diff --git a/README.md b/README.md index 0f95dca..a1e6789 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # BEDMS -BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE). - +BEDMS (BED Metadata Standardizer) is a tool desgined to standardize genomics and epigenomics metadata attributes according to user-selected schemas such as `ENCODE`, `FAIRTRACKS` and `BEDBASE`. BEDMS ensures consistency and FAIRness of metadata across different platforms. Additionally, users have the option to train their own standardizer model using a custom schema (`CUSTOM`), allowing for the standardization of attributes based on users' specific research requirements. ## Installation @@ -16,22 +15,72 @@ pip install git+https://github.com/databio/bedms.git ## Usage +### Standardizing based on available schemas + +To choose the schema you want to standardize according to, please refer to the [HuggingFace repository](https://huggingface.co/databio/attribute-standardizer-model6). Based on the schema design `.yaml` files, you can select which schema best represents your attributes. In the example below, we have chosen `encode` schema. + ```python from bedms import AttrStandardizer -model = AttrStandardizer("ENCODE") +model = AttrStandardizer( + repo_id="databio/attribute-standardizer-model6", model_name="encode" +) results = model.standardize(pep="geo/gse228634:default") assert results ``` +### Training custom schemas +Training your custom schema is very easy with `BEDMS`. You would need two things to get started: +1. Training Sets +2. `training_config.yaml` + +To instantiate `TrainStandardizer` class: + +```python +from bedms.train import AttrStandardizerTrainer + +trainer = AttrStandardizerTrainer("training_config.yaml") -To see the available schemas, you can run: ``` -from bedms.constants import AVAILABLE_SCHEMAS -print(AVAILABLE_SCHEMAS) +To load the datasets and encode them: -# >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] +```python +train_data, val_data, test_data, label_encoder, vectorizer = trainer.load_data() +``` + +To train the custom model: +```python +trainer.train() +``` + +To test the custom model: + +```python +test_results_dict = trainer.test() ``` -AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata. + +To generate visualizations such as Learning Curves, Confusion Matrices, and ROC Curve: + +```python +acc_fig, loss_fig, conf_fig, roc_fig = trainer.plot_visualizations() +``` + +Where `acc_fig` is Accuracy Curve figure object, `loss_fig` is Loss Curve figure object, `conf_fig` is the Confusion Matrix figure object, and `roc_fig` is the ROC Curve figure object. + + +### Standardizing based on custom schema + +For standardizing based on custom schema, your model should be on HuggingFace. The directory structure should follow the instructions mentioned on [HuggingFace](https://huggingface.co/databio/attribute-standardizer-model6). + +```python +from bedms import AttrStandardizer + +model = AttrStandardizer( + repo_id="name/of/your/hf/repo", model_name="model/name" +) +results = model.standardize(pep="geo/gse228634:default") + +print(results) #Dictionary of suggested predictions with their confidence: {'attr_1':{'prediction_1': 0.70, 'prediction_2':0.30}} +``` \ No newline at end of file diff --git a/bedms/__init__.py b/bedms/__init__.py index d0d13a5..99bc695 100644 --- a/bedms/__init__.py +++ b/bedms/__init__.py @@ -3,3 +3,6 @@ """ from .attr_standardizer import AttrStandardizer +from .train import AttrStandardizerTrainer + +__all__ = ["AttrStandardizer", "AttrStandardizerTrainer"] diff --git a/bedms/_version.py b/bedms/_version.py index 3dc1f76..d3ec452 100644 --- a/bedms/_version.py +++ b/bedms/_version.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 6fa3b2e..c823890 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -3,44 +3,26 @@ """ import logging -from typing import Dict, Tuple, Union +import glob +import os +import yaml +from typing import Dict, Tuple, Union, Optional import pickle import peppy import torch from torch import nn import torch.nn.functional as torch_functional - +import yaml +from huggingface_hub import hf_hub_download from .const import ( AVAILABLE_SCHEMAS, CONFIDENCE_THRESHOLD, - DROPOUT_PROB, - EMBEDDING_SIZE, - HIDDEN_SIZE, - INPUT_SIZE_BOW_BEDBASE, - INPUT_SIZE_BOW_ENCODE, - INPUT_SIZE_BOW_FAIRTRACKS, - OUTPUT_SIZE_BEDBASE, - OUTPUT_SIZE_ENCODE, - OUTPUT_SIZE_FAIRTRACKS, PROJECT_NAME, SENTENCE_TRANSFORMER_MODEL, - REPO_ID, - ENCODE_VECTORIZER_FILENAME, - ENCODE_LABEL_ENCODER_FILENAME, - FAIRTRACKS_VECTORIZER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, - BEDBASE_VECTORIZER_FILENAME, - BEDBASE_LABEL_ENCODER_FILENAME, ) from .model import BoWSTModel -from .utils import ( - data_encoding, - data_preprocessing, - fetch_from_pephub, - get_any_pep, - load_from_huggingface, -) -from huggingface_hub import hf_hub_download +from .utils import data_encoding, data_preprocessing, fetch_from_pephub, get_any_pep + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(PROJECT_NAME) @@ -51,16 +33,27 @@ class AttrStandardizer: This is the AttrStandardizer class which holds the models for Attribute Standardization. """ - def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: + def __init__( + self, + repo_id: str, + model_name: str, + custom_param: Optional[str] = None, + confidence: int = CONFIDENCE_THRESHOLD, + ) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. - :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + :param str repo_id: HuggingFace repository ID + :param str model_name: Name of the schema model + :param str custom_param: User provided config file for + custom parameters, if they choose "CUSTOM" schema. :param int confidence: Confidence threshold for the predictions. """ - self.schema = schema - self.model, self.vectorizer, self.label_encoder = self._load_model() + self.repo_id = repo_id + self.model_name = model_name self.conf_threshold = confidence + self.custom_param = custom_param + self.model, self.vectorizer, self.label_encoder = self._load_model() def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: """ @@ -68,36 +61,27 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters. """ - if self.schema == "ENCODE": - return ( - INPUT_SIZE_BOW_ENCODE, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_ENCODE, - DROPOUT_PROB, - ) - if self.schema == "FAIRTRACKS": - return ( - INPUT_SIZE_BOW_FAIRTRACKS, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_FAIRTRACKS, - DROPOUT_PROB, - ) - if self.schema == "BEDBASE": - return ( - INPUT_SIZE_BOW_BEDBASE, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_BEDBASE, - DROPOUT_PROB, - ) - raise ValueError( - f"Schema not available: {self.schema}." - "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" + config_filename = f"config_{self.model_name}.yaml" + config_pth = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, config_filename), + ) + with open(config_pth, "r") as file: + config = yaml.safe_load(file) + + input_size_bow = config["params"]["input_size_bow"] + embedding_size = config["params"]["embedding_size"] + hidden_size = config["params"]["hidden_size"] + output_size = config["params"]["output_size"] + dropout_prob = config["params"]["dropout_prob"] + + return ( + input_size_bow, + embedding_size, + embedding_size, + hidden_size, + output_size, + dropout_prob, ) def _load_model(self) -> Tuple[nn.Module, object, object]: @@ -108,63 +92,54 @@ def _load_model(self) -> Tuple[nn.Module, object, object]: :return object: The scikit learn vectorizer for bag of words encoding. :return object: Label encoder object for the labels (y). """ - try: - if self.schema == "ENCODE": - filename_vc = ENCODE_VECTORIZER_FILENAME - filename_lb = ENCODE_LABEL_ENCODER_FILENAME - elif self.schema == "FAIRTRACKS": - filename_vc = FAIRTRACKS_VECTORIZER_FILENAME - filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME - elif self.schema == "BEDBASE": - filename_vc = BEDBASE_VECTORIZER_FILENAME - filename_lb = BEDBASE_LABEL_ENCODER_FILENAME - - vectorizer = None - label_encoder = None - - vc_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_vc, - ) + model_filename = f"model_{self.model_name}.pth" + label_encoder_filename = f"label_encoder_{self.model_name}.pkl" + vectorizer_filename = f"vectorizer_{self.model_name}.pkl" - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) + model_pth = hf_hub_download( + repo_id=self.repo_id, filename=os.path.join(self.model_name, model_filename) + ) - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_lb, - ) + vc_path = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, vectorizer_filename), + ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) + lb_path = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, label_encoder_filename), + ) - model = load_from_huggingface(self.schema) - state_dict = torch.load(model) + with open(vc_path, "rb") as f: + vectorizer = pickle.load(f) + + with open(lb_path, "rb") as f: + label_encoder = pickle.load(f) + + state_dict = torch.load(model_pth) + + ( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) = self._get_parameters() + + model = BoWSTModel( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) - ( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) = self._get_parameters() - - model = BoWSTModel( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) - model.load_state_dict(state_dict) - model.eval() - return model, vectorizer, label_encoder + model.load_state_dict(state_dict) + model.eval() - except Exception as e: - logger.error(f"Error loading the model: {str(e)}") - raise + return model, vectorizer, label_encoder def standardize( self, pep: Union[str, peppy.Project] diff --git a/bedms/const.py b/bedms/const.py index 86916c6..c36f5f4 100644 --- a/bedms/const.py +++ b/bedms/const.py @@ -2,29 +2,10 @@ This module contains constant values used in the 'bedms' package. """ -PROJECT_NAME = "bedmess" +PROJECT_NAME = "bedms" -AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE"] +AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"] PEP_FILE_TYPES = ["yaml", "csv"] -REPO_ID = "databio/attribute-standardizer-model6" -MODEL_ENCODE = "model_encode.pth" -MODEL_FAIRTRACKS = "model_fairtracks.pth" -MODEL_BEDBASE = "model_bedbase.pth" -ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl" -FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl" -BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl" -ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl" -FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl" -BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl" SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" -HIDDEN_SIZE = 32 -DROPOUT_PROB = 0.113 CONFIDENCE_THRESHOLD = 0.70 -EMBEDDING_SIZE = 384 -INPUT_SIZE_BOW_ENCODE = 10459 -INPUT_SIZE_BOW_FAIRTRACKS = 13617 -OUTPUT_SIZE_FAIRTRACKS = 15 -OUTPUT_SIZE_ENCODE = 18 NUM_CLUSTERS = 3 -INPUT_SIZE_BOW_BEDBASE = 13708 -OUTPUT_SIZE_BEDBASE = 12 diff --git a/bedms/train.py b/bedms/train.py new file mode 100644 index 0000000..b7a5c77 --- /dev/null +++ b/bedms/train.py @@ -0,0 +1,296 @@ +""" This is the training script with which the user can train their own models.""" + +import logging +import torch +from torch import nn +from torch import optim +from torch.utils.data import DataLoader +from sklearn.metrics import ( + precision_score, + recall_score, + f1_score, +) +from sklearn.preprocessing import LabelEncoder +from sklearn.feature_extraction.text import CountVectorizer +import matplotlib.pyplot as plt +from typing import List, Dict, Tuple +import yaml +from .utils_train import ( + load_training_files_from_dir, + accumulate_data, + training_encoding, + data_loader, + train_model, + plot_learning_curve, + model_testing, + plot_confusion_matrix, + auc_roc_curve, +) +from .const import PROJECT_NAME +from .model import BoWSTModel + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(PROJECT_NAME) + + +class AttrStandardizerTrainer: + """ + This is the training class responsible for + managing the training process for the standardizer model. + """ + + def __init__(self, config: str) -> None: + """ + Initializes the TrainStandardizer object with the given configuration. + + :param str config: Path to the config file which has the training parameters provided by the user. + """ + self.label_encoder: LabelEncoder = None + self.vectorizer: CountVectorizer = None + self.train_loader: DataLoader = None + self.val_loader: DataLoader = None + self.test_loader: DataLoader = None + self.output_size: int = 0 + self.criterion: nn.Module = None + self.train_accuracies: List[float] = [] + self.val_accuracies: List[float] = [] + self.train_losses: List[float] = [] + self.val_losses: List[float] = [] + self.model: BoWSTModel = None + self.fpr: Dict[int, float] = {} + self.tpr: Dict[int, float] = {} + self.roc_auc: Dict[int, float] = {} + self.all_labels: List[int] = [] + self.all_preds: List[int] = [] + + with open(config, "r") as file: + self.config = yaml.safe_load(file) + + def load_data( + self, + ) -> Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + CountVectorizer, + ]: + """ + Loads and prepares the encoded training, testing and validation datasets. + :return Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + CountVectorizer]: A tuple containing: + - training dataset tensor + - validation dataset tensor + - testing dataset tensor + - label encoder + - bag of words vectorizer + """ + values_files_list = load_training_files_from_dir( + self.config["dataset"]["values_dir_pth"] + ) + headers_files_list = load_training_files_from_dir( + self.config["dataset"]["headers_dir_pth"] + ) + + if len(values_files_list) != len(headers_files_list): + logger.error( + f"Mismatch in number of value files ({len(values_files_list)}) \ + and header files ({len(headers_files_list)})" + ) + return + + total_files = len(values_files_list) + + paired_files = list(zip(values_files_list, headers_files_list)) + + train_size = self.config["data_split"]["train_set"] + test_size = self.config["data_split"]["test_set"] + val_size = self.config["data_split"]["val_set"] + + if train_size + val_size + test_size > total_files: + logger.error( + f"Data split sizes exceed total number of files: " + f"train({train_size}) + val({val_size}) + \ + test({test_size}) > total_files({total_files})" + ) + return + + train_files = paired_files[:train_size] + val_files = paired_files[train_size : train_size + val_size] + test_files = paired_files[ + train_size + val_size : train_size + val_size + test_size + ] + + logger.info(f"Training on {len(train_files)} file sets") + logger.info(f"Validating on {len(val_files)} file sets") + logger.info(f"Testing on {len(test_files)} file sets") + + x_values_train_list, x_headers_train_list, y_train_list = accumulate_data( + train_files + ) + x_values_test_list, x_headers_test_list, y_test_list = accumulate_data( + test_files + ) + x_values_val_list, x_headers_val_list, y_val_list = accumulate_data(val_files) + + logger.info("Accumulation Done.") + + num_cluster = self.config["training"]["num_cluster"] + vectorizer_pth = self.config["training"]["vectorizer_pth"] + label_encoder_pth = self.config["training"]["label_encoder_pth"] + sentence_transformer_model = self.config["training"][ + "sentence_transformer_model" + ] + + ( + train_encoded_data, + test_encoded_data, + val_encoded_data, + self.label_encoder, + self.vectorizer, + ) = training_encoding( + x_values_train_list, + x_headers_train_list, + y_train_list, + x_values_test_list, + x_headers_test_list, + y_test_list, + x_values_val_list, + x_headers_val_list, + y_val_list, + num_cluster, + vectorizer_pth, + label_encoder_pth, + sentence_transformer_model, + ) + logger.info("Encoding Done.") + + batch_size = self.config["training"]["batch_size"] + self.train_loader = data_loader(train_encoded_data, batch_size) + self.test_loader = data_loader(test_encoded_data, batch_size) + self.val_loader = data_loader(val_encoded_data, batch_size) + + logger.info("Loading Done.") + + return ( + train_encoded_data, + val_encoded_data, + test_encoded_data, + self.label_encoder, + self.vectorizer, + ) + + def train(self) -> None: + """ + Trains the model. + """ + input_size_values = len(self.vectorizer.vocabulary_) + input_size_values_embeddings = self.config["training"]["embedding_size"] + input_size_headers = self.config["training"]["embedding_size"] + hidden_size = self.config["model"]["hidden_size"] + self.output_size = len(self.label_encoder.classes_) # Number of classes + dropout_prob = self.config["model"]["dropout_prob"] + + self.model = BoWSTModel( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + self.output_size, + dropout_prob, + ) + + learning_rate = self.config["training"]["learning_rate"] + self.criterion = nn.CrossEntropyLoss() + l2_reg_lambda = self.config["training"]["l2_regularization"] + optimizer = optim.Adam( + self.model.parameters(), lr=learning_rate, weight_decay=l2_reg_lambda + ) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model.to(self.device) + + # Training the model + num_epochs = self.config["training"]["num_epochs"] + + model_pth = self.config["training"]["model_pth"] + bow_drops = self.config["training"]["bow_drops"] + + ( + self.train_accuracies, + self.val_accuracies, + self.train_losses, + self.val_losses, + self.fpr, + self.tpr, + self.roc_auc, + ) = train_model( + self.model, + self.train_loader, + self.val_loader, + self.criterion, + optimizer, + self.device, + num_epochs, + self.output_size, + model_pth, + bow_drops, + ) + + logger.info("Training Done.") + + def test(self) -> Dict[str, float]: + """ + Model testing. + + :return Dict[str, float]: Precision, Recall, and F1 values + """ + self.all_preds, self.all_labels = model_testing( + self.model, self.device, self.test_loader, self.criterion + ) + precision = precision_score(self.all_labels, self.all_preds, average="macro") + recall = recall_score(self.all_labels, self.all_preds, average="macro") + f1 = f1_score(self.all_labels, self.all_preds, average="macro") + logger.info(f"Precision:{precision}, Recall: {recall}, F1 Score: {f1}") + return {"precision": precision, "recall": recall, "f1": f1} + + def plot_visualizations( + self, + ) -> Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]: + """ + Generates visualizations for training ( accuracy and loss curves) + and testing( confusion matrix, roc curve) + + :return Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]: + A Tuple containing: + - accuracy figure + - loss figure + - confusion matrix figure + - ROC curve figure + """ + num_epochs = self.config["training"]["num_epochs"] + accuracy_fig_pth = self.config["visualization"]["accuracy_fig_pth"] + loss_fig_pth = self.config["visualization"]["loss_fig_pth"] + cm_pth = self.config["visualization"]["confusion_matrix_fig_pth"] + roc_pth = self.config["visualization"]["roc_fig_pth"] + acc_fig, loss_fig = plot_learning_curve( + num_epochs, + self.train_accuracies, + self.val_accuracies, + self.train_losses, + self.val_losses, + accuracy_fig_pth, + loss_fig_pth, + ) + conf_fig = plot_confusion_matrix( + self.all_labels, self.all_preds, self.label_encoder.classes_, cm_pth + ) + roc_fig = auc_roc_curve( + self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth + ) + + return acc_fig, loss_fig, conf_fig, roc_fig diff --git a/bedms/utils.py b/bedms/utils.py index 0dcb613..20e7128 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -2,6 +2,7 @@ This module has all util functions for 'bedms' """ +import logging import warnings from collections import Counter from typing import Any, List, Optional, Tuple, Union @@ -15,16 +16,15 @@ from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans from sklearn.preprocessing import LabelEncoder - from .const import ( - MODEL_BEDBASE, - MODEL_ENCODE, - MODEL_FAIRTRACKS, NUM_CLUSTERS, - REPO_ID, PEP_FILE_TYPES, + PROJECT_NAME, ) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(PROJECT_NAME) + # TODO : convert to single np array before converting to tensor warnings.filterwarnings( "ignore", @@ -46,22 +46,6 @@ def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame: return csv_file_df -def load_from_huggingface(schema: str) -> Optional[Any]: - """ - Load a model from HuggingFace based on the schema of choice. - - :param str schema: Schema Type - :return Optional[Any]: Loaded model object - """ - if schema == "ENCODE": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE) - elif schema == "FAIRTRACKS": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS) - elif schema == "BEDBASE": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_BEDBASE) - return model - - def data_preprocessing( df: pd.DataFrame, ) -> Tuple[List[List[str]], List[str], List[List[str]], int]: diff --git a/bedms/utils_train.py b/bedms/utils_train.py new file mode 100644 index 0000000..9661988 --- /dev/null +++ b/bedms/utils_train.py @@ -0,0 +1,707 @@ +""" +This module has all training util functions for 'bedms' +""" + +import os +import logging +from glob import glob +import warnings +from collections import Counter +from typing import List, Tuple, Iterator, Dict +import pickle +import random + + +import numpy as np +import pandas as pd +import torch +from torch.utils.data import TensorDataset, DataLoader +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.cluster import KMeans +from sklearn.preprocessing import LabelEncoder, label_binarize +from sklearn.metrics import ( + confusion_matrix, + auc, + roc_curve, +) +import matplotlib.pyplot as plt +import seaborn as sns +from .const import PROJECT_NAME + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(PROJECT_NAME) + +warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Creating a tensor from a list of numpy.ndarrays is extremely slow.", +) + + +def load_training_files_from_dir(dir: str) -> List[str]: + """ + Loads each file from the directory path. + + :param str dir: Path to the directory. + :return: List:paths to each file in the directory. + """ + return glob(os.path.join(dir, "*.csv")) + + +def load_and_preprocess_files(file_path: str) -> pd.DataFrame: + """ + Loads and Preprocesses each csv file as a Pandas DataFrame. + + :param str file_path: Path to each csv file. + :return pandas.DataFrame: df of each csv file. + """ + df = pd.read_csv(file_path, sep=",") + df.replace("NA", np.nan, inplace=True) + for column in df.columns: + most_common_val = df[column].mode().iloc[0] + df[column] = df[column].fillna(most_common_val) + return df + + +def accumulate_data( + files: List[Tuple[str, str]] +) -> Tuple[List[List[List[str]]], List[List[List[str]]], List[pd.Index]]: + """ + Accumulates data from multiple files into lists. + + :param List[Tuple[str, str]] files: List containing + sublists of values or header files. + :return Tuple[List[List[List[str]]], List[List[List[str]]],[List[str]]: + Lists of values, headers, labels. + A tuple containing three lists: + - A nested list of values (list of tables where + each table is a list of lists for columns), + - A nested list of headers (similar structure to values), + - A list of Pandas Index objects containing column labels. + """ + x_values_list = [] + x_headers_list = [] + y_list = [] + for values_file, headers_file in files: + df_values = load_and_preprocess_files(values_file) + df_headers = load_and_preprocess_files(headers_file) + df_values = df_values.fillna("") + df_headers = df_headers.fillna("") + y = df_values.columns + table_list = [] + # values list + for col in df_values.columns: + sublist_list = df_values[col].tolist() + table_list.append(sublist_list) + x_values_list.append(table_list) + # headers list + table_list = [] + for col in df_headers.columns: + sublist_list = df_headers[col].tolist() + table_list.append(sublist_list) + x_headers_list.append(table_list) + # y list + y_list.append(y) + + return x_values_list, x_headers_list, y_list + + +def lazy_loading(data_list: List, batch_size: int) -> Iterator[List]: + """ + Lazy loading for data in batches. + + :param List data_list: List of data to be loaded lazily. + :param int batch_size: Size of batch. + """ + for i in range(0, len(data_list), batch_size): + yield data_list[i : i + batch_size] + + +def get_top_training_cluster_averaged( + embeddings: List[torch.tensor], num: int +) -> torch.Tensor: + """ + Computes the clutser-averaged top training embeddings using k-means clustering. + + :param List[torch.tensor] embeddings: List of embedding tensors to cluster. + :param int num: Number of clusters to be created using k-means. + :return torch.Tensor: A tensor representing the + average of embeddings in the most common cluster. + """ + embeddings_list = [embedding.tolist() for embedding in embeddings] + kmeans = KMeans(n_clusters=num, random_state=0).fit(embeddings_list) + labels_kmeans = kmeans.labels_ + cluster_counts = Counter(labels_kmeans) + most_common_cluster = max(cluster_counts, key=cluster_counts.get) + most_common_indices = [ + idx for idx, label in enumerate(labels_kmeans) if label == most_common_cluster + ] + most_common_embeddings = [ + torch.tensor(embeddings[idx]) for idx in most_common_indices + ] + + if most_common_embeddings: + top_k_average = torch.mean( + torch.stack(most_common_embeddings), dim=0 + ).unsqueeze(0) + else: + top_k_average = torch.zeros_like(most_common_embeddings[0]).unsqueeze(0) + return top_k_average + + +def training_encoding( + x_values_train_list: List[List[List[str]]], + x_headers_train_list: List[List[List[str]]], + y_train_list: List[pd.Index], + x_values_test_list: List[List[List[str]]], + x_headers_test_list: List[List[List[str]]], + y_test_list: List[pd.Index], + x_values_val_list: List[List[List[str]]], + x_headers_val_list: List[List[List[str]]], + y_val_list: List[pd.Index], + num_cluster: int, + vectorizer_pth: str, + label_encoder_pth: str, + sentence_transformer_model: str, +) -> Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + List[str], + CountVectorizer, +]: + """ + Generates encoded headers and values. + + :param List[List[List[str]]] x_values_train_list: + Nested list containing the training set for values. + :param List[List[List[str]]] x_headers_train_list: + Nested list containing the training set for headers. + :param List[pd.Index] y_train_list: + List of the column labels ( attributes) for training. + :param List[List[List[str]]] x_values_test_list: + Nested list containing the testing set for values. + :param List[List[List[str]]] x_headers_test_list: + Nested list containing the testing set for headers. + :param List[pd.Index] y_test_list: + List of the column labels ( attributes) for testing. + :param List[List[List[str]]] x_values_val_list: + Nested list containing the validation set for values. + :param List[List[List[str]]] x_headers_val_list: + Nested list containing the validation set for headers. + :param List[pd.Index] y_val_list: + List of the column labels ( attributes) for validation. + :return Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + CountVectorizer]: Returns a tuple of + - training dataset tensor + - testing dataset tensor + - validation dataset tensor + - trained label encoder + - Trained vectorizer for Bag of Words representation + + """ + # Bag of Words + flattened_list = [ + item for sublist in x_values_train_list for col in sublist for item in col + ] + vectorizer = CountVectorizer() + vectorizer.fit(flattened_list) + with open(vectorizer_pth, "wb") as f: + pickle.dump(vectorizer, f) + vocabulary_size = len(vectorizer.vocabulary_) + logger.info(f"Vocabulary size: {vocabulary_size}") + + # Sentence Transformers + model_name = sentence_transformer_model + sentence_encoder = SentenceTransformer(model_name) + + # Label Encoders + label_encoder = LabelEncoder() + flat_y_train = [",".join(y) for y in y_train_list] + individual_values = [value.strip() for y in flat_y_train for value in y.split(",")] + unique_values = set(individual_values) + unique_values_list = list(unique_values) + label_encoder.fit(unique_values_list) + + with open(label_encoder_pth, "wb") as f: + pickle.dump(label_encoder, f) + + def encode_data( + x_values_list: List[List[List[str]]], + x_headers_list: List[List[List[str]]], + y_list: List[pd.Index], + num_cluster: int, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + This nested function encodes the values, headers and labels data. + It is called for thrice - training, testing, validation. + + :param List[List[List[str]]] x_values_list: Nested list containing values. + :param List[List[List[str]]] x_headers_list: Nested list containing headers. + :param List[pd.Index] y_list: Labels (attributes) list. + :param int num_cluster: Number of clusters to be generated. + """ + x_values_bow_tensors = [] + x_values_embeddings_tensors = [] + x_headers_embeddings_tensors = [] + y_tensors = [] + + for x_values, x_headers, y in zip(x_values_list, x_headers_list, y_list): + + for i in range(len(x_values)): # Iterate over columns + # BoW Representation + x_values_bow = vectorizer.transform(x_values[i]).toarray() + x_values_bow_tensor = ( + torch.tensor(x_values_bow, dtype=torch.float32) + .mean(dim=0) + .unsqueeze(0) + .clone() + .detach() + ) + + # Embeddings for Values + embeddings_values = [ + sentence_encoder.encode(str(value), show_progress_bar=False) + for value in x_values[i] + ] + + top_k_average_values = get_top_training_cluster_averaged( + embeddings_values, num_cluster + ) # Average of all embeddings + x_values_embeddings_tensor = top_k_average_values.clone().detach() + + # Embeddings for Headers + embeddings_headers = [ + sentence_encoder.encode(str(header), show_progress_bar=False) + for header in x_headers[i] + ] + + top_k_average_headers = get_top_training_cluster_averaged( + embeddings_headers, num_cluster + ) # Average of all embeddings + x_headers_embeddings_tensor = top_k_average_headers.clone().detach() + + # Labels + y_col = label_encoder.transform([y[i]]) + y_col_tensor = torch.tensor(y_col, dtype=torch.long).clone().detach() + + x_values_bow_tensors.append(x_values_bow_tensor) + x_values_embeddings_tensors.append(x_values_embeddings_tensor) + x_headers_embeddings_tensors.append(x_headers_embeddings_tensor) + y_tensors.append(y_col_tensor) + + x_values_bow_tensor = torch.cat( + x_values_bow_tensors, dim=0 + ) # this has [num_cols, vocab_size] + x_values_embeddings_tensor = torch.cat( + x_values_embeddings_tensors, dim=0 + ) # [num_cols, embedding_dim] + x_headers_embeddings_tensor = torch.cat(x_headers_embeddings_tensors, dim=0) + y_tensor = torch.cat(y_tensors, dim=0) # [num_cols] + + return ( + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, + y_tensor, + ) + + train_data = encode_data( + x_values_train_list, x_headers_train_list, y_train_list, num_cluster + ) + test_data = encode_data( + x_values_test_list, x_headers_test_list, y_test_list, num_cluster + ) + val_data = encode_data( + x_values_val_list, x_headers_val_list, y_val_list, num_cluster + ) + + return ( + train_data, + test_data, + val_data, + label_encoder, + vectorizer, + ) + + +def data_loader( + encoded_data: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + batch_size: int, +) -> DataLoader: + """ + Creates a DataLoader from encoded tensor data. + + :param [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] encoded_data: + Tuple containing tensors for + values bag of words, values embeddings, headers embeddings, and labels. + :param int batch_size: The number of samples per batch for the DataLoader. + :return DataLoader: A PyTorch DataLoader which yields + batches of data from the given tensors. + """ + ( + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, + y_tensor, + ) = encoded_data + # Convert data to TensorDataset + dataset = TensorDataset( + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, + y_tensor, + ) + # Create DataLoader + return DataLoader(dataset, batch_size=batch_size, shuffle=True) + + +def drop_bow(bow_tensor: torch.Tensor, num_drops: int) -> torch.Tensor: + """ + Randomly drops a specified number of columns in the + Bag of Words tensor for regularization. + + :param torch.Tensor bow_tensor: Bag of Words tensor. + :param int num_drops: Number of columns to be randomly + dropped from the Bag of Words tensor. + :return torch.Tensor: Bag of Words tensor with dropped columns. + """ + num_columns = bow_tensor.size(0) + columns = list(range(num_columns)) + columns_to_drop = random.sample(columns, num_drops) + + mask = torch.ones(num_columns, dtype=torch.bool) + mask[columns_to_drop] = False + mask = mask.unsqueeze(1).expand_as(bow_tensor) + + # Apply the mask to the BoW tensor + dropped_bow_tensor = bow_tensor.clone() + dropped_bow_tensor[~mask] = 0.0 + + return dropped_bow_tensor + + +def train_model( + model: torch.nn.Module, + train_loader: DataLoader, + val_loader: DataLoader, + criterion: torch.nn.Module, + optimizer: torch.optim.Optimizer, + device: torch.device, + num_epochs: int, + output_size: int, + model_pth: str, + bow_drops: int, +) -> Tuple[ + List[float], + List[float], + List[float], + List[float], + Dict[int, np.ndarray], + Dict[int, np.ndarray], + Dict[int, float], +]: + """ + Trains and validates the neural network model. + + :param torch.nn.Module model: The neural network model to be trained. + :param DataLoader train_loader: DataLoader for the training set. + :param DataLoader val_loader: DataLoader for the validation set. + :param torch.nn.Module criterion: The loss function used to compute loss during training. + :param torch.optim.Optimizer optimizer: The optimizer to update the model parameters. + :param torch.device device: The device (CPU or GPU) on which the model will be trained. + :param int num_epochs: The number of epochs to train the model. + :param int output_size: The size of the model's output layer. + :param str model_pth: The file path to where the model would be saved. + :param int bow_drops: The number of Bag of Words columns to be dropped. + :return Tuple: + - List[float]: Train accuracy per epoch. + - List[float]: Validation accuracy per epoch. + - List[float]: Train loss per epoch. + - List[float]: Validation loss per epoch. + - Dict[int, np.ndarray]: Dictionary of False Positive Rates (FPR). + - Dict[int, np.ndarray]: Dictionary of True Positive Rates (TPR). + - Dict[int, float]: Dictionary of Area Under the ROC Curve for different classes. + """ + patience = 3 + train_accuracies = [] + val_accuracies = [] + train_losses = [] + val_losses = [] + + best_val_loss = float("inf") + best_epoch = 0 + early_stop = False + + model.train() + + for epoch in range(num_epochs): + total_samples = 0 + correct_predictions = 0 + train_loss = 0.0 + for x_values_bow, x_values_embeddings, x_headers_embeddings, y in train_loader: + x_values_bow = x_values_bow.to(device) + x_values_embeddings = x_values_embeddings.to(device) + x_headers_embeddings = x_headers_embeddings.to(device) + y = y.to(device) + + x_values_bow = drop_bow(x_values_bow, bow_drops) + + optimizer.zero_grad() + outputs = model(x_values_bow, x_values_embeddings, x_headers_embeddings) + + loss = criterion(outputs, y) + loss.backward() + optimizer.step() + train_loss += loss.item() * x_values_bow.size(0) + + _, predicted = torch.max(outputs, 1) + total_samples += y.size(0) + correct_predictions += (predicted == y).sum().item() + + train_accuracy = correct_predictions / total_samples * 100 + train_accuracies.append(train_accuracy) + train_loss = train_loss / len(train_loader.dataset) + train_losses.append(train_loss) + + model.eval() + val_loss = 0.0 + correct_predictions_val = 0 + total_samples_val = 0 + y_true = [] + y_scores = [] + with torch.no_grad(): + for ( + x_values_bow, + x_values_embeddings, + x_headers_embeddings, + y, + ) in val_loader: + x_values_bow = x_values_bow.to(device) + x_values_embeddings = x_values_embeddings.to(device) + x_headers_embeddings = x_headers_embeddings.to(device) + y = y.to(device) + outputs = model(x_values_bow, x_values_embeddings, x_headers_embeddings) + loss = criterion(outputs, y) + val_loss += loss.item() * x_values_bow.size(0) + + _, predicted = torch.max(outputs, 1) + total_samples_val += y.size(0) + correct_predictions_val += (predicted == y).sum().item() + y_true.extend(y.cpu().numpy()) + y_scores.extend(outputs.cpu().numpy()) + + val_loss = val_loss / len(val_loader.dataset) + val_accuracy = correct_predictions_val / total_samples_val * 100 + val_accuracies.append(val_accuracy) + val_losses.append(val_loss) + + print( + f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss:.4f}, \ + Training Accuracy: {train_accuracy:.2f}%, Validation Loss: {val_loss:.4f}, \ + Validation Accuracy: {val_accuracy:.2f}%" + ) + + # Early stop + + if val_loss < best_val_loss: + best_val_loss = val_loss + best_epoch = epoch + torch.save(model.state_dict(), model_pth) + elif epoch - best_epoch >= patience: + early_stop = True + if early_stop: + print(f"Early stop at {best_epoch + 1} epoch.") + y_true = label_binarize(y_true, classes=list(range(output_size))) + + y_true = np.array(y_true) + y_scores = np.array(y_scores) + + # Calculate ROC curves and AUC + fpr = {} + tpr = {} + roc_auc = {} + + for i in range(output_size): + fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_scores[:, i]) + roc_auc[i] = auc(fpr[i], tpr[i]) + + return train_accuracies, val_accuracies, train_losses, val_losses, fpr, tpr, roc_auc + + +def model_testing( + model: torch.nn.Module, + device: torch.device, + test_loader: DataLoader, + loss_fn: torch.nn.Module, +) -> Tuple[List[int], List[int], torch.Tensor]: + """ + This functions tests the model. + + :param torch.nn.Module model: The trained model. + :param DataLoader test_loader: DataLoader for the testing set. + :param torch.nn.Module loss_fn: The loss function used to compute loss. + :return Tuple: + - List[int]: List of all the predictions made by the model. + - List[int]: List of all the true labels ( Ground truth) + - torch.Tensor: Logist from the model for the test dataset. + """ + all_preds = [] + all_labels = [] + model.eval() + total_loss_test = 0.0 + total_correct_test = 0 + total_samples_test = 0 + with torch.no_grad(): + for values_batch, bow_batch, headers_batch, labels in test_loader: + values_batch = values_batch.to(device) + bow_batch = bow_batch.to(device) + headers_batch = headers_batch.to(device) + labels = labels.to(device) + outputs = model(values_batch, bow_batch, headers_batch) + loss = loss_fn(outputs, labels) + total_loss_test += loss.item() + _, predicted_test = torch.max(outputs, 1) + correct_test = (predicted_test == labels).sum().item() + total_correct_test += correct_test + total_samples_test += labels.size(0) + all_preds.extend(predicted_test.cpu().numpy()) + all_labels.extend(labels.cpu().numpy()) + test_accuracy = total_correct_test / total_samples_test + test_loss = total_loss_test / len(test_loader) + logger.info(f"Test Accuracy: {test_accuracy}, Test Loss: {test_loss}") + + return all_preds, all_labels + + +def plot_learning_curve( + num_epochs: int, + train_accuracies: List[float], + val_accuracies: List[float], + train_losses: List[float], + val_losses: List[float], + accuracy_fig_pth: str, + loss_fig_pth: str, +) -> Tuple[plt.Figure, plt.Figure]: + """ + Plots the learning curves - accuracy and loss for Training and Validation of the model. + + :param int num_epochs: Number of epochs for which the model was trained. + :param List[float] train_accuracies: List of training accuracies for each epoch. + :param List[float] val_accuracies: List of validation accuracies for each epoch. + :param List[float] train_losses: List of training losses for each epoch. + :param List[float] val_losses: List of validation losses for each epoch. + :param str accuracy_fig_pth: Path where the accuracy curve figure will be saved. + :param str loss_fig_pth: Path where the loss curve figure will be saved. + + :return Tuple[plt.Figure, plt.Figure]: Accuracy and Loss curves + """ + + # accuracy + acc = plt.plot( + range(1, num_epochs + 1), train_accuracies, label="Training Accuracy" + ) + plt.plot(range(1, num_epochs + 1), val_accuracies, label="Validation Accuracy") + plt.xlabel("Epoch") + plt.ylabel("Accuracy") + plt.title("Learning Curve") + plt.legend() + plt.grid(True) + plt.savefig(accuracy_fig_pth, format="svg") + plt.show() + plt.close() + + # loss + loss = plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss") + plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss") + plt.xlabel("Epoch") + plt.ylabel("Loss") + plt.title("Learning Curve") + plt.legend() + plt.grid(True) + plt.savefig(loss_fig_pth, format="svg") + plt.show() + plt.close() + return acc, loss + + +def plot_confusion_matrix( + y_true: List[int], + y_pred: List[int], + unique_values_list: List[str], + confusion_matrix_fig_pth: str, +) -> plt.Figure: + """ + Plots confusion matrix for the test data. + + :param List[int] y_true: List of true labels ( Ground Truth) + :param List[int] y_pred: List of predictions made by the model. + :param List[str] unique_values_list: List of all the classes that the model predicted. + :param str confusion_matrix_fig_pth: Path where the confusion matrix figure will be saved. + + :return plt.Figure: Confusion matrix figure + """ + conf_matrix = confusion_matrix(y_true, y_pred) + plt.figure(figsize=(12, 12)) + sns.heatmap( + conf_matrix, + annot=True, + fmt="d", + cmap="Blues", + xticklabels=np.unique(unique_values_list), + yticklabels=np.unique(unique_values_list), + ) + plt.title("Confusion Matrix") + plt.xlabel("Predicted Label") + plt.ylabel("True Label") + plt.savefig(confusion_matrix_fig_pth, format="svg") + plt.show() + plt.close() + class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1) + for i, acc in enumerate(class_accuracy): + print(f"Accuracy for class {i}: {acc:.4f}") + return conf_matrix + + +def auc_roc_curve( + fpr: Dict[int, np.ndarray], + tpr: Dict[int, np.ndarray], + roc_auc: Dict[int, float], + output_size: int, + roc_fig_pth: str, +) -> plt.Figure: + """ + Plots the ROC Curve. + + :param Dict[int, np.ndarray] fpr: Dictionary of False Positive Rates + :param Dicr[int, np.ndarray] tpr: Dictionary of True Positive Rates + :param Dict[int, float] roc_auc: Dictionary of Area Under Curve for ROC for different classes. + :param int output_size: The number of classes the model predicted into. + :param str roc_fig_pth: Path to where the ROC figure will be saved. + + :return plt.Figure: Figure for the ROC Curve. + """ + fig = plt.figure(figsize=(12, 12)) + for i in range(output_size): + plt.plot( + fpr[i], + tpr[i], + lw=2, + label="ROC curve (class %d) (AUC = %0.2f)" % (i, roc_auc[i]), + ) + + plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("Receiver Operating Characteristic (ROC) Curve") + plt.legend(loc="lower right") + plt.savefig(roc_fig_pth, format="svg") + return fig diff --git a/custom_config.yaml b/custom_config.yaml new file mode 100644 index 0000000..a7e06b9 --- /dev/null +++ b/custom_config.yaml @@ -0,0 +1,10 @@ +paths: + model_pth: "path/to/custom/trained/model.pth" #Path to where you saved the custom model + label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you saved the Label Encoder + vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you saved the Bag of Words vectorizer +model: + input_size_bow: 1857 #Size of the vocabulary for Bag of Words encoding + input_size_embeddings: 384 #Size of the input embeddings for values and attributes + hidden_size: 32 #Hidden size the model was trained on + output_size: 18 #Number of classes the model predicts into + dropout_prob: 0.113 #Dropout probability you had set for the model \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md index 747b21b..25fa872 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,7 +2,15 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.2.0] - 2024-12-03 +### Added +- Added generic way of initialization of all schemas +- Added TrainStandardizer module for custom model creation + +### Fixed +- Fixed Typo in README #23 + ## [0.1.0] - 2024-09-16 ### Added -- initial project release \ No newline at end of file +- initial project release diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 3f373a4..daeede2 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,3 +4,5 @@ torch sentence-transformers pephubclient>=0.4.2 peppy>=0.40.6 +matplotlib +seaborn \ No newline at end of file diff --git a/tests/test_bedms.py b/tests/test_bedms.py index a47dfb1..fc654a1 100755 --- a/tests/test_bedms.py +++ b/tests/test_bedms.py @@ -3,7 +3,9 @@ class TestBEDMES: def test_bedmes(self): - model = AttrStandardizer("ENCODE") + model = AttrStandardizer( + repo_id="databio/attribute-standardizer-model6", model_name="encode" + ) results = model.standardize(pep="geo/gse228634:default") assert results diff --git a/training_config.yaml b/training_config.yaml new file mode 100644 index 0000000..75910d2 --- /dev/null +++ b/training_config.yaml @@ -0,0 +1,31 @@ +dataset: + values_dir_pth: "/path/to/training/values/directory" #Path to the values directory + headers_dir_pth: "path/to/training/headers/directory" #Path to the attributes directory + +data_split: + train_set: 8000 #Number of csv value-attribute file pairs for training set + test_set: 100 #Number of csv value-attribute file pairs for testing set + val_set: 100 #Number of csv value-attribute file pairs for validation set + +model: + hidden_size: 32 #Hidden size for training the model + dropout_prob: 0.113 #Dropout probability for training the model + +training: + batch_size: 32 #Batch size for training + num_epochs: 20 #Number of training epochs + learning_rate: 0.001 #Learning rate of the model + l2_regularization: 0.001 #L2 regularization strength applied to the optimizer (Avoids overfitting, can be set to 0) + model_pth: "path/to/custom/trained/model.pth" #Path to where you want to save the custom model + num_cluster: 3 #Number of clusters for KMeans + vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you want to save the Bag of Words vectorizer + label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you want to save the Label Encoder + sentence_transformer_model: "all-MiniLM-L6-v2" #Name of the sentence transformer model you wish to use fro HuggingFace + embedding_size: 384 #Dimensionality of the embedding produced by the chosen sentence transformer + bow_drops: 2 #Number of Bag of Words columns you wish to drop out during the training process (Avoids overfitting, can be set to 0) + +visualization: + accuracy_fig_pth: "/path/to/accuracy_fig.svg" #Path to where you wish to save the Accuracy Curve image + loss_fig_pth: "/path/to/loss_fig.svg" #Path to where you wish to save the Loss Curve image + confusion_matrix_fig_pth: "/path/to/confusion_matrix.svg" #Path to where you wish to save the confusion matrix image + roc_fig_pth: "/path/to/roc_fig.svg" #Path to where you wish to save the ROC curve image \ No newline at end of file