Skip to content

Commit

Permalink
reviewer request changes
Browse files Browse the repository at this point in the history
  • Loading branch information
saanikat committed Oct 15, 2024
1 parent 732b505 commit dfa1a02
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 54 deletions.
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,30 @@ trainer = AttrStandardizerTrainer("training_config.yaml")
To load the datasets and encode them:

```python
trainer.load_encode_data()
train_data, val_data, test_data, label_encoder, vectorizer = trainer.load_data()
```

To train the custom model:

```python
trainer.training()
trainer.train()
```

To test the custom model:

```python
trainer.testing()
test_results_dict = trainer.test()
```

To generate visualizations such as Learning Curves, Confusion Matrices, and ROC Curve:

```python
trainer.plot_visualizations()
acc_fig, loss_fig, conf_fig, roc_fig = trainer.plot_visualizations()
```

Where `acc_fig` is Accuracy Curve figure object, `loss_fig` is Loss Curve figure object, `conf_fig` is the Confusion Matrix figure object, and `roc_fig` is the ROC Curve figure object.


### Standardizing based on custom schema

For standardizing based on custom schema, your model should be on HuggingFace. The directory structure should follow the instructions mentioned on [HuggingFace](https://huggingface.co/databio/attribute-standardizer-model6).
Expand All @@ -79,5 +82,5 @@ model = AttrStandardizer(
)
results = model.standardize(pep="geo/gse228634:default")

assert results
print(results) #Dictionary of suggested predictions with their confidence: {'attr_1':{'prediction_1': 0.70, 'prediction_2':0.30}}
```
118 changes: 86 additions & 32 deletions bedms/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from sklearn.metrics import (
precision_score,
recall_score,
f1_score,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple
import yaml
from .utils_train import (
load_from_dir,
load_training_files_from_dir,
accumulate_data,
training_encoding,
data_loader,
Expand Down Expand Up @@ -41,33 +46,56 @@ def __init__(self, config: str) -> None:
:param str config: Path to the config file which has the training parameters provided by the user.
"""
self.label_encoder = None
self.vectorizer = None
self.train_loader = None
self.val_loader = None
self.test_loader = None
self.output_size = None
self.criterion = None
self.train_accuracies = None
self.val_accuracies = None
self.train_losses = None
self.val_losses = None
self.model = None
self.fpr = None
self.tpr = None
self.roc_auc = None
self.all_labels = None
self.all_preds = None
self.label_encoder: LabelEncoder = None
self.vectorizer: CountVectorizer = None
self.train_loader: DataLoader = None
self.val_loader: DataLoader = None
self.test_loader: DataLoader = None
self.output_size: int = 0
self.criterion: nn.Module = None
self.train_accuracies: List[float] = []
self.val_accuracies: List[float] = []
self.train_losses: List[float] = []
self.val_losses: List[float] = []
self.model: BoWSTModel = None
self.fpr: Dict[int, float] = {}
self.tpr: Dict[int, float] = {}
self.roc_auc: Dict[int, float] = {}
self.all_labels: List[int] = []
self.all_preds: List[int] = []

with open(config, "r") as file:
self.config = yaml.safe_load(file)

def load_encode_data(self) -> None:
def load_data(
self,
) -> Tuple[
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
LabelEncoder,
CountVectorizer,
]:
"""
Loads and prepares the encoded training, testing and validation datasets.
:return Tuple[
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
LabelEncoder,
CountVectorizer]: A tuple containing:
- training dataset tensor
- validation dataset tensor
- testing dataset tensor
- label encoder
- bag of words vectorizer
"""
values_files_list = load_from_dir(self.config["dataset"]["values_dir_pth"])
headers_files_list = load_from_dir(self.config["dataset"]["headers_dir_pth"])
values_files_list = load_training_files_from_dir(
self.config["dataset"]["values_dir_pth"]
)
headers_files_list = load_training_files_from_dir(
self.config["dataset"]["headers_dir_pth"]
)

if len(values_files_list) != len(headers_files_list):
logger.error(
Expand Down Expand Up @@ -149,13 +177,21 @@ def load_encode_data(self) -> None:

logger.info("Loading Done.")

def training(self):
return (
train_encoded_data,
val_encoded_data,
test_encoded_data,
self.label_encoder,
self.vectorizer,
)

def train(self) -> None:
"""
Trains the model.
"""
input_size_values = len(self.vectorizer.vocabulary_)
input_size_values_embeddings = EMBEDDING_SIZE
input_size_headers = EMBEDDING_SIZE
input_size_values_embeddings = self.config["training"]["embedding_size"]
input_size_headers = self.config["training"]["embedding_size"]
hidden_size = self.config["model"]["hidden_size"]
self.output_size = len(self.label_encoder.classes_) # Number of classes
dropout_prob = self.config["model"]["dropout_prob"]
Expand All @@ -175,7 +211,9 @@ def training(self):
optimizer = optim.Adam(
self.model.parameters(), lr=learning_rate, weight_decay=l2_reg_lambda
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)

# Training the model
num_epochs = self.config["training"]["num_epochs"]

Expand All @@ -196,7 +234,7 @@ def training(self):
self.val_loader,
self.criterion,
optimizer,
device,
self.device,
num_epochs,
self.output_size,
model_pth,
Expand All @@ -205,29 +243,41 @@ def training(self):

logger.info("Training Done.")

def testing(self):
def test(self) -> Dict[str, float]:
"""
Model testing.
:return Dict[str, float]: Precision, Recall, and F1 values
"""
self.all_preds, self.all_labels = model_testing(
self.model, self.test_loader, self.criterion
self.model, self.device, self.test_loader, self.criterion
)
precision = precision_score(self.all_labels, self.all_preds, average="macro")
recall = recall_score(self.all_labels, self.all_preds, average="macro")
f1 = f1_score(self.all_labels, self.all_preds, average="macro")
logger.info(f"Precision:{precision}, Recall: {recall}, F1 Score: {f1}")
return {"precision": precision, "recall": recall, "f1": f1}

def plot_visualizations(self):
def plot_visualizations(
self,
) -> Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]:
"""
Generates visualizations for training ( accuracy and loss curves)
and testing( confusion matrix, roc curve)
:return Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]:
A Tuple containing:
- accuracy figure
- loss figure
- confusion matrix figure
- ROC curve figure
"""
num_epochs = self.config["training"]["num_epochs"]
accuracy_fig_pth = self.config["visualization"]["accuracy_fig_pth"]
loss_fig_pth = self.config["visualization"]["loss_fig_pth"]
cm_pth = self.config["visualization"]["confusion_matrix_fig_pth"]
roc_pth = self.config["visualization"]["roc_fig_pth"]
plot_learning_curve(
acc_fig, loss_fig = plot_learning_curve(
num_epochs,
self.train_accuracies,
self.val_accuracies,
Expand All @@ -236,7 +286,11 @@ def plot_visualizations(self):
accuracy_fig_pth,
loss_fig_pth,
)
plot_confusion_matrix(
conf_fig = plot_confusion_matrix(
self.all_labels, self.all_preds, self.label_encoder.classes_, cm_pth
)
auc_roc_curve(self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth)
roc_fig = auc_roc_curve(
self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth
)

return acc_fig, loss_fig, conf_fig, roc_fig
Loading

0 comments on commit dfa1a02

Please sign in to comment.