From 5a721157477d50ae631bb3545fe95c2c9147a7e1 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 28 Aug 2024 16:26:54 -0400 Subject: [PATCH 1/2] fixed #16 --- README.md | 3 +- attribute_standardizer/__init__.py | 4 +- ...dardizer_class.py => attr_standardizer.py} | 55 ++++++++++--------- attribute_standardizer/utils.py | 24 +++++++- requirements/requirements-all.txt | 4 +- scripts/model1.py | 13 +++-- trial.py | 4 +- 7 files changed, 65 insertions(+), 42 deletions(-) rename attribute_standardizer/{attr_standardizer_class.py => attr_standardizer.py} (84%) diff --git a/README.md b/README.md index 062f222..33800c5 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,10 @@ Using Python, this is how you can run `attribute_standardizer` and print the res ``` -from attribute_standardizer.attr_standardizer_class import AttrStandardizer +from attribute_standardizer import AttrStandardizer model = AttrStandardizer("ENCODE") +model = AttrStandardizer("FAIRTRACKS") results = model.standardize(pep ="geo/gse178283:default") diff --git a/attribute_standardizer/__init__.py b/attribute_standardizer/__init__.py index 6e82403..374c0be 100644 --- a/attribute_standardizer/__init__.py +++ b/attribute_standardizer/__init__.py @@ -1,3 +1 @@ -# from .attribute_standardizer import attr_standardizer - -from .attr_standardizer_class import AttrStandardizer +from .attr_standardizer import AttrStandardizer diff --git a/attribute_standardizer/attr_standardizer_class.py b/attribute_standardizer/attr_standardizer.py similarity index 84% rename from attribute_standardizer/attr_standardizer_class.py rename to attribute_standardizer/attr_standardizer.py index db36fc2..7f76024 100644 --- a/attribute_standardizer/attr_standardizer_class.py +++ b/attribute_standardizer/attr_standardizer.py @@ -1,12 +1,19 @@ -# TODO take the pep object as input, add a function for that and then add the present fetch_from_pep as the wrapper -# TODO use the peppy constructor to take the Peppy.Project object - prj = peppy.Project(pep) - -import pandas as pd -import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F +import torch.nn.functional as torch_functional import logging +import peppy + +from typing import Dict, Tuple, Union + +from .model import BoWSTModel +from .utils import ( + fetch_from_pephub, + load_from_huggingface, + data_preprocessing, + data_encoding, + get_any_pep, +) from .const import ( HIDDEN_SIZE, DROPOUT_PROB, @@ -21,33 +28,25 @@ OUTPUT_SIZE_BEDBASE, ) -from .utils import ( - fetch_from_pephub, - load_from_huggingface, - data_preprocessing, - data_encoding, -) -from .model import BoWSTModel -from huggingface_hub import hf_hub_download -from typing import Dict, List, Tuple, Any, Union - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class AttrStandardizer: - def __init__(self, schema: str) -> None: + def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + :param int confidence: Confidence threshold for the predictions. """ self.schema = schema self.model = self._load_model() + self.conf_threshold = confidence def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: """ - Gets the model parameters as per the chosen schema. + Get the model parameters as per the chosen schema. :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters. """ @@ -118,16 +117,22 @@ def _load_model(self) -> nn.Module: logger.error(f"Error loading the model: {str(e)}") raise - def standardize(self, pep: str) -> Dict[str, Dict[str, float]]: + def standardize( + self, pep: Union[str, peppy.Project] + ) -> Dict[str, Dict[str, float]]: """ Fetches the user provided PEP from the PEPHub registry path, returns the predictions. - :param str pep: User provided path to the PEP. + :param str pep: peppy.Project object or PEPHub registry path to PEP. :return Dict[str, Dict[str, float]]: Suggestions to the user. """ - if not pep: + if isinstance(pep, str): + pep = get_any_pep(pep) + elif isinstance(pep, peppy.Project): + pass + else: raise ValueError( - "PEP path is missing or empty. Please provide the PEPHub registry path to PEP" + f"PEP should be either a path to PEPHub registry or peppy.Project object." ) try: csv_file = fetch_from_pephub(pep) @@ -153,7 +158,7 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]: X_values_embeddings_tensor, X_headers_embeddings_tensor, ) - probabilities = F.softmax(outputs, dim=1) + probabilities = torch_functional.softmax(outputs, dim=1) # confidence, predicted = torch.max(probabilities, 1) values, indices = torch.topk(probabilities, k=3, dim=1) @@ -167,11 +172,11 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]: suggestions = {} for i, category in enumerate(X_headers_st): category_suggestions = {} - if top_confidences[i][0] >= CONFIDENCE_THRESHOLD: + if top_confidences[i][0] >= self.conf_threshold: for j in range(3): prediction = decoded_predictions[i][j] probability = top_confidences[i][j] - if probability >= CONFIDENCE_THRESHOLD: + if probability >= self.conf_threshold: category_suggestions[prediction] = probability else: break diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index 8922941..ef45bb0 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -9,8 +9,10 @@ from sklearn.cluster import KMeans from collections import Counter from huggingface_hub import hf_hub_download -from sklearn.metrics import silhouette_score from typing import Optional, Any, List, Tuple, Union +import warnings +import peppy + from .const import ( REPO_ID, MODEL_ENCODE, @@ -22,10 +24,8 @@ FAIRTRACKS_VECTORIZER_FILENAME, BEDBASE_VECTORIZER_FILENAME, BEDBASE_LABEL_ENCODER_FILENAME, - SENTENCE_TRANSFORMER_MODEL, NUM_CLUSTERS, ) -import warnings # TODO : convert to single np array before converting to tensor @@ -255,3 +255,21 @@ def data_encoding( X_values_bow_tensor, label_encoder, ) + + +def get_any_pep(pep: str) -> peppy.Project: + """ + Get the PEP file from the local system or from PEPhub. + + :param pep: Path to the PEP file or PEPhub registry path. + + :return: peppy.Project object. + """ + + PEP_FILE_TYPES = ["yaml", "csv"] + + res = list(filter(pep.endswith, PEP_FILE_TYPES)) != [] + if res: + return peppy.Project(pep) + else: + return peppy.Project.from_pephub(pep) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 6642681..848e7e8 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -2,5 +2,5 @@ pandas numpy torch sentence-transformers -pephubclient - +pephubclient>=0.4.2 +peppy>=0.40.5 diff --git a/scripts/model1.py b/scripts/model1.py index bef41fb..0118add 100644 --- a/scripts/model1.py +++ b/scripts/model1.py @@ -29,7 +29,8 @@ class NN1(nn.Module): - """ Simple Neural Network with a single Hidden Layer.""" + """Simple Neural Network with a single Hidden Layer.""" + def __init__(self, input_size, hidden_size, output_size): """ Initializes the NN1 model. @@ -45,7 +46,7 @@ def __init__(self, input_size, hidden_size, output_size): def forward(self, x): """ - Defines the forward pass of the neural network. + Defines the forward pass of the neural network. :param torch.Tensor x: Input tensor. :return torch.Tensor: Output tensor after passing through the network. @@ -86,14 +87,14 @@ def data_split(df_values): df_values_temp, test_size=0.5, random_state=42 ) - #Snippet for testing on unseen data + # Snippet for testing on unseen data """ df_values_test = pd.read_csv( "/home/saanika/curation/scripts/bedmess_archive/data/encode_metadata_values_moderate.csv", sep=",", ) """ - #Comment out the above for training on seen data. + # Comment out the above for training on seen data. X_values_train = [ df_values_train[column].astype(str).tolist() @@ -135,9 +136,9 @@ def data_split(df_values): def encoding(X_values_train, X_values_test, X_values_val, y_train, y_test, y_val): """ - Encodes the values for the model. + Encodes the values for the model. - :param list X_values_train: Training features. + :param list X_values_train: Training features. :param list X_values_test: Testing features. :param list X_values_val: Validation features. :param list y_train: Training labels. diff --git a/trial.py b/trial.py index cfa1a8b..88a257e 100644 --- a/trial.py +++ b/trial.py @@ -1,7 +1,7 @@ -from attribute_standardizer.attr_standardizer_class import AttrStandardizer +from attribute_standardizer.attr_standardizer import AttrStandardizer model = AttrStandardizer("ENCODE") -results = model.standardize(pep ="geo/gse178283:default") +results = model.standardize(pep="geo/gse178283:default") print(results) From 82940db3f893ceeaf39bd7385e243a0680825821 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 28 Aug 2024 16:38:52 -0400 Subject: [PATCH 2/2] cleaning --- attribute_standardizer/attr_standardizer.py | 47 +++++++++++---------- attribute_standardizer/const.py | 2 + attribute_standardizer/utils.py | 44 +++++++++---------- 3 files changed, 45 insertions(+), 48 deletions(-) diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py index 7f76024..f29adc6 100644 --- a/attribute_standardizer/attr_standardizer.py +++ b/attribute_standardizer/attr_standardizer.py @@ -1,35 +1,36 @@ +import logging +from typing import Dict, Tuple, Union + +import peppy import torch import torch.nn as nn import torch.nn.functional as torch_functional -import logging -import peppy - -from typing import Dict, Tuple, Union -from .model import BoWSTModel -from .utils import ( - fetch_from_pephub, - load_from_huggingface, - data_preprocessing, - data_encoding, - get_any_pep, -) from .const import ( - HIDDEN_SIZE, - DROPOUT_PROB, CONFIDENCE_THRESHOLD, + DROPOUT_PROB, EMBEDDING_SIZE, - SENTENCE_TRANSFORMER_MODEL, - INPUT_SIZE_BOW_FAIRTRACKS, + HIDDEN_SIZE, + INPUT_SIZE_BOW_BEDBASE, INPUT_SIZE_BOW_ENCODE, + INPUT_SIZE_BOW_FAIRTRACKS, + OUTPUT_SIZE_BEDBASE, OUTPUT_SIZE_ENCODE, OUTPUT_SIZE_FAIRTRACKS, - INPUT_SIZE_BOW_BEDBASE, - OUTPUT_SIZE_BEDBASE, + SENTENCE_TRANSFORMER_MODEL, + PROJECT_NAME, +) +from .model import BoWSTModel +from .utils import ( + data_encoding, + data_preprocessing, + fetch_from_pephub, + get_any_pep, + load_from_huggingface, ) logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +logger = logging.getLogger(PROJECT_NAME) class AttrStandardizer: @@ -132,11 +133,11 @@ def standardize( pass else: raise ValueError( - f"PEP should be either a path to PEPHub registry or peppy.Project object." + "PEP should be either a path to PEPHub registry or peppy.Project object." ) try: csv_file = fetch_from_pephub(pep) - schema = self.schema + X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file) ( X_headers_embeddings_tensor, @@ -147,9 +148,10 @@ def standardize( X_values_st, X_headers_st, X_values_bow, - schema, + self.schema, model_name=SENTENCE_TRANSFORMER_MODEL, ) + logger.info("Data Preprocessing completed.") with torch.no_grad(): @@ -159,7 +161,6 @@ def standardize( X_headers_embeddings_tensor, ) probabilities = torch_functional.softmax(outputs, dim=1) - # confidence, predicted = torch.max(probabilities, 1) values, indices = torch.topk(probabilities, k=3, dim=1) top_preds = indices.tolist() diff --git a/attribute_standardizer/const.py b/attribute_standardizer/const.py index 460abfa..54e9b06 100644 --- a/attribute_standardizer/const.py +++ b/attribute_standardizer/const.py @@ -1,3 +1,5 @@ +PROJECT_NAME = "bedmess" + REPO_ID = "databio/attribute-standardizer-model6" MODEL_ENCODE = "model_encode.pth" MODEL_FAIRTRACKS = "model_fairtracks.pth" diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py index ef45bb0..8e798ba 100644 --- a/attribute_standardizer/utils.py +++ b/attribute_standardizer/utils.py @@ -1,33 +1,33 @@ -import pandas as pd +import pickle +import warnings +from collections import Counter +from typing import Any, List, Optional, Tuple, Union + import numpy as np +import pandas as pd +import peppy import torch +from huggingface_hub import hf_hub_download from pephubclient import PEPHubClient from sentence_transformers import SentenceTransformer -import pickle -from sklearn.preprocessing import LabelEncoder -from sklearn.feature_extraction.text import CountVectorizer from sklearn.cluster import KMeans -from collections import Counter -from huggingface_hub import hf_hub_download -from typing import Optional, Any, List, Tuple, Union -import warnings -import peppy +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import LabelEncoder from .const import ( - REPO_ID, - MODEL_ENCODE, - MODEL_FAIRTRACKS, - MODEL_BEDBASE, + BEDBASE_LABEL_ENCODER_FILENAME, + BEDBASE_VECTORIZER_FILENAME, ENCODE_LABEL_ENCODER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, ENCODE_VECTORIZER_FILENAME, + FAIRTRACKS_LABEL_ENCODER_FILENAME, FAIRTRACKS_VECTORIZER_FILENAME, - BEDBASE_VECTORIZER_FILENAME, - BEDBASE_LABEL_ENCODER_FILENAME, + MODEL_BEDBASE, + MODEL_ENCODE, + MODEL_FAIRTRACKS, NUM_CLUSTERS, + REPO_ID, ) - # TODO : convert to single np array before converting to tensor warnings.filterwarnings( "ignore", @@ -36,20 +36,14 @@ ) -def fetch_pep(pep): - # input of python object of peppy.Project and output of csv_fle_df - raise NotImplementedError - - -def fetch_from_pephub(pep: str) -> pd.DataFrame: +def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame: """ Fetches metadata from PEPhub registry. :param str pep: Path to the PEPhub registry containing the metadata csv file :return pd.DataFrame: path to the CSV file on the local system. """ - phc = PEPHubClient() - project = phc.load_project(pep) + sample_table = project.sample_table csv_file_df = pd.DataFrame(sample_table) return csv_file_df