From 5a721157477d50ae631bb3545fe95c2c9147a7e1 Mon Sep 17 00:00:00 2001
From: Khoroshevskyi <sasha99250@gmail.com>
Date: Wed, 28 Aug 2024 16:26:54 -0400
Subject: [PATCH 1/2] fixed #16

---
 README.md                                     |  3 +-
 attribute_standardizer/__init__.py            |  4 +-
 ...dardizer_class.py => attr_standardizer.py} | 55 ++++++++++---------
 attribute_standardizer/utils.py               | 24 +++++++-
 requirements/requirements-all.txt             |  4 +-
 scripts/model1.py                             | 13 +++--
 trial.py                                      |  4 +-
 7 files changed, 65 insertions(+), 42 deletions(-)
 rename attribute_standardizer/{attr_standardizer_class.py => attr_standardizer.py} (84%)

diff --git a/README.md b/README.md
index 062f222..33800c5 100644
--- a/README.md
+++ b/README.md
@@ -20,9 +20,10 @@ Using Python, this is how you can run `attribute_standardizer` and print the res
 
 
 ```
-from attribute_standardizer.attr_standardizer_class import AttrStandardizer
+from attribute_standardizer import AttrStandardizer
 
 model = AttrStandardizer("ENCODE")
+model = AttrStandardizer("FAIRTRACKS")
 
 results = model.standardize(pep ="geo/gse178283:default")
 
diff --git a/attribute_standardizer/__init__.py b/attribute_standardizer/__init__.py
index 6e82403..374c0be 100644
--- a/attribute_standardizer/__init__.py
+++ b/attribute_standardizer/__init__.py
@@ -1,3 +1 @@
-# from .attribute_standardizer import attr_standardizer
-
-from .attr_standardizer_class import AttrStandardizer
+from .attr_standardizer import AttrStandardizer
diff --git a/attribute_standardizer/attr_standardizer_class.py b/attribute_standardizer/attr_standardizer.py
similarity index 84%
rename from attribute_standardizer/attr_standardizer_class.py
rename to attribute_standardizer/attr_standardizer.py
index db36fc2..7f76024 100644
--- a/attribute_standardizer/attr_standardizer_class.py
+++ b/attribute_standardizer/attr_standardizer.py
@@ -1,12 +1,19 @@
-# TODO take the pep object as input, add a function for that and then add the present fetch_from_pep as the wrapper
-# TODO use the peppy constructor to take the Peppy.Project object -   prj = peppy.Project(pep)
-
-import pandas as pd
-import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
+import torch.nn.functional as torch_functional
 import logging
+import peppy
+
+from typing import Dict, Tuple, Union
+
+from .model import BoWSTModel
+from .utils import (
+    fetch_from_pephub,
+    load_from_huggingface,
+    data_preprocessing,
+    data_encoding,
+    get_any_pep,
+)
 from .const import (
     HIDDEN_SIZE,
     DROPOUT_PROB,
@@ -21,33 +28,25 @@
     OUTPUT_SIZE_BEDBASE,
 )
 
-from .utils import (
-    fetch_from_pephub,
-    load_from_huggingface,
-    data_preprocessing,
-    data_encoding,
-)
-from .model import BoWSTModel
-from huggingface_hub import hf_hub_download
-from typing import Dict, List, Tuple, Any, Union
-
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
 class AttrStandardizer:
-    def __init__(self, schema: str) -> None:
+    def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None:
         """
         Initializes the attribute standardizer with user provided schema, loads the model.
 
         :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS"
+        :param int confidence: Confidence threshold for the predictions.
         """
         self.schema = schema
         self.model = self._load_model()
+        self.conf_threshold = confidence
 
     def _get_parameters(self) -> Tuple[int, int, int, int, int, float]:
         """
-        Gets the model parameters as per the chosen schema.
+        Get the model parameters as per the chosen schema.
 
         :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters.
         """
@@ -118,16 +117,22 @@ def _load_model(self) -> nn.Module:
             logger.error(f"Error loading the model: {str(e)}")
             raise
 
-    def standardize(self, pep: str) -> Dict[str, Dict[str, float]]:
+    def standardize(
+        self, pep: Union[str, peppy.Project]
+    ) -> Dict[str, Dict[str, float]]:
         """
         Fetches the user provided PEP from the PEPHub registry path, returns the predictions.
 
-        :param str pep: User provided path to the PEP.
+        :param str pep: peppy.Project object or PEPHub registry path to PEP.
         :return Dict[str, Dict[str, float]]: Suggestions to the user.
         """
-        if not pep:
+        if isinstance(pep, str):
+            pep = get_any_pep(pep)
+        elif isinstance(pep, peppy.Project):
+            pass
+        else:
             raise ValueError(
-                "PEP path is missing or empty. Please provide the PEPHub registry path to PEP"
+                f"PEP should be either a path to PEPHub registry or peppy.Project object."
             )
         try:
             csv_file = fetch_from_pephub(pep)
@@ -153,7 +158,7 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]:
                     X_values_embeddings_tensor,
                     X_headers_embeddings_tensor,
                 )
-                probabilities = F.softmax(outputs, dim=1)
+                probabilities = torch_functional.softmax(outputs, dim=1)
                 # confidence, predicted = torch.max(probabilities, 1)
 
                 values, indices = torch.topk(probabilities, k=3, dim=1)
@@ -167,11 +172,11 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]:
                 suggestions = {}
             for i, category in enumerate(X_headers_st):
                 category_suggestions = {}
-                if top_confidences[i][0] >= CONFIDENCE_THRESHOLD:
+                if top_confidences[i][0] >= self.conf_threshold:
                     for j in range(3):
                         prediction = decoded_predictions[i][j]
                         probability = top_confidences[i][j]
-                        if probability >= CONFIDENCE_THRESHOLD:
+                        if probability >= self.conf_threshold:
                             category_suggestions[prediction] = probability
                         else:
                             break
diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py
index 8922941..ef45bb0 100644
--- a/attribute_standardizer/utils.py
+++ b/attribute_standardizer/utils.py
@@ -9,8 +9,10 @@
 from sklearn.cluster import KMeans
 from collections import Counter
 from huggingface_hub import hf_hub_download
-from sklearn.metrics import silhouette_score
 from typing import Optional, Any, List, Tuple, Union
+import warnings
+import peppy
+
 from .const import (
     REPO_ID,
     MODEL_ENCODE,
@@ -22,10 +24,8 @@
     FAIRTRACKS_VECTORIZER_FILENAME,
     BEDBASE_VECTORIZER_FILENAME,
     BEDBASE_LABEL_ENCODER_FILENAME,
-    SENTENCE_TRANSFORMER_MODEL,
     NUM_CLUSTERS,
 )
-import warnings
 
 
 # TODO : convert to single np array before converting to tensor
@@ -255,3 +255,21 @@ def data_encoding(
         X_values_bow_tensor,
         label_encoder,
     )
+
+
+def get_any_pep(pep: str) -> peppy.Project:
+    """
+    Get the PEP file from the local system or from PEPhub.
+
+    :param pep: Path to the PEP file or PEPhub registry path.
+
+    :return: peppy.Project object.
+    """
+
+    PEP_FILE_TYPES = ["yaml", "csv"]
+
+    res = list(filter(pep.endswith, PEP_FILE_TYPES)) != []
+    if res:
+        return peppy.Project(pep)
+    else:
+        return peppy.Project.from_pephub(pep)
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 6642681..848e7e8 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -2,5 +2,5 @@ pandas
 numpy
 torch
 sentence-transformers
-pephubclient
-
+pephubclient>=0.4.2
+peppy>=0.40.5
diff --git a/scripts/model1.py b/scripts/model1.py
index bef41fb..0118add 100644
--- a/scripts/model1.py
+++ b/scripts/model1.py
@@ -29,7 +29,8 @@
 
 
 class NN1(nn.Module):
-    """ Simple Neural Network with a single Hidden Layer."""
+    """Simple Neural Network with a single Hidden Layer."""
+
     def __init__(self, input_size, hidden_size, output_size):
         """
         Initializes the NN1 model.
@@ -45,7 +46,7 @@ def __init__(self, input_size, hidden_size, output_size):
 
     def forward(self, x):
         """
-        Defines the forward pass of the neural network. 
+        Defines the forward pass of the neural network.
 
         :param torch.Tensor x: Input tensor.
         :return torch.Tensor: Output tensor after passing through the network.
@@ -86,14 +87,14 @@ def data_split(df_values):
         df_values_temp, test_size=0.5, random_state=42
     )
 
-    #Snippet for testing on unseen data 
+    # Snippet for testing on unseen data
     """
     df_values_test = pd.read_csv(
         "/home/saanika/curation/scripts/bedmess_archive/data/encode_metadata_values_moderate.csv",
         sep=",",
     )
     """
-    #Comment out the above for training on seen data. 
+    # Comment out the above for training on seen data.
 
     X_values_train = [
         df_values_train[column].astype(str).tolist()
@@ -135,9 +136,9 @@ def data_split(df_values):
 
 def encoding(X_values_train, X_values_test, X_values_val, y_train, y_test, y_val):
     """
-    Encodes the values for the model. 
+    Encodes the values for the model.
 
-    :param list X_values_train: Training features. 
+    :param list X_values_train: Training features.
     :param list X_values_test: Testing features.
     :param list X_values_val: Validation features.
     :param list y_train: Training labels.
diff --git a/trial.py b/trial.py
index cfa1a8b..88a257e 100644
--- a/trial.py
+++ b/trial.py
@@ -1,7 +1,7 @@
-from attribute_standardizer.attr_standardizer_class import AttrStandardizer
+from attribute_standardizer.attr_standardizer import AttrStandardizer
 
 model = AttrStandardizer("ENCODE")
 
-results = model.standardize(pep ="geo/gse178283:default")
+results = model.standardize(pep="geo/gse178283:default")
 
 print(results)

From 82940db3f893ceeaf39bd7385e243a0680825821 Mon Sep 17 00:00:00 2001
From: Khoroshevskyi <sasha99250@gmail.com>
Date: Wed, 28 Aug 2024 16:38:52 -0400
Subject: [PATCH 2/2] cleaning

---
 attribute_standardizer/attr_standardizer.py | 47 +++++++++++----------
 attribute_standardizer/const.py             |  2 +
 attribute_standardizer/utils.py             | 44 +++++++++----------
 3 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/attribute_standardizer/attr_standardizer.py b/attribute_standardizer/attr_standardizer.py
index 7f76024..f29adc6 100644
--- a/attribute_standardizer/attr_standardizer.py
+++ b/attribute_standardizer/attr_standardizer.py
@@ -1,35 +1,36 @@
+import logging
+from typing import Dict, Tuple, Union
+
+import peppy
 import torch
 import torch.nn as nn
 import torch.nn.functional as torch_functional
-import logging
-import peppy
-
-from typing import Dict, Tuple, Union
 
-from .model import BoWSTModel
-from .utils import (
-    fetch_from_pephub,
-    load_from_huggingface,
-    data_preprocessing,
-    data_encoding,
-    get_any_pep,
-)
 from .const import (
-    HIDDEN_SIZE,
-    DROPOUT_PROB,
     CONFIDENCE_THRESHOLD,
+    DROPOUT_PROB,
     EMBEDDING_SIZE,
-    SENTENCE_TRANSFORMER_MODEL,
-    INPUT_SIZE_BOW_FAIRTRACKS,
+    HIDDEN_SIZE,
+    INPUT_SIZE_BOW_BEDBASE,
     INPUT_SIZE_BOW_ENCODE,
+    INPUT_SIZE_BOW_FAIRTRACKS,
+    OUTPUT_SIZE_BEDBASE,
     OUTPUT_SIZE_ENCODE,
     OUTPUT_SIZE_FAIRTRACKS,
-    INPUT_SIZE_BOW_BEDBASE,
-    OUTPUT_SIZE_BEDBASE,
+    SENTENCE_TRANSFORMER_MODEL,
+    PROJECT_NAME,
+)
+from .model import BoWSTModel
+from .utils import (
+    data_encoding,
+    data_preprocessing,
+    fetch_from_pephub,
+    get_any_pep,
+    load_from_huggingface,
 )
 
 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
+logger = logging.getLogger(PROJECT_NAME)
 
 
 class AttrStandardizer:
@@ -132,11 +133,11 @@ def standardize(
             pass
         else:
             raise ValueError(
-                f"PEP should be either a path to PEPHub registry or peppy.Project object."
+                "PEP should be either a path to PEPHub registry or peppy.Project object."
             )
         try:
             csv_file = fetch_from_pephub(pep)
-            schema = self.schema
+
             X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file)
             (
                 X_headers_embeddings_tensor,
@@ -147,9 +148,10 @@ def standardize(
                 X_values_st,
                 X_headers_st,
                 X_values_bow,
-                schema,
+                self.schema,
                 model_name=SENTENCE_TRANSFORMER_MODEL,
             )
+
             logger.info("Data Preprocessing completed.")
 
             with torch.no_grad():
@@ -159,7 +161,6 @@ def standardize(
                     X_headers_embeddings_tensor,
                 )
                 probabilities = torch_functional.softmax(outputs, dim=1)
-                # confidence, predicted = torch.max(probabilities, 1)
 
                 values, indices = torch.topk(probabilities, k=3, dim=1)
                 top_preds = indices.tolist()
diff --git a/attribute_standardizer/const.py b/attribute_standardizer/const.py
index 460abfa..54e9b06 100644
--- a/attribute_standardizer/const.py
+++ b/attribute_standardizer/const.py
@@ -1,3 +1,5 @@
+PROJECT_NAME = "bedmess"
+
 REPO_ID = "databio/attribute-standardizer-model6"
 MODEL_ENCODE = "model_encode.pth"
 MODEL_FAIRTRACKS = "model_fairtracks.pth"
diff --git a/attribute_standardizer/utils.py b/attribute_standardizer/utils.py
index ef45bb0..8e798ba 100644
--- a/attribute_standardizer/utils.py
+++ b/attribute_standardizer/utils.py
@@ -1,33 +1,33 @@
-import pandas as pd
+import pickle
+import warnings
+from collections import Counter
+from typing import Any, List, Optional, Tuple, Union
+
 import numpy as np
+import pandas as pd
+import peppy
 import torch
+from huggingface_hub import hf_hub_download
 from pephubclient import PEPHubClient
 from sentence_transformers import SentenceTransformer
-import pickle
-from sklearn.preprocessing import LabelEncoder
-from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.cluster import KMeans
-from collections import Counter
-from huggingface_hub import hf_hub_download
-from typing import Optional, Any, List, Tuple, Union
-import warnings
-import peppy
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.preprocessing import LabelEncoder
 
 from .const import (
-    REPO_ID,
-    MODEL_ENCODE,
-    MODEL_FAIRTRACKS,
-    MODEL_BEDBASE,
+    BEDBASE_LABEL_ENCODER_FILENAME,
+    BEDBASE_VECTORIZER_FILENAME,
     ENCODE_LABEL_ENCODER_FILENAME,
-    FAIRTRACKS_LABEL_ENCODER_FILENAME,
     ENCODE_VECTORIZER_FILENAME,
+    FAIRTRACKS_LABEL_ENCODER_FILENAME,
     FAIRTRACKS_VECTORIZER_FILENAME,
-    BEDBASE_VECTORIZER_FILENAME,
-    BEDBASE_LABEL_ENCODER_FILENAME,
+    MODEL_BEDBASE,
+    MODEL_ENCODE,
+    MODEL_FAIRTRACKS,
     NUM_CLUSTERS,
+    REPO_ID,
 )
 
-
 # TODO : convert to single np array before converting to tensor
 warnings.filterwarnings(
     "ignore",
@@ -36,20 +36,14 @@
 )
 
 
-def fetch_pep(pep):
-    # input of python object of peppy.Project and output of csv_fle_df
-    raise NotImplementedError
-
-
-def fetch_from_pephub(pep: str) -> pd.DataFrame:
+def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame:
     """
     Fetches metadata from PEPhub registry.
 
     :param str pep: Path to the PEPhub registry containing the metadata csv file
     :return pd.DataFrame: path to the CSV file on the local system.
     """
-    phc = PEPHubClient()
-    project = phc.load_project(pep)
+
     sample_table = project.sample_table
     csv_file_df = pd.DataFrame(sample_table)
     return csv_file_df