Skip to content

Commit

Permalink
minor changes to trial.py
Browse files Browse the repository at this point in the history
  • Loading branch information
saanikat committed Aug 30, 2024
2 parents f9bdffe + 36d9783 commit 04024cd
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 72 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ Using Python, this is how you can run `attribute_standardizer` and print the res


```
from attribute_standardizer.attr_standardizer_class import AttrStandardizer
from attribute_standardizer import AttrStandardizer
model = AttrStandardizer("ENCODE")
model = AttrStandardizer("FAIRTRACKS")
results = model.standardize(pep ="geo/gse178283:default")
Expand Down
4 changes: 1 addition & 3 deletions attribute_standardizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# from .attribute_standardizer import attr_standardizer

from .attr_standardizer_class import AttrStandardizer
from .attr_standardizer import AttrStandardizer
Original file line number Diff line number Diff line change
@@ -1,53 +1,53 @@
# TODO take the pep object as input, add a function for that and then add the present fetch_from_pep as the wrapper
# TODO use the peppy constructor to take the Peppy.Project object - prj = peppy.Project(pep)
import logging
from typing import Dict, Tuple, Union

import pandas as pd
import numpy as np
import peppy
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging
import torch.nn.functional as torch_functional

from .const import (
HIDDEN_SIZE,
DROPOUT_PROB,
CONFIDENCE_THRESHOLD,
DROPOUT_PROB,
EMBEDDING_SIZE,
SENTENCE_TRANSFORMER_MODEL,
INPUT_SIZE_BOW_FAIRTRACKS,
HIDDEN_SIZE,
INPUT_SIZE_BOW_BEDBASE,
INPUT_SIZE_BOW_ENCODE,
INPUT_SIZE_BOW_FAIRTRACKS,
OUTPUT_SIZE_BEDBASE,
OUTPUT_SIZE_ENCODE,
OUTPUT_SIZE_FAIRTRACKS,
INPUT_SIZE_BOW_BEDBASE,
OUTPUT_SIZE_BEDBASE,
SENTENCE_TRANSFORMER_MODEL,
PROJECT_NAME,
)

from .model import BoWSTModel
from .utils import (
data_encoding,
data_preprocessing,
fetch_from_pephub,
get_any_pep,
load_from_huggingface,
data_preprocessing,
data_encoding,
)
from .model import BoWSTModel
from huggingface_hub import hf_hub_download
from typing import Dict, List, Tuple, Any, Union

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger = logging.getLogger(PROJECT_NAME)


class AttrStandardizer:
def __init__(self, schema: str) -> None:
def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None:
"""
Initializes the attribute standardizer with user provided schema, loads the model.
:param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS"
:param int confidence: Confidence threshold for the predictions.
"""
self.schema = schema
self.model = self._load_model()
self.conf_threshold = confidence

def _get_parameters(self) -> Tuple[int, int, int, int, int, float]:
"""
Gets the model parameters as per the chosen schema.
Get the model parameters as per the chosen schema.
:return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters.
"""
Expand Down Expand Up @@ -118,20 +118,26 @@ def _load_model(self) -> nn.Module:
logger.error(f"Error loading the model: {str(e)}")
raise

def standardize(self, pep: str) -> Dict[str, Dict[str, float]]:
def standardize(
self, pep: Union[str, peppy.Project]
) -> Dict[str, Dict[str, float]]:
"""
Fetches the user provided PEP from the PEPHub registry path, returns the predictions.
:param str pep: User provided path to the PEP.
:param str pep: peppy.Project object or PEPHub registry path to PEP.
:return Dict[str, Dict[str, float]]: Suggestions to the user.
"""
if not pep:
if isinstance(pep, str):
pep = get_any_pep(pep)
elif isinstance(pep, peppy.Project):
pass
else:
raise ValueError(
"PEP path is missing or empty. Please provide the PEPHub registry path to PEP"
"PEP should be either a path to PEPHub registry or peppy.Project object."
)
try:
csv_file = fetch_from_pephub(pep)
schema = self.schema

X_values_st, X_headers_st, X_values_bow = data_preprocessing(csv_file)
(
X_headers_embeddings_tensor,
Expand All @@ -142,9 +148,10 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]:
X_values_st,
X_headers_st,
X_values_bow,
schema,
self.schema,
model_name=SENTENCE_TRANSFORMER_MODEL,
)

logger.info("Data Preprocessing completed.")

with torch.no_grad():
Expand All @@ -153,8 +160,7 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]:
X_values_embeddings_tensor,
X_headers_embeddings_tensor,
)
probabilities = F.softmax(outputs, dim=1)
# confidence, predicted = torch.max(probabilities, 1)
probabilities = torch_functional.softmax(outputs, dim=1)

values, indices = torch.topk(probabilities, k=3, dim=1)
top_preds = indices.tolist()
Expand All @@ -167,11 +173,11 @@ def standardize(self, pep: str) -> Dict[str, Dict[str, float]]:
suggestions = {}
for i, category in enumerate(X_headers_st):
category_suggestions = {}
if top_confidences[i][0] >= CONFIDENCE_THRESHOLD:
if top_confidences[i][0] >= self.conf_threshold:
for j in range(3):
prediction = decoded_predictions[i][j]
probability = top_confidences[i][j]
if probability >= CONFIDENCE_THRESHOLD:
if probability >= self.conf_threshold:
category_suggestions[prediction] = probability
else:
break
Expand Down
2 changes: 2 additions & 0 deletions attribute_standardizer/const.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
PROJECT_NAME = "bedmess"

REPO_ID = "databio/attribute-standardizer-model6"
MODEL_ENCODE = "model_encode.pth"
MODEL_FAIRTRACKS = "model_fairtracks.pth"
Expand Down
64 changes: 38 additions & 26 deletions attribute_standardizer/utils.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
import pandas as pd
import pickle
import warnings
from collections import Counter
from typing import Any, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import peppy
import torch
from huggingface_hub import hf_hub_download
from pephubclient import PEPHubClient
from sentence_transformers import SentenceTransformer
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from collections import Counter
from huggingface_hub import hf_hub_download
from sklearn.metrics import silhouette_score
from typing import Optional, Any, List, Tuple, Union
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

from .const import (
REPO_ID,
MODEL_ENCODE,
MODEL_FAIRTRACKS,
MODEL_BEDBASE,
BEDBASE_LABEL_ENCODER_FILENAME,
BEDBASE_VECTORIZER_FILENAME,
ENCODE_LABEL_ENCODER_FILENAME,
FAIRTRACKS_LABEL_ENCODER_FILENAME,
ENCODE_VECTORIZER_FILENAME,
FAIRTRACKS_LABEL_ENCODER_FILENAME,
FAIRTRACKS_VECTORIZER_FILENAME,
BEDBASE_VECTORIZER_FILENAME,
BEDBASE_LABEL_ENCODER_FILENAME,
SENTENCE_TRANSFORMER_MODEL,
MODEL_BEDBASE,
MODEL_ENCODE,
MODEL_FAIRTRACKS,
NUM_CLUSTERS,
REPO_ID,
)
import warnings


# TODO : convert to single np array before converting to tensor
warnings.filterwarnings(
Expand All @@ -36,20 +36,14 @@
)


def fetch_pep(pep):
# input of python object of peppy.Project and output of csv_fle_df
raise NotImplementedError


def fetch_from_pephub(pep: str) -> pd.DataFrame:
def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame:
"""
Fetches metadata from PEPhub registry.
:param str pep: Path to the PEPhub registry containing the metadata csv file
:return pd.DataFrame: path to the CSV file on the local system.
"""
phc = PEPHubClient()
project = phc.load_project(pep)

sample_table = project.sample_table
csv_file_df = pd.DataFrame(sample_table)
return csv_file_df
Expand Down Expand Up @@ -255,3 +249,21 @@ def data_encoding(
X_values_bow_tensor,
label_encoder,
)


def get_any_pep(pep: str) -> peppy.Project:
"""
Get the PEP file from the local system or from PEPhub.
:param pep: Path to the PEP file or PEPhub registry path.
:return: peppy.Project object.
"""

PEP_FILE_TYPES = ["yaml", "csv"]

res = list(filter(pep.endswith, PEP_FILE_TYPES)) != []
if res:
return peppy.Project(pep)
else:
return peppy.Project.from_pephub(pep)
4 changes: 2 additions & 2 deletions requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ pandas
numpy
torch
sentence-transformers
pephubclient

pephubclient>=0.4.2
peppy>=0.40.5
13 changes: 7 additions & 6 deletions scripts/model1.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@


class NN1(nn.Module):
""" Simple Neural Network with a single Hidden Layer."""
"""Simple Neural Network with a single Hidden Layer."""

def __init__(self, input_size, hidden_size, output_size):
"""
Initializes the NN1 model.
Expand All @@ -45,7 +46,7 @@ def __init__(self, input_size, hidden_size, output_size):

def forward(self, x):
"""
Defines the forward pass of the neural network.
Defines the forward pass of the neural network.
:param torch.Tensor x: Input tensor.
:return torch.Tensor: Output tensor after passing through the network.
Expand Down Expand Up @@ -86,14 +87,14 @@ def data_split(df_values):
df_values_temp, test_size=0.5, random_state=42
)

#Snippet for testing on unseen data
# Snippet for testing on unseen data
"""
df_values_test = pd.read_csv(
"/home/saanika/curation/scripts/bedmess_archive/data/encode_metadata_values_moderate.csv",
sep=",",
)
"""
#Comment out the above for training on seen data.
# Comment out the above for training on seen data.

X_values_train = [
df_values_train[column].astype(str).tolist()
Expand Down Expand Up @@ -135,9 +136,9 @@ def data_split(df_values):

def encoding(X_values_train, X_values_test, X_values_val, y_train, y_test, y_val):
"""
Encodes the values for the model.
Encodes the values for the model.
:param list X_values_train: Training features.
:param list X_values_train: Training features.
:param list X_values_test: Testing features.
:param list X_values_val: Validation features.
:param list y_train: Training labels.
Expand Down
6 changes: 3 additions & 3 deletions trial.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from attribute_standardizer.attr_standardizer_class import AttrStandardizer
from attribute_standardizer.attr_standardizer import AttrStandardizer

model = AttrStandardizer("BEDBASE")
model = AttrStandardizer("ENCODE")

results = model.standardize(pep ="geo/gse228815:default")
results = model.standardize(pep="geo/gse178283:default")

print(results)

0 comments on commit 04024cd

Please sign in to comment.