diff --git a/bedms/__init__.py b/bedms/__init__.py index cdbc2d6..d0d13a5 100644 --- a/bedms/__init__.py +++ b/bedms/__init__.py @@ -1,4 +1,5 @@ """ This module initializes 'bedms' package. """ + from .attr_standardizer import AttrStandardizer diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index c61f4cb..6fa3b2e 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -100,7 +100,7 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" ) - def _load_model(self) -> tuple[nn.Module, object, object]: + def _load_model(self) -> Tuple[nn.Module, object, object]: """ Calls function to load the model from HuggingFace repository load vectorizer and label encoder and sets to eval(). diff --git a/bedms/model.py b/bedms/model.py index 94bd9da..52eed64 100644 --- a/bedms/model.py +++ b/bedms/model.py @@ -20,9 +20,9 @@ def __init__( Initializes the BoWSTModel. :param int input_size_values: Size of the input for the values (BoW). - :param int inout_size_values_embeddings: Size of the input + :param int inout_size_values_embeddings: Size of the input for the values sentence transformer embeddings. - :param int input_size_headers: Size of the input + :param int input_size_headers: Size of the input for the headers with sentence transformer embeddings. :param int hidden_size: Size of the hidden layer. :param int output_size: Size of the output layer. diff --git a/bedms/utils.py b/bedms/utils.py index f27fd82..0dcb613 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -1,6 +1,7 @@ """ This module has all util functions for 'bedms' """ + import warnings from collections import Counter from typing import Any, List, Optional, Tuple, Union @@ -21,7 +22,7 @@ MODEL_FAIRTRACKS, NUM_CLUSTERS, REPO_ID, - PEP_FILE_TYPES + PEP_FILE_TYPES, ) # TODO : convert to single np array before converting to tensor @@ -69,10 +70,10 @@ def data_preprocessing( :param pd.DataFrame df: The input DataFrame (user chosen PEP) to preprocess. :return Tuple[List[List[str]], List[str], List[List[str]]]: - - Nested list containing the comma separated values + - Nested list containing the comma separated values in each column for sentence transformer embeddings. - List containing the headers of the DataFrame. - - Nested list containing the comma separated values + - Nested list containing the comma separated values in each column for Bag of Words encoding. - Number of rows in the metadata csv """ @@ -163,15 +164,15 @@ def data_encoding( :param object vectorizer: scikit-learn vectorizer for bag of words encoding. :param object label_encoder" Label encoder object storing labels (y) :param int num_rows: Number of rows in the sample metadata - :param list X_values_st: Nested list containing the comma separated values + :param list X_values_st: Nested list containing the comma separated values in each column for sentence transformer embeddings. :param list X_headers_st: List containing the headers of the DataFrame. - :param list X_values_bow: Nested list containing the comma separated values + :param list X_values_bow: Nested list containing the comma separated values in each column for Bag of Words encoding. :param str schema: Schema type chosen by the user for standardization. - :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Union[LabelEncoder, None]]: Tuple containing - torch tensors for encoded embeddings and Bag of Words representations, + :return Tuple[torch.Tensor, torch.Tensor, torch.Tensor, + Union[LabelEncoder, None]]: Tuple containing + torch tensors for encoded embeddings and Bag of Words representations, and label encoder object. """ # Sentence Transformer Model