Merge pull request #19 from SkywardAI/feat/log

Redesign the methods by using classmethod
SkywardAI · Apr 6, 2024 · baf353c · baf353c
2 parents 5856521 + 7bc0dd4
commit baf353c
Show file tree

Hide file tree

Showing 14 changed files with 338 additions and 147 deletions.
diff --git a/examples/examples.py b/examples/examples.py
@@ -1,13 +1,35 @@
-from kimchima import Auto, get_device, get_capability
+from kimchima import (
+    ModelFactory, 
+    TokenizerFactory,
+    EmbeddingsFactory
+)
 
-model = Auto(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2")
+
+from kimchima import(
+    get_device, 
+    get_capability)
+
+pretrained_model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
+
+model = ModelFactory.auto_model(pretrained_model_name_or_path=pretrained_model_name_or_path)
+tokenizer= TokenizerFactory.auto_tokenizer(pretrained_model_name_or_path=pretrained_model_name_or_path)
 
 # computing embeddings for single text
-embeddings = model.get_embeddings(text="Melbourne")
+embeddings = EmbeddingsFactory.auto_embeddings(
+    model=model,
+    tokenizer=tokenizer,
+    prompt='Melbourne',
+    device='cpu'
+)
 print(embeddings.shape)
 
 # computing embeddings for multiple texts
-embeddings = model.get_embeddings(text=["Melbourne", "Sydney"])
+embeddings = EmbeddingsFactory.auto_embeddings(
+    model=model,
+    tokenizer=tokenizer,
+    prompt=['Melbourne', 'Sydney'],
+    device='cpu'
+)
 print(embeddings.shape)
 
 # Checking the device: GPU, mps and CPU

diff --git a/src/kimchima/__init__.py b/src/kimchima/__init__.py
@@ -12,11 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__="0.2.2"
+__version__="0.2.3"
+
+from .pkg import (
+    ModelFactory,
+    TokenizerFactory,
+    EmbeddingsFactory,
+    Devices
+)
+
 
 from .pkg import (
-    Auto,
-    Devices,
     get_device,
     get_capability
 )
diff --git a/src/kimchima/cmds/__init__.py b/src/kimchima/cmds/__init__.py
@@ -1 +1 @@
-from .auto_cli import CommandAuto
+from .auto_cli import CommandAutoModel
diff --git a/src/kimchima/cmds/auto_cli.py b/src/kimchima/cmds/auto_cli.py
@@ -19,12 +19,12 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from kimchima.pkg import Auto
+from kimchima.pkg import ModelFactory
 
 
-class CommandAuto:
+class CommandAutoModel:
     """
-    A class for auto command.
+    A class for loading models.
     """
 
     @staticmethod
@@ -38,7 +38,6 @@ def auto(args):
         Returns:
             torch.tensor: The embeddings of text.
         """
-        model = Auto(model_name_or_path=args.model_name_or_path)
-        embeddings = model.get_embeddings(text=args.text)
-        print(embeddings)
+        model = ModelFactory.auto_model(pretrained_model_name_or_path=args.model_name)
+        print(model.config)
 
diff --git a/src/kimchima/cmds/kimchima_cli.py b/src/kimchima/cmds/kimchima_cli.py
@@ -14,7 +14,7 @@
 
 import argparse
 
-from kimchima.cmds.auto_cli import CommandAuto
+from kimchima.cmds.auto_cli import CommandAutoModel
 
 
 def main():
@@ -31,7 +31,7 @@ def main():
     parser_auto=subparsers.add_parser("auto", help="auto help")
     parser_auto.add_argument("model_name_or_path", default="sentence-transformers/all-MiniLM-L6-v2", help="model name or path")
     parser_auto.add_argument("text", help="text str or list of text str")
-    parser_auto.set_defaults(func=CommandAuto.auto)
+    parser_auto.set_defaults(func=CommandAutoModel.auto)
 
     args = parser.parse_args()
     args.func(args)

diff --git a/src/kimchima/pkg/__init__.py b/src/kimchima/pkg/__init__.py
@@ -1,4 +1,22 @@
-from .auto import Auto
+# coding=utf-8
+# Copyright [2024] [SkywardAI]
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .model_factory import ModelFactory
+from .tokenizer_factory import TokenizerFactory
+from .embeddings import EmbeddingsFactory
+
 from .devices import (
     Devices,
     get_device,

diff --git a/src/kimchima/pkg/auto.py b/src/kimchima/pkg/auto.py
diff --git a/src/kimchima/pkg/devices.py b/src/kimchima/pkg/devices.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright [2024] [Aisuko]
+# Copyright [2024] [SkywardAI]
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -50,4 +50,4 @@ def get_capability()-> Tuple[int, int]:
     """
     if get_device() == Devices.GPU:
         return torch.cuda.get_device_capability()
-    return (0, 0)
+    return (0, 0)
diff --git a/src/kimchima/pkg/embeddings.py b/src/kimchima/pkg/embeddings.py
@@ -0,0 +1,97 @@
+# coding=utf-8
+# Copyright [2024] [SkywardAI]
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import torch
+import torch.nn.functional as F
+from kimchima.pkg import logging
+
+from .model_factory import ModelFactory
+from .tokenizer_factory import TokenizerFactory
+
+
+logger = logging.get_logger(__name__)
+
+
+class EmbeddingsFactory:
+    r"""
+
+    Embeddings class to get embeddings from the specified model and tokenizer.
+    The embeddings mean pooling is used to get the embeddings from the model,
+    and the embeddings are normalized using L2 normalization.
+
+    Args:
+        pretrained_model_name_or_path: pretrained model name or path
+
+    Returns:
+        sentence_embeddings: sentence embeddings type torch.Tensor
+
+    """
+
+    @classmethod
+    def __init__(cls):
+        raise EnvironmentError(
+            "Embeddings is designed to be instantiated "
+            "using the `Embeddings.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+
+
+    @classmethod
+    def auto_embeddings(cls, *args,**kwargs)-> torch.Tensor:
+        r"""
+        Get embeddings from the model.
+
+        Args:
+            prompt: prompt text
+            device: device to run the model
+            max_length: maximum length of the input text
+        """
+        model=kwargs.pop('model', None)
+        tokenizer=kwargs.pop('tokenizer', None)
+        prompt = kwargs.pop('prompt', None)
+        device = kwargs.pop('device', 'cpu')
+        max_length = kwargs.pop('max_length', 512)
+
+
+        inputs_ids = tokenizer(prompt, return_tensors='pt',max_length=max_length, padding=True, truncation=True).to(device)
+
+        model=model.to(device)
+        with torch.no_grad():
+            output = model(**inputs_ids)
+
+        embeddings=cls.mean_pooling(model_output=output, attention_mask=inputs_ids['attention_mask'])
+        logger.debug(f"Embedding mean pooling: {embeddings.shape}")
+
+        # Normalize embeddings
+        sentence_embeddings = F.normalize(embeddings, p=2, dim=1)
+
+        return sentence_embeddings
+
+
+    @classmethod
+    #Mean Pooling - Take attention mask into account for correct averaging
+    def mean_pooling(cls, **kwargs) -> torch.Tensor:
+        r"""
+        Mean Pooling - Take attention mask into account for correct averaging.
+
+        Args:
+            model_output: model output
+            attention_mask: attention mask
+        """
+        model_output = kwargs.get('model_output')
+        attention_mask = kwargs.get('attention_mask')
+        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
diff --git a/src/kimchima/pkg/logging.py b/src/kimchima/pkg/logging.py
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# The original code in Huggingface Transformers library is licensed under the Apache 2.0 license.
+# Adaptor: Aisuko
+# TODO: Will tweak the code to fit the Kimchima library
+
+from __future__ import annotations
+
 import logging
 import os
 import sys

diff --git a/src/kimchima/pkg/model_factory.py b/src/kimchima/pkg/model_factory.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright [2024] [SkywardAI]
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from transformers import AutoModel
+from kimchima.pkg import logging
+
+logger = logging.get_logger(__name__)
+
+
+class ModelFactory:
+    r"""
+    ModelFactory class to get the model from the specified model.
+
+    Args:
+        pretrained_model_name_or_path: pretrained model name or path
+    """
+    def __init__(self):
+        raise EnvironmentError(
+            "ModelFactory is designed to be instantiated "
+            "using the `ModelFactory.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+
+    @classmethod                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
+    def auto_model(cls, pretrained_model_name_or_path, **kwargs)-> AutoModel:
+        r"""
+        It is used to get the model from the Hugging Face Transformers AutoModel.
+        
+        Args:
+            pretrained_model_name_or_path: pretrained model name or path
+
+        """
+        if pretrained_model_name_or_path is None:
+            raise ValueError("pretrained_model_name_or_path cannot be None")
+        model = AutoModel.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        logger.debug(f"Loaded model: {pretrained_model_name_or_path}")
+        return model
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from .auto_cli import CommandAuto
		from .auto_cli import CommandAutoModel