Skip to content

Commit

Permalink
Merge pull request #19 from SkywardAI/feat/log
Browse files Browse the repository at this point in the history
Redesign the methods by using classmethod
  • Loading branch information
Aisuko authored Apr 6, 2024
2 parents 5856521 + 7bc0dd4 commit baf353c
Show file tree
Hide file tree
Showing 14 changed files with 338 additions and 147 deletions.
30 changes: 26 additions & 4 deletions examples/examples.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,35 @@
from kimchima import Auto, get_device, get_capability
from kimchima import (
ModelFactory,
TokenizerFactory,
EmbeddingsFactory
)

model = Auto(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2")

from kimchima import(
get_device,
get_capability)

pretrained_model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"

model = ModelFactory.auto_model(pretrained_model_name_or_path=pretrained_model_name_or_path)
tokenizer= TokenizerFactory.auto_tokenizer(pretrained_model_name_or_path=pretrained_model_name_or_path)

# computing embeddings for single text
embeddings = model.get_embeddings(text="Melbourne")
embeddings = EmbeddingsFactory.auto_embeddings(
model=model,
tokenizer=tokenizer,
prompt='Melbourne',
device='cpu'
)
print(embeddings.shape)

# computing embeddings for multiple texts
embeddings = model.get_embeddings(text=["Melbourne", "Sydney"])
embeddings = EmbeddingsFactory.auto_embeddings(
model=model,
tokenizer=tokenizer,
prompt=['Melbourne', 'Sydney'],
device='cpu'
)
print(embeddings.shape)

# Checking the device: GPU, mps and CPU
Expand Down
12 changes: 9 additions & 3 deletions src/kimchima/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__="0.2.2"
__version__="0.2.3"

from .pkg import (
ModelFactory,
TokenizerFactory,
EmbeddingsFactory,
Devices
)


from .pkg import (
Auto,
Devices,
get_device,
get_capability
)
2 changes: 1 addition & 1 deletion src/kimchima/cmds/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .auto_cli import CommandAuto
from .auto_cli import CommandAutoModel
11 changes: 5 additions & 6 deletions src/kimchima/cmds/auto_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from kimchima.pkg import Auto
from kimchima.pkg import ModelFactory


class CommandAuto:
class CommandAutoModel:
"""
A class for auto command.
A class for loading models.
"""

@staticmethod
Expand All @@ -38,7 +38,6 @@ def auto(args):
Returns:
torch.tensor: The embeddings of text.
"""
model = Auto(model_name_or_path=args.model_name_or_path)
embeddings = model.get_embeddings(text=args.text)
print(embeddings)
model = ModelFactory.auto_model(pretrained_model_name_or_path=args.model_name)
print(model.config)

4 changes: 2 additions & 2 deletions src/kimchima/cmds/kimchima_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import argparse

from kimchima.cmds.auto_cli import CommandAuto
from kimchima.cmds.auto_cli import CommandAutoModel


def main():
Expand All @@ -31,7 +31,7 @@ def main():
parser_auto=subparsers.add_parser("auto", help="auto help")
parser_auto.add_argument("model_name_or_path", default="sentence-transformers/all-MiniLM-L6-v2", help="model name or path")
parser_auto.add_argument("text", help="text str or list of text str")
parser_auto.set_defaults(func=CommandAuto.auto)
parser_auto.set_defaults(func=CommandAutoModel.auto)

args = parser.parse_args()
args.func(args)
Expand Down
20 changes: 19 additions & 1 deletion src/kimchima/pkg/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
from .auto import Auto
# coding=utf-8
# Copyright [2024] [SkywardAI]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from .model_factory import ModelFactory
from .tokenizer_factory import TokenizerFactory
from .embeddings import EmbeddingsFactory

from .devices import (
Devices,
get_device,
Expand Down
86 changes: 0 additions & 86 deletions src/kimchima/pkg/auto.py

This file was deleted.

4 changes: 2 additions & 2 deletions src/kimchima/pkg/devices.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding=utf-8
# Copyright [2024] [Aisuko]
# Copyright [2024] [SkywardAI]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -50,4 +50,4 @@ def get_capability()-> Tuple[int, int]:
"""
if get_device() == Devices.GPU:
return torch.cuda.get_device_capability()
return (0, 0)
return (0, 0)
97 changes: 97 additions & 0 deletions src/kimchima/pkg/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# coding=utf-8
# Copyright [2024] [SkywardAI]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import torch
import torch.nn.functional as F
from kimchima.pkg import logging

from .model_factory import ModelFactory
from .tokenizer_factory import TokenizerFactory


logger = logging.get_logger(__name__)


class EmbeddingsFactory:
r"""
Embeddings class to get embeddings from the specified model and tokenizer.
The embeddings mean pooling is used to get the embeddings from the model,
and the embeddings are normalized using L2 normalization.
Args:
pretrained_model_name_or_path: pretrained model name or path
Returns:
sentence_embeddings: sentence embeddings type torch.Tensor
"""

@classmethod
def __init__(cls):
raise EnvironmentError(
"Embeddings is designed to be instantiated "
"using the `Embeddings.from_pretrained(pretrained_model_name_or_path)` method."
)


@classmethod
def auto_embeddings(cls, *args,**kwargs)-> torch.Tensor:
r"""
Get embeddings from the model.
Args:
prompt: prompt text
device: device to run the model
max_length: maximum length of the input text
"""
model=kwargs.pop('model', None)
tokenizer=kwargs.pop('tokenizer', None)
prompt = kwargs.pop('prompt', None)
device = kwargs.pop('device', 'cpu')
max_length = kwargs.pop('max_length', 512)


inputs_ids = tokenizer(prompt, return_tensors='pt',max_length=max_length, padding=True, truncation=True).to(device)

model=model.to(device)
with torch.no_grad():
output = model(**inputs_ids)

embeddings=cls.mean_pooling(model_output=output, attention_mask=inputs_ids['attention_mask'])
logger.debug(f"Embedding mean pooling: {embeddings.shape}")

# Normalize embeddings
sentence_embeddings = F.normalize(embeddings, p=2, dim=1)

return sentence_embeddings


@classmethod
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(cls, **kwargs) -> torch.Tensor:
r"""
Mean Pooling - Take attention mask into account for correct averaging.
Args:
model_output: model output
attention_mask: attention mask
"""
model_output = kwargs.get('model_output')
attention_mask = kwargs.get('attention_mask')
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
6 changes: 6 additions & 0 deletions src/kimchima/pkg/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# The original code in Huggingface Transformers library is licensed under the Apache 2.0 license.
# Adaptor: Aisuko
# TODO: Will tweak the code to fit the Kimchima library

from __future__ import annotations

import logging
import os
import sys
Expand Down
50 changes: 50 additions & 0 deletions src/kimchima/pkg/model_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# coding=utf-8
# Copyright [2024] [SkywardAI]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from transformers import AutoModel
from kimchima.pkg import logging

logger = logging.get_logger(__name__)


class ModelFactory:
r"""
ModelFactory class to get the model from the specified model.
Args:
pretrained_model_name_or_path: pretrained model name or path
"""
def __init__(self):
raise EnvironmentError(
"ModelFactory is designed to be instantiated "
"using the `ModelFactory.from_pretrained(pretrained_model_name_or_path)` method."
)

@classmethod
def auto_model(cls, pretrained_model_name_or_path, **kwargs)-> AutoModel:
r"""
It is used to get the model from the Hugging Face Transformers AutoModel.
Args:
pretrained_model_name_or_path: pretrained model name or path
"""
if pretrained_model_name_or_path is None:
raise ValueError("pretrained_model_name_or_path cannot be None")
model = AutoModel.from_pretrained(pretrained_model_name_or_path, **kwargs)
logger.debug(f"Loaded model: {pretrained_model_name_or_path}")
return model

Loading

0 comments on commit baf353c

Please sign in to comment.