Skip to content

Commit

Permalink
Merge pull request #2 from Minitour/feature/support-generator
Browse files Browse the repository at this point in the history
Added generator support
  • Loading branch information
Minitour authored Jan 7, 2025
2 parents fc0c82e + bc7b73c commit 15ae69e
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 73 deletions.
6 changes: 3 additions & 3 deletions evaluations/raw.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from verbalizer.nlp import LlamaModelParaphrase, ChatGptModelParaphrase
from verbalizer.nlp import ChatGptModelParaphrase

examples = [
"""
Expand Down Expand Up @@ -556,9 +556,9 @@
]

if __name__ == '__main__':
llama_model = LlamaModelParaphrase('http://localhost:11434/v1', temperature=0.1)
# llama_model = LlamaModelParaphrase('http://localhost:11434/v1', temperature=0.1)
openai_model = ChatGptModelParaphrase(api_key=os.getenv('OPENAI_API_KEY'), model='gpt-4o', temperature=0.7)
models = [openai_model, llama_model]
models = [openai_model]

for model in models:
print(f'Running on {model.name}:')
Expand Down
5 changes: 2 additions & 3 deletions playground.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import os

from verbalizer.nlp import ChatGptModelParaphrase, LlamaModelParaphrase
from verbalizer.nlp import ChatGptModelParaphrase
from verbalizer.process import Processor
from verbalizer.sampler import Sampler
from verbalizer.verbalizer import Verbalizer
Expand Down Expand Up @@ -83,9 +83,8 @@
}

if __name__ == '__main__':
llama_model = LlamaModelParaphrase('http://localhost:11434/v1', temperature=0.1)
openai_model = ChatGptModelParaphrase(api_key=os.getenv('OPENAI_API_KEY'), model='gpt-4o', temperature=0.7)
models = [openai_model, llama_model]
models = [openai_model]

sampler = Sampler(sample_n=100, seed=42)

Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ontology-verbalizer"
version = "1.1.0"
version = "1.1.1"
description = "A Python package for ontology verbalization"
authors = ["Antonio Zaitoun <tony.z.1711@gmail.com>"]
license = "MIT"
Expand All @@ -12,7 +12,6 @@ repository = "https://github.com/Minitour/ontology-verbalizer"
[tool.poetry.dependencies]
python = "^3.12"
rdflib = "~7.0.0"
openai = "~1.12.0"
pandas = "~2.2.0"
tqdm = "~4.66.2"

Expand Down
14 changes: 14 additions & 0 deletions tests/test_verbalization.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import types
import unittest

from rdflib import Graph
Expand Down Expand Up @@ -64,3 +65,16 @@ def test_verbalization_with_sampler(self):

# although we sampled 10, only 7 were applicable.
self.assertEqual(7, len(results))

def test_verbalization_with_generator(self):
ontology = Processor.from_file('./data/foaf.owl')

# create vocabulary
vocab = Vocabulary(ontology, ignore=ignore_iri, rephrased=rename_iri)

# create verbalizer
verbalizer = Verbalizer(vocab)

results = Processor.verbalize_with(verbalizer, namespace='foaf', as_generator=True)
self.assertTrue(isinstance(results, types.GeneratorType))
self.assertEqual(12, len(list(results)))
98 changes: 37 additions & 61 deletions verbalizer/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,40 @@
from abc import ABC, abstractmethod
from typing import Optional

from openai import OpenAI
try:
from openai import OpenAI

logging.getLogger("openai").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
logging.getLogger("openai").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
except ModuleNotFoundError as err:
OpenAI = None


class ParaphraseLanguageModel(ABC):

@abstractmethod
def pseudo_to_text(self, pseudo_text: str, extra: str = None) -> str:
"""
Given a pseudo text or controlled natural language, return a rephrased version of that same text.
:param pseudo_text: The CNL set of statements,
:param extra: Additional context to include as part of the prompt.
:return: Paraphrased text.
"""
return pseudo_text

@property
def cost(self) -> float:
"""
The usage cost so far of the model.
"""
return 0.0

@property
def name(self) -> str:
"""
The name of the model used.
"""
return 'Unknown'


def get_messages(pseudo_text: str, extra_context: Optional[str] = None):
Expand Down Expand Up @@ -50,33 +80,6 @@ def get_messages(pseudo_text: str, extra_context: Optional[str] = None):
]


class ParaphraseLanguageModel(ABC):

@abstractmethod
def pseudo_to_text(self, pseudo_text: str, extra: str = None) -> str:
"""
Given a pseudo text or controlled natural language, return a rephrased version of that same text.
:param pseudo_text: The CNL set of statements,
:param extra: Additional context to include as part of the prompt.
:return: Paraphrased text.
"""
return pseudo_text

@property
def cost(self) -> float:
"""
The usage cost so far of the model.
"""
return 0.0

@property
def name(self) -> str:
"""
The name of the model used.
"""
return 'Unknown'


class ChatGptModelParaphrase(ParaphraseLanguageModel):
"""
OpenAI wrapper implementation.
Expand Down Expand Up @@ -138,6 +141,9 @@ class ChatGptModelParaphrase(ParaphraseLanguageModel):
}

def __init__(self, api_key: str, model: str = 'gpt-3.5-turbo-0613', temperature=0.5):
if not OpenAI:
raise ModuleNotFoundError("OpenAI is not installed. Please install it with `pip install openai`")

self.model = model
self.temperature = temperature
self.client = OpenAI(api_key=api_key)
Expand All @@ -156,7 +162,7 @@ def pseudo_to_text(self, pseudo_text: str, extra: str = None) -> str:

@property
def cost(self) -> float:
model_pricing = self.models.get(self.model)
model_pricing = self.models.get(self.model) or {'input': 0.0, 'output': 0.0}

in_tokens = self._in_token_usage / 1000
out_tokens = self._out_token_usage / 1000
Expand All @@ -166,33 +172,3 @@ def cost(self) -> float:
@property
def name(self) -> str:
return self.model


class LlamaModelParaphrase(ParaphraseLanguageModel):
"""
Llama model wrapper implementation.
"""

def __init__(self, base_url, model='llama3', temperature=0.5):
self.temperature = temperature
self.model = model
self.client = OpenAI(
base_url=base_url,
api_key="sk-no-key-required"
)

def pseudo_to_text(self, pseudo_text: str, extra: str = None) -> str:
response = self.client.chat.completions.create(
model=self.model,
messages=get_messages(pseudo_text, extra),
temperature=self.temperature
)
return response.choices[0].message.content.strip()

@property
def cost(self) -> float:
return 0.0

@property
def name(self) -> str:
return self.model
39 changes: 35 additions & 4 deletions verbalizer/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,39 @@ def verbalize_with(cls,
namespace: str,
output_dir: Optional[str] = None,
chunk_size: int = 1000,
sampler: Optional[Sampler] = None):
sampler: Optional[Sampler] = None,
as_generator: bool = False):
gen = cls.verbalize_with_stream(
verbalizer,
namespace=namespace,
output_dir=output_dir,
chunk_size=chunk_size,
sampler=sampler,
as_generator=as_generator
)
if as_generator:
return gen

return next(gen)

@classmethod
def verbalize_with_stream(
cls,
verbalizer: Verbalizer,
*,
namespace: str,
output_dir: Optional[str] = None,
chunk_size: int = 1000,
sampler: Optional[Sampler] = None,
as_generator: bool = False):
"""
Start the verbalization process.
:param verbalizer: The verbalizer to use.
:param namespace: Name of the directory to create under the output directory.
:param output_dir: Name of the output directory.
:param chunk_size: Number of entries (rows) per file. default = 1000
:param sampler: A sampling configuration, use to sample large ontologies.
:param as_generator: If True, returns a generator instead of a list.
"""

# current timestamp
Expand Down Expand Up @@ -67,7 +92,7 @@ def verbalize_with(cls,
if stats.statements == 0:
continue

chunk_dataset.append({
element = {
'ontology': namespace,
'root': entry,
'fragment': fragment,
Expand All @@ -79,7 +104,12 @@ def verbalize_with(cls,
'unique_relationships': len(stats.relationship_counter),
'total_relationships': sum(stats.relationship_counter.values()),
**stats.relationship_counter
})
}

chunk_dataset.append(element)

if as_generator:
yield element

if len(chunk_dataset) != chunk_size:
continue
Expand All @@ -104,7 +134,8 @@ def verbalize_with(cls,
if llm:
logger.info(f'LLM usage cost: ${llm.cost}')

return full_dataset
if not as_generator:
yield full_dataset

@staticmethod
def _get_classes(graph):
Expand Down

0 comments on commit 15ae69e

Please sign in to comment.