Skip to content

Commit

Permalink
docs: qwen model
Browse files Browse the repository at this point in the history
  • Loading branch information
LutingWang committed Jan 19, 2025
1 parent ed6c3dc commit 993c85c
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 21 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"ODKD",
"preds",
"pretrained",
"Qwen",
"segmentor",
"segmentors",
"SGFI",
Expand Down
50 changes: 29 additions & 21 deletions docs/source/pretrained/gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.models.gemma import GemmaForCausalLM, GemmaTokenizerFast

from todd.patches.torch import get_device
import todd

PRETRAINED = "pretrained/gemma/gemma-1.1-2b-it"
PRETRAINED = 'pretrained/gemma/gemma-1.1-2b-it'

tokenizer: GemmaTokenizerFast = AutoTokenizer.from_pretrained(PRETRAINED)
model: GemmaForCausalLM = AutoModelForCausalLM.from_pretrained(
PRETRAINED,
device_map=get_device(),
torch_dtype=torch.float16,
revision="float16",
device_map='auto',
torch_dtype='auto',
revision='float16',
)

WORD_TOKEN = '<word>' # nosec B105
Expand All @@ -22,41 +22,49 @@
f": {WORD_TOKEN}"
)

prompt = tokenizer.apply_chat_template(
[dict(role='user', content=TEMPLATE)],
tokenize=False,
conversation = [dict(role='user', content=TEMPLATE)]
inputs = tokenizer.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False,
)
prefix, suffix = prompt.split(WORD_TOKEN)
prefix_ids = tokenizer.encode(
prefix, suffix = inputs.split(WORD_TOKEN)
prefix_ids: torch.Tensor = tokenizer.encode(
prefix,
return_tensors='pt',
add_special_tokens=False,
).to('cuda')
suffix_ids = tokenizer.encode(
suffix,
return_tensors='pt',
)
suffix_ids: torch.Tensor = tokenizer.encode(
suffix,
add_special_tokens=False,
).to('cuda')
return_tensors='pt',
)
if todd.Store.cuda: # pylint: disable=using-constant-test
prefix_ids = prefix_ids.cuda()
suffix_ids = suffix_ids.cuda()
prefix_outputs: CausalLMOutputWithPast = model(prefix_ids, use_cache=True)
prefix_cache = prefix_outputs.past_key_values

WORD = 'car'

word_ids = tokenizer.encode(
word_ids: torch.Tensor = tokenizer.encode(
WORD,
return_tensors='pt',
add_special_tokens=False,
).to('cuda')
return_tensors='pt',
)
if todd.Store.cuda: # pylint: disable=using-constant-test
word_ids = word_ids.cuda()

input_ids = torch.cat([prefix_ids, word_ids, suffix_ids], dim=-1)
input_ids = torch.cat([prefix_ids, word_ids, suffix_ids], -1)
output_ids = model.generate(
input_ids,
past_key_values=prefix_cache,
use_cache=True,
max_new_tokens=50,
do_sample=True,
)
output_ids = output_ids[0, input_ids.shape[-1]:]
output_text = tokenizer.decode(output_ids, skip_special_tokens=True)

_, input_length = input_ids.shape
output_ids = output_ids[0, input_length:]
output_text = tokenizer.decode(output_ids, True)
print(output_text)
9 changes: 9 additions & 0 deletions docs/source/pretrained/llama.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
LLaMA
=====

.. code-block:: bash
root=pretrained/llama
mkdir -p ${root} && cd ${root}
git clone git@hf.co:meta-llama/Llama-3.2-11B-Vision
cd ../..
142 changes: 142 additions & 0 deletions docs/source/pretrained/qwen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import argparse
import pathlib
from typing import Literal, TextIO, TypedDict

import torch
from bs4 import BeautifulSoup, NavigableString
from ebooklib import ITEM_DOCUMENT, epub
from tqdm import tqdm
from transformers import BatchEncoding
from transformers.cache_utils import DynamicCache
from transformers.models.qwen2 import Qwen2ForCausalLM, Qwen2TokenizerFast

import todd


class Message(TypedDict):
role: Literal['system', 'user', 'assistant']
content: str


class Chatbot:
PRETRAINED = 'pretrained/qwen/Qwen2.5-7B-Instruct'

def __init__(self) -> None:
tokenizer = Qwen2TokenizerFast.from_pretrained(self.PRETRAINED)
self._tokenizer: Qwen2TokenizerFast = tokenizer

model = Qwen2ForCausalLM.from_pretrained(
self.PRETRAINED,
device_map='auto',
torch_dtype='auto',
)
self._model: Qwen2ForCausalLM = model

self._refresh()

def _refresh(self) -> None:
# todd.logger.debug("Refreshing.")

self._cache = DynamicCache()

message = Message(
role='system',
content=(
"你是一个出色的翻译,正在翻译萨尔曼可汗(Salman Khan)编写的《教育新语》(Brave New Words)。"
),
)
self._conversation = [message]

def __call__(self, text: str) -> str:
message = Message(role='user', content=text)
self._conversation.append(message)

inputs: BatchEncoding = self._tokenizer.apply_chat_template(
self._conversation,
add_generation_prompt=True,
return_tensors='pt',
return_dict=True,
)
if todd.Store.cuda: # pylint: disable=using-constant-test
inputs = inputs.to('cuda')

input_ids: torch.Tensor = inputs['input_ids']
_, input_length = input_ids.shape

output_ids = self._model.generate(
**inputs,
past_key_values=self._cache,
use_cache=True,
max_new_tokens=1024,
top_p=0.95,
)

while self._cache.get_seq_length() > 32_000:
self._refresh()
self._conversation.append(message)

generated_ids = output_ids[0, input_length:]
generated_text = self._tokenizer.decode(generated_ids, True)

message = Message(role='assistant', content=generated_text)
self._conversation.append(message)

return generated_text


class Translator:
PROMPT = "逐字地把下面的英文文本翻译成中文,不要输出不相关或不符合原文的内容:\n"

def __init__(self, f: TextIO) -> None:
self._f = f
self._chatbot = Chatbot()

def _translate_text(self, text: str) -> str | None:
if sum(c.isalpha() for c in text) <= 1:
return None
return self._chatbot(self.PROMPT + text)

def _translate_item(self, item: epub.EpubItem) -> None:
soup = BeautifulSoup(item.content, 'html.parser')
texts: list[NavigableString] = soup.body.find_all(string=True)
for text in tqdm(texts, leave=False):
translation = self._translate_text(text)
if translation is not None and translation.strip():
# todd.logger.debug("\n'%s' -> '%s'", text, translation)
self._f.write(f"'{text}' -> '{translation}'\n")
text.replace_with(translation)
item.set_content(soup.encode())
self._f.flush()

def _translate_book(self, book: epub.EpubBook) -> None:
items: list[epub.EpubItem] = book.items
for item in tqdm(items):
if item.get_type() == ITEM_DOCUMENT:
self._translate_item(item)

def translate(self, book: epub.EpubBook) -> None:
self._translate_book(book)


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument('input_path', type=pathlib.Path)
parser.add_argument('output_path', type=pathlib.Path)
args = parser.parse_args()
return args


def main() -> None:
args = parse_args()

book = epub.read_epub(args.input_path)

with open('tmp.log', 'w') as f:
translator = Translator(f)
translator.translate(book)

epub.write_epub(args.output_path, book)


if __name__ == '__main__':
main()
10 changes: 10 additions & 0 deletions docs/source/pretrained/qwen.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Qwen
====

.. code-block:: bash
root=pretrained/qwen
mkdir -p ${root} && cd ${root}
git clone git@hf.co:Qwen/Qwen2-1.5B-Instruct
git clone git@hf.co:Qwen/Qwen2.5-7B-Instruct
cd ../..
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Documentation = 'https://toddai.readthedocs.io/en/latest/'

[project.optional-dependencies]
optional = [
'beautifulsoup4',
'bitsandbytes',
'diffusers',
'ftfy',
Expand Down Expand Up @@ -82,6 +83,7 @@ lint = [
'pre-commit',
'pydocstyle',
'pylint',
'types-beautifulsoup4',
'types-Pillow',
'types-pycocotools',
'types-regex',
Expand Down Expand Up @@ -158,6 +160,7 @@ plugins = 'numpy.typing.mypy_plugin'
module = [
'custom_types.*',
'datasets.*',
'ebooklib.*',
'ffmpeg.*',
'h5py.*',
'ipdb.*',
Expand Down

0 comments on commit 993c85c

Please sign in to comment.