docs: qwen model

LutingWang · Jan 19, 2025 · 993c85c · 993c85c
1 parent ed6c3dc
commit 993c85c
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 21 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -30,6 +30,7 @@
     "ODKD",
     "preds",
     "pretrained",
+    "Qwen",
     "segmentor",
     "segmentors",
     "SGFI",

diff --git a/docs/source/pretrained/gemma.py b/docs/source/pretrained/gemma.py
@@ -3,16 +3,16 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.gemma import GemmaForCausalLM, GemmaTokenizerFast
 
-from todd.patches.torch import get_device
+import todd
 
-PRETRAINED = "pretrained/gemma/gemma-1.1-2b-it"
+PRETRAINED = 'pretrained/gemma/gemma-1.1-2b-it'
 
 tokenizer: GemmaTokenizerFast = AutoTokenizer.from_pretrained(PRETRAINED)
 model: GemmaForCausalLM = AutoModelForCausalLM.from_pretrained(
     PRETRAINED,
-    device_map=get_device(),
-    torch_dtype=torch.float16,
-    revision="float16",
+    device_map='auto',
+    torch_dtype='auto',
+    revision='float16',
 )
 
 WORD_TOKEN = '<word>'  # nosec B105
@@ -22,41 +22,49 @@
     f": {WORD_TOKEN}"
 )
 
-prompt = tokenizer.apply_chat_template(
-    [dict(role='user', content=TEMPLATE)],
-    tokenize=False,
+conversation = [dict(role='user', content=TEMPLATE)]
+inputs = tokenizer.apply_chat_template(
+    conversation,
     add_generation_prompt=True,
+    tokenize=False,
 )
-prefix, suffix = prompt.split(WORD_TOKEN)
-prefix_ids = tokenizer.encode(
+prefix, suffix = inputs.split(WORD_TOKEN)
+prefix_ids: torch.Tensor = tokenizer.encode(
     prefix,
-    return_tensors='pt',
     add_special_tokens=False,
-).to('cuda')
-suffix_ids = tokenizer.encode(
-    suffix,
     return_tensors='pt',
+)
+suffix_ids: torch.Tensor = tokenizer.encode(
+    suffix,
     add_special_tokens=False,
-).to('cuda')
+    return_tensors='pt',
+)
+if todd.Store.cuda:  # pylint: disable=using-constant-test
+    prefix_ids = prefix_ids.cuda()
+    suffix_ids = suffix_ids.cuda()
 prefix_outputs: CausalLMOutputWithPast = model(prefix_ids, use_cache=True)
 prefix_cache = prefix_outputs.past_key_values
 
 WORD = 'car'
 
-word_ids = tokenizer.encode(
+word_ids: torch.Tensor = tokenizer.encode(
     WORD,
-    return_tensors='pt',
     add_special_tokens=False,
-).to('cuda')
+    return_tensors='pt',
+)
+if todd.Store.cuda:  # pylint: disable=using-constant-test
+    word_ids = word_ids.cuda()
 
-input_ids = torch.cat([prefix_ids, word_ids, suffix_ids], dim=-1)
+input_ids = torch.cat([prefix_ids, word_ids, suffix_ids], -1)
 output_ids = model.generate(
     input_ids,
     past_key_values=prefix_cache,
     use_cache=True,
     max_new_tokens=50,
     do_sample=True,
 )
-output_ids = output_ids[0, input_ids.shape[-1]:]
-output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
+
+_, input_length = input_ids.shape
+output_ids = output_ids[0, input_length:]
+output_text = tokenizer.decode(output_ids, True)
 print(output_text)
diff --git a/docs/source/pretrained/llama.rst b/docs/source/pretrained/llama.rst
@@ -0,0 +1,9 @@
+LLaMA
+=====
+
+.. code-block:: bash
+
+    root=pretrained/llama
+    mkdir -p ${root} && cd ${root}
+    git clone git@hf.co:meta-llama/Llama-3.2-11B-Vision
+    cd ../..
diff --git a/docs/source/pretrained/qwen.py b/docs/source/pretrained/qwen.py
@@ -0,0 +1,142 @@
+import argparse
+import pathlib
+from typing import Literal, TextIO, TypedDict
+
+import torch
+from bs4 import BeautifulSoup, NavigableString
+from ebooklib import ITEM_DOCUMENT, epub
+from tqdm import tqdm
+from transformers import BatchEncoding
+from transformers.cache_utils import DynamicCache
+from transformers.models.qwen2 import Qwen2ForCausalLM, Qwen2TokenizerFast
+
+import todd
+
+
+class Message(TypedDict):
+    role: Literal['system', 'user', 'assistant']
+    content: str
+
+
+class Chatbot:
+    PRETRAINED = 'pretrained/qwen/Qwen2.5-7B-Instruct'
+
+    def __init__(self) -> None:
+        tokenizer = Qwen2TokenizerFast.from_pretrained(self.PRETRAINED)
+        self._tokenizer: Qwen2TokenizerFast = tokenizer
+
+        model = Qwen2ForCausalLM.from_pretrained(
+            self.PRETRAINED,
+            device_map='auto',
+            torch_dtype='auto',
+        )
+        self._model: Qwen2ForCausalLM = model
+
+        self._refresh()
+
+    def _refresh(self) -> None:
+        # todd.logger.debug("Refreshing.")
+
+        self._cache = DynamicCache()
+
+        message = Message(
+            role='system',
+            content=(
+                "你是一个出色的翻译，正在翻译萨尔曼可汗（Salman Khan）编写的《教育新语》（Brave New Words）。"
+            ),
+        )
+        self._conversation = [message]
+
+    def __call__(self, text: str) -> str:
+        message = Message(role='user', content=text)
+        self._conversation.append(message)
+
+        inputs: BatchEncoding = self._tokenizer.apply_chat_template(
+            self._conversation,
+            add_generation_prompt=True,
+            return_tensors='pt',
+            return_dict=True,
+        )
+        if todd.Store.cuda:  # pylint: disable=using-constant-test
+            inputs = inputs.to('cuda')
+
+        input_ids: torch.Tensor = inputs['input_ids']
+        _, input_length = input_ids.shape
+
+        output_ids = self._model.generate(
+            **inputs,
+            past_key_values=self._cache,
+            use_cache=True,
+            max_new_tokens=1024,
+            top_p=0.95,
+        )
+
+        while self._cache.get_seq_length() > 32_000:
+            self._refresh()
+            self._conversation.append(message)
+
+        generated_ids = output_ids[0, input_length:]
+        generated_text = self._tokenizer.decode(generated_ids, True)
+
+        message = Message(role='assistant', content=generated_text)
+        self._conversation.append(message)
+
+        return generated_text
+
+
+class Translator:
+    PROMPT = "逐字地把下面的英文文本翻译成中文，不要输出不相关或不符合原文的内容：\n"
+
+    def __init__(self, f: TextIO) -> None:
+        self._f = f
+        self._chatbot = Chatbot()
+
+    def _translate_text(self, text: str) -> str | None:
+        if sum(c.isalpha() for c in text) <= 1:
+            return None
+        return self._chatbot(self.PROMPT + text)
+
+    def _translate_item(self, item: epub.EpubItem) -> None:
+        soup = BeautifulSoup(item.content, 'html.parser')
+        texts: list[NavigableString] = soup.body.find_all(string=True)
+        for text in tqdm(texts, leave=False):
+            translation = self._translate_text(text)
+            if translation is not None and translation.strip():
+                # todd.logger.debug("\n'%s' -> '%s'", text, translation)
+                self._f.write(f"'{text}' -> '{translation}'\n")
+                text.replace_with(translation)
+        item.set_content(soup.encode())
+        self._f.flush()
+
+    def _translate_book(self, book: epub.EpubBook) -> None:
+        items: list[epub.EpubItem] = book.items
+        for item in tqdm(items):
+            if item.get_type() == ITEM_DOCUMENT:
+                self._translate_item(item)
+
+    def translate(self, book: epub.EpubBook) -> None:
+        self._translate_book(book)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input_path', type=pathlib.Path)
+    parser.add_argument('output_path', type=pathlib.Path)
+    args = parser.parse_args()
+    return args
+
+
+def main() -> None:
+    args = parse_args()
+
+    book = epub.read_epub(args.input_path)
+
+    with open('tmp.log', 'w') as f:
+        translator = Translator(f)
+        translator.translate(book)
+
+    epub.write_epub(args.output_path, book)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/docs/source/pretrained/qwen.rst b/docs/source/pretrained/qwen.rst
@@ -0,0 +1,10 @@
+Qwen
+====
+
+.. code-block:: bash
+
+    root=pretrained/qwen
+    mkdir -p ${root} && cd ${root}
+    git clone git@hf.co:Qwen/Qwen2-1.5B-Instruct
+    git clone git@hf.co:Qwen/Qwen2.5-7B-Instruct
+    cd ../..
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,6 +43,7 @@ Documentation = 'https://toddai.readthedocs.io/en/latest/'
 
 [project.optional-dependencies]
 optional = [
+    'beautifulsoup4',
     'bitsandbytes',
     'diffusers',
     'ftfy',
@@ -82,6 +83,7 @@ lint = [
     'pre-commit',
     'pydocstyle',
     'pylint',
+    'types-beautifulsoup4',
     'types-Pillow',
     'types-pycocotools',
     'types-regex',
@@ -158,6 +160,7 @@ plugins = 'numpy.typing.mypy_plugin'
 module = [
     'custom_types.*',
     'datasets.*',
+    'ebooklib.*',
     'ffmpeg.*',
     'h5py.*',
     'ipdb.*',