Merge pull request #1 from backend-developers-ltd/init

llama3 prompt gen
backend-developers-ltd · Sep 3, 2024 · 0b75e08 · 0b75e08
2 parents 7e07957 + 9b2f593
commit 0b75e08
Show file tree

Hide file tree

Showing 14 changed files with 3,721 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.pdm-python
+__pycache__
+src/compute_horde_prompt_gen/saved_models/
+src/compute_horde_prompt_gen/output/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Opentensor
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,34 @@
+# Compute Horde Prompt Gen
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+
+Script to generate batches of random unique prompts to be used in the Compute Horde project synthetic jobs.
+The prompt that generates prompts is inspired from [Bittensor Subnet 18 (Cortex. t)] (https://github.com/Datura-ai/cortex.t/blob/276cfcf742e8b442500435a1c1862ac4dffa9e20/cortext/utils.py#L193) (licensed under the MIT License.)
+The generated prompts will be saved in `<output_folder_path>/prompts_<uuid>.txt`, each line of the text file containing a prompt.
+
+
+### build image 
+
+
+```bash
+# download the model data from huggingface
+python3 download_model.py --huggingface_token <API_KEY>
+
+cd src/compute_horde_prompt_gen
+docker build -t compute-horde-prompt-gen .
+```
+
+
+### run image
+```bash
+docker run -v ./output/:/app/output/ compute-horde-prompt-gen --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
+```
+
+### testint
+```bash
+python3 run.py --mock_model --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
+```
+
+---
+
+## License
+This repository is licensed under the MIT License.
diff --git a/download_model.py b/download_model.py
@@ -0,0 +1,38 @@
+import argparse
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Save huggingface model")
+    parser.add_argument(
+        "--huggingface_token",
+        type=str,
+        required=True,
+        help="Huggingface token to use",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        help="Model name to use",
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./src/compute_horde_prompt_gen/saved_models/",
+        help="Path to save the model and tokenizer to",
+    )
+
+    args = parser.parse_args()
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name,
+        # either give token directly or assume logged in with huggingface-cli
+        token=args.huggingface_token or True,
+    )
+    model.save_pretrained(args.model_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    tokenizer.save_pretrained(args.model_path)
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,17 @@
+[project]
+name = "compute-horde-prompt-gen"
+version = "0.1.0"
+description = "Default template for PDM package"
+authors = [
+    {name = "Andreea Popescu", email = "andreea.popescu@reef.pl"},
+]
+dependencies = [
+    "transformers[torch]>=4.44.2",
+]
+requires-python = ">=3.11"
+readme = "README.md"
+license = {text = "MIT"}
+
+
+[tool.pdm]
+distribution = false
diff --git a/src/compute_horde_prompt_gen/Dockerfile b/src/compute_horde_prompt_gen/Dockerfile
@@ -0,0 +1,24 @@
+# Start from a CUDA-enabled base image
+FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
+
+# Set working directory
+WORKDIR /app
+
+# Install Python and system dependencies
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip
+
+# Install PyTorch Hugging Face Transformers and other necessary packages
+RUN pip3 install torch 
+RUN pip3 install transformers sentencepiece accelerate bitsandbytes
+
+# Create an output folder
+RUN mkdir /app/output
+
+# Copy your Python script into the container
+COPY saved_models/ /app/saved_models/
+COPY *.py .
+
+# Set the entrypoint to run your script
+ENTRYPOINT ["python3", "run.py"]
diff --git a/src/compute_horde_prompt_gen/__init__.py b/src/compute_horde_prompt_gen/__init__.py
diff --git a/src/compute_horde_prompt_gen/model.py b/src/compute_horde_prompt_gen/model.py
@@ -0,0 +1,69 @@
+import torch
+import logging
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+
+from prompt import PROMPT_ENDING
+
+log = logging.getLogger(__name__)
+
+
+class MockModel:
+    def __init__(self):
+        pass
+
+    def generate(self, prompts: list[str], num_return_sequences: int, **_kwargs):
+        return torch.rand(len(prompts) * num_return_sequences)
+
+    def decode(self, _output):
+        return f"COPY PASTE INPUT PROMPT {PROMPT_ENDING} Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"
+
+
+class GenerativeModel:
+    def __init__(self, model_path: str, quantize: bool = False):
+        quantization_config = None
+        if quantize:
+            from transformers import BitsAndBytesConfig
+
+            quantization_config = BitsAndBytesConfig(
+                llm_int8_enable_fp32_cpu_offload=False,
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+            )
+            log.info("using quantized model")
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            device_map="auto",
+            quantization_config=quantization_config,
+            local_files_only=True,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            local_files_only=True,
+        )
+        # set default padding token
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    def generate(
+        self,
+        prompts: list[str],
+        num_return_sequences: int,
+        max_new_tokens: int,
+        temperature: float,
+    ):
+        # encode the prompts
+        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
+
+        return self.model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            num_return_sequences=num_return_sequences,
+            do_sample=True,  # use sampling-based decoding
+        )
+
+    def decode(self, output):
+        return self.tokenizer.decode(output, skip_special_tokens=True)
diff --git a/src/compute_horde_prompt_gen/prompt.py b/src/compute_horde_prompt_gen/prompt.py
@@ -0,0 +1,57 @@
+import io
+import random
+from seeds import THEMES, ABILITIES, FORMATS
+
+PROMPT_ENDING = " }}assistant"
+
+
+class PromptGeneratingPrompt:
+    def random_select(self, arr: list[str], num: int = 5) -> str:
+        random.shuffle(arr)
+        return ", ".join(arr[:num]) + ", etc"
+
+    def generate_prompt(self) -> str:
+        num_prompts = random.choice([10, 15, 20, 25, 30])
+        relevance_level = random.randint(5, 20)
+        complexity_level = random.randint(5, 20)
+
+        themes = self.random_select(THEMES, num=3)
+        abilities = self.random_select(ABILITIES, num=4)
+        formats = self.random_select(FORMATS, num=5)
+
+        prompt = (
+            f"Generate a list of {num_prompts} complex prompts (questions or instruct tasks) that cover a wide range of skills and knowledge areas related to the themes of {themes}. "
+            f"Each of these prompts should: "
+            f"\n- have a complexity level of {complexity_level} out of 20 and a relevance level to the theme of {relevance_level} out of 20"
+            f"\n- test various cognitive abilities ({abilities}) and require different types of writting formats ({formats})"
+            f"\n- challenge the model's ability to understand and respond appropriately"
+            f"\n- varyingly explore the {themes} in a manner that is consistent with their assigned complexity and relevance levels to the theme"
+            f"\nOutput each prompt on a new line without any extra commentary or special characters."
+        )
+        return prompt
+
+    def generate_role(self) -> str:
+        role = "You are a prompt engineer tasked with prompts of varying complexity to test the capabilities of a new language model. For each prompt, consider what aspect of the language model's capabilities it is designed to test and ensure that the set of prompts covers a broad spectrum of potential use cases for the language model. Only output the prompts, one per line without any extra commentary. Do not use any special characters or formatting, numbering or styling in the output."
+        return role
+
+    def tokenize(self, prompt: str, role: str) -> str:
+        role_templates = {
+            "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "end": "<|start_header_id|>assistant<|end_header_id|>",
+        }
+        msgs = [
+            {"role": "system", "content": role},
+            {"role": "user", "content": prompt},
+        ]
+        full_prompt = io.StringIO()
+        for msg in msgs:
+            full_prompt.write(role_templates[msg["role"]].format(msg["content"]))
+        full_prompt.write(role_templates["end"])
+        return full_prompt.getvalue()
+
+    def generate(self):
+        prompt = self.generate_prompt()
+        role = self.generate_role()
+        return self.tokenize(prompt, role)