Skip to content

Commit

Permalink
Merge pull request #1 from backend-developers-ltd/init
Browse files Browse the repository at this point in the history
llama3 prompt gen
  • Loading branch information
andreea-popescu-reef authored Sep 3, 2024
2 parents 7e07957 + 9b2f593 commit 0b75e08
Show file tree
Hide file tree
Showing 14 changed files with 3,721 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.pdm-python
__pycache__
src/compute_horde_prompt_gen/saved_models/
src/compute_horde_prompt_gen/output/
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2023 Opentensor

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Compute Horde Prompt Gen
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)

Script to generate batches of random unique prompts to be used in the Compute Horde project synthetic jobs.
The prompt that generates prompts is inspired from [Bittensor Subnet 18 (Cortex. t)] (https://github.com/Datura-ai/cortex.t/blob/276cfcf742e8b442500435a1c1862ac4dffa9e20/cortext/utils.py#L193) (licensed under the MIT License.)
The generated prompts will be saved in `<output_folder_path>/prompts_<uuid>.txt`, each line of the text file containing a prompt.


### build image


```bash
# download the model data from huggingface
python3 download_model.py --huggingface_token <API_KEY>

cd src/compute_horde_prompt_gen
docker build -t compute-horde-prompt-gen .
```


### run image
```bash
docker run -v ./output/:/app/output/ compute-horde-prompt-gen --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
```

### testint
```bash
python3 run.py --mock_model --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
```

---

## License
This repository is licensed under the MIT License.
38 changes: 38 additions & 0 deletions download_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import argparse
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Save huggingface model")
parser.add_argument(
"--huggingface_token",
type=str,
required=True,
help="Huggingface token to use",
)
parser.add_argument(
"--model_name",
type=str,
default="meta-llama/Meta-Llama-3.1-8B-Instruct",
help="Model name to use",
)
parser.add_argument(
"--model_path",
type=str,
default="./src/compute_horde_prompt_gen/saved_models/",
help="Path to save the model and tokenizer to",
)

args = parser.parse_args()

model = AutoModelForCausalLM.from_pretrained(
args.model_name,
# either give token directly or assume logged in with huggingface-cli
token=args.huggingface_token or True,
)
model.save_pretrained(args.model_path)

tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.save_pretrained(args.model_path)
827 changes: 827 additions & 0 deletions pdm.lock

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[project]
name = "compute-horde-prompt-gen"
version = "0.1.0"
description = "Default template for PDM package"
authors = [
{name = "Andreea Popescu", email = "andreea.popescu@reef.pl"},
]
dependencies = [
"transformers[torch]>=4.44.2",
]
requires-python = ">=3.11"
readme = "README.md"
license = {text = "MIT"}


[tool.pdm]
distribution = false
24 changes: 24 additions & 0 deletions src/compute_horde_prompt_gen/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Start from a CUDA-enabled base image
FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04

# Set working directory
WORKDIR /app

# Install Python and system dependencies
RUN apt-get update && apt-get install -y \
python3 \
python3-pip

# Install PyTorch Hugging Face Transformers and other necessary packages
RUN pip3 install torch
RUN pip3 install transformers sentencepiece accelerate bitsandbytes

# Create an output folder
RUN mkdir /app/output

# Copy your Python script into the container
COPY saved_models/ /app/saved_models/
COPY *.py .

# Set the entrypoint to run your script
ENTRYPOINT ["python3", "run.py"]
Empty file.
69 changes: 69 additions & 0 deletions src/compute_horde_prompt_gen/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import torch
import logging
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
)

from prompt import PROMPT_ENDING

log = logging.getLogger(__name__)


class MockModel:
def __init__(self):
pass

def generate(self, prompts: list[str], num_return_sequences: int, **_kwargs):
return torch.rand(len(prompts) * num_return_sequences)

def decode(self, _output):
return f"COPY PASTE INPUT PROMPT {PROMPT_ENDING} Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"


class GenerativeModel:
def __init__(self, model_path: str, quantize: bool = False):
quantization_config = None
if quantize:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
llm_int8_enable_fp32_cpu_offload=False,
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
)
log.info("using quantized model")

self.model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
quantization_config=quantization_config,
local_files_only=True,
)
self.tokenizer = AutoTokenizer.from_pretrained(
model_path,
local_files_only=True,
)
# set default padding token
self.tokenizer.pad_token = self.tokenizer.eos_token

def generate(
self,
prompts: list[str],
num_return_sequences: int,
max_new_tokens: int,
temperature: float,
):
# encode the prompts
inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

return self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
num_return_sequences=num_return_sequences,
do_sample=True, # use sampling-based decoding
)

def decode(self, output):
return self.tokenizer.decode(output, skip_special_tokens=True)
57 changes: 57 additions & 0 deletions src/compute_horde_prompt_gen/prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import io
import random
from seeds import THEMES, ABILITIES, FORMATS

PROMPT_ENDING = " }}assistant"


class PromptGeneratingPrompt:
def random_select(self, arr: list[str], num: int = 5) -> str:
random.shuffle(arr)
return ", ".join(arr[:num]) + ", etc"

def generate_prompt(self) -> str:
num_prompts = random.choice([10, 15, 20, 25, 30])
relevance_level = random.randint(5, 20)
complexity_level = random.randint(5, 20)

themes = self.random_select(THEMES, num=3)
abilities = self.random_select(ABILITIES, num=4)
formats = self.random_select(FORMATS, num=5)

prompt = (
f"Generate a list of {num_prompts} complex prompts (questions or instruct tasks) that cover a wide range of skills and knowledge areas related to the themes of {themes}. "
f"Each of these prompts should: "
f"\n- have a complexity level of {complexity_level} out of 20 and a relevance level to the theme of {relevance_level} out of 20"
f"\n- test various cognitive abilities ({abilities}) and require different types of writting formats ({formats})"
f"\n- challenge the model's ability to understand and respond appropriately"
f"\n- varyingly explore the {themes} in a manner that is consistent with their assigned complexity and relevance levels to the theme"
f"\nOutput each prompt on a new line without any extra commentary or special characters."
)
return prompt

def generate_role(self) -> str:
role = "You are a prompt engineer tasked with prompts of varying complexity to test the capabilities of a new language model. For each prompt, consider what aspect of the language model's capabilities it is designed to test and ensure that the set of prompts covers a broad spectrum of potential use cases for the language model. Only output the prompts, one per line without any extra commentary. Do not use any special characters or formatting, numbering or styling in the output."
return role

def tokenize(self, prompt: str, role: str) -> str:
role_templates = {
"system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"end": "<|start_header_id|>assistant<|end_header_id|>",
}
msgs = [
{"role": "system", "content": role},
{"role": "user", "content": prompt},
]
full_prompt = io.StringIO()
for msg in msgs:
full_prompt.write(role_templates[msg["role"]].format(msg["content"]))
full_prompt.write(role_templates["end"])
return full_prompt.getvalue()

def generate(self):
prompt = self.generate_prompt()
role = self.generate_role()
return self.tokenize(prompt, role)
Loading

0 comments on commit 0b75e08

Please sign in to comment.