Merge pull request #2 from backend-developers-ltd/init

add github actions
backend-developers-ltd · Sep 10, 2024 · 4862f46 · 4862f46
2 parents 0b75e08 + 0b955d2
commit 4862f46
Show file tree

Hide file tree

Showing 9 changed files with 240 additions and 92 deletions.
diff --git a/.github/workflows/build_push_image.yml b/.github/workflows/build_push_image.yml
@@ -0,0 +1,43 @@
+name: "CD: build & push image"
+
+on:
+  push:
+    branches: [build-image]
+  workflow_dispatch:
+
+env:
+  PYTHON_DEFAULT_VERSION: "3.12"
+  TAG_VERSION: "v0-latest"
+  DOCKER_REPO_NAME: "backenddevelopersltd/compute-horde-prompt-gen"
+
+jobs:
+  deploy:
+    timeout-minutes: 15
+    runs-on:
+      group: bulkier
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Set up Python ${{ env.PYTHON_DEFAULT_VERSION }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_DEFAULT_VERSION }}
+
+      - name: Login Dockerhub
+        run: echo "${{ secrets.DOCKERHUB_KEY }}" | docker login -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
+
+      - name: Install dependencies
+        run: | 
+          python -m pip install transformers torch
+
+      - name: Docker build and push
+        run: |
+          df -h
+          IMAGE_NAME="${DOCKER_REPO_NAME}:${TAG_VERSION}"
+
+          cd src/compute_horde_prompt_gen
+
+          python download_model.py --model_name phi3 --huggingface_token "${{ secrets.HUGGINGFACE_API_KEY }}"
+
+          docker build -t $IMAGE_NAME .
diff --git a/.github/workflows/smoke_test.yml b/.github/workflows/smoke_test.yml
@@ -0,0 +1,45 @@
+name: Run Smoke Test
+
+on:
+  push:
+    branches: [master, main]
+  pull_request:
+    branches: [master, main]
+  workflow_dispatch:
+
+env:
+  PYTHON_DEFAULT_VERSION: "3.11"
+
+jobs:
+  test:
+    timeout-minutes: 10
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python ${{ env.PYTHON_DEFAULT_VERSION }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.PYTHON_DEFAULT_VERSION }}
+
+      - name: Run Test
+        run: | 
+          cd src/compute_horde_prompt_gen
+
+          python3 run.py --model_name mock --number_of_batches 5 --number_of_prompts_per_batch 20 --uuids uuid1,uuid2,uuid3,uuid4,uuid5
+
+          echo -e "\ngenerated batches:"
+          ls
+          ls output/
+
+          echo -e "\nchecking if prompts are generated fine"
+          for i in $(seq 1 5); do
+            if [ $(cat output/prompts_uuid$i.txt | wc -l) -ne 20 ]; then
+              echo "Missing prompts: $(cat output/prompts_uuid{$i}.txt)"
+              exit 1
+            fi
+          done
+          echo "OK"
+
diff --git a/README.md b/README.md
@@ -2,30 +2,36 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 
 Script to generate batches of random unique prompts to be used in the Compute Horde project synthetic jobs.
+
 The prompt that generates prompts is inspired from [Bittensor Subnet 18 (Cortex. t)] (https://github.com/Datura-ai/cortex.t/blob/276cfcf742e8b442500435a1c1862ac4dffa9e20/cortext/utils.py#L193) (licensed under the MIT License.)
+
 The generated prompts will be saved in `<output_folder_path>/prompts_<uuid>.txt`, each line of the text file containing a prompt.
 
+supports llama3 (`meta-llama/Meta-Llama-3.1-8B-Instruct`) and phi3 (`microsoft/Phi-3.5-mini-instruct`) models
 
 ### build image 
 
 
 ```bash
-# download the model data from huggingface
-python3 download_model.py --huggingface_token <API_KEY>
-
 cd src/compute_horde_prompt_gen
+
+# download model data
+python3 download_model.py --model_name phi3 --huggingface_token <API_KEY>
+
+# build the image
 docker build -t compute-horde-prompt-gen .
 ```
 
 
 ### run image
 ```bash
-docker run -v ./output/:/app/output/ compute-horde-prompt-gen --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
+docker run -v ./output/:/app/output/ compute-horde-prompt-gen --model_name phi3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
 ```
 
 ### testint
 ```bash
-python3 run.py --mock_model --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
+cd src/compute_horde_prompt_gen
+python3 run.py --model_name mock --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
 ```
 
 ---

diff --git a/pdm.lock b/pdm.lock
diff --git a/download_model.py → ...ompute_horde_prompt_gen/download_model.py b/download_model.py → ...ompute_horde_prompt_gen/download_model.py
@@ -1,9 +1,15 @@
+import os
 import argparse
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
 )
 
+MODEL_PATHS = {
+    "llama3": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "phi3": "microsoft/Phi-3.5-mini-instruct",
+}
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Save huggingface model")
     parser.add_argument(
@@ -15,24 +21,31 @@
     parser.add_argument(
         "--model_name",
         type=str,
-        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
-        help="Model name to use",
+        choices=["llama3", "phi3"],
+        required=True,
+        help="Model to use - options are llama3 or phi3",
     )
     parser.add_argument(
-        "--model_path",
+        "--save_path",
         type=str,
-        default="./src/compute_horde_prompt_gen/saved_models/",
+        default="./saved_models/",
         help="Path to save the model and tokenizer to",
     )
 
     args = parser.parse_args()
+    save_path = os.path.join(args.save_path, args.model_name)
+    model_name = MODEL_PATHS[args.model_name]
+    print(f"Saving {model_name} model to {save_path}")
 
     model = AutoModelForCausalLM.from_pretrained(
-        args.model_name,
+        model_name,
         # either give token directly or assume logged in with huggingface-cli
         token=args.huggingface_token or True,
     )
-    model.save_pretrained(args.model_path)
+    model.save_pretrained(save_path)
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
-    tokenizer.save_pretrained(args.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        token=args.huggingface_token or True,
+    )
+    tokenizer.save_pretrained(save_path)
diff --git a/src/compute_horde_prompt_gen/model.py b/src/compute_horde_prompt_gen/model.py
@@ -1,28 +1,34 @@
-import torch
 import logging
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-)
-
-from prompt import PROMPT_ENDING
+import io
 
 log = logging.getLogger(__name__)
 
 
+def strip_input(output: str, ending: str) -> str:
+    # input prompt is repeated in the output, so we need to remove it
+    idx = output.find(ending) + len(ending)
+    return output[idx:].strip()
+
+
 class MockModel:
     def __init__(self):
         pass
 
     def generate(self, prompts: list[str], num_return_sequences: int, **_kwargs):
-        return torch.rand(len(prompts) * num_return_sequences)
-
-    def decode(self, _output):
-        return f"COPY PASTE INPUT PROMPT {PROMPT_ENDING} Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"
+        content = f"Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"
+        return [content for _ in range(len(prompts) * num_return_sequences)]
 
 
 class GenerativeModel:
     def __init__(self, model_path: str, quantize: bool = False):
+        self.input_prompt_ending = None
+
+        import torch
+        from transformers import (
+            AutoTokenizer,
+            AutoModelForCausalLM,
+        )
+
         quantization_config = None
         if quantize:
             from transformers import BitsAndBytesConfig
@@ -44,26 +50,71 @@ def __init__(self, model_path: str, quantize: bool = False):
             model_path,
             local_files_only=True,
         )
+
+    def tokenize(self, prompts: list[str], role: str) -> str:
         # set default padding token
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
+        role_templates = {
+            "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "end": "<|start_header_id|>assistant<|end_header_id|>",
+        }
+
+        def tokenize(prompt: str) -> str:
+            msgs = [
+                {"role": "system", "content": role},
+                {"role": "user", "content": prompt},
+            ]
+            full_prompt = io.StringIO()
+            for msg in msgs:
+                full_prompt.write(role_templates[msg["role"]].format(msg["content"]))
+            full_prompt.write(role_templates["end"])
+            return full_prompt.getvalue()
+
+        inputs = [tokenize(prompt) for prompt in prompts]
+        inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to("cuda")
+        return inputs
+
+    def decode(self, output) -> list[str]:
+        return [
+            strip_input(
+                self.tokenizer.decode(x, skip_special_tokens=True),
+                self.input_prompt_ending,
+            )
+            for x in output
+        ]
+
     def generate(
         self,
         prompts: list[str],
+        role: str,
         num_return_sequences: int,
         max_new_tokens: int,
         temperature: float,
     ):
         # encode the prompts
-        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
+        inputs = self.tokenize(prompts, role)
 
-        return self.model.generate(
+        output = self.model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             num_return_sequences=num_return_sequences,
             do_sample=True,  # use sampling-based decoding
         )
 
-    def decode(self, output):
-        return self.tokenizer.decode(output, skip_special_tokens=True)
+        return self.decode(output)
+
+
+class Phi3(GenerativeModel):
+    def __init__(self, model_path: str, quantize: bool = False):
+        super().__init__(model_path, quantize)
+        self.input_prompt_ending = "assistant<|end_header_id|>"
+
+
+class Llama3(GenerativeModel):
+    def __init__(self, model_path: str, quantize: bool = False):
+        super().__init__(model_path, quantize)
+        self.input_prompt_ending = " }}assistant"