fixes

backend-developers-ltd · Sep 3, 2024 · 9b2f593 · 9b2f593
1 parent f3cfcb0
commit 9b2f593
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -1,16 +1,17 @@
 # Compute Horde Prompt Gen
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 
-Script to generate batches of prompts for the Compute Horde project synthetic jobs.
-The prompt that generates prompts is inspired from [Bittensor Subnet 18 (Cortex. t)] (https://github.com/Datura-ai/cortex.t/blob/main/cortext/utils.py#L139)
+Script to generate batches of random unique prompts to be used in the Compute Horde project synthetic jobs.
+The prompt that generates prompts is inspired from [Bittensor Subnet 18 (Cortex. t)] (https://github.com/Datura-ai/cortex.t/blob/276cfcf742e8b442500435a1c1862ac4dffa9e20/cortext/utils.py#L193) (licensed under the MIT License.)
+The generated prompts will be saved in `<output_folder_path>/prompts_<uuid>.txt`, each line of the text file containing a prompt.
 
 
 ### build image 
 
 
 ```bash
 # download the model data from huggingface
-python3 download_model.py --hugging face_api_key <API_KEY>
+python3 download_model.py --huggingface_token <API_KEY>
 
 cd src/compute_horde_prompt_gen
 docker build -t compute-horde-prompt-gen .
@@ -19,34 +20,15 @@ docker build -t compute-horde-prompt-gen .
 
 ### run image
 ```bash
-docker run -v ./output/:/app/output/ compute-horde-prompt-gen --dynamic_number_of_batches_in_a_single_go 3 --dynamic_number_of_prompts_in_a_batch 4 --uui uuid1,uuid2,uuid3
+docker run -v ./output/:/app/output/ compute-horde-prompt-gen --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
 ```
 
 ### testint
 ```bash
-python3 run.py --mock_model --dynamic_number_of_batches_in_a_single_go 3 --dynamic_number_of_prompts_in_a_batch 4 --uui uuid1,uuid2,uuid3
+python3 run.py --mock_model --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
 ```
 
 ---
 
 ## License
 This repository is licensed under the MIT License.
-```text
-# The MIT License (MIT)
-# Copyright © 2023 Yuma Rao
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
-# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
-# the Software.
-
-# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
-# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-```
-
diff --git a/download_model.py b/download_model.py
@@ -7,11 +7,11 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Save huggingface model")
     parser.add_argument(
-            "--huggingface_token",
-            type=str,
-            required=True,
-            help="Huggingface token to use",
-            )
+        "--huggingface_token",
+        type=str,
+        required=True,
+        help="Huggingface token to use",
+    )
     parser.add_argument(
         "--model_name",
         type=str,

diff --git a/src/compute_horde_prompt_gen/run.py b/src/compute_horde_prompt_gen/run.py
@@ -1,4 +1,5 @@
 import datetime
+import os
 import logging
 import argparse
 
@@ -10,13 +11,13 @@
 
 
 def generate_prompts(
-    model: MockModel | GenerativeModel,
+    model,
     total_prompts,
     batch_size: int = 5,
     num_return_sequences: int = 5,
     max_new_tokens: int = 2000,
     temperature: float = 1.0,
-    filename: str = "prompts.txt",
+    filepath: str = "prompts.txt",
 ):
     prompt_generator = PromptGeneratingPrompt()
 
@@ -53,7 +54,7 @@ def generate_prompts(
             new_prompts = new_prompts[:total_prompts]
 
         total_prompts -= len(new_prompts)
-        append_to_file(new_prompts, filename)
+        append_to_file(new_prompts, filepath)
 
         if total_prompts == 0:
             break
@@ -98,36 +99,44 @@ def generate_prompts(
         help="Path to load the model and tokenizer from",
     )
     parser.add_argument(
-        "--dynamic_number_of_batches_in_a_single_go",
+        "--number_of_batches",
         type=int,
-        required=True,
+        default=None,
         help="Number of batches to generate",
     )
     parser.add_argument(
-        "--dynamic_number_of_prompts_in_a_batch",
+        "--number_of_prompts_per_batch",
         type=int,
         required=True,
-        help="Number of prompts per batch",
+        help="Number of prompts per uuid batch",
     )
     parser.add_argument(
         "--uuids",
         type=str,
         required=True,
-        help="Comma separated list of uuids to upload batch of prompts for",
+        help="Comma separated list of uuids, used as file names of output batches, i.e. `output/prompts_{uuid}.txt`",
     )
     parser.add_argument(
         "--mock_model",
         action="store_true",
         default=False,
         help="Mock llama3 model for testing purposes only",
     )
+    parser.add_argument(
+        "--output_folder_path",
+        type=str,
+        default="output/",
+        help="Folder path to save the generated prompts to",
+    )
 
     args = parser.parse_args()
 
     uuids = args.uuids.split(",")
-    assert (
-        len(uuids) == args.dynamic_number_of_batches_in_a_single_go
-    ), "Number of uuids should be equal to number of batches requested"
+
+    if args.number_of_batches:
+        assert (
+            len(uuids) == args.number_of_batches
+        ), "Number of uuids should be equal to number of batches requested"
 
     model = (
         GenerativeModel(model_path=args.model_path, quantize=args.quantize)
@@ -139,14 +148,14 @@ def generate_prompts(
         start_ts = datetime.datetime.now()
         generate_prompts(
             model,
-            total_prompts=args.dynamic_number_of_prompts_in_a_batch,
+            total_prompts=args.number_of_prompts_per_batch,
             batch_size=args.batch_size,
             num_return_sequences=args.num_return_sequences,
             max_new_tokens=args.max_new_tokens,
             temperature=args.temperature,
-            filename=f"output/prompts_{uuid}.txt",
+            filepath=os.path.join(args.output_folder_path, f"prompts_{uuid}.txt"),
         )
         seconds_taken = (datetime.datetime.now() - start_ts).total_seconds()
         log.info(
-            f"Finished generating {uuid} batch with {args.dynamic_number_of_prompts_in_a_batch} prompts in {seconds_taken:.2f} seconds"
+            f"Finished generating {uuid} batch with {args.number_of_prompts_per_batch} prompts in {seconds_taken:.2f} seconds"
         )
diff --git a/src/compute_horde_prompt_gen/utils.py b/src/compute_horde_prompt_gen/utils.py
@@ -1,4 +1,5 @@
 import re
+import os
 import logging
 import collections
 
@@ -45,6 +46,10 @@ def check_prompts_quality(prompts: list[str]):
 
 
 def append_to_file(prompts: list[str], filepath: str = "prompts.txt"):
+    folder = os.path.dirname(filepath)
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
     try:
         with open(filepath, "a") as f:
             for prompt in prompts: