Add support to MPS backend

defog-ai · Jan 8, 2024 · c5f0740 · c5f0740
1 parent 127319e
commit c5f0740
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 8 deletions.
diff --git a/eval/hf_runner.py b/eval/hf_runner.py
@@ -18,6 +18,8 @@
 import gc
 from peft import PeftModel, PeftConfig
 
+device_map = "mps" if torch.backends.mps.is_available() else "auto"
+
 
 def generate_prompt(prompt_file, question, db_name):
     with open(prompt_file, "r") as f:
@@ -32,7 +34,6 @@ def generate_prompt(prompt_file, question, db_name):
 
 def dynamic_num_beams(prompt: str, tokenizer, max_beams: int = 4) -> int:
     tokens = len(tokenizer.encode(prompt))
-    print(tokens)
     if tokens <= 1024:
         return max_beams
     elif tokens <= 1536:
@@ -55,7 +56,7 @@ def get_tokenizer_model(model_name: Optional[str], adapter_path: Optional[str]):
             torch_dtype=torch.float16,
             trust_remote_code=True,
             use_cache=True,
-            device_map="auto",
+            device_map=device_map,
         )
         print(f"Loading adapter {adapter_path}")
         model = PeftModel.from_pretrained(model, adapter_path)
@@ -69,7 +70,7 @@ def get_tokenizer_model(model_name: Optional[str], adapter_path: Optional[str]):
         model = LlamaForCausalLM.from_pretrained(
             model_name,
             torch_dtype=torch.float16,
-            device_map="auto",
+            device_map=device_map,
             use_cache=True,
             use_flash_attention_2=True,
         )
@@ -80,7 +81,7 @@ def get_tokenizer_model(model_name: Optional[str], adapter_path: Optional[str]):
             model_name,
             torch_dtype=torch.float16,
             trust_remote_code=True,
-            device_map="auto",
+            device_map=device_map,
         )
     return tokenizer, model
 
@@ -149,8 +150,10 @@ def run_hf_eval(args):
                     + ";"
                 )
                 gc.collect()
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                    torch.cuda.synchronize()
+
                 end_time = time()
 
                 row["generated_query"] = generated_query

diff --git a/main.py b/main.py
@@ -38,6 +38,13 @@
             args.model = "claude-2"
         run_anthropic_eval(args)
     elif args.model_type == "vllm":
+        import platform
+
+        if platform.system() == "Darwin":
+            raise ValueError(
+                "VLLM is not supported on macOS. Please run on a other OS supporting CUDA."
+            )
+
         from eval.vllm_runner import run_vllm_eval
 
         run_vllm_eval(args)
@@ -51,5 +58,5 @@
         run_api_eval(args)
     else:
         raise ValueError(
-            f"Invalid model type: {args.model_type}. Model type must be one of: 'oa', 'hf'"
+            f"Invalid model type: {args.model_type}. Model type must be one of: 'oa', 'hf', 'api', 'anthropic', 'vllm'"
         )
diff --git a/requirements.txt b/requirements.txt
@@ -16,4 +16,4 @@ tiktoken
 torch
 tqdm
 transformers
-vllm
+vllm; sys_platform != 'darwin'
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,4 +16,4 @@ tiktoken @@
     torch
     tqdm
     transformers
-    vllm
+    vllm; sys_platform != 'darwin'