From 95639048ff36edc528370af7797e396cdeec9347 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Thu, 22 Aug 2024 00:33:31 +0800 Subject: [PATCH] Add openai client to deepspeedometer (#913) * add openai client * adding openai api support for mii benchmark * enable openai_api (non-stream) mode * enable stream mode for openai-api --------- Co-authored-by: Olatunji Ruwase --- .../src/deepspeedometer/clients/__init__.py | 3 + .../deepspeedometer/clients/openai_client.py | 57 ++++++++++++++ benchmarks/inference/mii/src/client.py | 76 ++++++++++++++++++- .../mii/src/plot_effective_throughput.py | 9 ++- .../inference/mii/src/postprocess_results.py | 11 ++- benchmarks/inference/mii/src/server.py | 8 ++ benchmarks/inference/mii/src/utils.py | 14 +++- 7 files changed, 169 insertions(+), 9 deletions(-) create mode 100644 benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py index a52c3618b..ac1891112 100644 --- a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/__init__.py @@ -4,16 +4,19 @@ from .dummy_client import DummyClientConfig, DummyClient from .fastgen_client import FastGenClientConfig, FastGenClient from .vllm_client import vLLMClientConfig, vLLMClient +from .openai_client import openaiClientConfig, openaiClient client_config_classes = { "dummy": DummyClientConfig, "azure_ml": AzureMLClientConfig, "fastgen": FastGenClientConfig, "vllm": vLLMClientConfig, + "openai": openaiClientConfig } client_classes = { "dummy": DummyClient, "azure_ml": AzureMLClient, "fastgen": FastGenClient, "vllm": vLLMClient, + "openai": openaiClient, } diff --git a/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py new file mode 100644 index 000000000..76eadfc5c --- /dev/null +++ b/benchmarks/inference/deepspeedometer/src/deepspeedometer/clients/openai_client.py @@ -0,0 +1,57 @@ +import os +import json +import requests +import subprocess +import time +from typing import Any, Dict + +from loguru import logger +from pydantic import Field + +from .base import BaseClient +from ..config import BaseConfigModel +from ..prompt import Prompt + + +# client to test any openai API +class openaiClientConfig(BaseConfigModel): + model: str = Field(..., description="HuggingFace.co model name") + url: str = "http://127.0.0.1:26500/v1/completions" + + +class openaiClient(BaseClient): + def __init__(self, config: openaiClientConfig): + super().__init__(config) + + def start_service(self) -> None: + pass + + def stop_service(self) -> None: + pass + + def prepare_request(self, prompt: Prompt) -> Dict[str, Any]: + api_url = self.config.url + headers = { + "User-Agent": "Benchmark Client", + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + pload = { + "prompt": prompt.text, + "model": self.config.model, + "n": 1, + "use_beam_search": False, + "temperature": 1.0, + "top_p": 0.9, + "max_tokens": prompt.max_new_tokens, + "ignore_eos": False, + } + return {"url": api_url, "headers": headers, "json": pload, "timeout": 180} + + def send_request(self, request_kwargs: Dict[str, Any]) -> Any: + response = requests.post(**request_kwargs) + output = json.loads(response.content) + return output + + def process_response(self, raw_response: Any) -> str: + return raw_response["choices"][0]["text"] diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py index e8c656ab0..4e20d37c9 100644 --- a/benchmarks/inference/mii/src/client.py +++ b/benchmarks/inference/mii/src/client.py @@ -131,6 +131,80 @@ def get_response(response: requests.Response) -> List[str]: ) +# client talks with openai api +def call_openai( + input_tokens: str, max_new_tokens: int, args: argparse.Namespace +) -> ResponseDetails: + + api_url = args.openai_api_url + headers = { + "User-Agent": "Benchmark Client", + "Content-Type": "application/json", + "Authorization": f"Bearer {args.openai_api_key}" + } + + pload = { + "prompt": input_tokens, + "model": args.model, + "n": 1, + "use_beam_search": False, + "temperature": 1.0, + "top_p": 0.9, + "max_tokens": max_new_tokens, + "ignore_eos": False, + "stream": args.stream, + } + + def clear_line(n: int = 1) -> None: + LINE_UP = "\033[1A" + LINE_CLEAR = "\x1b[2K" + for _ in range(n): + print(LINE_UP, end=LINE_CLEAR, flush=True) + + def get_streaming_response( + response: requests.Response, time_last_token + ) -> Iterable[List[str]]: + for chunk in response.iter_lines( + chunk_size=8192, decode_unicode=False, delimiter=b"data:" + ): + if chunk: + plain=chunk.decode("utf-8") + if plain.strip() == "[DONE]": + continue + data = json.loads(plain) + output = data["choices"][0]["text"] + time_now = time.time() + yield output, time_now - time_last_token + time_last_token = time_now + + # For non-streaming, but currently non-streaming is not fully implemented + def get_response(response: requests.Response) -> List[str]: + data = json.loads(response.content) + output = data["choices"][0]["text"] + return output + + token_gen_time = [] + start_time = time.time() + #response = requests.post(api_url, headers=headers, json=pload, stream=False) + response = requests.post(api_url, headers=headers, json=pload, stream=args.stream) + if args.stream: + output = "" + for h, t in get_streaming_response(response, start_time): + output += h + token_gen_time.append(t) + else: + output = get_response(response) + + return ResponseDetails( + generated_tokens=output, + prompt=input_tokens, + start_time=start_time, + end_time=time.time(), + model_time=0, + token_gen_time=token_gen_time, + ) + + def call_aml( input_tokens: str, max_new_tokens: int, @@ -205,7 +279,7 @@ def _run_parallel( event_loop = asyncio.new_event_loop() asyncio.set_event_loop(event_loop) - backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml} + backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml, "openai": call_openai} call_fn = backend_call_fns[args.backend] barrier.wait() diff --git a/benchmarks/inference/mii/src/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py index 196f70211..2370a2e1e 100644 --- a/benchmarks/inference/mii/src/plot_effective_throughput.py +++ b/benchmarks/inference/mii/src/plot_effective_throughput.py @@ -15,9 +15,10 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--backend", type=str, choices=["fastgen", "vllm"], default=["fastgen", "vllm"], \ + parser.add_argument("--backend", type=str, choices=["fastgen", "vllm", "openai"], default=["fastgen", "vllm"], \ nargs="+", help="Specify the backends to generate plots for") parser.add_argument("--log_dir", type=Path, default="./results") + parser.add_argument("--model", type=str) parser.add_argument("--out_dir", type=Path, default="./plots/goodtput") parser.add_argument("--sla_prompt_tokens_per_sec", type=int, default=512, help="SLA prompt tokens per second") parser.add_argument("--sla_gen_tokens_per_sec", type=int, default=[1, 2, 3, 4, 6, 8], nargs="+", help="SLA generation tokens/s targets") @@ -76,7 +77,7 @@ def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span): def validate_prompt_latency_SLA(response_detail, sla_token_gen, f, sla_prompt_tokens_per_sec ): - tokenizer = get_tokenizer() + tokenizer = get_tokenizer(args.model) prompt_length = len(tokenizer.tokenize(response_detail.prompt)) prompt_latency_SLA = prompt_length / sla_prompt_tokens_per_sec if prompt_latency_SLA < response_detail.token_gen_time[0]: @@ -137,7 +138,9 @@ def output_charts(args, model, tp_size, bs, replicas, sla_token_gen, prompt, gen ] plt_cfg = {'vllm': {'label': 'vLLM', 'marker': 'x', 'color': 'orange'},\ - 'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}} + 'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}, \ + 'openai': {'label': 'openai-API', 'marker': '+', 'color': 'red'} + } for f in validate_funcs: plt.figure() diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py index 4179f44b6..378925027 100644 --- a/benchmarks/inference/mii/src/postprocess_results.py +++ b/benchmarks/inference/mii/src/postprocess_results.py @@ -49,10 +49,13 @@ def parse_args(): return args -def get_tokenizer(): +def get_tokenizer(model=None): global tokenizer if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + if model==None: + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") + else: + tokenizer = AutoTokenizer.from_pretrained(model) return tokenizer @@ -78,8 +81,8 @@ def get_summary(args, response_details): tokens_per_sec = mean( [ - (len(get_tokenizer().tokenize(r.prompt)) + - len(get_tokenizer().tokenize(r.generated_tokens)) if type(r.generated_tokens) == str + (len(get_tokenizer(args["model"]).tokenize(r.prompt)) + + len(get_tokenizer(args["model"]).tokenize(r.generated_tokens)) if type(r.generated_tokens) == str else len(r.generated_tokens)) / (r.end_time - r.start_time) for r in response_details diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py index 56fd7930e..6d3c1cd69 100644 --- a/benchmarks/inference/mii/src/server.py +++ b/benchmarks/inference/mii/src/server.py @@ -19,6 +19,7 @@ def start_server(args: argparse.Namespace) -> None: "fastgen": start_fastgen_server, "vllm": start_vllm_server, "aml": start_aml_server, + "openai": start_openai_server, } start_fn = start_server_fns[args.backend] start_fn(args) @@ -90,12 +91,16 @@ def start_aml_server(args: argparse.Namespace) -> None: "AML server start not implemented. Please use Azure Portal to start the server." ) +def start_openai_server(args: argparse.Namespace) -> None: + # openai api has no command to stop server + pass def stop_server(args: argparse.Namespace) -> None: stop_server_fns = { "fastgen": stop_fastgen_server, "vllm": stop_vllm_server, "aml": stop_aml_server, + "openai": stop_openai_server, } stop_fn = stop_server_fns[args.backend] stop_fn(args) @@ -118,6 +123,9 @@ def stop_aml_server(args: argparse.Namespace) -> None: "AML server stop not implemented. Please use Azure Portal to stop the server." ) +def stop_openai_server(args: argparse.Namespace) -> None: + # openai api has no command to stop server + pass if __name__ == "__main__": args = parse_args(server_args=True) diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py index d3c1fee02..ac2065065 100644 --- a/benchmarks/inference/mii/src/utils.py +++ b/benchmarks/inference/mii/src/utils.py @@ -122,6 +122,18 @@ def parse_args( default="./results/", help="Directory to save result JSON files", ) + client_parser.add_argument( + "--openai_api_url", + type=str, + default=None, + help="When using the openai API backend, this is the API URL that points to an openai api server", + ) + client_parser.add_argument( + "--openai_api_key", + type=str, + default=None, + help="When using the openai API backend, this is the API key for a given openai_api_url", + ) client_parser.add_argument( "--aml_api_url", type=str, @@ -156,7 +168,7 @@ def parse_args( parser.add_argument( "--backend", type=str, - choices=["aml", "fastgen", "vllm"], + choices=["aml", "fastgen", "vllm", "openai"], default="fastgen", help="Which backend to benchmark", )