diff --git a/dev/bench/data.js b/dev/bench/data.js index 966f841..32e3954 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,54 +1,8 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1732419834636, + "lastUpdate": 1732446428581, "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ - { - "commit": { - "author": { - "name": "Derek Kozikowski", - "username": "derekk-nm", - "email": "106621615+derekk-nm@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "69c7d68c68d2733f4456d066ceef852f043df08c", - "message": "skip test_cpu_offload_gptq to get nightly green (#145)\n\ntest_cpu_offload_gptq is the last failing test holding up nightly from\r\nbeing totally green.\r\n\r\nwe're skipping to avoid this error:\r\n\r\n```\r\nE AssertionError: Results for model='Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4' are not the same.\r\nE ref_args=['--quantization', 'gptq', '--load-format', 'auto'] ref_envs=None\r\nE compare_args=['--quantization', 'gptq', '--cpu-offload-gb', '1', '--load-format', 'auto'] compare_envs=None\r\nE ref_result={'test': 'seeded_sampling', 'text': ' Amanda Bunnell.', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=5, total_tokens=10, completion_tokens_details=None)}\r\nE compare_result={'test': 'seeded_sampling', 'text': ' Ruth Yocom and I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=5, total_tokens=10, completion_tokens_details=None)}\r\n```", - "timestamp": "2024-11-08T14:40:28Z", - "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/69c7d68c68d2733f4456d066ceef852f043df08c" - }, - "date": 1731560497149, - "tool": "customSmallerIsBetter", - "benches": [ - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241114\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 131.39319299332178, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241114\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-14 04:39:58 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241114\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 24.710389965479525, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241114\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-14 04:39:58 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241114\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 576.423131241075, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241114\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-14 05:00:18 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241114\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 19.98965143248847, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241114\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-14 05:00:18 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" - } - ] - }, { "commit": { "author": { @@ -2302,6 +2256,52 @@ window.BENCHMARK_DATA = { "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241124\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-24 03:42:39 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" } ] + }, + { + "commit": { + "author": { + "name": "Derek Kozikowski", + "username": "derekk-nm", + "email": "106621615+derekk-nm@users.noreply.github.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "153cd7643b55c4e769530eaaa8c92454b139f228", + "message": "NeuralMagic testing partitions (#152)\n\nInitial draft of testing partitions.\r\nAttempts to consolidate tests into a smaller number of steps/partitions\r\nthan are used by .buildkite/test-pipeline.yaml\r\n\r\n---------\r\n\r\nCo-authored-by: Domenic Barbuzzi ", + "timestamp": "2024-11-23T14:30:49Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/153cd7643b55c4e769530eaaa8c92454b139f228" + }, + "date": 1732446427330, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241124\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 602.8456981262813, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241124\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-24 11:05:41 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241124\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 21.942928564529996, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241124\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-24 11:05:41 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241124\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 133.93332347351435, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241124\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-24 10:45:19 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.6.3.0.20241124\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 24.805840447777175, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.6.3.0.20241124\",\n \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n \"torch_version\": \"2.4.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-11-24 10:45:19 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + } + ] } ] }