From aafdf199968b09a36b4506684a15cb71d8b4ae99 Mon Sep 17 00:00:00 2001 From: github-action-benchmark Date: Fri, 10 Jan 2025 06:11:22 +0000 Subject: [PATCH] add smaller_is_better (customSmallerIsBetter) benchmark result for 4e51ab3e0aac991d2ded2bb55fd5a957e8b1477a --- dev/bench/data.js | 94 +++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/dev/bench/data.js b/dev/bench/data.js index 39e16ad..cd70d95 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,54 +1,8 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1736478128436, + "lastUpdate": 1736489482732, "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ - { - "commit": { - "author": { - "name": "dhuangnm", - "username": "dhuangnm", - "email": "74931910+dhuangnm@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "b5d97744548e7e7e155d3625edaa2927864c62e5", - "message": "limit flashinfer version to <0.2.0 (#173)\n\nCo-authored-by: dhuangnm ", - "timestamp": "2024-12-24T02:49:56Z", - "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/b5d97744548e7e7e155d3625edaa2927864c62e5" - }, - "date": 1735479405590, - "tool": "customSmallerIsBetter", - "benches": [ - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 606.3914225654056, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241229\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-29 13:35:20 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 21.356676601375852, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241229\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-29 13:35:20 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 133.46370228876668, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241229\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-29 13:14:42 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 24.721010678340654, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241229\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-29 13:14:42 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" - } - ] - }, { "commit": { "author": { @@ -2302,6 +2256,52 @@ window.BENCHMARK_DATA = { "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250110\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\", \"cuda_device_names\": [\"NVIDIA A100-SXM4-80GB\", \"NVIDIA A100-SXM4-80GB\", \"NVIDIA A100-SXM4-80GB\", \"NVIDIA A100-SXM4-80GB\"]}, \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-10 02:54:16 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" } ] + }, + { + "commit": { + "author": { + "name": "Domenic Barbuzzi", + "username": "dbarbuzzi", + "email": "domenic@neuralmagic.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "4e51ab3e0aac991d2ded2bb55fd5a957e8b1477a", + "message": "Use pytest-nm-releng plugin for reporting (#176)\n\nThis PR updates the command-running action/scripts to use the\n[`pytest-nm-releng`](https://github.com/neuralmagic/pytest-nm-releng)\npytest plugin for the creation of JUnit reports and code coverage\nreports (when enabled).\n\nThe previous method had the command runner script checking if the\ncommand being run was a `pytest` command and, if so, it would append the\nappropriate CLI flags based on what was enabled.\n\nThis was problematic if tests were being executed indirectly; namely, if\nthe command runner script was running something like a Bash script which\nin turn ran the pytest commands. This prevented the command runner from\nbeing able to append the CLI flags to create uniquely named report files\nand, as a result, those tests would not have results captured and\nreported outside the action/job.\n\nWith the new plugin, the new method does away with that and simply sets\n2-3 env vars before running the commands (based on what features are\nenabled) and lets the pytest plugin do the heavy lifting of generating\nunique JUnit report names without needing to append any CLI flags to any\ncommands.", + "timestamp": "2025-01-09T16:54:04Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/4e51ab3e0aac991d2ded2bb55fd5a957e8b1477a" + }, + "date": 1736489481187, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 135.06840252317488, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250110\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-10 05:49:33 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 24.668698152153297, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250110\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-10 05:49:33 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 612.5878688910356, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250110\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-10 06:09:56 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 22.185587385605032, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250110\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-10 06:09:56 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" + } + ] } ] }