diff --git a/dev/bench/data.js b/dev/bench/data.js index 028178b..62b1f06 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,54 +1,8 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1736306317903, + "lastUpdate": 1736318483945, "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ - { - "commit": { - "author": { - "name": "dhuangnm", - "username": "dhuangnm", - "email": "74931910+dhuangnm@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "b5d97744548e7e7e155d3625edaa2927864c62e5", - "message": "limit flashinfer version to <0.2.0 (#173)\n\nCo-authored-by: dhuangnm ", - "timestamp": "2024-12-24T02:49:56Z", - "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/b5d97744548e7e7e155d3625edaa2927864c62e5" - }, - "date": 1735280168461, - "tool": "customSmallerIsBetter", - "benches": [ - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 595.4321703563133, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241227\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-27 06:14:40 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 22.517213675088637, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241227\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-27 06:14:40 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 137.293341467157, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241227\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-27 05:54:18 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 25.52118534924614, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20241227\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2024-12-27 05:54:18 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" - } - ] - }, { "commit": { "author": { @@ -2302,6 +2256,52 @@ window.BENCHMARK_DATA = { "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\", \"cuda_device_names\": [\"NVIDIA A100-SXM4-80GB\", \"NVIDIA A100-SXM4-80GB\", \"NVIDIA A100-SXM4-80GB\", \"NVIDIA A100-SXM4-80GB\"]}, \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 03:17:14 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" } ] + }, + { + "commit": { + "author": { + "name": "Andy Linfoot", + "username": "andy-neuma", + "email": "78757007+andy-neuma@users.noreply.github.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "0507e27d49749501ebf2db85d210dee03da59315", + "message": "Remove `magic_wand` (#172)\n\nSUMMARY:\r\n* remove \"magic wand\" from \"nm-vllm\"\r\n* update neural magic docker\r\n* update `collect_env.py`\r\n\r\nNOTE: final run was cancelled, since it was just @derekk-nm disabling a\r\ntest ... changes ran green ...\r\nhttps://github.com/neuralmagic/nm-vllm-ent/actions/runs/12602081688\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma ", + "timestamp": "2025-01-03T19:52:36Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/0507e27d49749501ebf2db85d210dee03da59315" + }, + "date": 1736318482512, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 137.0293802022934, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:19:34 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 25.2788263601787, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:19:34 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 588.6454928200692, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:39:55 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 20.957360656216675, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:39:55 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" + } + ] } ] }