diff --git a/dev/bench/data.js b/dev/bench/data.js index 94de1da..a9d088b 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,6 +1,6 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1721874377285, - "repoUrl": "https://github.com/neuralmagic/nm-vllm-certs", + "lastUpdate": 1721919677041, + "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ { @@ -278,6 +278,52 @@ window.BENCHMARK_DATA = { "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.2.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-07-25 02:17:06 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" } ] + }, + { + "commit": { + "author": { + "name": "Domenic Barbuzzi", + "username": "dbarbuzzi", + "email": "domenic@neuralmagic.com" + }, + "committer": { + "name": "Domenic Barbuzzi", + "username": "dbarbuzzi", + "email": "domenic@neuralmagic.com" + }, + "id": "5bb2eaca884399fa3ae0da3499c4ba5796746e06", + "message": "Slim down build-test for nightly test", + "timestamp": "2024-07-25T14:07:33Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/5bb2eaca884399fa3ae0da3499c4ba5796746e06" + }, + "date": 1721919676864, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.2.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 186.69768197999397, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.2.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-07-25 15:00:13 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.2.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.98793814020692, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.2.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-07-25 15:00:13 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.2.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.1914908766665, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.2.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-07-25 14:52:18 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.2.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6.073553366434298, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.2.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-07-25 14:52:18 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" + } + ] } ] }