From fb236cc447e45856ddbadd25cd6e0f660ea7133c Mon Sep 17 00:00:00 2001 From: github-action-benchmark Date: Tue, 3 Sep 2024 02:14:52 +0000 Subject: [PATCH] add smaller_is_better (customSmallerIsBetter) benchmark result for 207471a9f8fb072d601f308d0c25e8e915fa4ea0 --- dev/bench/data.js | 94 +++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/dev/bench/data.js b/dev/bench/data.js index 04a0ef0..6c598e4 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,54 +1,8 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1725329443891, + "lastUpdate": 1725329692784, "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ - { - "commit": { - "author": { - "name": "Derek Kozikowski", - "username": "derekk-nm", - "email": "106621615+derekk-nm@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "9d2ec2b28d6ec008484de343354033d58a28bca3", - "message": "METADATA file \"License\" was updated to Enterprise (#58)\n\nSUMMARY:\r\nupdates the license test to be current with the change to the METADATA\r\nLicense field.\r\n\r\nTEST PLAN:\r\nwatch remote push", - "timestamp": "2024-08-15T13:09:20Z", - "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/9d2ec2b28d6ec008484de343354033d58a28bca3" - }, - "date": 1723774543803, - "tool": "customSmallerIsBetter", - "benches": [ - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 42.65716744276384, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-08-16 02:05:21 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 7.40419920062122, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-08-16 02:05:21 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 37.238513535509504, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-08-16 02:14:26 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 12.297281338667512, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-08-16 02:14:26 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - } - ] - }, { "commit": { "author": { @@ -2206,6 +2160,52 @@ window.BENCHMARK_DATA = { "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n \"cuda_device_names\": [\n \"NVIDIA A100-SXM4-80GB\",\n \"NVIDIA A100-SXM4-80GB\",\n \"NVIDIA A100-SXM4-80GB\",\n \"NVIDIA A100-SXM4-80GB\"\n ]\n },\n \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-03 02:09:25 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" } ] + }, + { + "commit": { + "author": { + "name": "Derek Kozikowski", + "username": "derekk-nm", + "email": "106621615+derekk-nm@users.noreply.github.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "207471a9f8fb072d601f308d0c25e8e915fa4ea0", + "message": "skip test coverage if tests were not run (#63)\n\n# SUMMARY:\r\nA recent nightly run failed because automated tests did not execute.\r\n(https://github.com/neuralmagic/nm-vllm-ent/actions/runs/10500265493/job/29088756696)\r\nThis caused a follow-on error when the test coverage was supposed to be\r\nreported. To avoid the confusion of the additional error, this code\r\nchange conditionally summarizes the code coverage only if there was a\r\nvalid TEST_STATUS.\r\n\r\n# TEST PLAN:\r\nThe [nm-remote-push\r\njob](https://github.com/neuralmagic/nm-vllm-ent/actions/runs/10514817969/job/29134295227#step:18:31)\r\nconveniently has a TEST failure that demonstrates the success of this\r\ncode change. [The\r\nsummary](https://github.com/neuralmagic/nm-vllm-ent/actions/runs/10514817969)\r\nshows the code coverage report for successful TEST jobs, and does not\r\nshow one for the TEST job that didn’t run the tests.\r\n\r\n\"Screenshot\r\n\"Screenshot", + "timestamp": "2024-08-29T17:25:28Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/207471a9f8fb072d601f308d0c25e8e915fa4ea0" + }, + "date": 1725329692129, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 23.011060759672546, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-03 02:05:49 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 6.052315288260539, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-03 02:05:49 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 183.063268619638, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-03 02:13:45 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 83.62012220348369, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n \"cuda_device_names\": [\n \"NVIDIA L4\"\n ]\n },\n \"gpu_description\": \"NVIDIA L4 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-03 02:13:45 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + } + ] } ] }