From ccf76df58457d0b9161db43b1462f5588882958d Mon Sep 17 00:00:00 2001
From: github-action-benchmark <github@users.noreply.github.com>
Date: Thu, 5 Sep 2024 02:21:16 +0000
Subject: [PATCH] add smaller_is_better (customSmallerIsBetter) benchmark
 result for ea0fbbd5bd1581d4d71ce3e8c151db7c792d3bdd

---
 dev/bench/data.js | 94 +++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/dev/bench/data.js b/dev/bench/data.js
index 8699114..787fbba 100644
--- a/dev/bench/data.js
+++ b/dev/bench/data.js
@@ -1,54 +1,8 @@
 window.BENCHMARK_DATA = {
-  "lastUpdate": 1725502554384,
+  "lastUpdate": 1725502875984,
   "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent",
   "entries": {
     "smaller_is_better": [
-      {
-        "commit": {
-          "author": {
-            "name": "Domenic Barbuzzi",
-            "username": "dbarbuzzi",
-            "email": "domenic@neuralmagic.com"
-          },
-          "committer": {
-            "name": "GitHub",
-            "username": "web-flow",
-            "email": "noreply@github.com"
-          },
-          "id": "2efcd90ff72f4ffe8637f118ad2844d9e44bf789",
-          "message": "Extended server benchmark model list (#40)\n\nThis PR updates the benchmark configs to include some a large model that\r\nrun on the 4x H100 runner.\r\n\r\nTo accomplish this, the various `benchmark_` inputs to `build-test` are\r\nconsolidated into the JSON-based `benchmark_configs` input (which\r\naccepts a list of objects containing keys: `label`, `config_file`,\r\n`timeout`).\r\n\r\nThis allows the existing benchmarking of ‘small’ models to continue\r\nrunning on the runners they already are, while allowing a new config\r\nwith ‘large’ models to run specifically on the 4x H100 machine.\r\n\r\nAdditionally, the config now supports an optionally-present property to\r\ndefine the server startup timeout, which is set on the large model\r\nconfig (60 min). This will override the default (15 min) if present. In\r\nthe course of adding this support, I fixed a handful of incorrect type\r\nhinting about the config object (as it tripped me up when I tried to use\r\nit as the advertised type, which didn’t work).",
-          "timestamp": "2024-08-16T15:16:47Z",
-          "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/2efcd90ff72f4ffe8637f118ad2844d9e44bf789"
-        },
-        "date": 1724035833059,
-        "tool": "customSmallerIsBetter",
-        "benches": [
-          {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
-            "value": 38.38384596630931,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA H100 80GB HBM3\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-08-19 02:49:20 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
-            "value": 12.523310634132399,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA H100 80GB HBM3\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-08-19 02:49:20 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
-            "value": 37.30297331698239,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA H100 80GB HBM3\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-08-19 02:40:30 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
-            "value": 5.935463516049849,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA H100 80GB HBM3\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-08-19 02:40:30 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
-          }
-        ]
-      },
       {
         "commit": {
           "author": {
@@ -2242,6 +2196,52 @@ window.BENCHMARK_DATA = {
             "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108), _CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A100-SXM4-80GB\",\n      \"NVIDIA A100-SXM4-80GB\",\n      \"NVIDIA A100-SXM4-80GB\",\n      \"NVIDIA A100-SXM4-80GB\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 4\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 4,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-09-05 02:08:28 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
+      },
+      {
+        "commit": {
+          "author": {
+            "name": "Robert Shaw",
+            "username": "robertgshaw2-neuralmagic",
+            "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com"
+          },
+          "committer": {
+            "name": "GitHub",
+            "username": "web-flow",
+            "email": "noreply@github.com"
+          },
+          "id": "ea0fbbd5bd1581d4d71ce3e8c151db7c792d3bdd",
+          "message": "Upstream sync 2024 08 18 (#59)\n\nSUMMARY:\r\n- Upstream sync from\r\nhttps://github.com/vllm-project/vllm/commit/4cf256ae7f8b0be8f06f6b85821e55d4f5bdaa13\r\n(`v0.5.2`) to\r\nhttps://github.com/vllm-project/vllm/commit/38c4b7e863570a045308af814c72f4504297222e\r\n(`v0.5.3.post1`)\r\n- Comprare\r\nhttps://github.com/neuralmagic/nm-vllm-ent/compare/upstream-sync-2024-08-18..upstream-v0.5.3.post1?expand=1\r\n\r\nTEST PLAN:\r\n- Automation\r\n\r\n---------\r\n\r\nSigned-off-by: kevin <kevin@anyscale.com>\r\nSigned-off-by: Thomas Parnell <tpa@zurich.ibm.com>\r\nSigned-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>\r\nSigned-off-by: Rui Qiao <ruisearch42@gmail.com>\r\nSigned-off-by: Travis Johnson <tsjohnso@us.ibm.com>\r\nCo-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>\r\nCo-authored-by: kevin <kevin@anyscale.com>\r\nCo-authored-by: Mor Zusman <mor.zusmann@gmail.com>\r\nCo-authored-by: Mor Zusman <morz@ai21.com>\r\nCo-authored-by: Joe <g-eoj@users.noreply.github.com>\r\nCo-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>\r\nCo-authored-by: sasha0552 <admin@sasha0552.org>\r\nCo-authored-by: Thomas Parnell <tpa@zurich.ibm.com>\r\nCo-authored-by: Peng Guanwen <pg999w@outlook.com>\r\nCo-authored-by: youkaichao <youkaichao@gmail.com>\r\nCo-authored-by: Jiaxin Shan <seedjeffwan@gmail.com>\r\nCo-authored-by: Cody Yu <hao.yu.cody@gmail.com>\r\nCo-authored-by: Michael Goin <michael@neuralmagic.com>\r\nCo-authored-by: Wushi Dong <33078715+wushidonguc@users.noreply.github.com>\r\nCo-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>\r\nCo-authored-by: shangmingc <csmthu@gmail.com>\r\nCo-authored-by: caishangming.csm <caishangming.csm@alibaba-inc.com>\r\nCo-authored-by: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>\r\nCo-authored-by: milo157 <43028253+milo157@users.noreply.github.com>\r\nCo-authored-by: Antoni Baum <antoni.baum@protonmail.com>\r\nCo-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>\r\nCo-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>\r\nCo-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>\r\nCo-authored-by: Nick Hill <nickhill@us.ibm.com>\r\nCo-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>\r\nCo-authored-by: Stephanie Wang <swang@cs.berkeley.edu>\r\nCo-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>\r\nCo-authored-by: Noam Gat <noamgat@gmail.com>\r\nCo-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>\r\nCo-authored-by: Simon Mo <simon.mo@hey.com>\r\nCo-authored-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai>\r\nCo-authored-by: Woo-Yeon Lee <wooyeon0.lee@samsung.com>\r\nCo-authored-by: Daniele <36171005+dtrifiro@users.noreply.github.com>\r\nCo-authored-by: Travis Johnson <tsjohnso@us.ibm.com>\r\nCo-authored-by: Matt Wong <156021403+mawong-amd@users.noreply.github.com>\r\nCo-authored-by: Roger Wang <ywang@roblox.com>\r\nCo-authored-by: sroy745 <142070531+sroy745@users.noreply.github.com>\r\nCo-authored-by: Isotr0py <2037008807@qq.com>\r\nCo-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>\r\nCo-authored-by: Jae-Won Chung <jwnchung@umich.edu>\r\nCo-authored-by: Cheng Li <pistasable@gmail.com>\r\nCo-authored-by: zhaotyer <89376832+zhaotyer@users.noreply.github.com>\r\nCo-authored-by: tianyi.zhao <tianyi.zhao@transwarp.io>\r\nCo-authored-by: youkaichao <youkaichao@126.com>\r\nCo-authored-by: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>",
+          "timestamp": "2024-09-04T18:05:15Z",
+          "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/ea0fbbd5bd1581d4d71ce3e8c151db7c792d3bdd"
+        },
+        "date": 1725502875665,
+        "tool": "customSmallerIsBetter",
+        "benches": [
+          {
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
+            "value": 25.253582443310734,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-09-05 02:10:58 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
+            "value": 6.642644770058042,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-09-05 02:10:58 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
+            "value": 190.42992976332545,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-09-05 02:19:59 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}",
+            "value": 87.18913287869015,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.5.3.0\",\n    \"python_version\": \"3.10.12 (main, Jun  7 2023, 13:43:11) [GCC 11.3.0]\",\n    \"torch_version\": \"2.3.1+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-09-05 02:19:59 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
+          }
+        ]
       }
     ]
   }