diff --git a/dev/bench/data.js b/dev/bench/data.js
index f20782b..3ffa047 100644
--- a/dev/bench/data.js
+++ b/dev/bench/data.js
@@ -1,54 +1,8 @@
 window.BENCHMARK_DATA = {
-  "lastUpdate": 1732673167029,
+  "lastUpdate": 1732673302314,
   "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent",
   "entries": {
     "smaller_is_better": [
-      {
-        "commit": {
-          "author": {
-            "name": "Andy Linfoot",
-            "username": "andy-neuma",
-            "email": "78757007+andy-neuma@users.noreply.github.com"
-          },
-          "committer": {
-            "name": "GitHub",
-            "username": "web-flow",
-            "email": "noreply@github.com"
-          },
-          "id": "7c992e25436127a48dfb9a168bbcd28470ddb986",
-          "message": "initial test fan out (#117)\n\nSUMMARY:\r\n* update python script that parses and processes \"buildkite\" yaml\r\n* added `partitions` outputs to \"nm build\" workflow\r\n* add Neural Magic partition files\r\n* update helper scripts\r\n* adjust workflows and add actions for updated style\r\n\r\nNOTES: this generates a json data structure which can be used to iterate\r\nover to run tests. some benefits of this approach are:\r\n* programmatically fanning out\r\n* setting ENV's locally to the partitions where they are necessary\r\n* localizing pip installs to the partitions where they are necessary\r\n* running any other commands necessary for correct test execution\r\n\r\nTESTPLAN:\r\ndogfooding\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma <andy@neuralmagic.com>",
-          "timestamp": "2024-11-15T18:31:15Z",
-          "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/7c992e25436127a48dfb9a168bbcd28470ddb986"
-        },
-        "date": 1731810261605,
-        "tool": "customSmallerIsBetter",
-        "benches": [
-          {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.6.3.0.20241117\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
-            "value": 61.327604623511434,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241117\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A100-SXM4-80GB\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-17 02:23:02 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.6.3.0.20241117\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
-            "value": 14.410766984397478,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241117\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A100-SXM4-80GB\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-17 02:23:02 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.6.3.0.20241117\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
-            "value": 33.37820377511283,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241117\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A100-SXM4-80GB\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-17 02:16:56 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
-          },
-          {
-            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\", \"vllm_version\": \"0.6.3.0.20241117\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
-            "value": 4.131654235661128,
-            "unit": "ms",
-            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241117\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81049MB, multi_processor_count=108)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA A100-SXM4-80GB\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA A100-SXM4-80GB x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-17 02:16:56 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
-          }
-        ]
-      },
       {
         "commit": {
           "author": {
@@ -2302,6 +2256,52 @@ window.BENCHMARK_DATA = {
             "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241127\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA H100 80GB HBM3\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-27 02:04:51 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
           }
         ]
+      },
+      {
+        "commit": {
+          "author": {
+            "name": "derekk-nm",
+            "username": "derekk-nm",
+            "email": "derek@neuralmagic.com"
+          },
+          "committer": {
+            "name": "derekk-nm",
+            "username": "derekk-nm",
+            "email": "derek@neuralmagic.com"
+          },
+          "id": "1dd5c0d394263862ffebf2469cfc5493122aa3cf",
+          "message": "fix path to models",
+          "timestamp": "2024-11-25T22:05:13Z",
+          "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/1dd5c0d394263862ffebf2469cfc5493122aa3cf"
+        },
+        "date": 1732673301128,
+        "tool": "customSmallerIsBetter",
+        "benches": [
+          {
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.6.3.0.20241127\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
+            "value": 35.79189232315305,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241127\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-27 01:57:46 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.6.3.0.20241127\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
+            "value": 5.958700279832005,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241127\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"facebook/opt-350m\",\n    \"tokenizer\": \"facebook/opt-350m\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-27 01:57:46 UTC\",\n  \"model\": \"facebook/opt-350m\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.6.3.0.20241127\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
+            "value": 229.6344010636191,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241127\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-27 02:07:13 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
+          },
+          {
+            "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"vllm_version\": \"0.6.3.0.20241127\", \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\", \"torch_version\": \"2.4.0+cu121\"}",
+            "value": 82.30920144258917,
+            "unit": "ms",
+            "extra": "{\n  \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n  \"benchmarking_context\": {\n    \"vllm_version\": \"0.6.3.0.20241127\",\n    \"python_version\": \"3.10.12 (main, Sep 30 2024, 21:31:31) [GCC 11.4.0]\",\n    \"torch_version\": \"2.4.0+cu121\",\n    \"torch_cuda_version\": \"12.1\",\n    \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22478MB, multi_processor_count=58)]\",\n    \"cuda_device_names\": [\n      \"NVIDIA L4\"\n    ]\n  },\n  \"gpu_description\": \"NVIDIA L4 x 1\",\n  \"script_name\": \"benchmark_serving.py\",\n  \"script_args\": {\n    \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n  \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n  \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n    \"backend\": \"vllm\",\n    \"version\": \"N/A\",\n    \"base_url\": null,\n    \"host\": \"127.0.0.1\",\n    \"port\": 9000,\n    \"endpoint\": \"/generate\",\n    \"dataset\": \"sharegpt\",\n    \"num_input_tokens\": null,\n    \"num_output_tokens\": null,\n    \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n    \"best_of\": 1,\n    \"use_beam_search\": false,\n    \"log_model_io\": false,\n    \"seed\": 0,\n    \"trust_remote_code\": false,\n    \"disable_tqdm\": false,\n    \"save_directory\": \"benchmark-results\",\n    \"num_prompts_\": null,\n    \"request_rate_\": null,\n    \"nr_qps_pair_\": [\n      300,\n      \"1.0\"\n    ],\n    \"server_tensor_parallel_size\": 1,\n    \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n  },\n  \"date\": \"2024-11-27 02:07:13 UTC\",\n  \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\",\n  \"dataset\": \"sharegpt\"\n}"
+          }
+        ]
       }
     ]
   }