diff --git a/dev/bench/data.js b/dev/bench/data.js index de46705..5cb4bf5 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,42 +1,8 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1725675451741, + "lastUpdate": 1725679390824, "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ - { - "commit": { - "author": { - "name": "Derek Kozikowski", - "username": "derekk-nm", - "email": "106621615+derekk-nm@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "d2ecadd10af5f341a15d275a27a6eb015b7ccf6c", - "message": "add support for the lm-eval task \"ifeval\" (#53)\n\n# SUMMARY:\r\n\r\nUpdates test_lm_eval_correctness to handle reporting of results for all\r\ntasks and their metrics as requested by the model configuration file.\r\nIncorporates the \"ifeval\" task with one model configured as an example.\r\nThe \"ifeval\" task has some package dependencies added to\r\nrequirements-test.txt. Includes a README.md describing the layout of the\r\nmodel configuration files.\r\n\r\nA followup PR will include `ifeval` ground truth values for the\r\nremaining models we are currently using.\r\n\r\n# TEST PLAN:\r\nI tested this locally with a simple `TinyLlama.yaml` file for the\r\nTinyLlama model. in there I configured the `ifeval` task with a\r\ndifferent `rtol`, and then modified the .yaml config file to only\r\nrequest the `gsm8k` task to demonstrate that the code will work with\r\nother models:\r\n\r\n## run results for config with two tasks\r\n\r\n| task | metric | ground_truth | measured | rtol | isclose |\r\n\r\n|:-------|:-----------------------------|---------------:|-----------:|-------:|:----------|\r\n| gsm8k | exact_match,strict-match | 0.023 | 0.023 | 0.025 | True |\r\n| gsm8k | exact_match,flexible-extract | 0.029 | 0.029 | 0.025 | True |\r\n| ifeval | prompt_level_strict_acc,none | 0.036 | 0.0351201 | 0.05 |\r\nTrue |\r\n| ifeval | inst_level_strict_acc,none | 0.078 | 0.0767386 | 0.05 | True\r\n|\r\n| ifeval | prompt_level_loose_acc,none | 0.042 | 0.0425139 | 0.05 | True\r\n|\r\n| ifeval | inst_level_loose_acc,none | 0.099 | 0.0839329 | 0.05 | False\r\n|\r\n\r\n## run results for config with one task\r\n\r\n| task | metric | ground_truth | measured | rtol | isclose |\r\n\r\n|:-------|:-----------------------------|---------------:|-----------:|-------:|:----------|\r\n| gsm8k | exact_match,strict-match | 0.023 | 0.023 | 0.025 | True |\r\n| gsm8k | exact_match,flexible-extract | 0.029 | 0.029 | 0.025 | True |", - "timestamp": "2024-08-19T18:50:34Z", - "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/d2ecadd10af5f341a15d275a27a6eb015b7ccf6c" - }, - "date": 1724296129204, - "tool": "customSmallerIsBetter", - "benches": [ - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 76.14679149972896, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-08-22 03:07:30 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.3.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", - "value": 24.669795447726173, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.3.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-08-22 03:07:30 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" - } - ] - }, { "commit": { "author": { @@ -2266,6 +2232,52 @@ window.BENCHMARK_DATA = { "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"facebook/opt-350m\",\n \"tokenizer\": \"facebook/opt-350m\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-07 02:07:42 UTC\",\n \"model\": \"facebook/opt-350m\",\n \"dataset\": \"sharegpt\"\n}" } ] + }, + { + "commit": { + "author": { + "name": "dhuangnm", + "username": "dhuangnm", + "email": "74931910+dhuangnm@users.noreply.github.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "f04d9cfd3acadce89edd6052045dbb41826fda08", + "message": "bump up main to 0.5.4.0 (#70)\n\nSUMMARY:\r\nbump up main to 0.5.4.0\r\n\r\nTEST PLAN:\r\nAll tests\r\n\r\nCo-authored-by: dhuangnm ", + "timestamp": "2024-09-06T16:20:03Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/f04d9cfd3acadce89edd6052045dbb41826fda08" + }, + "date": 1725679390388, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 219.5814475005803, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-07 03:21:55 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 18.037020594543065, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-07 03:21:55 UTC\",\n \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 71.59329962606232, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-07 03:01:24 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"vllm_version\": \"0.5.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\", \"torch_version\": \"2.3.1+cu121\"}", + "value": 24.08591213568051, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]\",\n \"torch_version\": \"2.3.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=81116MB, multi_processor_count=132)]\",\n \"cuda_device_names\": [\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\",\n \"NVIDIA H100 80GB HBM3\"\n ]\n },\n \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"127.0.0.1\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 4,\n \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-09-07 03:01:24 UTC\",\n \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n \"dataset\": \"sharegpt\"\n}" + } + ] } ] }