From f069775002cf8fd0f3f91d6232b92de583cbb1af Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 26 Sep 2024 11:42:38 +0530 Subject: [PATCH 001/111] initial commit for llama2 gh action --- .../test-mlperf-inference-llama2.yml | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 .github/workflows/test-mlperf-inference-llama2.yml diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml new file mode 100644 index 0000000000..b5e65a0a7e --- /dev/null +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -0,0 +1,62 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: MLPerf inference GPT-J + +on: + pull_request: + branches: [ "main1", "dev1" ] + paths: + - '.github/workflows/test-mlperf-inference-gptj.yml' + - '**' + - '!**.md' + +jobs: + build_reference: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [ "3.12" ] + backend: [ "pytorch" ] + device: [ "cpu", "cuda" ] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install cmind + cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + - name: Test MLPerf Inference LLAMA 2 70B reference implementation + run: | + cm run script --tags=run-mlperf,inference,_find-performance,_full_r4.1-dev --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=0.01 --rerun + + build_nvidia: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [ "3.12" ] + backend: [ "pytorch" ] + device: [ "cuda" ] + path_to_llama2_dataset_pickle_file: [ "/mnt/llama2/inference/inference/open_orca/open_orca_gpt4_tokenized_llama.sampled_24576.pkl" ] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3 -m pip install cmind + cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + - name: Test MLPerf Inference LLAMA 2 70B NVIDIA implementation + run: | + cm run script --tags=run-mlperf,inference,_find-performance,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=0.01 --rerun \ No newline at end of file From 6ecd8b4d01ca785e613aa30fabf55532dd05bd1d Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 26 Sep 2024 12:12:41 +0530 Subject: [PATCH 002/111] updated run cmds --- .github/workflows/test-mlperf-inference-llama2.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index b5e65a0a7e..194665e979 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -34,7 +34,7 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | - cm run script --tags=run-mlperf,inference,_find-performance,_full_r4.1-dev --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=0.01 --rerun + cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --rerun build_nvidia: @@ -43,7 +43,7 @@ jobs: fail-fast: false matrix: python-version: [ "3.12" ] - backend: [ "pytorch" ] + backend: [ "tensorrt" ] device: [ "cuda" ] path_to_llama2_dataset_pickle_file: [ "/mnt/llama2/inference/inference/open_orca/open_orca_gpt4_tokenized_llama.sampled_24576.pkl" ] @@ -59,4 +59,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B NVIDIA implementation run: | - cm run script --tags=run-mlperf,inference,_find-performance,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=0.01 --rerun \ No newline at end of file + cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --rerun \ No newline at end of file From 62c6d9fccfd5ea7d2c02fca2ff6e59436399a942 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 26 Sep 2024 12:16:54 +0530 Subject: [PATCH 003/111] added clean tad --- .github/workflows/test-mlperf-inference-llama2.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index 194665e979..4f048acf0c 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -34,7 +34,7 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | - cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --rerun + cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --clean build_nvidia: @@ -59,4 +59,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B NVIDIA implementation run: | - cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --rerun \ No newline at end of file + cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --clean \ No newline at end of file From 1fb631f5bae3497d1b7f9afbe0f92c798261083a Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 26 Sep 2024 12:37:09 +0530 Subject: [PATCH 004/111] added submission tags --- .../test-mlperf-inference-llama2.yml | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index 4f048acf0c..b8b19b0de6 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -1,20 +1,16 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: MLPerf inference GPT-J +name: MLPerf inference LLAMA 2 70B on: - pull_request: - branches: [ "main1", "dev1" ] - paths: - - '.github/workflows/test-mlperf-inference-gptj.yml' - - '**' - - '!**.md' + schedule: + - cron: "1 2 * * *" jobs: build_reference: - - runs-on: ubuntu-latest + if: github.repository_owner == 'gateoverflow' + runs-on: [ self-hosted, linux, x64 ] strategy: fail-fast: false matrix: @@ -23,18 +19,15 @@ jobs: device: [ "cpu", "cuda" ] steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install cmind + source gh_action/bin/deactivate || python3 -m venv gh_action + source gh_action/bin/activate + export CM_REPOS=$HOME/GH_CM cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | - cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --clean + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean build_nvidia: @@ -59,4 +52,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B NVIDIA implementation run: | - cm run script --tags=run-mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --clean \ No newline at end of file + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean \ No newline at end of file From 909474e4042534a1afa9ea00ce5b3dd221eec25d Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 12:45:20 +0530 Subject: [PATCH 005/111] updated to only run if owner is gateoverflow --- .github/workflows/test-mlperf-inference-llama2.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index b8b19b0de6..509bab875a 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -30,8 +30,8 @@ jobs: cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean build_nvidia: - - runs-on: ubuntu-latest + if: github.repository_owner == 'gateoverflow' + runs-on: [ self-hosted, linux, x64 ] strategy: fail-fast: false matrix: @@ -52,4 +52,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B NVIDIA implementation run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean \ No newline at end of file + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean From 71357151b529eb834b9afb0e9fe040e619675668 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 14:50:04 +0530 Subject: [PATCH 006/111] Added additional env key for host dataset download --- script/get-preprocessed-dataset-kits19/customize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/script/get-preprocessed-dataset-kits19/customize.py b/script/get-preprocessed-dataset-kits19/customize.py index 8de0593753..c8a0914d24 100644 --- a/script/get-preprocessed-dataset-kits19/customize.py +++ b/script/get-preprocessed-dataset-kits19/customize.py @@ -17,5 +17,6 @@ def postprocess(i): env = i['env'] if 'CM_DATASET_PREPROCESSED_PATH' not in env: env['CM_DATASET_PREPROCESSED_PATH'] = os.getcwd() + env['CM_DATASET_KITS19_PREPROCESSED_PATH'] = env['CM_DATASET_PREPROCESSED_PATH'] return {'return': 0} From e30dfe0ed4aa4500a81b36465f33bd381eac13a6 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 14:50:49 +0530 Subject: [PATCH 007/111] Updated env key for 3d unet --- script/app-mlperf-inference-mlcommons-python/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/app-mlperf-inference-mlcommons-python/customize.py b/script/app-mlperf-inference-mlcommons-python/customize.py index 9d0a649554..aaea9fd96d 100644 --- a/script/app-mlperf-inference-mlcommons-python/customize.py +++ b/script/app-mlperf-inference-mlcommons-python/customize.py @@ -348,7 +348,7 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio cmd = env['CM_PYTHON_BIN_WITH_PATH']+ " run.py --backend=" + backend + " --scenario="+env['CM_MLPERF_LOADGEN_SCENARIO'] + \ env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] + \ " --model="+env['CM_ML_MODEL_FILE_WITH_PATH'] + \ - " --preprocessed_data_dir="+env['CM_DATASET_PREPROCESSED_PATH'] + \ + " --preprocessed_data_dir="+env['CM_DATASET_KITS19_PREPROCESSED_PATH'] + \ scenario_extra_options + mode_extra_options + dataset_options env['LOG_PATH'] = env['CM_MLPERF_OUTPUT_DIR'] From b2808ffa3122310e547198d136e4f74f566b469c Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 14:52:45 +0530 Subject: [PATCH 008/111] Modified for host model download - 3d unet --- script/app-mlperf-inference/_cm.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml index 82e67641d8..5e850eb70d 100644 --- a/script/app-mlperf-inference/_cm.yaml +++ b/script/app-mlperf-inference/_cm.yaml @@ -656,6 +656,14 @@ variations: - 3d-unet-accuracy-script tags: run,accuracy,mlperf,_kits19,_int8 + 3d-unet,reference: + docker: + deps: + - enable_if_env: + CM_MLPERF_DATASET_3DUNET_DOWNLOAD_TO_HOST: + - 'yes' + tags: get,dataset,kits19,preprocessed + sdxl: group: model @@ -1587,6 +1595,7 @@ docker: - "${{ DLRM_DATA_PATH }}:/home/mlperf_inf_dlrmv2" - "${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}:${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}" - "${{ SDXL_CHECKPOINT_PATH }}:${{ SDXL_CHECKPOINT_PATH }}" + - "${{ CM_DATASET_KITS19_PREPROCESSED_PATH }}:${{ CM_DATASET_KITS19_PREPROCESSED_PATH }}" skip_run_cmd: 'no' shm_size: '32gb' interactive: True From 76451bcbcac6c86e22ff5e07384c958287084da7 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:03:51 +0530 Subject: [PATCH 009/111] handle model accuracy variants --- script/app-mlperf-inference/_cm.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml index 5e850eb70d..e881e88b77 100644 --- a/script/app-mlperf-inference/_cm.yaml +++ b/script/app-mlperf-inference/_cm.yaml @@ -656,7 +656,15 @@ variations: - 3d-unet-accuracy-script tags: run,accuracy,mlperf,_kits19,_int8 - 3d-unet,reference: + 3d-unet-99,reference: + docker: + deps: + - enable_if_env: + CM_MLPERF_DATASET_3DUNET_DOWNLOAD_TO_HOST: + - 'yes' + tags: get,dataset,kits19,preprocessed + + 3d-unet-99.9,reference: docker: deps: - enable_if_env: From 46e5a289cdc6b84966d021e5bf08447f4a0be63d Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 16:47:39 +0530 Subject: [PATCH 010/111] 3d unet dataset made as prehook deps --- .../_cm.yaml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index df7a5a1d7a..cf4c89a886 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -558,15 +558,6 @@ deps: CM_MODEL: - mixtral-8x7b - ## Kits19 for 3d-unet - - tags: get,dataset,kits19,preprocessed - names: - - kits19-preprocessed - enable_if_env: - CM_MODEL: - - 3d-unet-99 - - 3d-unet-99.9 - ## Librispeech for rnnt - tags: get,dataset,librispeech,preprocessed names: @@ -620,6 +611,12 @@ deps: - tags: get,generic-python-lib,_package.psutil prehook_deps: + - enable_if_env: + CM_REQUIRE_3DUNET_DATASET_DOWNLOAD: + - 'yes' + names: + - kits19-preprocessed + tags: get,dataset,kits19,preprocessed - names: - remote-run-cmds tags: remote,run,cmds From 6380c8fde800e1274d7f86633f6a3c5734457267 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 26 Sep 2024 17:36:48 +0530 Subject: [PATCH 011/111] Revert "3d unet dataset made as prehook deps" This reverts commit 46e5a289cdc6b84966d021e5bf08447f4a0be63d. --- .../_cm.yaml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index cf4c89a886..df7a5a1d7a 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -558,6 +558,15 @@ deps: CM_MODEL: - mixtral-8x7b + ## Kits19 for 3d-unet + - tags: get,dataset,kits19,preprocessed + names: + - kits19-preprocessed + enable_if_env: + CM_MODEL: + - 3d-unet-99 + - 3d-unet-99.9 + ## Librispeech for rnnt - tags: get,dataset,librispeech,preprocessed names: @@ -611,12 +620,6 @@ deps: - tags: get,generic-python-lib,_package.psutil prehook_deps: - - enable_if_env: - CM_REQUIRE_3DUNET_DATASET_DOWNLOAD: - - 'yes' - names: - - kits19-preprocessed - tags: get,dataset,kits19,preprocessed - names: - remote-run-cmds tags: remote,run,cmds From 9c6098e879a6817e1581fe3281c0beee37104084 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Thu, 26 Sep 2024 17:39:44 +0530 Subject: [PATCH 012/111] skips model download if model download to host is enabled --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index df7a5a1d7a..3fa0e7d316 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -446,6 +446,9 @@ deps: CM_MODEL: - 3d-unet-99 - 3d-unet-99.9 + skip_if_env: + CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST: + - 'yes' ## Rnnt - tags: get,ml-model,speech-recognition,rnnt From e0a79b14dc23ca0c0568d21be8bd98c7209a589a Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 18:02:42 +0530 Subject: [PATCH 013/111] Changed for running in GO-i9 system --- .../test-mlperf-inference-llama2.yml | 31 ++----------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index 509bab875a..ab81f46a95 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -5,18 +5,18 @@ name: MLPerf inference LLAMA 2 70B on: schedule: - - cron: "1 2 * * *" + - cron: "30 19 * * *" jobs: build_reference: if: github.repository_owner == 'gateoverflow' - runs-on: [ self-hosted, linux, x64 ] + runs-on: [ self-hosted, GO-i9, linux, x64 ] strategy: fail-fast: false matrix: python-version: [ "3.12" ] backend: [ "pytorch" ] - device: [ "cpu", "cuda" ] + device: [ "cpu" ] steps: - name: Install dependencies @@ -28,28 +28,3 @@ jobs: - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean - - build_nvidia: - if: github.repository_owner == 'gateoverflow' - runs-on: [ self-hosted, linux, x64 ] - strategy: - fail-fast: false - matrix: - python-version: [ "3.12" ] - backend: [ "tensorrt" ] - device: [ "cuda" ] - path_to_llama2_dataset_pickle_file: [ "/mnt/llama2/inference/inference/open_orca/open_orca_gpt4_tokenized_llama.sampled_24576.pkl" ] - - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python3 -m pip install cmind - cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - - name: Test MLPerf Inference LLAMA 2 70B NVIDIA implementation - run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --tp_size=2 --nvidia_llama2_dataset_file_path=${{ matrix.path_to_llama2_dataset_pickle_file }} --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean From 7935270301c7035bb15a53dd042c188f3914118f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 14:16:07 +0100 Subject: [PATCH 014/111] Update and rename check-broken-links.md to check-broken-links.yml --- .github/workflows/check-broken-links.md | 17 ----------------- .github/workflows/check-broken-links.yml | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 17 deletions(-) delete mode 100644 .github/workflows/check-broken-links.md create mode 100644 .github/workflows/check-broken-links.yml diff --git a/.github/workflows/check-broken-links.md b/.github/workflows/check-broken-links.md deleted file mode 100644 index a753ec75ba..0000000000 --- a/.github/workflows/check-broken-links.md +++ /dev/null @@ -1,17 +0,0 @@ -name: Check .md README files for broken links - -on: [pull_request] - -jobs: - markdown-link-check: - runs-on: ubuntu-latest - # check out the latest version of the code - steps: - - uses: actions/checkout@v3 - - # Checks the status of hyperlinks in .md files in verbose mode - - name: Check links - uses: gaurav-nelson/github-action-markdown-link-check@v1 - with: - use-quiet-mode: 'yes' - check-modified-files-only: 'yes' diff --git a/.github/workflows/check-broken-links.yml b/.github/workflows/check-broken-links.yml new file mode 100644 index 0000000000..c297adbb60 --- /dev/null +++ b/.github/workflows/check-broken-links.yml @@ -0,0 +1,14 @@ +name: Linkspector +on: [pull_request] +jobs: + check-links: + name: runner / linkspector + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run linkspector + uses: umbrelladocs/action-linkspector@v1 + with: + github_token: ${{ secrets.github_token }} + reporter: github-pr-review + fail_on_error: true From e4fc59003148cc0e399b9c49eb85e88243d42111 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 19:09:22 +0530 Subject: [PATCH 015/111] Fix model deps for nvidia mlperf inference sdxl --- script/app-mlperf-inference-nvidia/customize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/script/app-mlperf-inference-nvidia/customize.py b/script/app-mlperf-inference-nvidia/customize.py index 9171027018..f7c116b145 100644 --- a/script/app-mlperf-inference-nvidia/customize.py +++ b/script/app-mlperf-inference-nvidia/customize.py @@ -232,6 +232,7 @@ def preprocess(i): for folder in folders: onnx_model_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'SDXL', 'onnx_models', folder, 'model.onnx') if not os.path.exists(onnx_model_path): + env['CM_REQUIRE_SDXL_MODEL_DOWNLOAD'] = 'yes' cmds.append(f"make download_model BENCHMARKS='{model_name}'") break else: From dd50c6e126067427a4b192fdc13223dcec1aff6f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 19:41:41 +0530 Subject: [PATCH 016/111] Fix tflite dependency for app-mlperf-inference-mlcommons-python --- .../_cm.json | 2 +- .../_cm.yaml | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/script/app-mlperf-inference-ctuning-cpp-tflite/_cm.json b/script/app-mlperf-inference-ctuning-cpp-tflite/_cm.json index 1d00f3c819..03e91596a6 100644 --- a/script/app-mlperf-inference-ctuning-cpp-tflite/_cm.json +++ b/script/app-mlperf-inference-ctuning-cpp-tflite/_cm.json @@ -120,7 +120,7 @@ { "names": [ "tensorflow", - "tflite" + "tflite" ], "tags": "get,tensorflow,lib,_tflite" }, diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index df7a5a1d7a..84fef2ff9d 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -295,7 +295,6 @@ deps: enable_if_env: CM_MLPERF_BACKEND: - tf - - tflite ## NCNN - tags: get,generic-python-lib,_package.ncnn @@ -305,15 +304,12 @@ deps: CM_MLPERF_BACKEND: - ncnn - # - tags: get,generic-python-lib - # names: - # - ml-engine-tflite - # enable_if_env: - # CM_MLPERF_BACKEND: - # - tflite - # CM_MLPERF_DEVICE: - # - tpu - + - tags: get,tensorflow,lib,_tflite + names: + - ml-engine-tflite + enable_if_env: + CM_MLPERF_BACKEND: + - tflite ######################################################################## From 07d0a6980e03ef7dbe9c05ce753515db67580448 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 15:41:57 +0100 Subject: [PATCH 017/111] Update check-broken-links.yml --- .github/workflows/check-broken-links.yml | 28 ++++++++++++++---------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/workflows/check-broken-links.yml b/.github/workflows/check-broken-links.yml index c297adbb60..af257ebf4a 100644 --- a/.github/workflows/check-broken-links.yml +++ b/.github/workflows/check-broken-links.yml @@ -1,14 +1,20 @@ -name: Linkspector -on: [pull_request] +name: "Check .md README files for broken links" + +on: + push: + branches: + - master + jobs: - check-links: - name: runner / linkspector + markdown-link-check: runs-on: ubuntu-latest + # check out the latest version of the code steps: - - uses: actions/checkout@v4 - - name: Run linkspector - uses: umbrelladocs/action-linkspector@v1 - with: - github_token: ${{ secrets.github_token }} - reporter: github-pr-review - fail_on_error: true + - uses: actions/checkout@v4 + + # Checks the status of hyperlinks in .md files in verbose mode + - name: Check links + uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + use-quiet-mode: 'yes' + check-modified-files-only: 'yes' From 9472060f16e310a11e7547b1984c4674372208e9 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 20:13:48 +0530 Subject: [PATCH 018/111] fix typo --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 3fa0e7d316..4b5d873b0a 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -447,7 +447,7 @@ deps: - 3d-unet-99 - 3d-unet-99.9 skip_if_env: - CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST: + CM_MLPERF_DATASET_3DUNET_DOWNLOAD_TO_HOST: - 'yes' ## Rnnt From 42bd118bb0a6f9896d99aef2a4f0a7415c057d02 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 26 Sep 2024 23:21:04 +0530 Subject: [PATCH 019/111] bug fix --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 4b5d873b0a..f4d69bb710 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -446,9 +446,6 @@ deps: CM_MODEL: - 3d-unet-99 - 3d-unet-99.9 - skip_if_env: - CM_MLPERF_DATASET_3DUNET_DOWNLOAD_TO_HOST: - - 'yes' ## Rnnt - tags: get,ml-model,speech-recognition,rnnt @@ -569,6 +566,9 @@ deps: CM_MODEL: - 3d-unet-99 - 3d-unet-99.9 + skip_if_env: + CM_MLPERF_DATASET_3DUNET_DOWNLOAD_TO_HOST: + - 'yes' ## Librispeech for rnnt - tags: get,dataset,librispeech,preprocessed From f56714c3b36c5466d4c5dedabafb6ad05f5b43fc Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 00:44:45 +0530 Subject: [PATCH 020/111] updated with compatable scipy version --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index f4d69bb710..367d4fb8aa 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -1078,6 +1078,7 @@ variations: CM_MLPERF_MODEL_SKIP_BATCHING: true deps: - tags: get,generic-python-lib,_package.nibabel + - tags: get,generic-python-lib,_package.scipy==1.10.1 dlrm-v2-99.9: group: models From 63d280fbcb97f16ce8108c21c5fb21260593a091 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 00:50:14 +0530 Subject: [PATCH 021/111] code clean --- script/app-mlperf-inference/_cm.yaml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml index e881e88b77..9250ec8780 100644 --- a/script/app-mlperf-inference/_cm.yaml +++ b/script/app-mlperf-inference/_cm.yaml @@ -656,15 +656,7 @@ variations: - 3d-unet-accuracy-script tags: run,accuracy,mlperf,_kits19,_int8 - 3d-unet-99,reference: - docker: - deps: - - enable_if_env: - CM_MLPERF_DATASET_3DUNET_DOWNLOAD_TO_HOST: - - 'yes' - tags: get,dataset,kits19,preprocessed - - 3d-unet-99.9,reference: + 3d-unet_,reference: docker: deps: - enable_if_env: From 462223720c37991e4b359f4cdf745520f1df57a7 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 01:03:11 +0530 Subject: [PATCH 022/111] Syntax correction for version - scipy --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 0694a30f96..c402d6bcc1 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -1074,7 +1074,10 @@ variations: CM_MLPERF_MODEL_SKIP_BATCHING: true deps: - tags: get,generic-python-lib,_package.nibabel - - tags: get,generic-python-lib,_package.scipy==1.10.1 + - tags: get,generic-python-lib,_package.scipy + names: + - scipy + version: 1.10.1 dlrm-v2-99.9: group: models From b6f54d72adf84a712e85ad7407e8c84939c2bb74 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 23:40:46 +0100 Subject: [PATCH 023/111] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 258b305f38..81b3725943 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -2,7 +2,7 @@ name: OpenAI Code Review on: pull_request_target: - types: [opened, synchronize] + types: [opened] paths: - 'automation/**' - 'script/**' From 6810d7c0afd284cf2cd698b8a3b649b6bc029cc1 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 10:32:56 +0530 Subject: [PATCH 024/111] Updated - llama2 model download to host --- script/app-mlperf-inference/_cm.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml index 8e56b16047..2b8186c886 100644 --- a/script/app-mlperf-inference/_cm.yaml +++ b/script/app-mlperf-inference/_cm.yaml @@ -762,6 +762,14 @@ variations: add_deps_recursive: mlperf-inference-implementation: tags: _llama2-70b-99.9 + + llama2-70b_,reference: + docker: + deps: + - enable_if_env: + CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST: + - 'yes' + tags: get,ml-model,llama2 mixtral-8x7b: group: From e7a9293f555ac4aa203dc2f370b792ab419b0bfc Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 10:35:00 +0530 Subject: [PATCH 025/111] Skip model download if user sets llama 2 download to host --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index c402d6bcc1..1480a27875 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -420,6 +420,8 @@ deps: - "on" CM_MLPERF_INFERENCE_API_SERVER: - "on" + CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST: + - 'yes' ## mixtral-8x7b - tags: get,ml-model,mixtral From 0455eee176e1688ece8f6fc8747b5e081ef7523e Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:00:42 +0530 Subject: [PATCH 026/111] model path env variable updated --- script/app-mlperf-inference-mlcommons-python/customize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/customize.py b/script/app-mlperf-inference-mlcommons-python/customize.py index aaea9fd96d..ca97bbf0d0 100644 --- a/script/app-mlperf-inference-mlcommons-python/customize.py +++ b/script/app-mlperf-inference-mlcommons-python/customize.py @@ -75,7 +75,7 @@ def preprocess(i): else: env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] += " --mlperf_conf "+ x + env['CM_MLPERF_CONF'] + x - if env.get('CM_NETWORK_LOADGEN', '') != "lon" and env.get('CM_MLPERF_INFERENCE_API_SERVER','')=='': + if env.get('CM_NETWORK_LOADGEN', '') != "lon" and env.get('CM_MLPERF_INFERENCE_API_SERVER','')=='' and "llama2-70b" not in env['CM_MODEL']: env['MODEL_DIR'] = env.get('CM_ML_MODEL_PATH') if not env['MODEL_DIR']: env['MODEL_DIR'] = os.path.dirname(env.get('CM_MLPERF_CUSTOM_MODEL_PATH', env.get('CM_ML_MODEL_FILE_WITH_PATH'))) @@ -318,7 +318,7 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio #env['CM_MLPERF_INFERENCE_API_SERVER'] = "http://localhost:8000" cmd += f" --api-server {env['CM_MLPERF_INFERENCE_API_SERVER']} --model-path {env['CM_VLLM_SERVER_MODEL_NAME']} --api-model-name {env['CM_VLLM_SERVER_MODEL_NAME']} --vllm " else: - cmd += f" --model-path {env['MODEL_DIR']}" + cmd += f" --model-path {env['LLAMA2_CHECKPOINT_PATH']}" if env.get('CM_MLPERF_INFERENCE_NUM_WORKERS', '') != '': cmd += f" --num-workers {env['CM_MLPERF_INFERENCE_NUM_WORKERS']}" From b4e83f431249c644780668d753d93b09e769746f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 15:47:29 +0530 Subject: [PATCH 027/111] Added numpy dependency for pycuda --- script/get-cuda-devices/_cm.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/script/get-cuda-devices/_cm.yaml b/script/get-cuda-devices/_cm.yaml index 64d49d95be..e0d348b831 100644 --- a/script/get-cuda-devices/_cm.yaml +++ b/script/get-cuda-devices/_cm.yaml @@ -55,3 +55,6 @@ variations: - tags: get,generic-python-lib,_package.pycuda names: - pycuda + - tags: get,generic-python-lib,_package.numpy + names: + - numpy From dd6fe0eb834dcc4e74ac964db6541269df109973 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 11:20:34 +0100 Subject: [PATCH 028/111] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 81b3725943..3714d8a2a6 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -15,7 +15,7 @@ permissions: jobs: code_review: runs-on: ubuntu-latest - if: github.repository_owner == 'gateoverflow' && github.event.pull_request.changed_files > 0 + if: github.repository_owner == 'gateoverflow_off' && github.event.pull_request.changed_files > 0 steps: # Run code review via OpenAI # Step to run the OpenAI Code Review using the GATEOverflow action From b3cf801c21befe5b695a3e5f9f401c84495649a7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 11:53:14 +0100 Subject: [PATCH 029/111] Update test-scc24-sdxl.yaml | added SCC specific result directory --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index e9a2fa410c..8c866991be 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -51,6 +51,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} run: | - cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --precision=float16 --clean | - cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons | + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet From c52e446878864c20125ebfc65959c4f5a399e36b Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 16:24:23 +0530 Subject: [PATCH 030/111] added starting weights filename --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 1480a27875..6d7f6600b8 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -957,6 +957,7 @@ variations: llama2-70b_: env: CM_MLPERF_MODEL_SKIP_BATCHING: false + CM_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://github.com/mlcommons/cm4mlops/blob/b18ff890ff559e21d2e27a3b54cd26467ac1fd9e/script/get-ml-model-llama2/_cm.json#L51" deps: - tags: get,generic-python-lib,_package.transformers names: From 4d6d73789d1fe6a691c63e6bfb792116875fbce3 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 11:58:49 +0100 Subject: [PATCH 031/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 8c866991be..7b93c87855 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -26,8 +26,8 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} run: | - cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --precision=float16 --clean | - cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons | + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet build_nvidia: From 91e792d3e044470e2a8f4608bd378d665f7f3c69 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 16:53:35 +0530 Subject: [PATCH 032/111] Improve the detect-sudo script | make it bot compatible --- script/detect-sudo/customize.py | 44 ++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/script/detect-sudo/customize.py b/script/detect-sudo/customize.py index 2f6fa411a4..c963fc23f9 100644 --- a/script/detect-sudo/customize.py +++ b/script/detect-sudo/customize.py @@ -18,12 +18,50 @@ def preprocess(i): return {'return':0} +def reset_terminal(): + """Reset terminal to default settings.""" + subprocess.run(['stty', 'sane']) + +def prompt_retry(timeout=10): + """Prompt the user with a yes/no question to retry the command, with a 10-second timeout.""" + print(f"Timeout occurred. Do you want to try again? (y/n): ", end='', flush=True) + + # Use select to wait for user input with a timeout + ready, _, _ = select.select([sys.stdin], [], [], timeout) + + if ready: + answer = sys.stdin.readline().strip().lower() + if answer in ['y', 'n']: + return answer == 'y' # Return True if 'y', False if 'n' + print("\nInvalid input. Please enter 'y' or 'n'.") + return prompt_retry(timeout) # Re-prompt on invalid input + else: + print("\nNo input received in 10 seconds. Exiting.") + return False # No input within the timeout, so don't retry + def prompt_sudo(): - if os.geteuid() != 0: + if os.geteuid() != 0: # No sudo required for root user msg = "[sudo] password for %u:" - return subprocess.check_call("sudo echo 'Check sudo' -p '%s'" % msg, shell=True) - return -1 + while True: + try: + r = subprocess.check_output(["sudo", "-p", msg, "echo", "Check sudo"], + stderr=subprocess.STDOUT, timeout=5) + print(r.decode('utf-8')) # Decode bytes to string + return 0 + except subprocess.TimeoutExpired: + reset_terminal() # Reset terminal to sane state + if not prompt_retry(): # If the user chooses not to retry or times out + return -1 + except subprocess.CalledProcessError as e: + print(f"Command failed: {e.output.decode('utf-8')}") + reset_terminal() # Reset terminal in case of failure + return -1 + except Exception as e: + print(f"An error occurred: {str(e)}") + reset_terminal() # Always reset terminal after error + return -1 + return -1 def postprocess(i): From 07787c20bde713d870c517ede745eadcc14fb223 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:03:22 +0530 Subject: [PATCH 033/111] added additional tags --- .github/workflows/test-mlperf-inference-llama2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index ab81f46a95..0ce410409f 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -27,4 +27,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --env.CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST=yes --adr.inference-src.env.CM_GIT_URL=https://github.com/anandhu-eng/inference.git --clean From 645ad8f00dddd1c9f7f71a2d3d74a7c873083bb5 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 17:22:44 +0530 Subject: [PATCH 034/111] Downgraded nltk --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 6d7f6600b8..5275e62604 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -983,6 +983,7 @@ variations: - tags: get,generic-python-lib,_package.nltk names: - nltk + version: 3.8.1 - tags: get,generic-python-lib,_package.numpy names: - numpy From 19c6367a5e4bda208116cf5d8b6b18cd76c6cacd Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 14:06:44 +0100 Subject: [PATCH 035/111] Update _cm.yaml --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 5275e62604..8fa0e9d810 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -983,7 +983,8 @@ variations: - tags: get,generic-python-lib,_package.nltk names: - nltk - version: 3.8.1 + version_max: 3.8.1 + version_max_usable: 3.8.1 - tags: get,generic-python-lib,_package.numpy names: - numpy From 41d5934fc6b8f7f7b83bf850f6a0711b8929bea3 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 14:09:52 +0100 Subject: [PATCH 036/111] Update test-mlperf-inference-gptj.yml --- .github/workflows/test-mlperf-inference-gptj.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml index 5a7ecc7e82..7d695204e8 100644 --- a/.github/workflows/test-mlperf-inference-gptj.yml +++ b/.github/workflows/test-mlperf-inference-gptj.yml @@ -24,7 +24,8 @@ jobs: source gh_action/bin/deactivate || python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + python3 -m pip install cm4mlops + cm pull repo - name: Test MLPerf Inference GPTJ run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean From fcd586d871013f77ce3da68f0ae7f925a456e22e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 14:10:37 +0100 Subject: [PATCH 037/111] Update test-mlperf-inference-sdxl.yaml --- .github/workflows/test-mlperf-inference-sdxl.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml index c7d693495c..a2024cf4a0 100644 --- a/.github/workflows/test-mlperf-inference-sdxl.yaml +++ b/.github/workflows/test-mlperf-inference-sdxl.yaml @@ -20,7 +20,8 @@ jobs: source gh_action/bin/deactivate || python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + python3 -m pip install cm4mlops + cm pull repo - name: Test MLPerf Inference SDXL run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean From cf96b5a9f6706cc4d3e208e673992227013e4ac6 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 19:02:10 +0530 Subject: [PATCH 038/111] Removed the version restrictions for dlrmv2, tested on torch 2.4, added set-user-limit CM script --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 8fa0e9d810..3cde8ba1f0 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -1114,7 +1114,6 @@ variations: - torch - pytorch - ml-engine-pytorch - version: "1.13.1" - tags: get,generic-python-lib,_mlperf_logging - tags: get,generic-python-lib,_opencv-python - tags: get,generic-python-lib,_tensorboard @@ -1128,15 +1127,10 @@ variations: - tags: get,generic-python-lib,_package.pyre-extensions - tags: get,generic-python-lib,_package.torchsnapshot - tags: get,generic-python-lib,_package.torchmetrics - version: "0.11.0" - tags: get,generic-python-lib,_package.torchrec - version: "0.3.2" - tags: get,generic-python-lib,_package.fbgemm-gpu - version: "0.3.2" - tags: get,generic-python-lib,_package.fbgemm-gpu-cpu - version: "0.3.2" - - tags: get,generic-python-lib,_package.torch - version: "1.13.1" + - tags: set,user,limit,_large-nofile rnnt: From 83eb295c32db8a62486aff7314c5f8405b4c11cf Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 19:02:39 +0530 Subject: [PATCH 039/111] Added set-user-limit CM script --- script/set-user-limits/_cm.yaml | 14 ++++++++++++++ script/set-user-limits/customize.py | 29 +++++++++++++++++++++++++++++ script/set-user-limits/run.sh | 17 +++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 script/set-user-limits/_cm.yaml create mode 100644 script/set-user-limits/customize.py create mode 100644 script/set-user-limits/run.sh diff --git a/script/set-user-limits/_cm.yaml b/script/set-user-limits/_cm.yaml new file mode 100644 index 0000000000..6097298c29 --- /dev/null +++ b/script/set-user-limits/_cm.yaml @@ -0,0 +1,14 @@ +alias: set-user-limits +automation_alias: script +automation_uid: 5b4e0237da074764 +cache: false +tags: +- set +- user +- limits +- limit +uid: 49dd1856b37342ac +variations: + large-nofile: + env: + CM_ULIMIT_NOFILE: 9999 diff --git a/script/set-user-limits/customize.py b/script/set-user-limits/customize.py new file mode 100644 index 0000000000..3b67e410b3 --- /dev/null +++ b/script/set-user-limits/customize.py @@ -0,0 +1,29 @@ +from cmind import utils +import os + +def preprocess(i): + + os_info = i['os_info'] + + env = i['env'] + + meta = i['meta'] + + automation = i['automation'] + + quiet = (env.get('CM_QUIET', False) == 'yes') + + cmds = [] + + if env.get('CM_ULIMIT_NOFILE', '') != '': + cmds.append(f"ulimit -n {env['CM_ULIMIT_NOFILE']}") + + env['CM_RUN_CMD'] = " && ".join(cmds) + + return {'return':0} + +def postprocess(i): + + env = i['env'] + + return {'return':0} diff --git a/script/set-user-limits/run.sh b/script/set-user-limits/run.sh new file mode 100644 index 0000000000..4c23c380ea --- /dev/null +++ b/script/set-user-limits/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +#CM Script location: ${CM_TMP_CURRENT_SCRIPT_PATH} + +#To export any variable +#echo "VARIABLE_NAME=VARIABLE_VALUE" >>tmp-run-env.out + +#${CM_PYTHON_BIN_WITH_PATH} contains the path to python binary if "get,python" is added as a dependency + +echo "Running: " +echo "${CM_RUN_CMD}" +echo "" + +if [[ ${CM_FAKE_RUN} != "yes" ]]; then + eval "${CM_RUN_CMD}" + test $? -eq 0 || exit 1 +fi From 0f98c47754c887060efeaf726ec94593770bca9e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 14:33:58 +0100 Subject: [PATCH 040/111] Update test-mlperf-inference-llama2.yml --- .github/workflows/test-mlperf-inference-llama2.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index 0ce410409f..eae18729c7 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -24,7 +24,8 @@ jobs: source gh_action/bin/deactivate || python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + python3 -m pip install cm4mlops + cm pull repo - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --env.CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST=yes --adr.inference-src.env.CM_GIT_URL=https://github.com/anandhu-eng/inference.git --clean From 341f782dd3c969c98ebdb4e5932c0ac8bad9ec53 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Fri, 27 Sep 2024 20:23:46 +0530 Subject: [PATCH 041/111] fix indendation --- script/get-dlrm-data-mlperf-inference/_cm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/get-dlrm-data-mlperf-inference/_cm.yaml b/script/get-dlrm-data-mlperf-inference/_cm.yaml index ab0e46e8be..f287e37db8 100644 --- a/script/get-dlrm-data-mlperf-inference/_cm.yaml +++ b/script/get-dlrm-data-mlperf-inference/_cm.yaml @@ -17,7 +17,7 @@ new_env_keys: input_mapping: dlrm_data_path: CM_DLRM_DATA_PATH criteo_day23_raw_data_path: CM_CRITEO_DAY23_RAW_DATA_PATH - prehook_deps: +prehook_deps: - tags: get,ml-model,dlrm,_pytorch enable_if_env: CM_DLRM_MODEL_DOWNLOAD: From b4e6427d047c19922d075952bda03eaee9964d82 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 21:05:57 +0530 Subject: [PATCH 042/111] Fix issues with detect-sudo --- script/detect-sudo/customize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/script/detect-sudo/customize.py b/script/detect-sudo/customize.py index c963fc23f9..b765e875e2 100644 --- a/script/detect-sudo/customize.py +++ b/script/detect-sudo/customize.py @@ -1,5 +1,7 @@ from cmind import utils import os, subprocess +import select +import sys def preprocess(i): @@ -45,7 +47,7 @@ def prompt_sudo(): while True: try: r = subprocess.check_output(["sudo", "-p", msg, "echo", "Check sudo"], - stderr=subprocess.STDOUT, timeout=5) + stderr=subprocess.STDOUT, timeout=20) print(r.decode('utf-8')) # Decode bytes to string return 0 except subprocess.TimeoutExpired: From aee19baa5df8de2de80c8e82f02d44c6e77307d7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 28 Sep 2024 09:37:52 +0100 Subject: [PATCH 043/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 7b93c87855..fb0bb59ffb 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -18,10 +18,11 @@ jobs: steps: - name: Install dependencies run: | - source gh_action/bin/deactivate || python3 -m venv gh_action + if [ -f "gh_action/bin/activate" ]; then source gh_action/bin/deactivate; fi + python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + cm pull repo - name: Test MLPerf Inference reference SDXL SCC env: GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} @@ -43,10 +44,11 @@ jobs: steps: - name: Install dependencies run: | - source gh_action/bin/deactivate || python3 -m venv gh_action + if [ -f "gh_action/bin/activate" ]; then source gh_action/bin/deactivate; fi + python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + cm pull repo - name: Test MLPerf Inference NVIDIA SDXL SCC env: GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} From 056848f252a78cf36be2507ee61af44c83055953 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 28 Sep 2024 09:44:35 +0100 Subject: [PATCH 044/111] Update test-scc24-sdxl.yaml | fixes the run --- .github/workflows/test-scc24-sdxl.yaml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index fb0bb59ffb..0cf0108c95 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -8,6 +8,8 @@ jobs: build_reference: if: github.repository_owner == 'gateoverflow' runs-on: [ self-hosted, linux, x64 ] + env: + CM_REPOS: $HOME/GH_CM strategy: fail-fast: false matrix: @@ -16,17 +18,12 @@ jobs: precision: [ "float16" ] device: [ "cuda" ] steps: - - name: Install dependencies + - name: Test MLPerf Inference reference SDXL SCC run: | if [ -f "gh_action/bin/activate" ]; then source gh_action/bin/deactivate; fi python3 -m venv gh_action source gh_action/bin/activate - export CM_REPOS=$HOME/GH_CM cm pull repo - - name: Test MLPerf Inference reference SDXL SCC - env: - GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} - run: | cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet @@ -42,17 +39,12 @@ jobs: precision: [ "float16" ] implementation: [ "nvidia" ] steps: - - name: Install dependencies + - name: Test MLPerf Inference NVIDIA SDXL SCC run: | if [ -f "gh_action/bin/activate" ]; then source gh_action/bin/deactivate; fi python3 -m venv gh_action source gh_action/bin/activate - export CM_REPOS=$HOME/GH_CM cm pull repo - - name: Test MLPerf Inference NVIDIA SDXL SCC - env: - GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} - run: | cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet From 61d0f5ba3142ddc3027e11dacab0ed0e8d5274db Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 28 Sep 2024 09:46:51 +0100 Subject: [PATCH 045/111] Update test-mlperf-inference-sdxl.yaml | turned off --- .../workflows/test-mlperf-inference-sdxl.yaml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml index a2024cf4a0..fd452c3ba6 100644 --- a/.github/workflows/test-mlperf-inference-sdxl.yaml +++ b/.github/workflows/test-mlperf-inference-sdxl.yaml @@ -1,12 +1,12 @@ name: MLPerf inference SDXL - +#off now as we have SCC24 test doing the same on: schedule: - cron: "1 2 * * *" jobs: build_reference: - if: github.repository_owner == 'gateoverflow' + if: github.repository_owner == 'gateoverflow_off' runs-on: [ self-hosted, linux, x64 ] strategy: fail-fast: false @@ -15,19 +15,17 @@ jobs: backend: [ "pytorch" ] precision: [ "float16" ] steps: - - name: Install dependencies + - name: Test MLPerf Inference SDXL Reference run: | source gh_action/bin/deactivate || python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM python3 -m pip install cm4mlops cm pull repo - - name: Test MLPerf Inference SDXL - run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean build_nvidia: - if: github.repository_owner == 'gateoverflow' + if: github.repository_owner == 'gateoverflow_off' runs-on: [ self-hosted, linux, x64 ] strategy: fail-fast: false @@ -37,12 +35,10 @@ jobs: precision: [ "float16" ] implementation: [ "nvidia" ] steps: - - name: Install dependencies + - name: Test MLPerf Inference SDXL Nvidia run: | source gh_action/bin/deactivate || python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - - name: Test MLPerf Inference SDXL - run: | + cm pull repo cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean From de13f1ccba4b846cee02e81ef35dde7ebba48d5b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 27 Sep 2024 23:54:06 +0530 Subject: [PATCH 046/111] Added fvcore dependency for mlperf inference reference dlrmv2 --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 3cde8ba1f0..ab4b09a36c 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -1130,6 +1130,7 @@ variations: - tags: get,generic-python-lib,_package.torchrec - tags: get,generic-python-lib,_package.fbgemm-gpu - tags: get,generic-python-lib,_package.fbgemm-gpu-cpu + - tags: get,generic-python-lib,_package.fvcore - tags: set,user,limit,_large-nofile From cee2e81cc5950979aca7a112505948c4c686398b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 28 Sep 2024 15:23:17 +0530 Subject: [PATCH 047/111] Added variations for pip index and extra-index urls --- script/get-generic-python-lib/_cm.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/script/get-generic-python-lib/_cm.json b/script/get-generic-python-lib/_cm.json index fa78d0d966..b3757091c1 100644 --- a/script/get-generic-python-lib/_cm.json +++ b/script/get-generic-python-lib/_cm.json @@ -73,6 +73,16 @@ "tags_help": "get generic-python-lib", "uid": "94b62a682bc44791", "variations": { + "index-url.#": { + "env": { + "CM_GENERIC_PYTHON_PIP_INDEX_URL": "#" + } + }, + "extra-index-url.#": { + "env": { + "CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL": "#" + } + }, "Pillow": { "env": { "CM_GENERIC_PYTHON_PACKAGE_NAME": "Pillow" From ba391304b683a794b97a2503d076ce7cac7e7e63 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sun, 29 Sep 2024 10:04:56 +0100 Subject: [PATCH 048/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 0cf0108c95..2538446ddf 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -2,12 +2,12 @@ name: MLPerf inference SDXL on: schedule: - - cron: "43 1 * * *" + - cron: "43 */6 * * *" jobs: build_reference: if: github.repository_owner == 'gateoverflow' - runs-on: [ self-hosted, linux, x64 ] + runs-on: [ self-hosted, linux, x64, GO-spr ] env: CM_REPOS: $HOME/GH_CM strategy: From faa57815ac6175bd416cecdd7aa6889a3a9bc5b2 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sun, 29 Sep 2024 10:05:37 +0100 Subject: [PATCH 049/111] Update test-mlperf-inference-gptj.yml --- .github/workflows/test-mlperf-inference-gptj.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml index 7d695204e8..c4d896b54c 100644 --- a/.github/workflows/test-mlperf-inference-gptj.yml +++ b/.github/workflows/test-mlperf-inference-gptj.yml @@ -5,12 +5,12 @@ name: MLPerf inference GPT-J on: schedule: - - cron: "1 1 * * */3" + - cron: "1 */6 * * *" jobs: build: if: github.repository_owner == 'gateoverflow' - runs-on: [ self-hosted, linux, x64 ] + runs-on: [ self-hosted, linux, x64, GO-spr ] strategy: fail-fast: false matrix: From e9f9ced06ea1db0dc068d10a092c266a4dfbe498 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sun, 29 Sep 2024 14:19:59 +0100 Subject: [PATCH 050/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 2538446ddf..3ed9ea380b 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -41,7 +41,7 @@ jobs: steps: - name: Test MLPerf Inference NVIDIA SDXL SCC run: | - if [ -f "gh_action/bin/activate" ]; then source gh_action/bin/deactivate; fi + if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi python3 -m venv gh_action source gh_action/bin/activate cm pull repo From f0d7f7e12a58b5aab3df004c096cafd825399e0c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 00:52:34 +0100 Subject: [PATCH 051/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 3ed9ea380b..a0bd964f4d 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -30,7 +30,7 @@ jobs: build_nvidia: if: github.repository_owner == 'gateoverflow' - runs-on: [ self-hosted, linux, x64 ] + runs-on: [ self-hosted, linux, x64, GO-spr] strategy: fail-fast: false matrix: From 663f4ebc0493d1d782b15c1c2a1f04eedecb95e5 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 00:54:54 +0100 Subject: [PATCH 052/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index a0bd964f4d..a9e129aa1f 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -20,7 +20,7 @@ jobs: steps: - name: Test MLPerf Inference reference SDXL SCC run: | - if [ -f "gh_action/bin/activate" ]; then source gh_action/bin/deactivate; fi + if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi python3 -m venv gh_action source gh_action/bin/activate cm pull repo From f1728bfbe73694be66941beabc2a95a58f303e6e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 08:35:31 +0100 Subject: [PATCH 053/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index a9e129aa1f..eb3810671e 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -23,6 +23,8 @@ jobs: if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi python3 -m venv gh_action source gh_action/bin/activate + export CM_REPOS=$HOME/GH_CM + pip install cm4mlops cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions @@ -44,6 +46,8 @@ jobs: if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi python3 -m venv gh_action source gh_action/bin/activate + export CM_REPOS=$HOME/GH_CM + pip install cm4mlops cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions From b3e0caaf17ae6ac2b9e79b83f3ce122cfbcd43ac Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Mon, 30 Sep 2024 13:06:34 +0530 Subject: [PATCH 054/111] commit in reference to https://github.com/mlcommons/cm4mlops/issues/103 --- script/build-dockerfile/customize.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/script/build-dockerfile/customize.py b/script/build-dockerfile/customize.py index 41300cddef..c4eff9a60b 100644 --- a/script/build-dockerfile/customize.py +++ b/script/build-dockerfile/customize.py @@ -2,6 +2,7 @@ import cmind as cm import os import json +import re def preprocess(i): @@ -54,6 +55,17 @@ def preprocess(i): if env.get("CM_MLOPS_REPO", "") != "": cm_mlops_repo = env["CM_MLOPS_REPO"] + # the below pattern matches both the HTTPS and SSH git link formats + git_link_pattern = r'^(https?://github\.com/([^/]+)/([^/]+)\.git|git@github\.com:([^/]+)/([^/]+)\.git)$' + if match := re.match(git_link_pattern, cm_mlops_repo): + if match.group(2) and match.group(3): + repo_owner = match.group(2) + repo_name = match.group(3) + elif match.group(4) and match.group(5): + repo_owner = match.group(4) + repo_name = match.group(5) + cm_mlops_repo = f"{repo_owner}@{repo_name}" + print(f"Converted repo format from {env['CM_MLOPS_REPO']} to {cm_mlops_repo}") else: cm_mlops_repo = "mlcommons@ck" From dd5a1d84f717e9c8ebf946ec58f4b18374533c64 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:25:10 +0530 Subject: [PATCH 055/111] Added env key - docker_not_pull_update --- script/build-dockerfile/_cm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/script/build-dockerfile/_cm.yaml b/script/build-dockerfile/_cm.yaml index 5436c6ddd3..b2fdf6f09d 100644 --- a/script/build-dockerfile/_cm.yaml +++ b/script/build-dockerfile/_cm.yaml @@ -48,6 +48,7 @@ input_mapping: script_tags: CM_DOCKER_RUN_SCRIPT_TAGS skip_cm_sys_upgrade: CM_DOCKER_SKIP_CM_SYS_UPGRADE push_image: CM_DOCKER_PUSH_IMAGE + docker_not_pull_update: CM_DOCKER_NOT_PULL_UPDATE new_env_keys: - CM_DOCKERFILE_* From 92dbfc781fc6ee88c0fa74f18b4f763250322869 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:26:44 +0530 Subject: [PATCH 056/111] Default value docker_not_pull_update - False --- script/build-dockerfile/_cm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/script/build-dockerfile/_cm.yaml b/script/build-dockerfile/_cm.yaml index b2fdf6f09d..da00f24de8 100644 --- a/script/build-dockerfile/_cm.yaml +++ b/script/build-dockerfile/_cm.yaml @@ -18,6 +18,7 @@ default_env: ' CM_DOCKER_OS: ubuntu + CM_DOCKER_NOT_PULL_UPDATE: False input_mapping: build: CM_BUILD_DOCKER_IMAGE From 41f17faa99c16d3109e5a449abe3936960712e7e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 12:09:59 +0100 Subject: [PATCH 057/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index eb3810671e..b251db1cab 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -1,8 +1,8 @@ -name: MLPerf inference SDXL +name: MLPerf inference SDXL (SCC) on: schedule: - - cron: "43 */6 * * *" + - cron: "13 * * * *" jobs: build_reference: From 102f77443e66abdbdd186398b1ef3579cf10e673 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Mon, 30 Sep 2024 17:06:25 +0530 Subject: [PATCH 058/111] added cm pull repo before cm run --- script/build-dockerfile/customize.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/script/build-dockerfile/customize.py b/script/build-dockerfile/customize.py index c4eff9a60b..7a548de74d 100644 --- a/script/build-dockerfile/customize.py +++ b/script/build-dockerfile/customize.py @@ -233,12 +233,18 @@ def preprocess(i): skip_extra = False if 'CM_DOCKER_RUN_CMD' not in env: + env['CM_DOCKER_RUN_CMD']="" if 'CM_DOCKER_RUN_SCRIPT_TAGS' not in env: - env['CM_DOCKER_RUN_CMD']="cm version" + env['CM_DOCKER_RUN_CMD']+="cm version" skip_extra = True else: - env['CM_DOCKER_RUN_CMD']="cm run script --tags=" + env['CM_DOCKER_RUN_SCRIPT_TAGS']+ ' --quiet' - + if not env["CM_DOCKER_NOT_PULL_UPDATE"]: + env['CM_DOCKER_RUN_CMD'] += "cm pull repo && " + env['CM_DOCKER_RUN_CMD'] += "cm run script --tags=" + env['CM_DOCKER_RUN_SCRIPT_TAGS']+ ' --quiet' + else: + env['CM_DOCKER_RUN_CMD']="cm pull repo && " + env['CM_DOCKER_RUN_CMD'] + + print(env['CM_DOCKER_RUN_CMD']) fake_run = env.get("CM_DOCKER_FAKE_RUN_OPTION"," --fake_run") + dockerfile_env_input_string fake_run = fake_run + " --fake_deps" if env.get('CM_DOCKER_FAKE_DEPS') else fake_run From 104886b623379f885616fea5e16f6301af85cadc Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 13:08:35 +0100 Subject: [PATCH 059/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index b251db1cab..bac97579d0 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -24,7 +24,7 @@ jobs: python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - pip install cm4mlops + pip install --reinstall cm4mlops cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions @@ -47,7 +47,7 @@ jobs: python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - pip install cm4mlops + pip install --reinstall cm4mlops cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions From 6d503a02bf6fa88bc055cbd9ced80023e97b6824 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 14:13:50 +0100 Subject: [PATCH 060/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index bac97579d0..0809a61854 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -24,7 +24,7 @@ jobs: python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - pip install --reinstall cm4mlops + pip install --upgrade --force-reinstall cm4mlops cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions @@ -47,7 +47,7 @@ jobs: python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - pip install --reinstall cm4mlops + pip install --upgrade --force-reinstall cm4mlops cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions From f0a46996c8c029f79de3d7cb9f9d07d51267e81f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 14:30:22 +0100 Subject: [PATCH 061/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 0809a61854..fad7a8d89e 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -2,7 +2,7 @@ name: MLPerf inference SDXL (SCC) on: schedule: - - cron: "13 * * * *" + - cron: "*/10 * * * *" jobs: build_reference: From 5f6ab95b66443b31ebbf3a5401f3942160d241e7 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Mon, 30 Sep 2024 19:35:13 +0530 Subject: [PATCH 062/111] bug fix --- script/build-dockerfile/customize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/script/build-dockerfile/customize.py b/script/build-dockerfile/customize.py index 7a548de74d..1b74fa61b9 100644 --- a/script/build-dockerfile/customize.py +++ b/script/build-dockerfile/customize.py @@ -238,11 +238,12 @@ def preprocess(i): env['CM_DOCKER_RUN_CMD']+="cm version" skip_extra = True else: - if not env["CM_DOCKER_NOT_PULL_UPDATE"]: + if str(env.get('CM_DOCKER_NOT_PULL_UPDATE', 'False')).lower() not in ["yes", "1", "true"]: env['CM_DOCKER_RUN_CMD'] += "cm pull repo && " env['CM_DOCKER_RUN_CMD'] += "cm run script --tags=" + env['CM_DOCKER_RUN_SCRIPT_TAGS']+ ' --quiet' else: - env['CM_DOCKER_RUN_CMD']="cm pull repo && " + env['CM_DOCKER_RUN_CMD'] + if str(env.get('CM_DOCKER_NOT_PULL_UPDATE', 'False')).lower() not in ["yes", "1", "true"]: + env['CM_DOCKER_RUN_CMD']="cm pull repo && " + env['CM_DOCKER_RUN_CMD'] print(env['CM_DOCKER_RUN_CMD']) fake_run = env.get("CM_DOCKER_FAKE_RUN_OPTION"," --fake_run") + dockerfile_env_input_string From aa8c8d8f3ce9da2ef7643c0503fa13ac255a7ebe Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 15:11:20 +0100 Subject: [PATCH 063/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index fad7a8d89e..f61849ac2e 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -26,7 +26,7 @@ jobs: export CM_REPOS=$HOME/GH_CM pip install --upgrade --force-reinstall cm4mlops cm pull repo - cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet From 34f9ccf9a53cef432de53cb0fd746a6899d902b7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 15:11:34 +0100 Subject: [PATCH 064/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index f61849ac2e..0235cb12a8 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -2,7 +2,7 @@ name: MLPerf inference SDXL (SCC) on: schedule: - - cron: "*/10 * * * *" + - cron: "* * * * *" jobs: build_reference: From 40461ec55e882d6845ca80bdd7a2fe87088fa781 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 30 Sep 2024 18:31:42 +0100 Subject: [PATCH 065/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 0235cb12a8..5a1dd3eb95 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -2,7 +2,7 @@ name: MLPerf inference SDXL (SCC) on: schedule: - - cron: "* * * * *" + - cron: "1 3 * * *" jobs: build_reference: From 332ec61d4f30e5f25ebd5305c57bd608ae41f1b3 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 01:44:28 +0530 Subject: [PATCH 066/111] Improve detect-sudo for non-interactive shells --- script/detect-sudo/customize.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/script/detect-sudo/customize.py b/script/detect-sudo/customize.py index b765e875e2..ad4289efda 100644 --- a/script/detect-sudo/customize.py +++ b/script/detect-sudo/customize.py @@ -2,6 +2,7 @@ import os, subprocess import select import sys +import grp def preprocess(i): @@ -24,8 +25,17 @@ def reset_terminal(): """Reset terminal to default settings.""" subprocess.run(['stty', 'sane']) -def prompt_retry(timeout=10): +def prompt_retry(timeout=10, default_retry=False): """Prompt the user with a yes/no question to retry the command, with a 10-second timeout.""" + + # Check if we're in an interactive terminal + if not sys.stdin.isatty(): + if default_retry: + print(f"Non-interactive environment detected. Automatically retrying.") + else: + print(f"Non-interactive environment detected. Skipping retry.") + return default_retry # Automatically use the default in non-interactive terminals + print(f"Timeout occurred. Do you want to try again? (y/n): ", end='', flush=True) # Use select to wait for user input with a timeout @@ -41,8 +51,20 @@ def prompt_retry(timeout=10): print("\nNo input received in 10 seconds. Exiting.") return False # No input within the timeout, so don't retry +def is_user_in_sudo_group(): + """Check if the current user is in the 'sudo' group.""" + try: + sudo_group = grp.getgrnam('sudo').gr_mem + return os.getlogin() in sudo_group + except KeyError: + # 'sudo' group doesn't exist (might be different on some systems) + return False + except Exception as e: + print(f"Error checking sudo group: {str(e)}") + return False + def prompt_sudo(): - if os.geteuid() != 0: # No sudo required for root user + if os.geteuid() != 0 or is_user_in_sudo_group(): # No sudo required for root user msg = "[sudo] password for %u:" while True: try: From 2f843cbf6f0db1d7d3297a6ad347c19fa2f31c0f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 15:29:23 +0530 Subject: [PATCH 067/111] Added tabulate install in github action --- .github/workflows/test-scc24-sdxl.yaml | 2 ++ script/run-docker-container/customize.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 5a1dd3eb95..68b963a6b5 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -25,6 +25,7 @@ jobs: source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM pip install --upgrade --force-reinstall cm4mlops + pip install tabulate cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions @@ -48,6 +49,7 @@ jobs: source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM pip install --upgrade --force-reinstall cm4mlops + pip install tabulate cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions diff --git a/script/run-docker-container/customize.py b/script/run-docker-container/customize.py index 9158fde9a2..84114d7af5 100644 --- a/script/run-docker-container/customize.py +++ b/script/run-docker-container/customize.py @@ -221,10 +221,8 @@ def postprocess(i): lines = docker_out.split("\n") for line in lines: - print(f"line = {line}") if line.startswith("ID="): ID = line[3:] - print(f"My id = {ID}") env['CM_DOCKER_CONTAINER_ID'] = ID print(docker_out) From fbf3832ede608facaca37f10b603cbe65f7275eb Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 12:11:04 +0100 Subject: [PATCH 068/111] Update test-scc24-sdxl.yaml | use SDXL model from host --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 68b963a6b5..78b3ad4817 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -27,7 +27,7 @@ jobs: pip install --upgrade --force-reinstall cm4mlops pip install tabulate cm pull repo - cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet @@ -51,6 +51,6 @@ jobs: pip install --upgrade --force-reinstall cm4mlops pip install tabulate cm pull repo - cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --clean + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet From 8626640a8e2c0436306c1e55518ab9524ac2153a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 13:31:13 +0100 Subject: [PATCH 069/111] Update test-scc24-sdxl.yaml | Added the perf+acc run --- .github/workflows/test-scc24-sdxl.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 78b3ad4817..39909ac491 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -28,7 +28,8 @@ jobs: pip install tabulate cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean - cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions + cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet build_nvidia: @@ -52,5 +53,6 @@ jobs: pip install tabulate cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean - cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions + cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet From 93267a80c54b8a3b6179aa1fad3dd8df5c3f715a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 18:06:55 +0530 Subject: [PATCH 070/111] Added CHECK_CMD for rsync --- script/get-generic-sys-util/_cm.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/script/get-generic-sys-util/_cm.json b/script/get-generic-sys-util/_cm.json index e8ed21f1f8..90fba4fc98 100644 --- a/script/get-generic-sys-util/_cm.json +++ b/script/get-generic-sys-util/_cm.json @@ -564,7 +564,8 @@ }, "rsync": { "env": { - "CM_SYS_UTIL_NAME": "rsync" + "CM_SYS_UTIL_NAME": "rsync", + "CM_SYS_UTIL_CHECK_CMD": "rsync --version" }, "state": { "rsync": { From 1fc5a53ece13f10637cc06bcddc855a885d8eca0 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 13:37:33 +0100 Subject: [PATCH 071/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 39909ac491..29ee93f8a0 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -30,7 +30,7 @@ jobs: cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results - cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet + cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions build_nvidia: if: github.repository_owner == 'gateoverflow' @@ -55,4 +55,4 @@ jobs: cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results - cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet + cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions From 166269153ab1835d59184b6be95bf5390ac3d09a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 13:57:18 +0100 Subject: [PATCH 072/111] Update test-scc24-sdxl.yaml | Dont force reinstall cm4mlops --- .github/workflows/test-scc24-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 29ee93f8a0..49535e6cac 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -24,7 +24,7 @@ jobs: python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - pip install --upgrade --force-reinstall cm4mlops + pip install --upgrade cm4mlops pip install tabulate cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean @@ -49,7 +49,7 @@ jobs: python3 -m venv gh_action source gh_action/bin/activate export CM_REPOS=$HOME/GH_CM - pip install --upgrade --force-reinstall cm4mlops + pip install --upgrade cm4mlops pip install tabulate cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean From 88f56e770a8ca3e5f0a4b829f29bb81477276f5c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 18:40:58 +0530 Subject: [PATCH 073/111] Support pip install of loadgen --- script/get-mlperf-inference-loadgen/_cm.yaml | 5 +++++ script/get-mlperf-inference-loadgen/customize.py | 8 ++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/script/get-mlperf-inference-loadgen/_cm.yaml b/script/get-mlperf-inference-loadgen/_cm.yaml index 09d66d5444..45e180275f 100644 --- a/script/get-mlperf-inference-loadgen/_cm.yaml +++ b/script/get-mlperf-inference-loadgen/_cm.yaml @@ -94,6 +94,11 @@ tags: - mlcommons variations: + from-pip: + env: + CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP: 'yes' + deps: + - tags: get,generic-python-lib,_package.mlcommons-loadgen copy: add_deps: inference-src-loadgen: diff --git a/script/get-mlperf-inference-loadgen/customize.py b/script/get-mlperf-inference-loadgen/customize.py index 1298e73488..66018867eb 100644 --- a/script/get-mlperf-inference-loadgen/customize.py +++ b/script/get-mlperf-inference-loadgen/customize.py @@ -5,8 +5,8 @@ def preprocess(i): os_info = i['os_info'] -# if os_info['platform'] == 'windows': -# return {'return':1, 'error': 'Windows is not supported in this script yet'} + if env.get('CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP', '') == 'yes': + i['run_script_input']['script_name'] = "donotrun" return {'return':0} @@ -15,6 +15,10 @@ def postprocess(i): os_info = i['os_info'] env = i['env'] + if env.get('CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP', '') == 'yes': + return {'return':0} + + for key in ['+PYTHONPATH', '+C_INCLUDE_PATH', '+CPLUS_INCLUDE_PATH', '+LD_LIBRARY_PATH', '+DYLD_FALLBACK_LIBRARY_PATH']: # 20221024: we save and restore env in the main script and can clean env here for determinism # if key not in env: From 8371f0b43d60da4e790906f06090a2bacaa44c0a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 19:44:12 +0530 Subject: [PATCH 074/111] Fix bug in get-mlperf-inference-loadgen --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 5 ----- script/get-mlperf-inference-loadgen/customize.py | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index ab4b09a36c..b648dc54e0 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -702,11 +702,6 @@ variations: torchvision: tags: _rocm - rocm,sdxl: - add_deps: - mlperf-implementation: - tags: _repo.https://github.com/gateoverflow/inference - ray: group: framework add_deps_recursive: diff --git a/script/get-mlperf-inference-loadgen/customize.py b/script/get-mlperf-inference-loadgen/customize.py index 66018867eb..6b84198bf7 100644 --- a/script/get-mlperf-inference-loadgen/customize.py +++ b/script/get-mlperf-inference-loadgen/customize.py @@ -4,6 +4,7 @@ def preprocess(i): os_info = i['os_info'] + env = i['env'] if env.get('CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP', '') == 'yes': i['run_script_input']['script_name'] = "donotrun" From 6f73b44ae9929e099fc75ade50e5992301f649c3 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 16:54:45 +0100 Subject: [PATCH 075/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 797c6f2a0b..c1ed2aaee2 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -27,7 +27,6 @@ jobs: implementation: cpp - os: macos-latest backend: tf - - os: windows-latest steps: - uses: actions/checkout@v4 @@ -41,4 +40,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference ResNet50 run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="cTuning" --hw_name=default --model=resnet50 --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --adr.loadgen.tags=_from-pip --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet From 4427270f6322b6ff1618ad5c2b960359d0d7b642 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 17:14:18 +0100 Subject: [PATCH 076/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index c1ed2aaee2..483101fbde 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -15,6 +15,8 @@ jobs: runs-on: ${{ matrix.os }} env: CM_INDEX: "on" + # Set WINDOWS_ADR only for windows-latest + WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip ' || '' }} strategy: fail-fast: false matrix: @@ -27,6 +29,8 @@ jobs: implementation: cpp - os: macos-latest backend: tf + - os: windows-latest + implementation: cpp steps: - uses: actions/checkout@v4 @@ -40,4 +44,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference ResNet50 run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --adr.loadgen.tags=_from-pip --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 ${WINDOWS_ADR} --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet From 459a440cc9f5d8efffd539b8286a8e0041025ed9 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 21:55:26 +0530 Subject: [PATCH 077/111] Avoid get,compiler when mlperf run is with python --- script/get-mlperf-inference-sut-description/_cm.json | 8 +++++++- script/get-mlperf-inference-sut-description/customize.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/script/get-mlperf-inference-sut-description/_cm.json b/script/get-mlperf-inference-sut-description/_cm.json index f9c1b03454..e5b8723c49 100644 --- a/script/get-mlperf-inference-sut-description/_cm.json +++ b/script/get-mlperf-inference-sut-description/_cm.json @@ -22,7 +22,13 @@ "names": [ "compiler" ], - "tags": "get,compiler" + "tags": "get,compiler", + "skip_if_env": { + "CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP": + [ + "yes" + ] + } }, { "tags": "get,cuda-devices,_with-pycuda", diff --git a/script/get-mlperf-inference-sut-description/customize.py b/script/get-mlperf-inference-sut-description/customize.py index cc36483c64..faf8556b65 100644 --- a/script/get-mlperf-inference-sut-description/customize.py +++ b/script/get-mlperf-inference-sut-description/customize.py @@ -8,7 +8,7 @@ def preprocess(i): state = i['state'] os_info = i['os_info'] - submitter = env.get('CM_MLPERF_SUBMITTER', 'CTuning') + submitter = env.get('CM_MLPERF_SUBMITTER', 'MLCommons') auto_detected_hw_name = False if env.get('CM_HW_NAME', '') == '': From 8e9493eabbb32e2fae61b42b44e25df4e890c82a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 22:48:55 +0530 Subject: [PATCH 078/111] Fix github test for pip-loadgen --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- script/get-mlperf-inference-loadgen/_cm.yaml | 2 +- script/get-mlperf-inference-loadgen/customize.py | 4 ++-- script/run-mlperf-inference-app/_cm.yaml | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 483101fbde..fff6fa8140 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -16,7 +16,7 @@ jobs: env: CM_INDEX: "on" # Set WINDOWS_ADR only for windows-latest - WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip ' || '' }} + WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip --from_pip ' || '' }} strategy: fail-fast: false matrix: diff --git a/script/get-mlperf-inference-loadgen/_cm.yaml b/script/get-mlperf-inference-loadgen/_cm.yaml index 45e180275f..50c80a92e6 100644 --- a/script/get-mlperf-inference-loadgen/_cm.yaml +++ b/script/get-mlperf-inference-loadgen/_cm.yaml @@ -96,7 +96,7 @@ tags: variations: from-pip: env: - CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP: 'yes' + CM_TMP_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP: 'yes' deps: - tags: get,generic-python-lib,_package.mlcommons-loadgen copy: diff --git a/script/get-mlperf-inference-loadgen/customize.py b/script/get-mlperf-inference-loadgen/customize.py index 6b84198bf7..077a6fae2e 100644 --- a/script/get-mlperf-inference-loadgen/customize.py +++ b/script/get-mlperf-inference-loadgen/customize.py @@ -6,7 +6,7 @@ def preprocess(i): os_info = i['os_info'] env = i['env'] - if env.get('CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP', '') == 'yes': + if env.get('CM_TMP_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP', '') == 'yes': i['run_script_input']['script_name'] = "donotrun" return {'return':0} @@ -16,7 +16,7 @@ def postprocess(i): os_info = i['os_info'] env = i['env'] - if env.get('CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP', '') == 'yes': + if env.get('CM_TMP_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP', '') == 'yes': return {'return':0} diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index efb6371502..cefdf55d40 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -57,6 +57,7 @@ input_mapping: framework: CM_MLPERF_BACKEND gpu_name: CM_NVIDIA_GPU_NAME hw_name: CM_HW_NAME + pip_loadgen: CM_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP hw_notes_extra: CM_MLPERF_SUT_SW_NOTES_EXTRA imagenet_path: IMAGENET_PATH implementation: CM_MLPERF_IMPLEMENTATION From ea1ae8e7cecc8740f275e90685a04e05f502c382 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 18:38:35 +0100 Subject: [PATCH 079/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index fff6fa8140..1d99cc8324 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -16,7 +16,7 @@ jobs: env: CM_INDEX: "on" # Set WINDOWS_ADR only for windows-latest - WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip --from_pip ' || '' }} + WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip --pip_loadgen ' || '' }} strategy: fail-fast: false matrix: From e402dbdecb5be9ff2285c565059bc3052b5a4ce2 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 18:42:18 +0100 Subject: [PATCH 080/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 1d99cc8324..c6fdffb16b 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -16,7 +16,7 @@ jobs: env: CM_INDEX: "on" # Set WINDOWS_ADR only for windows-latest - WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip --pip_loadgen ' || '' }} + WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip --pip_loadgen=yes ' || '' }} strategy: fail-fast: false matrix: From 6845d0ebb612d1c29650a21d4ba7cae19742c6e4 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 19:06:48 +0100 Subject: [PATCH 081/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index c6fdffb16b..760ee7a009 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -15,8 +15,6 @@ jobs: runs-on: ${{ matrix.os }} env: CM_INDEX: "on" - # Set WINDOWS_ADR only for windows-latest - WINDOWS_ADR: ${{ matrix.os == 'windows-latest' && ' --adr.loadgen.tags=_from-pip --pip_loadgen=yes ' || '' }} strategy: fail-fast: false matrix: @@ -44,4 +42,9 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference ResNet50 run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 ${WINDOWS_ADR} --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + if [[ "${{ matrix.os }}" == "windows-latest" ]]; then + WINDOWS_FLAGS="--adr.loadgen.tags=_from-pip --pip_loadgen=yes" + else + WINDOWS_FLAGS="" + fi + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 ${WINDOWS_FLAGS} --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet From 8c22922975d63d4692e051bc52a7ac37e5aae201 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 19:17:50 +0100 Subject: [PATCH 082/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 760ee7a009..774bfcd3a8 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -42,7 +42,7 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference ResNet50 run: | - if [[ "${{ matrix.os }}" == "windows-latest" ]]; then + if [ "${{ matrix.os }}" = "windows-latest" ]; then WINDOWS_FLAGS="--adr.loadgen.tags=_from-pip --pip_loadgen=yes" else WINDOWS_FLAGS="" From 3a05a1c4319fbf1b53f29b51ff6782fbb282b77f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 19:26:15 +0100 Subject: [PATCH 083/111] Update test-mlperf-inference-resnet50.yml --- .../workflows/test-mlperf-inference-resnet50.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 774bfcd3a8..99bdf7edd1 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -40,11 +40,12 @@ jobs: run: | python3 -m pip install cmind cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - - name: Test MLPerf Inference ResNet50 + - name: Test MLPerf Inference ResNet50 (Windows) run: | - if [ "${{ matrix.os }}" = "windows-latest" ]; then - WINDOWS_FLAGS="--adr.loadgen.tags=_from-pip --pip_loadgen=yes" - else - WINDOWS_FLAGS="" - fi - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 ${WINDOWS_FLAGS} --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + if: matrix.os == 'windows-latest' + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --adr.loadgen.tags=_from-pip --pip_loadgen=yes --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + - name: Test MLPerf Inference ResNet50 (Linux/macOS) + run: | + if: matrix.os != 'windows-latest' + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + From d676c6369f8d935f3bfebb751d2e293c30e2b7e1 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 19:28:39 +0100 Subject: [PATCH 084/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 99bdf7edd1..f665945f88 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -41,11 +41,11 @@ jobs: python3 -m pip install cmind cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference ResNet50 (Windows) + if: matrix.os == 'windows-latest' run: | - if: matrix.os == 'windows-latest' cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --adr.loadgen.tags=_from-pip --pip_loadgen=yes --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet - name: Test MLPerf Inference ResNet50 (Linux/macOS) + if: matrix.os != 'windows-latest' run: | - if: matrix.os != 'windows-latest' cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet From 1de5a86e48be71c4bdff8de8b155137a8f0c6286 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 2 Oct 2024 00:07:17 +0530 Subject: [PATCH 085/111] Skip get,compiler for mlperf-loadgen when using pypi --- script/get-mlperf-inference-loadgen/_cm.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/script/get-mlperf-inference-loadgen/_cm.yaml b/script/get-mlperf-inference-loadgen/_cm.yaml index 50c80a92e6..a097a1edde 100644 --- a/script/get-mlperf-inference-loadgen/_cm.yaml +++ b/script/get-mlperf-inference-loadgen/_cm.yaml @@ -40,13 +40,18 @@ deps: - CM_MLPERF_INFERENCE_LOADGEN_DOWNLOAD_URL - names: - compiler - skip_if_env: + skip_if_any_env: CM_HOST_OS_TYPE: - windows + CM_TMP_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP: + - 'yes' tags: get,compiler - enable_if_env: CM_HOST_OS_TYPE: - windows + skip_if_env: + CM_TMP_MLPERF_INFERENCE_LOADGEN_INSTALL_FROM_PIP: + - 'yes' names: - compiler tags: get,cl From dcad163e96d9d3cf76aa905fa61146cc9257ace2 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 2 Oct 2024 00:19:02 +0530 Subject: [PATCH 086/111] Make mlperf-inference submission checker windows compatible --- .../run-mlperf-inference-submission-checker/customize.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/script/run-mlperf-inference-submission-checker/customize.py b/script/run-mlperf-inference-submission-checker/customize.py index 51bb38cfa3..af0ee5a4d9 100644 --- a/script/run-mlperf-inference-submission-checker/customize.py +++ b/script/run-mlperf-inference-submission-checker/customize.py @@ -7,6 +7,8 @@ def preprocess(i): os_info = i['os_info'] env = i['env'] + q = '"' if os_info['platform'] == 'windows' else "'" + submission_dir = env.get("CM_MLPERF_INFERENCE_SUBMISSION_DIR", "") version = env.get('CM_MLPERF_SUBMISSION_CHECKER_VERSION','') @@ -49,11 +51,11 @@ def preprocess(i): extra_args = ' ' + env.get('CM_MLPERF_SUBMISSION_CHECKER_EXTRA_ARGS','') - x_submitter = ' --submitter "' + submitter + '" ' if submitter!='' else '' + x_submitter = ' --submitter ' + q + submitter + q if submitter!='' else '' x_version = ' --version ' + version +' ' if version!='' else '' - CMD = env['CM_PYTHON_BIN_WITH_PATH'] + ' \'' + submission_checker_file + '\' --input \'' + submission_dir + '\'' + \ + CMD = env['CM_PYTHON_BIN_WITH_PATH'] + q + submission_checker_file + q +' --input ' + q + submission_dir + q + \ x_submitter + \ x_version + \ skip_compliance + extra_map + power_check + extra_args @@ -65,7 +67,7 @@ def preprocess(i): "generate_final_report.py") env['CM_RUN_CMD'] = CMD print(CMD) - env['CM_POST_RUN_CMD'] = env['CM_PYTHON_BIN_WITH_PATH'] + ' \'' + report_generator_file + '\' --input summary.csv' + \ + env['CM_POST_RUN_CMD'] = env['CM_PYTHON_BIN_WITH_PATH'] + q + report_generator_file + q + ' --input summary.csv ' + \ x_version + \ x_submission_repository From 0e3f58bcbca0ecd67edf9d43a145f7133a902ce2 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 2 Oct 2024 00:27:02 +0530 Subject: [PATCH 087/111] Make mlperf-inference submission checker windows compatible --- script/run-mlperf-inference-submission-checker/customize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/run-mlperf-inference-submission-checker/customize.py b/script/run-mlperf-inference-submission-checker/customize.py index af0ee5a4d9..5c863bfca3 100644 --- a/script/run-mlperf-inference-submission-checker/customize.py +++ b/script/run-mlperf-inference-submission-checker/customize.py @@ -55,7 +55,7 @@ def preprocess(i): x_version = ' --version ' + version +' ' if version!='' else '' - CMD = env['CM_PYTHON_BIN_WITH_PATH'] + q + submission_checker_file + q +' --input ' + q + submission_dir + q + \ + CMD = env['CM_PYTHON_BIN_WITH_PATH'] + ' '+ q + submission_checker_file + q +' --input ' + q + submission_dir + q + \ x_submitter + \ x_version + \ skip_compliance + extra_map + power_check + extra_args @@ -67,7 +67,7 @@ def preprocess(i): "generate_final_report.py") env['CM_RUN_CMD'] = CMD print(CMD) - env['CM_POST_RUN_CMD'] = env['CM_PYTHON_BIN_WITH_PATH'] + q + report_generator_file + q + ' --input summary.csv ' + \ + env['CM_POST_RUN_CMD'] = env['CM_PYTHON_BIN_WITH_PATH'] +' ' + q + report_generator_file + q + ' --input summary.csv ' + \ x_version + \ x_submission_repository From a38837fb1a031eb97dbb3e5ff1ccf698681cc8ae Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 1 Oct 2024 23:21:29 +0100 Subject: [PATCH 088/111] Update test-mlperf-inference-llama2.yml --- .github/workflows/test-mlperf-inference-llama2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index eae18729c7..b73f3fd875 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -5,7 +5,7 @@ name: MLPerf inference LLAMA 2 70B on: schedule: - - cron: "30 19 * * *" + - cron: "30 19 * * 4" jobs: build_reference: From 5dbadf59bbf53e619ddb61b4dba5cca51f0de97f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 2 Oct 2024 20:48:27 +0530 Subject: [PATCH 089/111] Handled some more cases for detect-sudo --- script/detect-sudo/customize.py | 33 +++++++++++++++++-- .../detect_memory.sh | 2 +- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/script/detect-sudo/customize.py b/script/detect-sudo/customize.py index ad4289efda..dbc9b89705 100644 --- a/script/detect-sudo/customize.py +++ b/script/detect-sudo/customize.py @@ -18,9 +18,38 @@ def preprocess(i): if prompt_sudo() == 0: env['CM_SUDO_USER'] = "yes" + if os.geteuid() == 0: + env['CM_SUDO'] = '' #root user does not need sudo + else: + if can_execute_sudo_without_password(): + env['CM_SUDO_USER'] = "yes" + env['CM_SUDO'] = 'sudo' + else: + env['CM_SUDO_USER'] = "no" + env['CM_SUDO'] = '' return {'return':0} +def can_execute_sudo_without_password(): + try: + # Run a harmless command using sudo + result = subprocess.run( + ['sudo', '-n', 'true'], # -n prevents sudo from prompting for a password + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + + # Check the return code; if it's 0, sudo executed without needing a password + if result.returncode == 0: + return True + else: + return False + except Exception as e: + print(f"An error occurred: {e}") + return False + + + def reset_terminal(): """Reset terminal to default settings.""" subprocess.run(['stty', 'sane']) @@ -64,7 +93,7 @@ def is_user_in_sudo_group(): return False def prompt_sudo(): - if os.geteuid() != 0 or is_user_in_sudo_group(): # No sudo required for root user + if os.geteuid() != 0 or not is_user_in_sudo_group(): # No sudo required for root user msg = "[sudo] password for %u:" while True: try: @@ -85,7 +114,7 @@ def prompt_sudo(): reset_terminal() # Always reset terminal after error return -1 - return -1 + return 0 def postprocess(i): diff --git a/script/get-mlperf-inference-sut-description/detect_memory.sh b/script/get-mlperf-inference-sut-description/detect_memory.sh index edc338c799..8a65daa139 100644 --- a/script/get-mlperf-inference-sut-description/detect_memory.sh +++ b/script/get-mlperf-inference-sut-description/detect_memory.sh @@ -1,7 +1,7 @@ #!/bin/bash if [[ ${CM_SUDO_USER} == "yes" ]]; then - sudo dmidecode -t memory > meminfo.out + ${CM_SUDO} dmidecode -t memory > meminfo.out ${CM_PYTHON_BIN_WITH_PATH} ${CM_TMP_CURRENT_SCRIPT_PATH}/get_memory_info.py fi test $? -eq 0 || return $? From b374e90b06cd7ed9f6ff480d83997af366301390 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 2 Oct 2024 17:01:26 +0100 Subject: [PATCH 090/111] Update test-mlperf-inference-gptj.yml --- .github/workflows/test-mlperf-inference-gptj.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml index c4d896b54c..690435b3e9 100644 --- a/.github/workflows/test-mlperf-inference-gptj.yml +++ b/.github/workflows/test-mlperf-inference-gptj.yml @@ -29,3 +29,5 @@ jobs: - name: Test MLPerf Inference GPTJ run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean + cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions + From 6de86775cce378bc0c32dce0c60b8784748ec3e3 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 3 Oct 2024 13:56:37 +0530 Subject: [PATCH 091/111] added hf token - to prevent user interaction if model is absent --- .github/workflows/test-mlperf-inference-llama2.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index b73f3fd875..2ca2fec29d 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -26,6 +26,8 @@ jobs: export CM_REPOS=$HOME/GH_CM python3 -m pip install cm4mlops cm pull repo + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token ${{ secrets.HF_TOKEN }} --add-to-git-credential - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --env.CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST=yes --adr.inference-src.env.CM_GIT_URL=https://github.com/anandhu-eng/inference.git --clean From 30751f00441b97147b69cc11e134c6f253618ef2 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:00:30 +0530 Subject: [PATCH 092/111] installation of hf cli library limited to local virtual env --- .github/workflows/test-mlperf-inference-llama2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index 2ca2fec29d..b26990608b 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -26,7 +26,7 @@ jobs: export CM_REPOS=$HOME/GH_CM python3 -m pip install cm4mlops cm pull repo - pip install -U "huggingface_hub[cli]" + pip install "huggingface_hub[cli]" huggingface-cli login --token ${{ secrets.HF_TOKEN }} --add-to-git-credential - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | From 1d177dd5fd33d312699116ae0df70b1666df7169 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:02:46 +0530 Subject: [PATCH 093/111] pip run through python interpreter --- .github/workflows/test-mlperf-inference-llama2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index b26990608b..d974ee5b4e 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -26,7 +26,7 @@ jobs: export CM_REPOS=$HOME/GH_CM python3 -m pip install cm4mlops cm pull repo - pip install "huggingface_hub[cli]" + python3 -m pip install "huggingface_hub[cli]" huggingface-cli login --token ${{ secrets.HF_TOKEN }} --add-to-git-credential - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | From 84a333fe7ee117e33d2ba8402d9760e5d8ea23d2 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 3 Oct 2024 09:38:03 +0100 Subject: [PATCH 094/111] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 49535e6cac..680d0f5f43 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -29,7 +29,7 @@ jobs: cm pull repo cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean - cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions build_nvidia: @@ -52,7 +52,7 @@ jobs: pip install --upgrade cm4mlops pip install tabulate cm pull repo - cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --hw_name=go-spr --clean cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean - cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions From 3e5855c2224f79f1d767f51f4db156e6bacca719 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 3 Oct 2024 10:02:51 +0100 Subject: [PATCH 095/111] Update test-mlperf-inference-llama2.yml --- .github/workflows/test-mlperf-inference-llama2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml index d974ee5b4e..97bd1bc6fc 100644 --- a/.github/workflows/test-mlperf-inference-llama2.yml +++ b/.github/workflows/test-mlperf-inference-llama2.yml @@ -30,4 +30,4 @@ jobs: huggingface-cli login --token ${{ secrets.HF_TOKEN }} --add-to-git-credential - name: Test MLPerf Inference LLAMA 2 70B reference implementation run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --env.CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST=yes --adr.inference-src.env.CM_GIT_URL=https://github.com/anandhu-eng/inference.git --clean + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=llama2-70b-99 --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --env.CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST=yes --adr.inference-src.tags=_repo.https://github.com/anandhu-eng/inference.git --clean From dc92673ad27e5c95208698d2ce1ff51f501216dd Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 3 Oct 2024 10:05:04 +0100 Subject: [PATCH 096/111] Update test-mlperf-inference-gptj.yml --- .github/workflows/test-mlperf-inference-gptj.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml index 690435b3e9..a2da8e4fe7 100644 --- a/.github/workflows/test-mlperf-inference-gptj.yml +++ b/.github/workflows/test-mlperf-inference-gptj.yml @@ -5,7 +5,7 @@ name: MLPerf inference GPT-J on: schedule: - - cron: "1 */6 * * *" + - cron: "1 2 * * *" jobs: build: From 2eefa7eee5b3fcebaa01eccd0b6590be77a980b3 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 3 Oct 2024 14:35:39 +0200 Subject: [PATCH 097/111] clean up --- script/app-image-corner-detection/_cm.json | 34 ---------------------- 1 file changed, 34 deletions(-) delete mode 100644 script/app-image-corner-detection/_cm.json diff --git a/script/app-image-corner-detection/_cm.json b/script/app-image-corner-detection/_cm.json deleted file mode 100644 index 405654f5ee..0000000000 --- a/script/app-image-corner-detection/_cm.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "alias": "app-image-corner-detection", - "automation_alias": "script", - "automation_uid": "5b4e0237da074764", - "category": "Modular application pipeline", - "deps": [ - {"tags":"detect,os"}, - {"tags":"detect,cpu"} - ], - "posthook_deps": [ - { - "skip_if_env": { - "CM_SKIP_COMPILE": [ - "on" - ] - }, - "tags": "compile,cpp-program" - }, - { - "skip_if_env": { - "CM_SKIP_RUN": [ - "on" - ] - }, - "tags": "benchmark-program" - } - ], - "tags": [ - "app", - "image", - "corner-detection" - ], - "uid": "998ffee0bc534d0a" -} From d5f275ad39320079813abcba74a68d3a41f9ed46 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 3 Oct 2024 20:51:06 +0530 Subject: [PATCH 098/111] Fix numpy version for SDXL accuracy --- script/process-mlperf-accuracy/_cm.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/script/process-mlperf-accuracy/_cm.json b/script/process-mlperf-accuracy/_cm.json index 7acbd4adf5..cd4028a533 100644 --- a/script/process-mlperf-accuracy/_cm.json +++ b/script/process-mlperf-accuracy/_cm.json @@ -369,6 +369,15 @@ }, { "tags": "get,generic-python-lib,_package.ijson" + }, + { + "tags": "get,generic-python-lib,_package.numpy", + "version_max": "1.22", + "version_max_usable": "1.22", + "names": [ + "pip-package", + "numpy" + ] } ], "env": { From 4631186cdc60c0ecdc7e4dee607147d1fcd53210 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 11:21:22 +0100 Subject: [PATCH 099/111] Update test-mlperf-inference-gptj.yml --- .github/workflows/test-mlperf-inference-gptj.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml index a2da8e4fe7..1c59dea469 100644 --- a/.github/workflows/test-mlperf-inference-gptj.yml +++ b/.github/workflows/test-mlperf-inference-gptj.yml @@ -29,5 +29,5 @@ jobs: - name: Test MLPerf Inference GPTJ run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean - cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions + cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions From 4acebf32ad21e42ee1df71ef7ebd9a6e22d72849 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 13:49:06 +0100 Subject: [PATCH 100/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index f665945f88..ef3033aa86 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -43,9 +43,12 @@ jobs: - name: Test MLPerf Inference ResNet50 (Windows) if: matrix.os == 'windows-latest' run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --adr.loadgen.tags=_from-pip --pip_loadgen=yes --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=gh_windows --model=resnet50 --adr.loadgen.tags=_from-pip --pip_loadgen=yes --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet - name: Test MLPerf Inference ResNet50 (Linux/macOS) if: matrix.os != 'windows-latest' run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet - + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=gh_${{ matrix.os }}_x86 --model=resnet50 --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet + - name: Push Results + if: github.repository_owner == 'gateoverflow' + run: | + cm run script --tags=push,github,mlperf,inference,submission,_short-run --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From cd1e44748cd5951d0dcb56f41a64725cbbb88e1c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 14:43:44 +0100 Subject: [PATCH 101/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index ef3033aa86..0c6d2935e0 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -51,4 +51,4 @@ jobs: - name: Push Results if: github.repository_owner == 'gateoverflow' run: | - cm run script --tags=push,github,mlperf,inference,submission,_short-run --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet + cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From ea8bb70759a563dfbc3dc60d526960c7bd835b83 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Fri, 4 Oct 2024 16:02:12 +0200 Subject: [PATCH 102/111] link to zenodo --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 06c2cd2bcf..40d09dbc10 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,7 @@ cm run script \ ## CM concepts -* https://doi.org/10.5281/zenodo.8105339 -* https://arxiv.org/abs/2406.16791 +Check our [ACM REP'23 keynote](https://doi.org/10.5281/zenodo.8105339). ## Authors From e446ae11d8021da408cd45f5c045d2a5ad67788a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 15:12:27 +0100 Subject: [PATCH 103/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 0c6d2935e0..e4d08e7eea 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -51,4 +51,8 @@ jobs: - name: Push Results if: github.repository_owner == 'gateoverflow' run: | + USER="GitHub Action" + EMAIL=admin@gateoverflow.com + git config --global user.name "$USER" + git config --global user.email "$EMAIL" cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From 54c9fb9a86d1af6e3ba7689473cf0a176d7ef779 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 15:45:54 +0100 Subject: [PATCH 104/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index e4d08e7eea..639fd29e01 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -55,4 +55,5 @@ jobs: EMAIL=admin@gateoverflow.com git config --global user.name "$USER" git config --global user.email "$EMAIL" + cm run script --tags=auth,gh,cli --env.CM_GH_AUTH_TOKEN=${{ secrets.TEST_RESULTS_GITHUB_TOKEN }} cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From c516986cf1aeda95af8992be8b07e2740695268b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 15:54:52 +0100 Subject: [PATCH 105/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 639fd29e01..750b8c3a3b 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -55,5 +55,5 @@ jobs: EMAIL=admin@gateoverflow.com git config --global user.name "$USER" git config --global user.email "$EMAIL" - cm run script --tags=auth,gh,cli --env.CM_GH_AUTH_TOKEN=${{ secrets.TEST_RESULTS_GITHUB_TOKEN }} + cm run script --tags=auth,gh,cli --with_token=${{ secrets. TEST_RESULTS_GITHUB_TOKEN }} cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From b1151a6f470745deea9a1b60300e4891c1baa7a0 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 15:59:32 +0100 Subject: [PATCH 106/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 750b8c3a3b..eede2a7569 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -55,5 +55,5 @@ jobs: EMAIL=admin@gateoverflow.com git config --global user.name "$USER" git config --global user.email "$EMAIL" - cm run script --tags=auth,gh,cli --with_token=${{ secrets. TEST_RESULTS_GITHUB_TOKEN }} + cm run script --tags=auth,gh,cli --with_token=${{ secrets.TEST_RESULTS_GITHUB_TOKEN }} cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From d2d187f3d6e9ffb72e57b8858e36905513bc19d5 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 16:25:47 +0100 Subject: [PATCH 107/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index eede2a7569..8fffc5e623 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -50,10 +50,11 @@ jobs: cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --hw_name=gh_${{ matrix.os }}_x86 --model=resnet50 --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --target_qps=1 -v --quiet - name: Push Results if: github.repository_owner == 'gateoverflow' + env: + USER: "GitHub Action" + EMAIL: "admin@gateoverflow.com" run: | - USER="GitHub Action" - EMAIL=admin@gateoverflow.com git config --global user.name "$USER" git config --global user.email "$EMAIL" - cm run script --tags=auth,gh,cli --with_token=${{ secrets.TEST_RESULTS_GITHUB_TOKEN }} + cm run script --tags=auth,gh,cli --with_token="${{ secrets.TEST_RESULTS_GITHUB_TOKEN }}" cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From 1e355b7216503c9794d5abb97e52f822cc4bd2fc Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 16:31:13 +0100 Subject: [PATCH 108/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 8fffc5e623..4b7e00fe10 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -4,7 +4,7 @@ name: MLPerf inference ResNet50 on: - pull_request: + pull_request_target: branches: [ "main", "dev", "mlperf-inference" ] paths: - '.github/workflows/test-mlperf-inference-resnet50.yml' From c00de07dc5675573c97380f867a045d4f572836d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 17:09:33 +0100 Subject: [PATCH 109/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index 4b7e00fe10..ddbad981ff 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -56,5 +56,10 @@ jobs: run: | git config --global user.name "$USER" git config --global user.email "$EMAIL" + git config --global credential.https://github.com.helper "" + git config --global credential.https://github.com.helper "!/usr/bin/gh auth git-credential" + git config --global credential.https://gist.github.com.helper "" + git config --global credential.https://gist.github.com.helper "!/usr/bin/gh auth git-credential" + cm run script --tags=auth,gh,cli --with_token="${{ secrets.TEST_RESULTS_GITHUB_TOKEN }}" cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From 0edaa6e9f978a12d40be20ade3604610ed0a9d6e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 17:25:22 +0100 Subject: [PATCH 110/111] Update test-mlperf-inference-resnet50.yml --- .github/workflows/test-mlperf-inference-resnet50.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-resnet50.yml b/.github/workflows/test-mlperf-inference-resnet50.yml index ddbad981ff..7211a7cf5b 100644 --- a/.github/workflows/test-mlperf-inference-resnet50.yml +++ b/.github/workflows/test-mlperf-inference-resnet50.yml @@ -57,9 +57,9 @@ jobs: git config --global user.name "$USER" git config --global user.email "$EMAIL" git config --global credential.https://github.com.helper "" - git config --global credential.https://github.com.helper "!/usr/bin/gh auth git-credential" + git config --global credential.https://github.com.helper "!gh auth git-credential" git config --global credential.https://gist.github.com.helper "" - git config --global credential.https://gist.github.com.helper "!/usr/bin/gh auth git-credential" + git config --global credential.https://gist.github.com.helper "!gh auth git-credential" cm run script --tags=auth,gh,cli --with_token="${{ secrets.TEST_RESULTS_GITHUB_TOKEN }}" cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from R50 GH action" --quiet From 238325fe9cfab624d567ccd56778d6fac02c791d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 4 Oct 2024 18:58:45 +0100 Subject: [PATCH 111/111] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 40d09dbc10..deff5458ce 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ ## Unified and cross-platform CM interface for DevOps, MLOps and MLPerf -[![arXiv](https://img.shields.io/badge/arXiv-2406.16791-b31b1b.svg)](https://arxiv.org/abs/2406.16791) [![License](https://img.shields.io/badge/License-Apache%202.0-green)](LICENSE.md) [![Python Version](https://img.shields.io/badge/python-3+-blue.svg)](https://github.com/mlcommons/ck/tree/master/cm/cmind) [![Powered by CM](https://img.shields.io/badge/Powered_by-MLCommons%20CM-blue)](https://github.com/mlcommons/ck).