Skip to content

Refactor and parallelize MaxText test runner #10

Refactor and parallelize MaxText test runner

Refactor and parallelize MaxText test runner #10

Workflow file for this run

# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Tests
on:
pull_request:
push:
branches: [ "main" ]
workflow_dispatch:
schedule:
# Run the job every 6 hours
- cron: '0 */6 * * *'
jobs:
check_dependencies:
runs-on: ["self-hosted"]
steps:
- name: Test gsutil installation
run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
tpu_image:
needs: check_dependencies
strategy:
fail-fast: false
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: tpu
device_name: v4-8
build_mode: stable
gpu_image:
needs: check_dependencies
strategy:
fail-fast: false
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
build_mode: pinned
common:
needs: [tpu_image, gpu_image]
strategy:
fail-fast: False
matrix:
device:
- type: tpu
name: v4-8
pytest_marker: 'not gpu_only' # exclude tests marked gpu_only
container_env:
XLA_PYTHON_CLIENT_MEM_FRACTION: 0.75
TF_FORCE_GPU_ALLOW_GROWTH: false
container_resource_option: "--privileged"
- type: gpu
name: a100-40gb-4
image_suffix: gpu_jax_pinned
pytest_marker: 'not tpu_only' # exclude tests marked tpu_only
container_env:
XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
TF_FORCE_GPU_ALLOW_GROWTH: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
name: Common test (${{ matrix.device.name }})
runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
container:
image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
volumes:
- /home/runner/actions-runner/_work/maxtext/maxtext:/deps
env:
XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ matrix.device.container_env.XLA_PYTHON_CLIENT_MEM_FRACTION }}
TF_FORCE_GPU_ALLOW_GROWTH: ${{ matrix.device.container_env.TF_FORCE_GPU_ALLOW_GROWTH }}
options: ${{ matrix.device.container_resource_option }}
steps:
- uses: actions/checkout@v4
- name: Unit Tests
run: cd MaxText;python3 -m pytest tests -m "${{ matrix.device.pytest_marker }} and not integration_test"
- name: Integration Tests
run: cd MaxText; python3 -m pytest tests/integration_tests -m "${{ matrix.device.pytest_marker }} and integration_test"
# tpu_tests:
# needs: tpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: tpu
# device_name: v4-8
# pytest_marker: 'not gpu_only' # exclude tests marked gpu_only
# xla_python_client_mem_fraction: 0.75
# tf_force_gpu_allow_growth: false
# container_resource_option: "--privileged"
# gpu_tests:
# needs: gpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: gpu
# device_name: a100-40gb-4
# pytest_marker: 'not tpu_only' # exclude tests marked tpu_only
# xla_python_client_mem_fraction: 0.65
# tf_force_gpu_allow_growth: true
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
clean_up:
if: ${{ always() }} # always execute, regardless of previous jobs or steps.
needs: common
name: "Clean up"
runs-on: ["self-hosted"]
steps:
- name: Delete GPU image
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
- name: Delete TPU image
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet