-
Notifications
You must be signed in to change notification settings - Fork 321
126 lines (115 loc) · 4.49 KB
/
RunTests.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Tests
on:
pull_request:
push:
branches: [ "main" ]
workflow_dispatch:
schedule:
# Run the job every 6 hours
- cron: '0 */6 * * *'
jobs:
check_dependencies:
runs-on: ["self-hosted"]
steps:
- name: Test gsutil installation
run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
tpu_image:
needs: check_dependencies
strategy:
fail-fast: false
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: tpu
device_name: v4-8
build_mode: stable
gpu_image:
needs: check_dependencies
strategy:
fail-fast: false
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
build_mode: pinned
common:
needs: [tpu_image, gpu_image]
strategy:
fail-fast: False
matrix:
device:
- type: tpu
name: v4-8
pytest_marker: 'not gpu_only' # exclude tests marked gpu_only
container_env:
XLA_PYTHON_CLIENT_MEM_FRACTION: 0.75
TF_FORCE_GPU_ALLOW_GROWTH: false
container_resource_option: "--privileged"
- type: gpu
name: a100-40gb-4
image_suffix: gpu_jax_pinned
pytest_marker: 'not tpu_only' # exclude tests marked tpu_only
container_env:
XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
TF_FORCE_GPU_ALLOW_GROWTH: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
name: Common test (${{ matrix.device.name }})
runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
container:
image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
volumes:
- /home/runner/actions-runner/_work/maxtext/maxtext:/deps
env:
XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ matrix.device.container_env.XLA_PYTHON_CLIENT_MEM_FRACTION }}
TF_FORCE_GPU_ALLOW_GROWTH: ${{ matrix.device.container_env.TF_FORCE_GPU_ALLOW_GROWTH }}
options: ${{ matrix.device.container_resource_option }}
steps:
- uses: actions/checkout@v4
- name: Unit Tests
run: cd MaxText;python3 -m pytest tests -m "${{ matrix.device.pytest_marker }} and not integration_test"
- name: Integration Tests
run: cd MaxText; python3 -m pytest tests/integration_tests -m "${{ matrix.device.pytest_marker }} and integration_test"
# tpu_tests:
# needs: tpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: tpu
# device_name: v4-8
# pytest_marker: 'not gpu_only' # exclude tests marked gpu_only
# xla_python_client_mem_fraction: 0.75
# tf_force_gpu_allow_growth: false
# container_resource_option: "--privileged"
# gpu_tests:
# needs: gpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: gpu
# device_name: a100-40gb-4
# pytest_marker: 'not tpu_only' # exclude tests marked tpu_only
# xla_python_client_mem_fraction: 0.65
# tf_force_gpu_allow_growth: true
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
clean_up:
if: ${{ always() }} # always execute, regardless of previous jobs or steps.
needs: common
name: "Clean up"
runs-on: ["self-hosted"]
steps:
- name: Delete GPU image
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
- name: Delete TPU image
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet