forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
.gitlab-ci.yml
138 lines (133 loc) · 4.04 KB
/
.gitlab-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
workflow:
rules:
- if: $CI_PROJECT_NAMESPACE != "ADLR"
when: never
- if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule"
when: never
- if: $CI_PIPELINE_SOURCE == "schedule"
auto_cancel:
on_new_commit: none
- if: $CI_PIPELINE_SOURCE == "web"
- if: $CI_COMMIT_REF_PROTECTED == "true"
variables:
FUNCTIONAL_TEST: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 10
FUNCTIONAL_TEST: 'yes'
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 9000
FUNCTIONAL_TEST_CLUSTER_A100: ''
FUNCTIONAL_TEST_CLUSTER_H100: ''
PUBLISH: 'no'
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
variables:
FUNCTIONAL_TEST: 'no'
PUBLISH: 'no'
- when: never
auto_cancel:
on_new_commit: interruptible
# on_job_failure: all
stages:
- test
- functional_tests
- publish
default:
interruptible: true
variables:
UNIT_TEST:
value: 'yes'
options:
- 'yes'
- 'no'
description: To run the funtional test suite
UNIT_TEST_REPEAT:
value: '1'
description: 'Number of repetitions'
UNIT_TEST_TIMEOUT:
value: '10'
description: Timeout (minutes) for Unit tests (all repeats)
FUNCTIONAL_TEST:
value: 'yes'
options:
- 'yes'
- 'no'
description: To run the funtional test suite
FUNCTIONAL_TEST_SCOPE:
value: 'mr'
options:
- 'mr'
- 'nightly'
- 'weekly'
- 'pre-release'
- 'release'
description: 'Testsuite to run (only for FUNCTIONAL_TEST=yes)'
FUNCTIONAL_TEST_REPEAT:
value: '5'
description: 'Number of repetitions per test'
FUNCTIONAL_TEST_TIME_LIMIT:
value: '2700'
description: 'Timeout in seconds per test'
FUNCTIONAL_TEST_CASES:
value: 'all'
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_CLUSTER_A100:
value: 'dgxa100_dracooci'
options:
- 'dgxa100_dracooci'
- 'dgxa100_dracooci-ord'
description: 'Cluster for A100 workloads'
FUNCTIONAL_TEST_CLUSTER_H100:
value: 'dgxh100_eos'
options:
- 'dgxh100_coreweave'
- 'dgxh100_eos'
description: 'Cluster for H100 workloads'
FUNCTIONAL_TEST_NAME:
description: 'Name of functional test run (only for pre-release and release)'
PUBLISH:
value: 'no'
options:
- 'yes'
- 'no'
description: Build and publish a wheel to PyPi
PUBLISH_SCOPE:
value: 'code-freeze'
options:
- 'code-freeze'
- 'release'
description: Type of publish (freeze or final release)
# CI wide variables
CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
include:
- .gitlab/stages/00.pre.yml
- .gitlab/stages/01.test.yml
- .gitlab/stages/02.functional-tests.yml
- .gitlab/stages/03.publish.yml