From 244a382e7211d3da258a4bcac3f8b0f13120d566 Mon Sep 17 00:00:00 2001
From: Alex Shraer <shralex@gmail.com>
Date: Fri, 20 Dec 2024 07:40:04 +0000
Subject: [PATCH] Move tests from UnitTests.yml to tests files.

This change moves out train and decode tests into separate files,
so they can be tested using pytest rather than explicitly being
invoked from UnitTests.yml. This will allow us, in a subsequent
PR, to parallelize the execution of these tests by utilizing
pytest parallelization support.

Several end-to-end tests are still being invoked from UnitTests.yml.
This does not seem like the right place, and will hopefully be
addressed in the future.
---
 .github/workflows/RunTests.yml                | 113 +++++++++++
 .github/workflows/UnitTests.yml               | 177 -----------------
 .github/workflows/build_upload_internal.yml   |  47 +++++
 .github/workflows/run_tests_internal.yml      |  60 ++++++
 MaxText/decode.py                             |  22 ++-
 MaxText/pytest.ini                            |  12 +-
 MaxText/tests/attention_test.py               |  16 +-
 MaxText/tests/decode_tests.py                 |  98 ++++++++++
 MaxText/tests/gpt3_test.py                    |   9 +-
 MaxText/tests/gradient_accumulation_test.py   |   2 +-
 .../inference_microbenchmark_smoke_test.py    |   2 +-
 .../checkpoint_compatibility_test.py          |  48 +++++
 .../integration_tests/checkpointing_test.py   |  51 +++++
 .../generate_param_only_checkpoint_test.py    |  53 +++++
 .../shmap_collective_matmul_test.py           |  32 +++
 MaxText/tests/kernels_test.py                 |   6 +-
 MaxText/tests/model_test.py                   |   2 +-
 MaxText/tests/multihost_dataloading_test.py   |   2 +-
 MaxText/tests/pipeline_parallelism_test.py    |  12 +-
 MaxText/tests/simple_decoder_layer_test.py    |   4 +-
 MaxText/tests/standalone_dl_ckpt_test.py      |   4 +-
 MaxText/tests/tokenizer_test.py               |   8 +-
 MaxText/tests/train_compile_test.py           |  24 +--
 MaxText/tests/train_tests.py                  | 184 ++++++++++++++++++
 24 files changed, 755 insertions(+), 233 deletions(-)
 create mode 100644 .github/workflows/RunTests.yml
 delete mode 100644 .github/workflows/UnitTests.yml
 create mode 100644 .github/workflows/build_upload_internal.yml
 create mode 100644 .github/workflows/run_tests_internal.yml
 create mode 100644 MaxText/tests/decode_tests.py
 create mode 100644 MaxText/tests/integration_tests/checkpoint_compatibility_test.py
 create mode 100644 MaxText/tests/integration_tests/checkpointing_test.py
 create mode 100644 MaxText/tests/integration_tests/generate_param_only_checkpoint_test.py
 create mode 100644 MaxText/tests/integration_tests/shmap_collective_matmul_test.py
 create mode 100644 MaxText/tests/train_tests.py

diff --git a/.github/workflows/RunTests.yml b/.github/workflows/RunTests.yml
new file mode 100644
index 000000000..1c8b53af7
--- /dev/null
+++ b/.github/workflows/RunTests.yml
@@ -0,0 +1,113 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Tests
+
+on:
+  pull_request:
+  push:
+    branches: [ "main" ]
+  workflow_dispatch:
+  schedule:
+    # Run the job every 4 hours
+    - cron:  '0 */4 * * *'
+
+jobs:
+  prelim:
+    runs-on: ["self-hosted"]
+    steps:
+    - name: Test gsutil installation
+      run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
+    - name: Cleanup old docker images
+      run: docker system prune --all --force
+
+  tpu_image:
+    needs: prelim
+    uses: ./.github/workflows/build_upload_internal.yml
+    with:
+      device_type: tpu
+      device_name: v4-8
+      build_mode: stable
+
+  gpu_image:
+    needs: prelim
+    uses: ./.github/workflows/build_upload_internal.yml
+    with:
+      device_type: gpu
+      device_name: a100-40gb-4
+      build_mode: pinned
+
+  tpu_unit_tests:
+    needs: tpu_image
+    uses: ./.github/workflows/run_tests_internal.yml
+    with:
+      device_type: tpu
+      device_name: v4-8
+      pytest_marker: 'not gpu_only and not integration_test'
+      test_directory: 'tests'
+      xla_python_client_mem_fraction: 0.75
+      tf_force_gpu_allow_growth: false
+      container_resource_option: "--privileged"
+
+  tpu_integration_tests:
+    needs: tpu_image
+    uses: ./.github/workflows/run_tests_internal.yml
+    with:
+      device_type: tpu
+      device_name: v4-8
+      pytest_marker: 'not gpu_only and integration_test'
+      test_directory: 'tests/integration_tests'
+      xla_python_client_mem_fraction: 0.75
+      tf_force_gpu_allow_growth: false
+      container_resource_option: "--privileged"
+
+  gpu_unit_tests:
+    needs: gpu_image
+    uses: ./.github/workflows/run_tests_internal.yml
+    with:
+      device_type: gpu
+      device_name: a100-40gb-4
+      pytest_marker: 'not tpu_only and not integration_test'
+      test_directory: 'tests'
+      xla_python_client_mem_fraction: 0.65
+      tf_force_gpu_allow_growth: true
+      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+
+  gpu_integration_tests:
+    needs: gpu_image
+    uses: ./.github/workflows/run_tests_internal.yml
+    with:
+      device_type: gpu
+      device_name: a100-40gb-4
+      pytest_marker: 'not tpu_only and integration_test'
+      test_directory: 'tests/integration_tests'
+      xla_python_client_mem_fraction: 0.65
+      tf_force_gpu_allow_growth: true
+      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+
+
+  clean_up:
+    if: ${{ always() }}  # always execute, regardless of previous jobs or steps.
+    needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
+    name: "Clean up"
+    runs-on: ["self-hosted"]
+    steps:
+    - name: Delete GPU image
+      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
+    - name: Delete TPU image
+      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet
+
diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
deleted file mode 100644
index 91a62efc2..000000000
--- a/.github/workflows/UnitTests.yml
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-
-name: Unit Test
-
-on:
-  pull_request:
-  push:
-    branches: [ "main" ]
-  workflow_dispatch:
-  schedule:
-    # Run the job every 2 hours
-    - cron:  '0 */2 * * *'
-
-jobs:
-  build_and_upload_image:
-    strategy:
-        fail-fast: false
-        matrix:
-          device:
-          - type: tpu
-            name: v4-8
-            mode: stable
-          - type: gpu
-            name: a100-40gb-4
-            mode: pinned
-    name: Build and upload image (${{ matrix.device.name }})
-    runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
-    steps:
-    - uses: actions/checkout@v4
-    - name: Cleanup old docker images
-      run: docker system prune --all --force
-    - name: Build an image
-      run: |
-        bash docker_build_dependency_image.sh MODE=${{ matrix.device.mode }} DEVICE=${{ matrix.device.type }} 
-    - name: Tag the image
-      run: |
-        docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
-    - name: Upload the image
-      run: |
-        docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
-
-  common:
-    needs: build_and_upload_image
-    strategy:
-      fail-fast: False
-      matrix:
-        device:
-        - type: tpu
-          name: v4-8
-          attention: autoselected
-          pytest_marker: ''
-          container_env:
-            XLA_PYTHON_CLIENT_MEM_FRACTION: 0.75
-            TF_FORCE_GPU_ALLOW_GROWTH: false
-          container_resource_option: "--privileged"
-        - type: gpu
-          name: a100-40gb-4
-          image_suffix: gpu_jax_pinned
-          attention: dot_product
-          pytest_marker: -m 'not tpu'
-          container_env:
-            XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
-            TF_FORCE_GPU_ALLOW_GROWTH: true
-          container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
-    name: Common test (${{ matrix.device.name }})
-    runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
-    container:
-      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
-      volumes: 
-        - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
-      env:
-        XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ matrix.device.container_env.XLA_PYTHON_CLIENT_MEM_FRACTION }}
-        TF_FORCE_GPU_ALLOW_GROWTH: ${{ matrix.device.container_env.TF_FORCE_GPU_ALLOW_GROWTH }}
-      options: ${{ matrix.device.container_resource_option }}
-    steps:
-    - uses: actions/checkout@v4
-    - name: Test gsutil installation
-      run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
-    - name: Test with pytest
-      run: cd MaxText;python3 -m pytest ${{ matrix.device.pytest_marker }}
-    - name: Test train.py with TFDS c4
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
-    - name: Test train.py with HF c4
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs hf_train_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet hf_path=parquet dataset_type=hf steps=2 tokenizer_path=google-t5/t5-large attention=${{ matrix.device.attention }} enable_checkpointing=false
-    - name: Test train.py with synthetic data
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }} dataset_type=synthetic
-    - name: Test train.py with per_device_batch_size < 1
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=${{ matrix.device.attention }}
-    - name: Test decode.py
-      run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=1
-    - name: Test int8_decode
-      run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=1 quantization=int8 quantize_kvcache=True
-    - name: Test decode.py with per_device_batch_size < 1
-      run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=.25
-    - name: Test int8_training
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
-    - name: Test fp8_training
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=fp8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
-    - name: Test train.py with dropout
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }} max_target_length=128 per_device_batch_size=1 dropout_rate=0.02
-    - name: Test generate_param_only_checkpoint
-      run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -a ${{ matrix.device.attention }}
-    - name: Test generate_param_only_checkpoint with int8 quantization
-      run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -q int8 -a ${{ matrix.device.attention }}
-    - name: Test grain checkpoint determinism
-      run: bash end_to_end/test_checkpointing.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset False grain ${{ matrix.device.attention }}
-    - name: Test checkpoint compatibility
-      run: bash end_to_end/test_checkpoint_compatibility.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset ${{ matrix.device.attention }}
-
-  tpu:
-    needs: build_and_upload_image
-    strategy:
-      fail-fast: false
-      matrix:
-        device-type: ["v4-8"]
-    name: "TPU test (${{ matrix.device-type }})"
-    runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"]
-    container:
-      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu
-      volumes: 
-      - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
-      options: "--privileged"
-    steps:
-    - uses: actions/checkout@v4
-    - name: Validate Pedagogical Example, Shmap_collective_matmul
-      run: python3 pedagogical_examples/shmap_collective_matmul.py
-
-  gpu:
-    needs: build_and_upload_image
-    strategy:
-      fail-fast: false
-      matrix:
-        device-type: ["a100-40gb-4"]
-        build-mode: ["pinned"]
-    name: "GPU test (${{ matrix.device-type }}, ${{ matrix.build-mode }})"
-    runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"]
-    container:
-      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu
-      volumes: 
-      - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
-      env:
-        XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
-        TF_FORCE_GPU_ALLOW_GROWTH: true
-      options: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-    - name: Test train.py with flash attention
-      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=cudnn_flash_te
-
-  clean_up:
-    if: ${{ always() }}
-    needs: [common, gpu, tpu]
-    name: "Clean up"
-    runs-on: ["self-hosted"]
-    steps:
-    - name: Delete GPU image
-      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
-    - name: Delete TPU image
-      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet
- 
diff --git a/.github/workflows/build_upload_internal.yml b/.github/workflows/build_upload_internal.yml
new file mode 100644
index 000000000..3df658a2e
--- /dev/null
+++ b/.github/workflows/build_upload_internal.yml
@@ -0,0 +1,47 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file defines a module for building and uploading an image used in UnitTests.yml
+
+name: Build and Upload Image
+
+on:
+  workflow_call:
+    inputs:
+      device_type:
+        required: true
+        type: string
+      device_name:
+        required: true
+        type: string
+      build_mode:
+        required: true
+        type: string
+
+jobs:
+  build_and_upload:
+    name: Build and upload image (${{ inputs.device_name }})
+    runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build an image
+        run: |
+          bash docker_build_dependency_image.sh MODE=${{ inputs.build_mode }} DEVICE=${{ inputs.device_type }}
+      - name: Tag the image
+        run: |
+          docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
+      - name: Upload the image
+        run: |
+          docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
+
diff --git a/.github/workflows/run_tests_internal.yml b/.github/workflows/run_tests_internal.yml
new file mode 100644
index 000000000..03cb84c56
--- /dev/null
+++ b/.github/workflows/run_tests_internal.yml
@@ -0,0 +1,60 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file defines a module for running tests used in UnitTests.yml
+
+name: Run Tests
+
+on:
+  workflow_call:
+    inputs:
+      device_type:
+        required: true
+        type: string
+      device_name:
+        required: true
+        type: string
+      pytest_marker:
+        required: true
+        type: string
+      test_directory:
+        required: true
+        type: string
+      xla_python_client_mem_fraction:
+        required: true
+        type: string
+      tf_force_gpu_allow_growth:
+        required: true
+        type: string
+      container_resource_option:
+        required: true
+        type: string
+
+jobs:
+  run:
+    runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
+    container:
+      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
+      volumes:
+        - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
+      env:
+        XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }}
+        TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }}
+      options: ${{ inputs.container_resource_option }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Run Tests
+        run: |
+          cd MaxText
+          python3 -m pytest ${{ inputs.test_directory }} -m "${{ inputs.pytest_marker }}"
diff --git a/MaxText/decode.py b/MaxText/decode.py
index b74385207..ef2f2fc79 100644
--- a/MaxText/decode.py
+++ b/MaxText/decode.py
@@ -21,10 +21,20 @@
 
 import os
 import pyconfig
-import sys
 
+from typing import Sequence
+from absl import app
+
+
+def main(argv: Sequence[str]) -> None:
+  jax.config.update("jax_default_prng_impl", "unsafe_rbg")
+  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
+
+  pyconfig.initialize(argv)
+  config = pyconfig.config
+  validate_config(config)
+  max_utils.print_system_information()
 
-def main(config):
   engine = maxengine.MaxEngine(config)
   rng = jax.random.PRNGKey(1234)
   rng, rng_load_params = jax.random.split(rng)
@@ -71,10 +81,4 @@ def validate_config(config):
 
 
 if __name__ == "__main__":
-  jax.config.update("jax_default_prng_impl", "unsafe_rbg")
-  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
-  pyconfig.initialize(sys.argv)
-  cfg = pyconfig.config
-  validate_config(cfg)
-  max_utils.print_system_information()
-  main(cfg)
+  app.run(main)
diff --git a/MaxText/pytest.ini b/MaxText/pytest.ini
index fa8e8142b..fc6c896a9 100644
--- a/MaxText/pytest.ini
+++ b/MaxText/pytest.ini
@@ -3,11 +3,15 @@
 testpaths =
     tests
 python_files = *_test.py
-addopts = 
-    -rf --import-mode=importlib 
+addopts =
+    -rf --import-mode=importlib --strict-markers
     --ignore=tests/profiler_test.py
     --ignore=tests/train_smoke_test.py
     --ignore=tests/train_int8_smoke_test.py
     --ignore=tests/train_gpu_smoke_test.py
-markers = 
-    tpu: marks tests to be run on TPU
\ No newline at end of file
+markers =
+    tpu_only: marks tests to be run on TPUs only
+    gpu_only: marks tests to be run on GPUs only
+    integration_test: tests exercising larger portions of the system,
+                      including interactions with other systems like GCS,
+                      e.g., end_to_end tests
diff --git a/MaxText/tests/attention_test.py b/MaxText/tests/attention_test.py
index f04ebe190..2cf47169e 100644
--- a/MaxText/tests/attention_test.py
+++ b/MaxText/tests/attention_test.py
@@ -118,7 +118,7 @@ def get_structured_data(self, dtype):
 
     return lnx, decoder_segment_ids, decoder_positions
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_autoregression(self):
     prefill_length = self.cfg.max_prefill_predict_length
     decode_total_length = self.cfg.max_target_length
@@ -174,11 +174,11 @@ def test_autoregression(self):
       self.assertTrue(mha_full_this_idx.shape == mha_idx.shape)
       self.assertTrue(jax.numpy.allclose(mha_full_this_idx, mha_idx, rtol=1e-02, atol=1e-02, equal_nan=False))
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_model_mode_prefill_dtype_float32(self):
     self._test_model_mode_prefill_dtype(jnp.float32)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_model_mode_prefill_dtype_bfloat16(self):
     self._test_model_mode_prefill_dtype(jnp.bfloat16)
 
@@ -224,15 +224,15 @@ def _test_model_mode_prefill_dtype(self, dtype):
 
     self.assertEqual(dtype, mha_prefill.dtype)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_tpu_kernel_attention_mha(self):
     self.tpu_kernel_attention_helper(self.num_kv_heads)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_tpu_kernel_attention_gqa(self):
     self.tpu_kernel_attention_helper(self.num_kv_heads // 2)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_tpu_kernel_attention_mqa(self):
     self.tpu_kernel_attention_helper(1)
 
@@ -309,7 +309,7 @@ def tpu_kernel_attention_helper(self, num_kv_heads):
         jax.numpy.allclose(mha_generic_output, mha_generic_flash_output, rtol=1e-01, atol=1e-01, equal_nan=False)
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_dot_product_cache_axis_order(self):
     all_axis_orders = [axis_order for axis_order in itertools.permutations(range(4))]
     for axis_order in random.choices(all_axis_orders, k=4):
@@ -423,7 +423,7 @@ def _dot_product_attention(
           jax.numpy.allclose(attention_w_layout_full_this_idx, attention_w_layout_idx, rtol=rtol, atol=atol, equal_nan=False)
       )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_dot_product_reshape_q(self):
     for compute_axis_order in [(0, 1, 2, 3), (0, 2, 1, 3)]:
       self._dot_product_attention_reshape_q(
diff --git a/MaxText/tests/decode_tests.py b/MaxText/tests/decode_tests.py
new file mode 100644
index 000000000..c86f47e6a
--- /dev/null
+++ b/MaxText/tests/decode_tests.py
@@ -0,0 +1,98 @@
+"""
+Copyright 2023 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Tests for decode with various configs"""
+import os
+import unittest
+import pytest
+from decode import main as decode_main
+from absl.testing import absltest
+
+
+class DecodeTests(unittest.TestCase):
+  """Tests decode with various configs"""
+
+  CONFIGS = {
+      "base": [  # tests decode
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "steps=2",
+          "enable_checkpointing=False",
+          "ici_tensor_parallelism=4",
+          "max_target_length=128",
+          "per_device_batch_size=1",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "int8": [  # tests decode with int8 quantization
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "steps=2",
+          "enable_checkpointing=False",
+          "ici_tensor_parallelism=4",
+          "max_target_length=128",
+          "per_device_batch_size=1",
+          "quantization=int8",
+          "quantize_kvcache=True",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "pdb_lt_1": [  # tests decode with per_device_batch_size < 1
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "steps=2",
+          "enable_checkpointing=False",
+          "ici_tensor_parallelism=4",
+          "max_target_length=128",
+          "per_device_batch_size=.25",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+  }
+
+  @pytest.mark.tpu_only
+  def test_tpu_base(self):
+    decode_main(DecodeTests.CONFIGS["base"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_base(self):
+    decode_main(DecodeTests.CONFIGS["base"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_int8(self):
+    decode_main(DecodeTests.CONFIGS["int8"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_int8(self):
+    decode_main(DecodeTests.CONFIGS["int8"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_pdb_lt_1(self):
+    decode_main(DecodeTests.CONFIGS["pdb_lt_1"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_pdb_lt_1(self):
+    decode_main(DecodeTests.CONFIGS["pdb_lt_1"] + ["attention=dot_product"])
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/MaxText/tests/gpt3_test.py b/MaxText/tests/gpt3_test.py
index b1f0bed52..4428d7b91 100644
--- a/MaxText/tests/gpt3_test.py
+++ b/MaxText/tests/gpt3_test.py
@@ -55,6 +55,8 @@ def _replace_initialization(key, value):
   return model_vars
 
 
+# TODO(b/386317358)
+@pytest.mark.skip(reason="Test started failing with pull/1113, skipping for now.")
 class GPT3(unittest.TestCase):
   """numerical tests for GPT3."""
 
@@ -85,7 +87,7 @@ def setUp(self):
     }
     self.model_vars = init_random_model_vars(self.model, self.rng, self.example_batch)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_logits_numerically(self):
     # ground truth values are calculated from paxml after loading above model_vars
     # note we expect all xents are the same except the padding one since:
@@ -108,4 +110,7 @@ def test_logits_numerically(self):
     # Mask out paddings at the end of each example.
     per_example_xent = per_example_xent * (self.example_batch["targets_segmentation"] != 0)
 
-    self.assertTrue(jax.numpy.allclose(per_example_xent, per_example_xent_truth, rtol=1e-03, atol=1e-03))
+    self.assertTrue(
+        jax.numpy.allclose(per_example_xent, per_example_xent_truth, rtol=1e-03, atol=1e-03),
+        msg=f"per_example_xent:\n{per_example_xent}\n\nper_example_xent_truth:\n{per_example_xent_truth}",
+    )
diff --git a/MaxText/tests/gradient_accumulation_test.py b/MaxText/tests/gradient_accumulation_test.py
index e2730fe97..29c0ab087 100644
--- a/MaxText/tests/gradient_accumulation_test.py
+++ b/MaxText/tests/gradient_accumulation_test.py
@@ -28,7 +28,7 @@ def generate_random_string(length=10):
 
 class GradientAccumulationTest(unittest.TestCase):
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_grad_accumulate_same_loss(self):
     random_suffix = generate_random_string()
     run_accumulate_metrics_file = f"/tmp/runner_grad_accumulate_{random_suffix}.txt"
diff --git a/MaxText/tests/inference_microbenchmark_smoke_test.py b/MaxText/tests/inference_microbenchmark_smoke_test.py
index c28de3dcc..43ceb82a1 100644
--- a/MaxText/tests/inference_microbenchmark_smoke_test.py
+++ b/MaxText/tests/inference_microbenchmark_smoke_test.py
@@ -23,7 +23,7 @@
 
 class Inference_Microbenchmark(unittest.TestCase):
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test(self):
     pyconfig.initialize(
         [
diff --git a/MaxText/tests/integration_tests/checkpoint_compatibility_test.py b/MaxText/tests/integration_tests/checkpoint_compatibility_test.py
new file mode 100644
index 000000000..0470d7093
--- /dev/null
+++ b/MaxText/tests/integration_tests/checkpoint_compatibility_test.py
@@ -0,0 +1,48 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Integraion tests for test_checkpointing.sh"""
+from datetime import datetime
+import subprocess
+import pytest
+
+
+def run_checkpoint_compatibility(attention_type):
+  """Tests checkpoint compatibility."""
+
+  run_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+  command = [
+      "bash",
+      "end_to_end/test_checkpoint_compatibility.sh",
+      f"runner_{run_date}",  # run_name
+      r"gs://runner-maxtext-logs",  # output_path
+      r"gs://maxtext-dataset",  # dataset_path
+      attention_type,
+  ]
+
+  subprocess.run(command, check=True, cwd="..")
+
+
+@pytest.mark.integration_test
+@pytest.mark.tpu_only
+def test_autoselected_attention():
+  run_checkpoint_compatibility("autoselected")
+
+
+@pytest.mark.integration_test
+@pytest.mark.gpu_only
+def test_with_dot_product():
+  run_checkpoint_compatibility("dot_product")
diff --git a/MaxText/tests/integration_tests/checkpointing_test.py b/MaxText/tests/integration_tests/checkpointing_test.py
new file mode 100644
index 000000000..01db18cba
--- /dev/null
+++ b/MaxText/tests/integration_tests/checkpointing_test.py
@@ -0,0 +1,51 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Integraion tests for test_checkpointing.sh"""
+from datetime import datetime
+import subprocess
+import pytest
+
+
+def run_checkpointing(attention_type):
+  """Tests grain checkpoint determinism."""
+
+  run_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+
+  command = [
+      "bash",
+      "end_to_end/test_checkpointing.sh",
+      f"runner_{run_date}",  # run_name
+      r"gs://runner-maxtext-logs",  # output_path
+      r"gs://maxtext-dataset",  # dataset_path
+      "False",  # collect_stack_trace
+      "grain",  # dataset_type
+      attention_type,
+  ]
+
+  subprocess.run(command, check=True, cwd="..")
+
+
+@pytest.mark.integration_test
+@pytest.mark.tpu_only
+def test_autoselected_attention():
+  run_checkpointing("autoselected")
+
+
+@pytest.mark.integration_test
+@pytest.mark.gpu_only
+def test_with_dot_product():
+  run_checkpointing("dot_product")
diff --git a/MaxText/tests/integration_tests/generate_param_only_checkpoint_test.py b/MaxText/tests/integration_tests/generate_param_only_checkpoint_test.py
new file mode 100644
index 000000000..6afb389c7
--- /dev/null
+++ b/MaxText/tests/integration_tests/generate_param_only_checkpoint_test.py
@@ -0,0 +1,53 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Integraion tests for test_generate_param_only_checkpoint.sh"""
+from datetime import datetime
+import subprocess
+import pytest
+
+
+def run_generate_param_only_checkpoint(attention_type, quantization):
+  """Tests generating a parameter-only checkpoint."""
+
+  run_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+  # fmt: off
+  command = [
+      "bash",
+      "end_to_end/test_generate_param_only_checkpoint.sh",
+      "-r", f"runner_{run_date}",
+      "-o", r"gs://runner-maxtext-logs",
+      "-d", r"gs://maxtext-dataset",
+      "-i", "4",
+      "-a", attention_type,
+      "-q", quantization,
+  ]
+
+  subprocess.run(command, check=True, cwd="..")
+
+
+@pytest.mark.integration_test
+@pytest.mark.tpu_only
+@pytest.mark.parametrize("quantization", [(""), ("int8")])
+def test_autoselected_attention(quantization):
+  run_generate_param_only_checkpoint("autoselected", quantization)
+
+
+@pytest.mark.integration_test
+@pytest.mark.gpu_only
+@pytest.mark.parametrize("quantization", [(""), ("int8")])
+def test_with_dot_product(quantization):
+  run_generate_param_only_checkpoint("dot_product", quantization)
diff --git a/MaxText/tests/integration_tests/shmap_collective_matmul_test.py b/MaxText/tests/integration_tests/shmap_collective_matmul_test.py
new file mode 100644
index 000000000..55658e414
--- /dev/null
+++ b/MaxText/tests/integration_tests/shmap_collective_matmul_test.py
@@ -0,0 +1,32 @@
+"""
+Copyright 2024 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Integraion test for pedagogical_examples/shmap_collective_matmul.py"""
+import subprocess
+import pytest
+
+
+@pytest.mark.integration_test
+@pytest.mark.tpu_only
+def test_shmap_collective_matmul_example():
+  """Validate Pedagogical Example, Shmap_collective_matmul."""
+
+  command = [
+      "python3",
+      "pedagogical_examples/shmap_collective_matmul.py",
+  ]
+
+  subprocess.run(command, check=True, cwd="..")
diff --git a/MaxText/tests/kernels_test.py b/MaxText/tests/kernels_test.py
index 5ec2d1c17..6313aa884 100644
--- a/MaxText/tests/kernels_test.py
+++ b/MaxText/tests/kernels_test.py
@@ -38,7 +38,7 @@ class RaggedAttentionTest(unittest.TestCase):
   key = jax.random.key(0)
   k1, k2, k3 = jax.random.split(key, 3)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_ragged_mqa(self):
     q = jax.random.normal(self.k1, (self.batch_size, 1, self.head_dim), dtype=self.dtype)
     k = jax.random.normal(self.k2, (self.batch_size, self.max_target_length, self.head_dim), dtype=self.dtype)
@@ -56,7 +56,7 @@ def test_ragged_mqa(self):
         msg=f"Avg difference: {jnp.average(abs(ragged_out - reference_out))} > 1e-2",
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_ragged_mha(self):
     q = jax.random.normal(self.k1, (self.batch_size, 1, self.num_query_heads, self.head_dim), dtype=self.dtype)
     k = jax.random.normal(
@@ -79,7 +79,7 @@ def test_ragged_mha(self):
         msg=f"Avg difference: {jnp.average(abs(ragged_out - reference_out))} > 1e-2",
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_ragged_gqa(self):
     q = jax.random.normal(self.k1, (self.batch_size, 1, self.num_query_heads, self.head_dim), dtype=self.dtype)
     k = jax.random.normal(
diff --git a/MaxText/tests/model_test.py b/MaxText/tests/model_test.py
index 9791af93b..ed1eecdb6 100644
--- a/MaxText/tests/model_test.py
+++ b/MaxText/tests/model_test.py
@@ -105,7 +105,7 @@ def test_logits_dtype_with_cast_to_fp32(self):
   def test_logits_dtype_without_cast(self):
     self._test_logits_cast_driver(cast_logits_to_fp32=False, expected_dtype=jnp.bfloat16)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_train_vs_prefill_and_autoregress(self):
     PREFILL_RANGE = MAX_PREFILL_PREDICT_LENGTH
 
diff --git a/MaxText/tests/multihost_dataloading_test.py b/MaxText/tests/multihost_dataloading_test.py
index ba289c040..297d75370 100644
--- a/MaxText/tests/multihost_dataloading_test.py
+++ b/MaxText/tests/multihost_dataloading_test.py
@@ -62,7 +62,7 @@ def setUp(self):
     dataset = dataset.batch(batch_size)
     self.multihost_gen = multihost_dataloading.MultiHostDataLoadIterator(dataset, self.mesh)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_batch_sharded_data_pipeline(self):
     first_batch = next(self.multihost_gen)
     sec_batch = next(self.multihost_gen)
diff --git a/MaxText/tests/pipeline_parallelism_test.py b/MaxText/tests/pipeline_parallelism_test.py
index 193c677fa..0e1e18d25 100644
--- a/MaxText/tests/pipeline_parallelism_test.py
+++ b/MaxText/tests/pipeline_parallelism_test.py
@@ -150,7 +150,7 @@ def regular_sequential_layers_dummy_loss(
         dummy_targets,
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_circular_minimum_microbatches_same_output_and_grad(self):
     # 4 stages, 8 layers (2 repeats, 1 layer per stage), 4 microbatches
     pyconfig.initialize(
@@ -167,7 +167,7 @@ def test_circular_minimum_microbatches_same_output_and_grad(self):
     config = pyconfig.config
     self.assert_pipeline_same_output_and_grad(config)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_circular_extra_microbatches_same_output_and_grad(self):
     # 4 stages, 8 layers (2 repeats, 1 layer per stage), 8 microbatches
     pyconfig.initialize(
@@ -184,7 +184,7 @@ def test_circular_extra_microbatches_same_output_and_grad(self):
     config = pyconfig.config
     self.assert_pipeline_same_output_and_grad(config)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_non_circular_same_output_and_grad(self):
     # 4 stages, 4 layers (no circular repeats, 1 layer per stage), 4 microbatches
     pyconfig.initialize(
@@ -201,7 +201,7 @@ def test_non_circular_same_output_and_grad(self):
     config = pyconfig.config
     self.assert_pipeline_same_output_and_grad(config)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_full_train_circular(self):
     # Run a full train.py call with 4 stages, 32 layers (2 layers per stage, 4 circular repeats), 8 microbatches
     train_main(
@@ -231,7 +231,7 @@ def test_full_train_circular(self):
         ]
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_delay_activation_forwarding_same_output_and_grad(self):
     # 4 stages, delayed activation forwarding, 8 layers (2 repeats, 1 layer per stage), 8 microbatches
     pyconfig.initialize(
@@ -249,7 +249,7 @@ def test_delay_activation_forwarding_same_output_and_grad(self):
     config = pyconfig.config
     self.assert_pipeline_same_output_and_grad(config)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_full_train_non_circular(self):
     # Run a full train.py call with 4 stages, 32 layers (8 layers per stage), 8 microbatches
     train_main(
diff --git a/MaxText/tests/simple_decoder_layer_test.py b/MaxText/tests/simple_decoder_layer_test.py
index afa2d0aeb..ba6fa7c3c 100644
--- a/MaxText/tests/simple_decoder_layer_test.py
+++ b/MaxText/tests/simple_decoder_layer_test.py
@@ -18,7 +18,7 @@
 
 class SimpleDecoderLayerTest(unittest.TestCase):
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_simple_decoder_layer(self):
     train_main(
         [
@@ -34,7 +34,7 @@ def test_simple_decoder_layer(self):
         ]
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_mlp_decoder_layer(self):
     train_main(
         [
diff --git a/MaxText/tests/standalone_dl_ckpt_test.py b/MaxText/tests/standalone_dl_ckpt_test.py
index 1bd774946..652b652e1 100644
--- a/MaxText/tests/standalone_dl_ckpt_test.py
+++ b/MaxText/tests/standalone_dl_ckpt_test.py
@@ -34,7 +34,7 @@ def _get_random_test_name(self, test_name):
     random_run_name = test_name + date_time + random_string
     return random_run_name
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_standalone_dataloader(self):
     random_run_name = self._get_random_test_name("standalone_dataloader")
     sdl_main(
@@ -50,7 +50,7 @@ def test_standalone_dataloader(self):
         )
     )  # need to pass relative path to tokenizer
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_standalone_checkpointer(self):
     random_run_name = self._get_random_test_name("standalone_checkpointer")
     # checkpoint at 50
diff --git a/MaxText/tests/tokenizer_test.py b/MaxText/tests/tokenizer_test.py
index c5222f0de..a64888e42 100644
--- a/MaxText/tests/tokenizer_test.py
+++ b/MaxText/tests/tokenizer_test.py
@@ -58,12 +58,12 @@ def tearDownClass(cls):
     os.remove(cls.tokenizer_path)
 
   @pytest.mark.skip(reason="mohitkhatwani@ will fix this")
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_tokenize(self):
     text = "This is a test"
     self.assertTrue(np.array_equal(self.source_tokenizer.encode(text).numpy(), self.test_tokenizer.encode(text).numpy()))
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_detokenize(self):
     tokens = [66, 12, 10, 698]
     self.assertEqual(np.asarray(self.source_tokenizer.decode(tokens)), np.asarray(self.test_tokenizer.decode(tokens)))
@@ -86,13 +86,13 @@ def setUpClass(cls):
     train_ds_builder = tfds.builder(dataset_name)
     cls.dataset = train_ds_builder.as_dataset(split="train", read_config=read_config, shuffle_files=True)
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_tokenize(self):
     text = "This is a test"
     tokens = [2028, 374, 264, 1296]
     self.assertTrue(np.array_equal(self.source_tokenizer.encode(text), tokens))
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_detokenize(self):
     tokens = [2028, 374, 264, 1296]
     text = "This is a test"
diff --git a/MaxText/tests/train_compile_test.py b/MaxText/tests/train_compile_test.py
index 87977079c..880a7a30c 100644
--- a/MaxText/tests/train_compile_test.py
+++ b/MaxText/tests/train_compile_test.py
@@ -24,7 +24,7 @@
 class TrainCompile(unittest.TestCase):
   """Tests for the Ahead of Time Compilation functionality, train_compile.py"""
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_save_compiled_v4(self):
     compiled_trainstep_file = "/tmp/test_compiled_v4.pickle"
     train_compile_main(
@@ -40,7 +40,7 @@ def test_save_compiled_v4(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_save_compiled_v5e(self):
     compiled_trainstep_file = "/tmp/test_compiled_v5e.pickle"
     train_compile_main(
@@ -79,7 +79,7 @@ def test_minimal_offloaded_v5e(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_save_compiled_v5p_two_slices(self):
     compiled_trainstep_file = "/tmp/test_compiled_v5p_two_slices.pickle"
     train_compile_main(
@@ -97,7 +97,7 @@ def test_save_compiled_v5p_two_slices(self):
 
   # TODO (b/374764692) : Enable when v6e AOT test when stable Jax supports v6e AOT.
   @pytest.mark.skip(reason="Enable when downstream v6e AOT support reaches stable Jax.")
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_save_compiled_v6e(self):
     compiled_trainstep_file = "/tmp/test_compiled_v6e.pickle"
     train_compile_main(
@@ -113,7 +113,7 @@ def test_save_compiled_v6e(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_sequence_parallelism(self):
     compiled_trainstep_file = "/tmp/test_compiled.pickle"
     train_compile_main(
@@ -131,7 +131,7 @@ def test_sequence_parallelism(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_remat_save_dot_except_mlpwi(self):
     compiled_trainstep_file = "/tmp/test_remat_save_dot_except_mlpwi.pickle"
     train_compile_main(
@@ -153,7 +153,7 @@ def test_remat_save_dot_except_mlpwi(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_remat_save_dot_except_mlp(self):
     compiled_trainstep_file = "/tmp/test_remat_save_dot_except_mlp.pickle"
     train_compile_main(
@@ -175,7 +175,7 @@ def test_remat_save_dot_except_mlp(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_remat_save_qkv_proj(self):
     compiled_trainstep_file = "/tmp/test_remat_save_qkv_proj.pickle"
     train_compile_main(
@@ -197,7 +197,7 @@ def test_remat_save_qkv_proj(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_remat_full(self):
     compiled_trainstep_file = "/tmp/test_remat_full.pickle"
     train_compile_main(
@@ -219,7 +219,7 @@ def test_remat_full(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_custom_64x4_mesh(self):
     compiled_trainstep_file = "/tmp/test_custom_64x4_mesh.pickle"
     train_compile_main(
@@ -241,7 +241,7 @@ def test_custom_64x4_mesh(self):
 
   # TODO (b/376470419) : Enable when AOT test work with host offloading.
   @pytest.mark.skip(reason="Enable when AOT test work with host offloading.")
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_llama3_1_70b_opt_offload(self):
     compiled_trainstep_file = "/tmp/test_llama3_1_70b_opt_offload.pickle"
     train_compile_main(
@@ -259,7 +259,7 @@ def test_llama3_1_70b_opt_offload(self):
         )
     )
 
-  @pytest.mark.tpu
+  @pytest.mark.tpu_only
   def test_custom_32x8_mesh(self):
     compiled_trainstep_file = "/tmp/test_custom_32x8_mesh.pickle"
     train_compile_main(
diff --git a/MaxText/tests/train_tests.py b/MaxText/tests/train_tests.py
new file mode 100644
index 000000000..d09e01d05
--- /dev/null
+++ b/MaxText/tests/train_tests.py
@@ -0,0 +1,184 @@
+"""
+Copyright 2023 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Tests for train.py with various configs"""
+import os
+import unittest
+import pytest
+from train import main as train_main
+from absl.testing import absltest
+
+
+class TrainTests(unittest.TestCase):
+  """Tests train.py with various configs"""
+
+  CONFIGS = {
+      "base": [  # short test for train.py with TFDS c4
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "steps=2",
+          "enable_checkpointing=False",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "synthetic": [  # tests base config with synthtic dataset
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "steps=2",
+          "enable_checkpointing=False",
+          "dataset_type=synthetic",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "pdb_lt_1": [  # tests base config with per_device_batch_size < 1
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "steps=2",
+          "enable_checkpointing=False",
+          "per_device_batch_size=0.25",
+          "ici_tensor_parallelism=4",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "int8": [  # tests base config with int8
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "quantization=int8",
+          "steps=2",
+          "enable_checkpointing=False",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "fp8": [  # tests base config with fp8
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "quantization=fp8",
+          "steps=2",
+          "enable_checkpointing=False",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "dropout": [  # tests base config with dropout
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          r"dataset_path=gs://maxtext-dataset",
+          "steps=2",
+          "enable_checkpointing=False",
+          "max_target_length=128",
+          "per_device_batch_size=1",
+          "dropout_rate=0.02",
+          r"tokenizer_path=../assets/tokenizer.llama2",
+      ],
+      "hf_input_pipeline": [  # test for train.py with TFDS c4, using HF input pipeline
+          None,
+          "configs/base.yml",
+          r"base_output_directory=gs://runner-maxtext-logs",
+          "run_name=runner_test",
+          "steps=2",
+          "enable_checkpointing=False",
+          "dataset_type=hf",
+          "hf_path=parquet",
+          r"hf_train_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet",
+          r"tokenizer_path=google-t5/t5-large",
+      ],
+  }
+
+  @pytest.mark.tpu_only
+  def test_tpu_base(self):
+    train_main(TrainTests.CONFIGS["base"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_base(self):
+    train_main(TrainTests.CONFIGS["base"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_synthetic(self):
+    train_main(TrainTests.CONFIGS["synthetic"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_synthetic(self):
+    train_main(TrainTests.CONFIGS["synthetic"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_pdb_lt_1(self):
+    train_main(TrainTests.CONFIGS["pdb_lt_1"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_pdb_lt_1(self):
+    train_main(TrainTests.CONFIGS["pdb_lt_1"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_int8(self):
+    train_main(TrainTests.CONFIGS["int8"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_int8(self):
+    train_main(TrainTests.CONFIGS["int8"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_fp8(self):
+    train_main(TrainTests.CONFIGS["fp8"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_fp8(self):
+    train_main(TrainTests.CONFIGS["fp8"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_dropout(self):
+    train_main(TrainTests.CONFIGS["dropout"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_dropout(self):
+    train_main(TrainTests.CONFIGS["dropout"] + ["attention=dot_product"])
+
+  @pytest.mark.tpu_only
+  def test_tpu_hf_input_pipeline(self):
+    train_main(TrainTests.CONFIGS["hf_input_pipeline"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_hf_input_pipeline(self):
+    train_main(TrainTests.CONFIGS["hf_input_pipeline"] + ["attention=dot_product"])
+
+  @pytest.mark.gpu_only
+  def test_gpu_cudnn_flash_te(self):
+    cudnn_flash_te = [  # tests base config on GPU with flash attention"""
+        None,
+        "configs/base.yml",
+        r"base_output_directory=gs://runner-maxtext-logs",
+        "run_name=runner_test",
+        r"dataset_path=gs://maxtext-dataset",
+        "steps=2",
+        "enable_checkpointing=False",
+        "attention=cudnn_flash_te",
+        r"tokenizer_path=../assets/tokenizer.llama2",
+    ]
+    train_main(cudnn_flash_te)
+
+
+if __name__ == "__main__":
+  absltest.main()