diff --git a/models/demos/llama3/demo/simple_text_demo.py b/models/demos/llama3/demo/simple_text_demo.py index 67cb1e35e18..4cf6a04f57c 100644 --- a/models/demos/llama3/demo/simple_text_demo.py +++ b/models/demos/llama3/demo/simple_text_demo.py @@ -293,6 +293,10 @@ def test_llama_demo_text( if is_ci_env and (optimizations == LlamaOptimizations.accuracy or not ci_only): pytest.skip("CI only runs the CI-only tests") + # TODO This can be tackled by reducing the number of iterations we run on CI on N150/N300 machines + if is_ci_env and mesh_device.get_num_devices() < 4 and batch_size == 32: + pytest.skip("Some llama3 models may run out of memory with CI settings when batch_size=32") + # TODO: Remove this once all batch sizes are supported on TG if os.environ.get("FAKE_DEVICE") == "TG" and batch_size not in [1, 32]: pytest.skip("TG only supports batch 1 and 32") diff --git a/models/demos/llama3/lt b/models/demos/llama3/lt index 31fea098e0e..0283e5d74be 100644 --- a/models/demos/llama3/lt +++ b/models/demos/llama3/lt @@ -839,6 +839,8 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update): "demo-acc": "pytest models/demos/llama3/demo/simple_text_demo.py -k accuracy-batch-1", "demo-32": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-batch-32", "demo-long": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-long", + "demo-ci-1": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-ci-1", + "demo-ci-32": "pytest models/demos/llama3/demo/simple_text_demo.py -k performance-ci-32", "attention": "pytest models/demos/llama3/tests/test_llama_attention.py", "attention-prefill": "pytest models/demos/llama3/tests/test_llama_attention_prefill.py", "mlp": "pytest models/demos/llama3/tests/test_llama_mlp.py",