AI-Hypercomputer · zpcore · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025 · tengyifei
diff --git a/README.md b/README.md
@@ -121,7 +121,26 @@ and attributes where this model code came from, if any. This also helps to
 show case what changes we have done to make it performant on TPU. The original
 version is not expected to be run.
 
-## Run huggingface transformer models
+## Contributing
+
+Contributions are welcome! Please feel free to submit a pull request.
+
+When developing, use `pip install -e '.[dev]'` to install dev dependencies such
+as linter and formatter.
+
+### How to run tests:
+
+```sh
+pytest
+```
+
+### How to run some of the tests, and re-run them whenever you change a file:
+
+```sh
+tp -i test ... # replace with path to tests/directories
+```
+
+### How to run HuggingFace transformer models
 Torchprime supports run with huggingface models by taking advantage of `tp run`.
 To use huggingface models, you can clone
 [huggingface/transformers](https://github.com/huggingface/transformers) under
@@ -137,32 +156,26 @@ add flag `--use-hf` to `tp run` command:
 tp run --use-hf torchprime/hf_models/train.py
 ```
 
-## Contributing
-
-Contributions are welcome! Please feel free to submit a pull request.
-
-When developing, use `pip install -e '.[dev]'` to install dev dependencies such
-as linter and formatter.
-
-How to run tests:
-
-```sh
-pytest
+### How to run inside the docker container locally
+You can also run locally without XPK with docker. When running inside the docker
+container, it will use the same dependencies and build process as used in the
+XPK approach, improving the hermeticity and reliability.
 ```
-
-How to run some of the tests, and re-run them whenever you change a file:
-
-```sh
-tp -i test ... # replace with path to tests/directories
+tp docker-run torchprime/torch_xla_models/train.py
+```
+This will run the TorchPrime docker image locally. You can also add `--use-hf`
+to run HuggingFace model locally.
+```
+tp docker-run --use-hf torchprime/hf_models/train.py
 ```
 
-How to format:
+### How to format:
 
 ```sh
 ruff format
 ```
 
-How to lint:
+### How to lint:
 
 ```sh
 ruff check [--fix]

diff --git a/torchprime/launcher/buildpush.py b/torchprime/launcher/buildpush.py
@@ -61,7 +61,8 @@ def buildpush(
     _run(
       f"{sudo_cmd} docker tag {docker_tag} {docker_url}",
     )
-    _run(f"{sudo_cmd} docker push {docker_url}")
+    if torchprime_docker_tag != "local_run":
+      _run(f"{sudo_cmd} docker push {docker_url}")
   except subprocess.CalledProcessError as e:
     print(f"Error running command: {e}")
     exit(e.returncode)

diff --git a/torchprime/launcher/cli.py b/torchprime/launcher/cli.py
@@ -23,6 +23,13 @@
 import torchprime.launcher.doctor
 from torchprime.launcher.buildpush import buildpush
 
+_DOCKER_ENV_FORWARD_LIST = [
+  "HF_TOKEN",
+  "XLA_IR_DEBUG",
+  "XLA_HLO_DEBUG",
+  "LIBTPU_INIT_ARGS",
+]
+
 
 @dataclass_json
 @dataclass
@@ -194,6 +201,55 @@ def create_and_activate_gcloud(gcloud_config_name, config: Config):
   )
 
 
+@cli.command(
+  name="docker-run",
+  context_settings=dict(
+    ignore_unknown_options=True,
+  ),
+)
+@click.argument("args", nargs=-1, type=click.UNPROCESSED)
+@click.option("--use-hf", is_flag=True, help="Use HuggingFace transformer")
+def docker_run(args, use_hf: bool):
+  """
+  Runs the provided training command locally for quick testing.
+  """
+  config = read_config()
+
+  click.echo(get_project_dir().absolute())
+
+  # Build docker image.
+  build_arg = "USE_TRANSFORMERS=true" if use_hf else None
+  docker_project = config.docker_project
+  if docker_project is None:
+    docker_project = config.project
+  docker_url = buildpush(
+    docker_project, torchprime_docker_tag="local_run", build_arg=build_arg
+  )
+  # Forward a bunch of important env vars.
+  env_forwarding = [
+    arg for env_var in _DOCKER_ENV_FORWARD_LIST for arg in forward_env(env_var)
+  ]
+  command = [
+    "python",
+  ] + list(args)
+  docker_command = [
+    "docker",
+    "run",
+    "-i",
+    *env_forwarding,
+    "--privileged",
+    "--net",
+    "host",
+    "--rm",
+    "-v",
+    f"{os.getcwd()}:/workspace",
+    "-w",
+    "/workspace",
+    docker_url,
+  ] + command
+  subprocess.run(docker_command, check=True)
+
+
 @cli.command(
   context_settings=dict(
     ignore_unknown_options=True,
@@ -235,12 +291,8 @@ def run(args, name: str | None, use_hf: bool):
 
   # Forward a bunch of important env vars.
   env_forwarding = [
-    *forward_env("HF_TOKEN"),  # HuggingFace token
-    *forward_env("XLA_IR_DEBUG"),  # torch_xla debugging flag
-    *forward_env("XLA_HLO_DEBUG"),  # torch_xla debugging flag
-    *forward_env("LIBTPU_INIT_ARGS"),  # XLA flags
+    arg for env_var in _DOCKER_ENV_FORWARD_LIST for arg in forward_env(env_var)
   ]
-
   # Pass artifact dir and jobset name as env vars.
   artifact_arg = [
     "--env",