From 32ba1013afb827afeaf1fbb1498d6f521b820033 Mon Sep 17 00:00:00 2001
From: jjallaire <jj.allaire@gmail.com>
Date: Thu, 17 Oct 2024 17:15:47 +0100
Subject: [PATCH] Use NamedTuple for SandboxEnvironmentSpec (#703)

* Use a NamedTuple for SandboxEnvironentSpec

* change order of sandbox type

* update json schema

* cwd_relative_path for sandbox config

* support Dockerfile as sandbox config

* Revert "support Dockerfile as sandbox config"

This reverts commit 6e617a6c6d451a95e520c5857babe35aac7c3ee2.

---------

Co-authored-by: jjallaire-aisi <joseph.allaire@dsit.gov.uk>
Co-authored-by: Charles Teague <cteague@gmail.com>
---
 src/inspect_ai/_cli/util.py                 |  8 +-
 src/inspect_ai/_eval/eval.py                | 16 ++--
 src/inspect_ai/_eval/evalset.py             |  8 +-
 src/inspect_ai/_eval/loader.py              | 35 ++++----
 src/inspect_ai/_eval/run.py                 | 12 +--
 src/inspect_ai/_eval/task/log.py            | 15 ++--
 src/inspect_ai/_eval/task/run.py            |  5 +-
 src/inspect_ai/_eval/task/sandbox.py        | 50 ++++++-----
 src/inspect_ai/_eval/task/task.py           | 17 ++--
 src/inspect_ai/_view/www/log-schema.json    | 93 +++++++--------------
 src/inspect_ai/_view/www/src/types/log.d.ts | 32 ++-----
 src/inspect_ai/dataset/_dataset.py          | 56 +++++++++----
 src/inspect_ai/dataset/_sources/util.py     |  7 +-
 src/inspect_ai/dataset/_util.py             |  4 +-
 src/inspect_ai/log/_log.py                  |  5 +-
 src/inspect_ai/util/__init__.py             |  2 +
 src/inspect_ai/util/_sandbox/__init__.py    |  8 +-
 src/inspect_ai/util/_sandbox/environment.py | 31 ++++++-
 tests/tools/test_web_browser.py             |  3 +-
 19 files changed, 214 insertions(+), 193 deletions(-)

diff --git a/src/inspect_ai/_cli/util.py b/src/inspect_ai/_cli/util.py
index e7f0bc08a..4beaad3f7 100644
--- a/src/inspect_ai/_cli/util.py
+++ b/src/inspect_ai/_cli/util.py
@@ -2,6 +2,8 @@
 
 import yaml
 
+from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
+
 
 def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]:
     params: dict[str, Any] = dict()
@@ -18,12 +20,12 @@ def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]:
     return params
 
 
-def parse_sandbox(sandbox: str | None) -> str | tuple[str, str] | None:
+def parse_sandbox(sandbox: str | None) -> SandboxEnvironmentSpec | None:
     if sandbox is not None:
         parts = sandbox.split(":", maxsplit=1)
         if len(parts) == 1:
-            return sandbox
+            return SandboxEnvironmentSpec(sandbox)
         else:
-            return (parts[0], parts[1])
+            return SandboxEnvironmentSpec(parts[0], parts[1])
     else:
         return None
diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py
index 242bc4a12..372e82045 100644
--- a/src/inspect_ai/_eval/eval.py
+++ b/src/inspect_ai/_eval/eval.py
@@ -29,7 +29,7 @@
 from inspect_ai.scorer._reducer import reducer_log_names
 from inspect_ai.solver._chain import chain
 from inspect_ai.solver._solver import Solver, SolverSpec
-from inspect_ai.util import SandboxEnvironmentSpec
+from inspect_ai.util import SandboxEnvironmentType
 
 from .context import init_eval_context
 from .loader import ResolvedTask, resolve_tasks
@@ -45,7 +45,7 @@ def eval(
     model_base_url: str | None = None,
     model_args: dict[str, Any] = dict(),
     task_args: dict[str, Any] = dict(),
-    sandbox: SandboxEnvironmentSpec | None = None,
+    sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
     solver: Solver | list[Solver] | SolverSpec | None = None,
     trace: bool | None = None,
@@ -80,8 +80,8 @@ def eval(
             with the model API.
         model_args (dict[str,Any]): Model creation parameters
         task_args (dict[str,Any]): Task arguments
-        sandbox (SandboxEnvironmentSpec | None): Sandbox
-           environment type (or optionally a tuple with type and config file)
+        sandbox (SandboxEnvironmentType | None): Sandbox environment type
+          (or optionally a str or tuple with a shorthand spec)
         sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
           (defaults to True)
         solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
@@ -166,7 +166,7 @@ async def eval_async(
     model_base_url: str | None = None,
     model_args: dict[str, Any] = dict(),
     task_args: dict[str, Any] = dict(),
-    sandbox: SandboxEnvironmentSpec | None = None,
+    sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
     solver: Solver | list[Solver] | SolverSpec | None = None,
     trace: bool | None = None,
@@ -201,8 +201,8 @@ async def eval_async(
             with the model API.
         model_args (dict[str,Any]): Model creation parameters
         task_args (dict[str,Any]): Task arguments
-        sandbox (SandboxEnvironentSpec | None): Sandbox
-           environment type (or optionally a tuple with type and config file)
+        sandbox (SandboxEnvironmentType | None): Sandbox environment type
+          (or optionally a str or tuple with a shorthand spec)
         sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
            (defaults to True)
         solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s).
@@ -676,7 +676,7 @@ def eval_init(
     model_base_url: str | None = None,
     model_args: dict[str, Any] = dict(),
     task_args: dict[str, Any] = dict(),
-    sandbox: SandboxEnvironmentSpec | None = None,
+    sandbox: SandboxEnvironmentType | None = None,
     trace: bool | None = None,
     approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None,
     max_subprocesses: int | None = None,
diff --git a/src/inspect_ai/_eval/evalset.py b/src/inspect_ai/_eval/evalset.py
index b4d3068cc..d3d168262 100644
--- a/src/inspect_ai/_eval/evalset.py
+++ b/src/inspect_ai/_eval/evalset.py
@@ -33,7 +33,7 @@
 )
 from inspect_ai.model._generate_config import GenerateConfig
 from inspect_ai.solver._solver import Solver, SolverSpec
-from inspect_ai.util import SandboxEnvironmentSpec
+from inspect_ai.util import SandboxEnvironmentType
 
 from .eval import eval, eval_init
 from .loader import ResolvedTask, resolve_task_args
@@ -54,7 +54,7 @@ def eval_set(
     model_base_url: str | None = None,
     model_args: dict[str, Any] = dict(),
     task_args: dict[str, Any] = dict(),
-    sandbox: SandboxEnvironmentSpec | None = None,
+    sandbox: SandboxEnvironmentType | None = None,
     sandbox_cleanup: bool | None = None,
     solver: Solver | list[Solver] | SolverSpec | None = None,
     trace: bool | None = None,
@@ -101,8 +101,8 @@ def eval_set(
             with the model API.
         model_args (dict[str,Any]): Model creation parameters
         task_args (dict[str,Any]): Task arguments
-        sandbox (SandboxEnvironmentSpec | None): Sandbox
-           environment type (or optionally a tuple with type and config file)
+        sandbox (SandboxEnvironmentType | None): Sandbox environment type
+          (or optionally a str or tuple with a shorthand spec)
         sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes
           (defaults to True)
         solver (Solver | list[Solver] | SolverSpec | None): Alternative solver(s) for
diff --git a/src/inspect_ai/_eval/loader.py b/src/inspect_ai/_eval/loader.py
index b7b2c64ac..2504d9be2 100644
--- a/src/inspect_ai/_eval/loader.py
+++ b/src/inspect_ai/_eval/loader.py
@@ -24,7 +24,8 @@
 )
 from inspect_ai.model import Model, ModelName
 from inspect_ai.solver._solver import Solver, SolverSpec
-from inspect_ai.util import SandboxEnvironmentSpec
+from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
+from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
 from inspect_ai.util._sandbox.registry import registry_find_sandboxenv
 
 from .list import task_files
@@ -42,7 +43,7 @@ class ResolvedTask:
     task_args: dict[str, Any]
     task_file: str | None
     model: Model
-    sandbox: tuple[str, str | None] | None
+    sandbox: SandboxEnvironmentSpec | None
     sequence: int
     id: str | None = field(default=None)
     sample_source: EvalSampleSource | None = field(default=None)
@@ -61,7 +62,7 @@ def resolve_tasks(
     tasks: Tasks,
     task_args: dict[str, Any],
     model: Model,
-    sandbox: SandboxEnvironmentSpec | None,
+    sandbox: SandboxEnvironmentType | None,
 ) -> list[ResolvedTask]:
     def as_resolved_tasks(tasks: list[Task]) -> list[ResolvedTask]:
         return [
@@ -169,24 +170,18 @@ def resolve_task_args(task: Task) -> dict[str, Any]:
 
 
 def resolve_task_sandbox(
-    task: Task, sandbox: SandboxEnvironmentSpec | None
-) -> tuple[str, str | None] | None:
+    task: Task, sandbox: SandboxEnvironmentType | None
+) -> SandboxEnvironmentSpec | None:
     # do the resolution
-    resolved_sandbox = (
-        (sandbox, None)
-        if isinstance(sandbox, str)
-        else sandbox
-        if sandbox is not None
-        else task.sandbox
-    )
+    resolved_sandbox = resolve_sandbox_environment(sandbox) or task.sandbox
 
     # if we have a sandbox with no config, see if there are implcit
     # config files available for the provider
     if resolved_sandbox is not None:
         # look for default
-        if resolved_sandbox[1] is None:
+        if resolved_sandbox.config is None:
             # get config files for this type
-            sandboxenv_type = registry_find_sandboxenv(resolved_sandbox[0])
+            sandboxenv_type = registry_find_sandboxenv(resolved_sandbox.type)
             config_files_fn = cast(
                 Callable[..., list[str]], getattr(sandboxenv_type, "config_files")
             )
@@ -197,15 +192,19 @@ def resolve_task_sandbox(
             for config_file in config_files:
                 config_file_path = os.path.join(src_dir, config_file)
                 if os.path.isfile(config_file_path):
-                    resolved_sandbox = (resolved_sandbox[0], config_file)
+                    resolved_sandbox = SandboxEnvironmentSpec(
+                        resolved_sandbox.type, config_file
+                    )
                     break
 
         # resolve relative paths
-        if resolved_sandbox[1] is not None:
-            file_path = Path(resolved_sandbox[1])
+        if resolved_sandbox.config is not None:
+            file_path = Path(resolved_sandbox.config)
             if not file_path.is_absolute():
                 file_path = Path(task_run_dir(task)) / file_path
-                resolved_sandbox = (resolved_sandbox[0], file_path.as_posix())
+                resolved_sandbox = SandboxEnvironmentSpec(
+                    resolved_sandbox.type, file_path.as_posix()
+                )
 
     # return resolved sandbox
     return resolved_sandbox
diff --git a/src/inspect_ai/_eval/run.py b/src/inspect_ai/_eval/run.py
index fbad6fa93..1ad604347 100644
--- a/src/inspect_ai/_eval/run.py
+++ b/src/inspect_ai/_eval/run.py
@@ -27,7 +27,7 @@
 from .task.log import TaskLogger
 from .task.run import TaskRunOptions, create_sample_semaphore, task_run
 from .task.rundir import task_run_dir_switching
-from .task.sandbox import resolve_sandbox_for_task
+from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task
 from .task.util import task_run_dir
 
 log = logging.getLogger(__name__)
@@ -310,7 +310,7 @@ async def startup_sandbox_environments(
     tasks: list[ResolvedTask], cleanup: bool
 ) -> Callable[[], Awaitable[None]]:
     # find unique sandboxenvs
-    sandboxenvs: Set[tuple[str, str | None, str]] = set()
+    sandboxenvs: Set[TaskSandboxEnvironment] = set()
     for task in tasks:
         # resolve each sample and add to sandboxenvs
         for sample in task.task.dataset:
@@ -322,16 +322,16 @@ async def startup_sandbox_environments(
     cleanups: list[tuple[TaskCleanup, str | None, str]] = []
     for sandboxenv in sandboxenvs:
         # find type
-        sandboxenv_type = registry_find_sandboxenv(sandboxenv[0])
+        sandboxenv_type = registry_find_sandboxenv(sandboxenv.sandbox.type)
 
         # run startup
         task_init = cast(TaskInit, getattr(sandboxenv_type, "task_init"))
-        with chdir(sandboxenv[2]):
-            await task_init("startup", sandboxenv[1])
+        with chdir(sandboxenv.run_dir):
+            await task_init("startup", sandboxenv.sandbox.config)
 
         # append cleanup method
         task_cleanup = cast(TaskCleanup, getattr(sandboxenv_type, "task_cleanup"))
-        cleanups.append((task_cleanup, sandboxenv[1], sandboxenv[2]))
+        cleanups.append((task_cleanup, sandboxenv.sandbox.config, sandboxenv.run_dir))
 
     # return shutdown method
     async def shutdown() -> None:
diff --git a/src/inspect_ai/_eval/task/log.py b/src/inspect_ai/_eval/task/log.py
index 06d677a5d..2113f03e2 100644
--- a/src/inspect_ai/_eval/task/log.py
+++ b/src/inspect_ai/_eval/task/log.py
@@ -37,6 +37,7 @@
 from inspect_ai.scorer._metric import SampleScore
 from inspect_ai.solver import Plan, Solver, TaskState
 from inspect_ai.solver._solver import SolverSpec
+from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 
 
 class TaskLogger:
@@ -50,7 +51,7 @@ def __init__(
         solver: SolverSpec | None,
         model: Model,
         dataset: Dataset,
-        sandbox: tuple[str, str | None] | None,
+        sandbox: SandboxEnvironmentSpec | None,
         task_attribs: dict[str, Any],
         task_args: dict[str, Any],
         model_args: dict[str, Any],
@@ -72,6 +73,12 @@ def __init__(
         if "api_key" in model_args:
             del model_args["api_key"]
 
+        # cwd_relative_path for sandbox config
+        if sandbox and sandbox.config:
+            sandbox = SandboxEnvironmentSpec(
+                sandbox.type, cwd_relative_path(sandbox.config)
+            )
+
         # create eval spec
         self.eval = EvalSpec(
             run_id=run_id,
@@ -155,11 +162,7 @@ def log_sample(
                 choices=sample.choices,
                 target=sample.target,
                 metadata=state.metadata if state.metadata else {},
-                sandbox=(
-                    (sample.sandbox, None)
-                    if isinstance(sample.sandbox, str)
-                    else sample.sandbox
-                ),
+                sandbox=sample.sandbox,
                 files=list(sample.files.keys()) if sample.files else None,
                 setup=sample.setup,
                 messages=state.messages,
diff --git a/src/inspect_ai/_eval/task/run.py b/src/inspect_ai/_eval/task/run.py
index 9f768741a..0eb2c0a81 100644
--- a/src/inspect_ai/_eval/task/run.py
+++ b/src/inspect_ai/_eval/task/run.py
@@ -64,6 +64,7 @@
 from inspect_ai.solver._fork import set_task_generate
 from inspect_ai.solver._solver import Solver
 from inspect_ai.solver._task_state import set_sample_state, state_jsonable
+from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 from inspect_ai.util._subtask import init_subtask
 
 from ..context import init_task_context
@@ -92,7 +93,7 @@
 class TaskRunOptions:
     task: Task
     model: Model
-    sandbox: tuple[str, str | None] | None
+    sandbox: SandboxEnvironmentSpec | None
     logger: TaskLogger
     eval_wd: str
     config: EvalConfig = field(default_factory=EvalConfig)
@@ -343,7 +344,7 @@ async def task_run_sample(
     task_name: str,
     sample: Sample,
     state: TaskState,
-    sandbox: tuple[str, str | None] | None,
+    sandbox: SandboxEnvironmentSpec | None,
     sandbox_cleanup: bool,
     plan: Plan,
     scorers: list[Scorer] | None,
diff --git a/src/inspect_ai/_eval/task/sandbox.py b/src/inspect_ai/_eval/task/sandbox.py
index a0dcf871c..d15eaafb4 100644
--- a/src/inspect_ai/_eval/task/sandbox.py
+++ b/src/inspect_ai/_eval/task/sandbox.py
@@ -1,7 +1,7 @@
 import asyncio
 import base64
 import contextlib
-from typing import AsyncGenerator
+from typing import AsyncGenerator, NamedTuple
 
 from inspect_ai._eval.task.task import Task
 from inspect_ai._eval.task.util import task_run_dir
@@ -12,13 +12,16 @@
     cleanup_sandbox_environments_sample,
     init_sandbox_environments_sample,
 )
-from inspect_ai.util._sandbox.environment import SandboxEnvironment
+from inspect_ai.util._sandbox.environment import (
+    SandboxEnvironment,
+    SandboxEnvironmentSpec,
+)
 
 
 @contextlib.asynccontextmanager
 async def sandboxenv_context(
     task_name: str,
-    sandbox: tuple[str, str | None] | None,
+    sandbox: SandboxEnvironmentSpec | None,
     cleanup: bool,
     sample: Sample,
 ) -> AsyncGenerator[None, None]:
@@ -47,9 +50,9 @@ async def sandboxenv_context(
     try:
         # initialize sandbox environment,
         environments = await init_sandbox_environments_sample(
-            type=sandbox[0],
+            type=sandbox.type,
             task_name=task_name,
-            config=sandbox[1],
+            config=sandbox.config,
             files=files,
             setup=setup,
             metadata=sample.metadata if sample.metadata else {},
@@ -66,9 +69,9 @@ async def sandboxenv_context(
         # cleanup sandbox environment
         if environments and cleanup:
             await cleanup_sandbox_environments_sample(
-                type=sandbox[0],
+                type=sandbox.type,
                 task_name=task_name,
-                config=sandbox[1],
+                config=sandbox.config,
                 environments=environments,
                 interrupted=interrupted,
             )
@@ -94,39 +97,40 @@ def read_sandboxenv_file(contents: str) -> bytes:
     return file_bytes
 
 
+class TaskSandboxEnvironment(NamedTuple):
+    sandbox: SandboxEnvironmentSpec
+    run_dir: str
+
+
 def resolve_sandbox_for_task(
     task: Task,
     sample: Sample,
-) -> tuple[str, str | None, str] | None:
+) -> TaskSandboxEnvironment | None:
     sandbox = resolve_sandbox(task.sandbox, sample)
     if sandbox is not None:
-        return sandbox + (task_run_dir(task),)
+        return TaskSandboxEnvironment(sandbox, task_run_dir(task))
     else:
         return None
 
 
 def resolve_sandbox(
-    sandbox: tuple[str, str | None] | None,
+    sandbox: SandboxEnvironmentSpec | None,
     sample: Sample,
-) -> tuple[str, str | None] | None:
+) -> SandboxEnvironmentSpec | None:
     # resolve sandbox (task type overrides sample type, but sample config
     # file overrides task config file if they have the same type)
-    sample_sandbox = (
-        (sample.sandbox, None) if isinstance(sample.sandbox, str) else sample.sandbox
-    )
     task_sandbox = sandbox
     if task_sandbox is not None:
-        sandbox_type = task_sandbox[0]
         if (
-            sample_sandbox
-            and sample_sandbox[0] == sandbox_type
-            and isinstance(sample_sandbox[1], str)
+            sample.sandbox
+            and sample.sandbox.type == task_sandbox.type
+            and sample.sandbox.config is not None
         ):
-            sandbox_config: str | None = sample_sandbox[1]
+            sandbox_config: str | None = sample.sandbox.config
         else:
-            sandbox_config = task_sandbox[1]
-        return (sandbox_type, sandbox_config)
-    elif sample_sandbox is not None:
-        return sample_sandbox
+            sandbox_config = task_sandbox.config
+        return SandboxEnvironmentSpec(task_sandbox.type, sandbox_config)
+    elif sample.sandbox is not None:
+        return sample.sandbox
     else:
         return None
diff --git a/src/inspect_ai/_eval/task/task.py b/src/inspect_ai/_eval/task/task.py
index a295f90cf..9e337cb73 100644
--- a/src/inspect_ai/_eval/task/task.py
+++ b/src/inspect_ai/_eval/task/task.py
@@ -14,6 +14,11 @@
 from inspect_ai.scorer._reducer import ScoreReducers, create_reducers
 from inspect_ai.solver import Plan, Solver, generate
 from inspect_ai.solver._chain import chain
+from inspect_ai.util._sandbox.environment import (
+    SandboxEnvironmentSpec,
+    SandboxEnvironmentType,
+    resolve_sandbox_environment,
+)
 
 from .epochs import Epochs
 
@@ -22,7 +27,7 @@
 
 class TaskDeprecatedArgs(TypedDict, total=False):
     plan: Plan | Solver | list[Solver]
-    tool_environment: str | tuple[str, str] | None
+    tool_environment: str | SandboxEnvironmentSpec | None
     epochs_reducer: ScoreReducers | None
     max_messages: int | None
 
@@ -40,8 +45,8 @@ class Task:
         metrics (list[Metric] | dict[str, list[Metric]] | None):
           Alternative metrics (overrides the metrics provided by the specified scorer).
         config (GenerateConfig): Model generation config.
-        sandbox (str | tuple[str,str] | None): Sandbox
-           environment type (or optionally a tuple with type and config file)
+        sandbox (SandboxEnvironmentType | None): Sandbox environment type
+          (or optionally a str or tuple with a shorthand spec)
         epochs (int | Epochs | None): Epochs to repeat samples for and optional score
            reducer function(s) used to combine sample scores (defaults to "mean")
         fail_on_error (bool | float | None): `True` to fail on first sample error
@@ -66,7 +71,7 @@ def __init__(
         scorer: Scorer | list[Scorer] | None = None,
         metrics: list[Metric] | dict[str, list[Metric]] | None = None,
         config: GenerateConfig = GenerateConfig(),
-        sandbox: str | tuple[str, str] | None = None,
+        sandbox: SandboxEnvironmentType | None = None,
         epochs: int | Epochs | None = None,
         fail_on_error: bool | float | None = None,
         message_limit: int | None = None,
@@ -81,7 +86,7 @@ def __init__(
             newarg = ""
             if arg == "tool_environment":
                 newarg = "sandbox"
-                sandbox = cast(str | tuple[str, str] | None, value)
+                sandbox = cast(str | SandboxEnvironmentSpec | None, value)
             elif arg == "epochs_reducer":
                 newarg = "epochs"
                 if isinstance(epochs, int):
@@ -122,7 +127,7 @@ def __init__(
         )
         self.metrics = metrics
         self.config = config
-        self.sandbox = (sandbox, None) if isinstance(sandbox, str) else sandbox
+        self.sandbox = resolve_sandbox_environment(sandbox)
         self.epochs = epochs.epochs if epochs else None
         self.epochs_reducer = epochs.reducer if epochs else None
         self.fail_on_error = fail_on_error
diff --git a/src/inspect_ai/_view/www/log-schema.json b/src/inspect_ai/_view/www/log-schema.json
index b0907bc66..a21f02484 100644
--- a/src/inspect_ai/_view/www/log-schema.json
+++ b/src/inspect_ai/_view/www/log-schema.json
@@ -1218,31 +1218,13 @@
         "sandbox": {
           "anyOf": [
             {
-              "maxItems": 2,
-              "minItems": 2,
-              "prefixItems": [
-                {
-                  "type": "string"
-                },
-                {
-                  "anyOf": [
-                    {
-                      "type": "string"
-                    },
-                    {
-                      "type": "null"
-                    }
-                  ]
-                }
-              ],
-              "type": "array"
+              "$ref": "#/$defs/SandboxEnvironmentSpec"
             },
             {
               "type": "null"
             }
           ],
-          "default": null,
-          "title": "Sandbox"
+          "default": null
         },
         "files": {
           "anyOf": [
@@ -1526,31 +1508,13 @@
         "sandbox": {
           "anyOf": [
             {
-              "maxItems": 2,
-              "minItems": 2,
-              "prefixItems": [
-                {
-                  "type": "string"
-                },
-                {
-                  "anyOf": [
-                    {
-                      "type": "string"
-                    },
-                    {
-                      "type": "null"
-                    }
-                  ]
-                }
-              ],
-              "type": "array"
+              "$ref": "#/$defs/SandboxEnvironmentSpec"
             },
             {
               "type": "null"
             }
           ],
-          "default": null,
-          "title": "Sandbox"
+          "default": null
         },
         "model": {
           "title": "Model",
@@ -2484,7 +2448,6 @@
       "additionalProperties": false
     },
     "Sample": {
-      "description": "Sample to be used in an evaluation task.\n\nArgs:\n    input (str | list[ChatMessage]): The input to be submitted to the model.\n    choices (list[str] | None): Optional. List of available answer choices\n       (used only for multiple-choice evals).\n    target (str | list[str]): Optional. Ideal target output. May be a literal value\n        or narrative text to be used by a model grader.\n    id (int | str | None): Optional. Unique identifier for sample.\n    metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample.\n    sandbox (SandboxEnvironmentSpec | None): Optional. Sandbox environment\n      type and optional config file.\n    files (dict[str, str] | None): Optional. Files that go along with the sample (copied to\n      SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).\n    setup (str | None): Optional. Setup script to run for sample (run\n      within default SandboxEnvironment).",
       "properties": {
         "input": {
           "anyOf": [
@@ -2573,34 +2536,13 @@
         "sandbox": {
           "anyOf": [
             {
-              "type": "string"
-            },
-            {
-              "maxItems": 2,
-              "minItems": 2,
-              "prefixItems": [
-                {
-                  "type": "string"
-                },
-                {
-                  "anyOf": [
-                    {
-                      "type": "string"
-                    },
-                    {
-                      "type": "null"
-                    }
-                  ]
-                }
-              ],
-              "type": "array"
+              "$ref": "#/$defs/SandboxEnvironmentSpec"
             },
             {
               "type": "null"
             }
           ],
-          "default": null,
-          "title": "Sandbox"
+          "default": null
         },
         "files": {
           "anyOf": [
@@ -2802,6 +2744,29 @@
       "type": "object",
       "additionalProperties": false
     },
+    "SandboxEnvironmentSpec": {
+      "maxItems": 2,
+      "minItems": 1,
+      "prefixItems": [
+        {
+          "title": "Type",
+          "type": "string"
+        },
+        {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "title": "Config"
+        }
+      ],
+      "type": "array"
+    },
     "Score": {
       "description": "Score generated by a scorer.\n\nArgs:\n   value (Value): Score value.\n   answer (str | None): Answer extracted from model output (optional).\n   explanation (str | None): Explanation of score (optional).\n   metadata (dict[str,Any]): Additional metadata related to the score.",
       "properties": {
diff --git a/src/inspect_ai/_view/www/src/types/log.d.ts b/src/inspect_ai/_view/www/src/types/log.d.ts
index 307797532..d8b06cf87 100644
--- a/src/inspect_ai/_view/www/src/types/log.d.ts
+++ b/src/inspect_ai/_view/www/src/types/log.d.ts
@@ -19,7 +19,11 @@ export type Name = string | null;
 export type Location = string | null;
 export type Samples = number | null;
 export type Shuffled = boolean | null;
-export type Sandbox = [unknown, unknown] | null;
+/**
+ * @minItems 1
+ * @maxItems 2
+ */
+export type SandboxEnvironmentSpec = [unknown] | [unknown, unknown];
 export type Model = string;
 export type ModelBaseUrl = string | null;
 export type Limit = number | [unknown, unknown] | null;
@@ -153,7 +157,6 @@ export type Type4 =
 export type Message1 = string;
 export type Choices = string[] | null;
 export type Target = string | string[];
-export type Sandbox1 = [unknown, unknown] | null;
 export type Files = string[] | null;
 export type Setup = string | null;
 export type Messages = (
@@ -208,7 +211,6 @@ export type Choices2 = string[] | null;
 export type Target1 = string | string[];
 export type Id2 = number | string | null;
 export type Metadata7 = {} | null;
-export type Sandbox2 = string | [unknown, unknown] | null;
 export type Files1 = {
   [k: string]: string;
 } | null;
@@ -380,7 +382,7 @@ export interface EvalSpec {
   solver: Solver;
   solver_args: SolverArgs;
   dataset: EvalDataset;
-  sandbox: Sandbox;
+  sandbox: SandboxEnvironmentSpec | null;
   model: Model;
   model_base_url: ModelBaseUrl;
   model_args: ModelArgs;
@@ -555,7 +557,7 @@ export interface EvalSample {
   input: Input;
   choices: Choices;
   target: Target;
-  sandbox: Sandbox1;
+  sandbox: SandboxEnvironmentSpec | null;
   files: Files;
   setup: Setup;
   messages: Messages;
@@ -675,31 +677,13 @@ export interface SampleInitEvent {
   sample: Sample;
   state: JsonValue;
 }
-/**
- * Sample to be used in an evaluation task.
- *
- * Args:
- *     input (str | list[ChatMessage]): The input to be submitted to the model.
- *     choices (list[str] | None): Optional. List of available answer choices
- *        (used only for multiple-choice evals).
- *     target (str | list[str]): Optional. Ideal target output. May be a literal value
- *         or narrative text to be used by a model grader.
- *     id (int | str | None): Optional. Unique identifier for sample.
- *     metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample.
- *     sandbox (SandboxEnvironmentSpec | None): Optional. Sandbox environment
- *       type and optional config file.
- *     files (dict[str, str] | None): Optional. Files that go along with the sample (copied to
- *       SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
- *     setup (str | None): Optional. Setup script to run for sample (run
- *       within default SandboxEnvironment).
- */
 export interface Sample {
   input: Input1;
   choices: Choices2;
   target: Target1;
   id: Id2;
   metadata: Metadata7;
-  sandbox: Sandbox2;
+  sandbox: SandboxEnvironmentSpec | null;
   files: Files1;
   setup: Setup1;
 }
diff --git a/src/inspect_ai/dataset/_dataset.py b/src/inspect_ai/dataset/_dataset.py
index b1e543580..12808948a 100644
--- a/src/inspect_ai/dataset/_dataset.py
+++ b/src/inspect_ai/dataset/_dataset.py
@@ -14,30 +14,52 @@
 from typing_extensions import override
 
 from inspect_ai.model import ChatMessage
-from inspect_ai.util import SandboxEnvironmentSpec
+from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType
+from inspect_ai.util._sandbox.environment import resolve_sandbox_environment
 
 if TYPE_CHECKING:
     from _typeshed import SupportsRichComparison
 
 
 class Sample(BaseModel):
-    r"""Sample to be used in an evaluation task.
+    def __init__(
+        self,
+        input: str | list[ChatMessage],
+        choices: list[str] | None = None,
+        target: str | list[str] = "",
+        id: int | str | None = None,
+        metadata: dict[str, Any] | None = None,
+        sandbox: SandboxEnvironmentType | None = None,
+        files: dict[str, str] | None = None,
+        setup: str | None = None,
+    ) -> None:
+        r"""Sample to be used in an evaluation task.
 
-    Args:
-        input (str | list[ChatMessage]): The input to be submitted to the model.
-        choices (list[str] | None): Optional. List of available answer choices
-           (used only for multiple-choice evals).
-        target (str | list[str]): Optional. Ideal target output. May be a literal value
-            or narrative text to be used by a model grader.
-        id (int | str | None): Optional. Unique identifier for sample.
-        metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample.
-        sandbox (SandboxEnvironmentSpec | None): Optional. Sandbox environment
-          type and optional config file.
-        files (dict[str, str] | None): Optional. Files that go along with the sample (copied to
-          SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
-        setup (str | None): Optional. Setup script to run for sample (run
-          within default SandboxEnvironment).
-    """
+        Args:
+            input (str | list[ChatMessage]): The input to be submitted to the model.
+            choices (list[str] | None): Optional. List of available answer choices
+            (used only for multiple-choice evals).
+            target (str | list[str]): Optional. Ideal target output. May be a literal value
+                or narrative text to be used by a model grader.
+            id (int | str | None): Optional. Unique identifier for sample.
+            metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample.
+            sandbox (SandboxEnvironmentType | None): Sandbox environment type
+            (or optionally a str or tuple with a shorthand spec)
+            files (dict[str, str] | None): Optional. Files that go along with the sample (copied to
+            SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).
+            setup (str | None): Optional. Setup script to run for sample (run
+            within default SandboxEnvironment).
+        """
+        super().__init__(
+            input=input,
+            choices=choices,
+            target=target,
+            id=id,
+            metadata=metadata,
+            sandbox=resolve_sandbox_environment(sandbox),
+            files=files,
+            setup=setup,
+        )
 
     input: str | list[ChatMessage]
     """The input to be submitted to the model."""
diff --git a/src/inspect_ai/dataset/_sources/util.py b/src/inspect_ai/dataset/_sources/util.py
index 4a02b345b..4085e3c0d 100644
--- a/src/inspect_ai/dataset/_sources/util.py
+++ b/src/inspect_ai/dataset/_sources/util.py
@@ -3,6 +3,7 @@
 from inspect_ai._util.content import Content, ContentImage
 from inspect_ai._util.file import filesystem
 from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser
+from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 
 from .._dataset import Dataset
 
@@ -33,8 +34,10 @@ def resolve_file(file: str) -> str:
     # for each sample
     for sample in dataset:
         # check for sandbox config file
-        if isinstance(sample.sandbox, tuple) and sample.sandbox[1] is not None:
-            sample.sandbox = (sample.sandbox[0], resolve_file(sample.sandbox[1]))
+        if sample.sandbox and sample.sandbox.config is not None:
+            sample.sandbox = SandboxEnvironmentSpec(
+                sample.sandbox.type, resolve_file(sample.sandbox.config)
+            )
 
         # check for files
         if sample.files is not None:
diff --git a/src/inspect_ai/dataset/_util.py b/src/inspect_ai/dataset/_util.py
index e0b41d5c5..df4c59b6a 100644
--- a/src/inspect_ai/dataset/_util.py
+++ b/src/inspect_ai/dataset/_util.py
@@ -169,11 +169,11 @@ def read_sandbox(sandbox: Any | None) -> SandboxEnvironmentSpec | None:
             if sandbox.strip().startswith("["):
                 sandbox = json.loads(sandbox)
             else:
-                return (sandbox, None)
+                return SandboxEnvironmentSpec(sandbox)
 
         if isinstance(sandbox, list):
             if len(sandbox) == 2:
-                return str(sandbox[0]), str(sandbox[1])
+                return SandboxEnvironmentSpec(str(sandbox[0]), str(sandbox[1]))
             else:
                 raise ValueError(
                     f"Invalid 'sandbox' value: '{str(sandbox)}'. Sandbox must be string or 2-item list"
diff --git a/src/inspect_ai/log/_log.py b/src/inspect_ai/log/_log.py
index 066b54803..2eed96ccc 100644
--- a/src/inspect_ai/log/_log.py
+++ b/src/inspect_ai/log/_log.py
@@ -25,6 +25,7 @@
 )
 from inspect_ai.scorer import Score
 from inspect_ai.scorer._metric import SampleScore
+from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec
 
 from ._transcript import EvalEvents
 
@@ -118,7 +119,7 @@ class EvalSample(BaseModel):
     target: str | list[str]
     """Sample target value(s)"""
 
-    sandbox: tuple[str, str | None] | None = Field(default=None)
+    sandbox: SandboxEnvironmentSpec | None = Field(default=None)
     """Sandbox environment type and optional config file."""
 
     files: list[str] | None = Field(default=None)
@@ -372,7 +373,7 @@ class EvalSpec(BaseModel):
     dataset: EvalDataset
     """Dataset used for eval."""
 
-    sandbox: tuple[str, str | None] | None = Field(default=None)
+    sandbox: SandboxEnvironmentSpec | None = Field(default=None)
     """Sandbox environment type and optional config file."""
 
     model: str
diff --git a/src/inspect_ai/util/__init__.py b/src/inspect_ai/util/__init__.py
index e942603be..5152836e0 100644
--- a/src/inspect_ai/util/__init__.py
+++ b/src/inspect_ai/util/__init__.py
@@ -5,6 +5,7 @@
     SandboxEnvironment,
     SandboxEnvironments,
     SandboxEnvironmentSpec,
+    SandboxEnvironmentType,
     sandbox,
     sandbox_with,
     sandboxenv,
@@ -26,6 +27,7 @@
     "SandboxEnvironment",
     "SandboxEnvironments",
     "SandboxEnvironmentSpec",
+    "SandboxEnvironmentType",
     "sandboxenv",
     "sandbox",
     "sandbox_with",
diff --git a/src/inspect_ai/util/_sandbox/__init__.py b/src/inspect_ai/util/_sandbox/__init__.py
index 94b2d6681..e0984be86 100644
--- a/src/inspect_ai/util/_sandbox/__init__.py
+++ b/src/inspect_ai/util/_sandbox/__init__.py
@@ -2,7 +2,12 @@
 
 from .context import sandbox, sandbox_with
 from .docker.docker import DockerSandboxEnvironment  # noqa: F401
-from .environment import SandboxEnvironment, SandboxEnvironments, SandboxEnvironmentSpec
+from .environment import (
+    SandboxEnvironment,
+    SandboxEnvironments,
+    SandboxEnvironmentSpec,
+    SandboxEnvironmentType,
+)
 from .local import LocalSandboxEnvironment  # noqa: F401
 from .registry import sandboxenv
 
@@ -10,6 +15,7 @@
     "SandboxEnvironment",
     "SandboxEnvironments",
     "SandboxEnvironmentSpec",
+    "SandboxEnvironmentType",
     "sandboxenv",
     "sandbox",
     "sandbox_with",
diff --git a/src/inspect_ai/util/_sandbox/environment.py b/src/inspect_ai/util/_sandbox/environment.py
index c5308254f..a35a5e37b 100644
--- a/src/inspect_ai/util/_sandbox/environment.py
+++ b/src/inspect_ai/util/_sandbox/environment.py
@@ -1,6 +1,6 @@
 import abc
 from dataclasses import dataclass, field
-from typing import Awaitable, Callable, Literal, Union, overload
+from typing import Awaitable, Callable, Literal, NamedTuple, Union, overload
 
 from .._subprocess import ExecResult
 
@@ -194,5 +194,30 @@ class SandboxEnvironments:
     """
 
 
-SandboxEnvironmentSpec = str | tuple[str, str | None]
-"""Specification of a SandboxEnvironment (type or tuple with type and config file)."""
+class SandboxEnvironmentSpec(NamedTuple):
+    """Specification of a SandboxEnvironment."""
+
+    type: str
+    config: str | None = None
+
+
+SandboxEnvironmentType = SandboxEnvironmentSpec | str | tuple[str, str]
+"""SandboxEnvironmentSpec and str and tuple shorthands for it.
+
+A plain str, e.g. "docker", is equivalent to SandboxEnvironmentSpec("docker")
+A tuple, e.g. ("docker", "compose.yaml"), is equivalent to SandboxEnvironmentSpec("docker", "compose.yaml")
+"""
+
+
+def resolve_sandbox_environment(
+    sandbox: SandboxEnvironmentType | None,
+) -> SandboxEnvironmentSpec | None:
+    # do the resolution
+    if isinstance(sandbox, str):
+        return SandboxEnvironmentSpec(type=sandbox)
+    elif isinstance(sandbox, SandboxEnvironmentSpec):
+        return sandbox
+    elif isinstance(sandbox, tuple):
+        return SandboxEnvironmentSpec(sandbox[0], sandbox[1])
+    else:
+        return None
diff --git a/tests/tools/test_web_browser.py b/tests/tools/test_web_browser.py
index 13dc42b3e..e72f0169c 100644
--- a/tests/tools/test_web_browser.py
+++ b/tests/tools/test_web_browser.py
@@ -10,7 +10,6 @@
 from inspect_ai.model import ModelOutput, get_model
 from inspect_ai.solver import generate, use_tools
 from inspect_ai.tool import web_browser
-from inspect_ai.util import SandboxEnvironmentSpec
 
 
 @skip_if_no_docker
@@ -146,7 +145,7 @@ def test_web_browser_input():
     assert type_call
 
 
-def web_browser_sandbox() -> SandboxEnvironmentSpec:
+def web_browser_sandbox() -> tuple[str, str]:
     return (
         "docker",
         (Path(__file__).parent / "test_web_browser_compose.yaml").as_posix(),