From 32ba1013afb827afeaf1fbb1498d6f521b820033 Mon Sep 17 00:00:00 2001 From: jjallaire Date: Thu, 17 Oct 2024 17:15:47 +0100 Subject: [PATCH] Use NamedTuple for SandboxEnvironmentSpec (#703) * Use a NamedTuple for SandboxEnvironentSpec * change order of sandbox type * update json schema * cwd_relative_path for sandbox config * support Dockerfile as sandbox config * Revert "support Dockerfile as sandbox config" This reverts commit 6e617a6c6d451a95e520c5857babe35aac7c3ee2. --------- Co-authored-by: jjallaire-aisi Co-authored-by: Charles Teague --- src/inspect_ai/_cli/util.py | 8 +- src/inspect_ai/_eval/eval.py | 16 ++-- src/inspect_ai/_eval/evalset.py | 8 +- src/inspect_ai/_eval/loader.py | 35 ++++---- src/inspect_ai/_eval/run.py | 12 +-- src/inspect_ai/_eval/task/log.py | 15 ++-- src/inspect_ai/_eval/task/run.py | 5 +- src/inspect_ai/_eval/task/sandbox.py | 50 ++++++----- src/inspect_ai/_eval/task/task.py | 17 ++-- src/inspect_ai/_view/www/log-schema.json | 93 +++++++-------------- src/inspect_ai/_view/www/src/types/log.d.ts | 32 ++----- src/inspect_ai/dataset/_dataset.py | 56 +++++++++---- src/inspect_ai/dataset/_sources/util.py | 7 +- src/inspect_ai/dataset/_util.py | 4 +- src/inspect_ai/log/_log.py | 5 +- src/inspect_ai/util/__init__.py | 2 + src/inspect_ai/util/_sandbox/__init__.py | 8 +- src/inspect_ai/util/_sandbox/environment.py | 31 ++++++- tests/tools/test_web_browser.py | 3 +- 19 files changed, 214 insertions(+), 193 deletions(-) diff --git a/src/inspect_ai/_cli/util.py b/src/inspect_ai/_cli/util.py index e7f0bc08a..4beaad3f7 100644 --- a/src/inspect_ai/_cli/util.py +++ b/src/inspect_ai/_cli/util.py @@ -2,6 +2,8 @@ import yaml +from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec + def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]: params: dict[str, Any] = dict() @@ -18,12 +20,12 @@ def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]: return params -def parse_sandbox(sandbox: str | None) -> str | tuple[str, str] | None: +def parse_sandbox(sandbox: str | None) -> SandboxEnvironmentSpec | None: if sandbox is not None: parts = sandbox.split(":", maxsplit=1) if len(parts) == 1: - return sandbox + return SandboxEnvironmentSpec(sandbox) else: - return (parts[0], parts[1]) + return SandboxEnvironmentSpec(parts[0], parts[1]) else: return None diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py index 242bc4a12..372e82045 100644 --- a/src/inspect_ai/_eval/eval.py +++ b/src/inspect_ai/_eval/eval.py @@ -29,7 +29,7 @@ from inspect_ai.scorer._reducer import reducer_log_names from inspect_ai.solver._chain import chain from inspect_ai.solver._solver import Solver, SolverSpec -from inspect_ai.util import SandboxEnvironmentSpec +from inspect_ai.util import SandboxEnvironmentType from .context import init_eval_context from .loader import ResolvedTask, resolve_tasks @@ -45,7 +45,7 @@ def eval( model_base_url: str | None = None, model_args: dict[str, Any] = dict(), task_args: dict[str, Any] = dict(), - sandbox: SandboxEnvironmentSpec | None = None, + sandbox: SandboxEnvironmentType | None = None, sandbox_cleanup: bool | None = None, solver: Solver | list[Solver] | SolverSpec | None = None, trace: bool | None = None, @@ -80,8 +80,8 @@ def eval( with the model API. model_args (dict[str,Any]): Model creation parameters task_args (dict[str,Any]): Task arguments - sandbox (SandboxEnvironmentSpec | None): Sandbox - environment type (or optionally a tuple with type and config file) + sandbox (SandboxEnvironmentType | None): Sandbox environment type + (or optionally a str or tuple with a shorthand spec) sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes (defaults to True) solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s). @@ -166,7 +166,7 @@ async def eval_async( model_base_url: str | None = None, model_args: dict[str, Any] = dict(), task_args: dict[str, Any] = dict(), - sandbox: SandboxEnvironmentSpec | None = None, + sandbox: SandboxEnvironmentType | None = None, sandbox_cleanup: bool | None = None, solver: Solver | list[Solver] | SolverSpec | None = None, trace: bool | None = None, @@ -201,8 +201,8 @@ async def eval_async( with the model API. model_args (dict[str,Any]): Model creation parameters task_args (dict[str,Any]): Task arguments - sandbox (SandboxEnvironentSpec | None): Sandbox - environment type (or optionally a tuple with type and config file) + sandbox (SandboxEnvironmentType | None): Sandbox environment type + (or optionally a str or tuple with a shorthand spec) sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes (defaults to True) solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s). @@ -676,7 +676,7 @@ def eval_init( model_base_url: str | None = None, model_args: dict[str, Any] = dict(), task_args: dict[str, Any] = dict(), - sandbox: SandboxEnvironmentSpec | None = None, + sandbox: SandboxEnvironmentType | None = None, trace: bool | None = None, approval: str | list[ApprovalPolicy] | ApprovalPolicyConfig | None = None, max_subprocesses: int | None = None, diff --git a/src/inspect_ai/_eval/evalset.py b/src/inspect_ai/_eval/evalset.py index b4d3068cc..d3d168262 100644 --- a/src/inspect_ai/_eval/evalset.py +++ b/src/inspect_ai/_eval/evalset.py @@ -33,7 +33,7 @@ ) from inspect_ai.model._generate_config import GenerateConfig from inspect_ai.solver._solver import Solver, SolverSpec -from inspect_ai.util import SandboxEnvironmentSpec +from inspect_ai.util import SandboxEnvironmentType from .eval import eval, eval_init from .loader import ResolvedTask, resolve_task_args @@ -54,7 +54,7 @@ def eval_set( model_base_url: str | None = None, model_args: dict[str, Any] = dict(), task_args: dict[str, Any] = dict(), - sandbox: SandboxEnvironmentSpec | None = None, + sandbox: SandboxEnvironmentType | None = None, sandbox_cleanup: bool | None = None, solver: Solver | list[Solver] | SolverSpec | None = None, trace: bool | None = None, @@ -101,8 +101,8 @@ def eval_set( with the model API. model_args (dict[str,Any]): Model creation parameters task_args (dict[str,Any]): Task arguments - sandbox (SandboxEnvironmentSpec | None): Sandbox - environment type (or optionally a tuple with type and config file) + sandbox (SandboxEnvironmentType | None): Sandbox environment type + (or optionally a str or tuple with a shorthand spec) sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes (defaults to True) solver (Solver | list[Solver] | SolverSpec | None): Alternative solver(s) for diff --git a/src/inspect_ai/_eval/loader.py b/src/inspect_ai/_eval/loader.py index b7b2c64ac..2504d9be2 100644 --- a/src/inspect_ai/_eval/loader.py +++ b/src/inspect_ai/_eval/loader.py @@ -24,7 +24,8 @@ ) from inspect_ai.model import Model, ModelName from inspect_ai.solver._solver import Solver, SolverSpec -from inspect_ai.util import SandboxEnvironmentSpec +from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType +from inspect_ai.util._sandbox.environment import resolve_sandbox_environment from inspect_ai.util._sandbox.registry import registry_find_sandboxenv from .list import task_files @@ -42,7 +43,7 @@ class ResolvedTask: task_args: dict[str, Any] task_file: str | None model: Model - sandbox: tuple[str, str | None] | None + sandbox: SandboxEnvironmentSpec | None sequence: int id: str | None = field(default=None) sample_source: EvalSampleSource | None = field(default=None) @@ -61,7 +62,7 @@ def resolve_tasks( tasks: Tasks, task_args: dict[str, Any], model: Model, - sandbox: SandboxEnvironmentSpec | None, + sandbox: SandboxEnvironmentType | None, ) -> list[ResolvedTask]: def as_resolved_tasks(tasks: list[Task]) -> list[ResolvedTask]: return [ @@ -169,24 +170,18 @@ def resolve_task_args(task: Task) -> dict[str, Any]: def resolve_task_sandbox( - task: Task, sandbox: SandboxEnvironmentSpec | None -) -> tuple[str, str | None] | None: + task: Task, sandbox: SandboxEnvironmentType | None +) -> SandboxEnvironmentSpec | None: # do the resolution - resolved_sandbox = ( - (sandbox, None) - if isinstance(sandbox, str) - else sandbox - if sandbox is not None - else task.sandbox - ) + resolved_sandbox = resolve_sandbox_environment(sandbox) or task.sandbox # if we have a sandbox with no config, see if there are implcit # config files available for the provider if resolved_sandbox is not None: # look for default - if resolved_sandbox[1] is None: + if resolved_sandbox.config is None: # get config files for this type - sandboxenv_type = registry_find_sandboxenv(resolved_sandbox[0]) + sandboxenv_type = registry_find_sandboxenv(resolved_sandbox.type) config_files_fn = cast( Callable[..., list[str]], getattr(sandboxenv_type, "config_files") ) @@ -197,15 +192,19 @@ def resolve_task_sandbox( for config_file in config_files: config_file_path = os.path.join(src_dir, config_file) if os.path.isfile(config_file_path): - resolved_sandbox = (resolved_sandbox[0], config_file) + resolved_sandbox = SandboxEnvironmentSpec( + resolved_sandbox.type, config_file + ) break # resolve relative paths - if resolved_sandbox[1] is not None: - file_path = Path(resolved_sandbox[1]) + if resolved_sandbox.config is not None: + file_path = Path(resolved_sandbox.config) if not file_path.is_absolute(): file_path = Path(task_run_dir(task)) / file_path - resolved_sandbox = (resolved_sandbox[0], file_path.as_posix()) + resolved_sandbox = SandboxEnvironmentSpec( + resolved_sandbox.type, file_path.as_posix() + ) # return resolved sandbox return resolved_sandbox diff --git a/src/inspect_ai/_eval/run.py b/src/inspect_ai/_eval/run.py index fbad6fa93..1ad604347 100644 --- a/src/inspect_ai/_eval/run.py +++ b/src/inspect_ai/_eval/run.py @@ -27,7 +27,7 @@ from .task.log import TaskLogger from .task.run import TaskRunOptions, create_sample_semaphore, task_run from .task.rundir import task_run_dir_switching -from .task.sandbox import resolve_sandbox_for_task +from .task.sandbox import TaskSandboxEnvironment, resolve_sandbox_for_task from .task.util import task_run_dir log = logging.getLogger(__name__) @@ -310,7 +310,7 @@ async def startup_sandbox_environments( tasks: list[ResolvedTask], cleanup: bool ) -> Callable[[], Awaitable[None]]: # find unique sandboxenvs - sandboxenvs: Set[tuple[str, str | None, str]] = set() + sandboxenvs: Set[TaskSandboxEnvironment] = set() for task in tasks: # resolve each sample and add to sandboxenvs for sample in task.task.dataset: @@ -322,16 +322,16 @@ async def startup_sandbox_environments( cleanups: list[tuple[TaskCleanup, str | None, str]] = [] for sandboxenv in sandboxenvs: # find type - sandboxenv_type = registry_find_sandboxenv(sandboxenv[0]) + sandboxenv_type = registry_find_sandboxenv(sandboxenv.sandbox.type) # run startup task_init = cast(TaskInit, getattr(sandboxenv_type, "task_init")) - with chdir(sandboxenv[2]): - await task_init("startup", sandboxenv[1]) + with chdir(sandboxenv.run_dir): + await task_init("startup", sandboxenv.sandbox.config) # append cleanup method task_cleanup = cast(TaskCleanup, getattr(sandboxenv_type, "task_cleanup")) - cleanups.append((task_cleanup, sandboxenv[1], sandboxenv[2])) + cleanups.append((task_cleanup, sandboxenv.sandbox.config, sandboxenv.run_dir)) # return shutdown method async def shutdown() -> None: diff --git a/src/inspect_ai/_eval/task/log.py b/src/inspect_ai/_eval/task/log.py index 06d677a5d..2113f03e2 100644 --- a/src/inspect_ai/_eval/task/log.py +++ b/src/inspect_ai/_eval/task/log.py @@ -37,6 +37,7 @@ from inspect_ai.scorer._metric import SampleScore from inspect_ai.solver import Plan, Solver, TaskState from inspect_ai.solver._solver import SolverSpec +from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec class TaskLogger: @@ -50,7 +51,7 @@ def __init__( solver: SolverSpec | None, model: Model, dataset: Dataset, - sandbox: tuple[str, str | None] | None, + sandbox: SandboxEnvironmentSpec | None, task_attribs: dict[str, Any], task_args: dict[str, Any], model_args: dict[str, Any], @@ -72,6 +73,12 @@ def __init__( if "api_key" in model_args: del model_args["api_key"] + # cwd_relative_path for sandbox config + if sandbox and sandbox.config: + sandbox = SandboxEnvironmentSpec( + sandbox.type, cwd_relative_path(sandbox.config) + ) + # create eval spec self.eval = EvalSpec( run_id=run_id, @@ -155,11 +162,7 @@ def log_sample( choices=sample.choices, target=sample.target, metadata=state.metadata if state.metadata else {}, - sandbox=( - (sample.sandbox, None) - if isinstance(sample.sandbox, str) - else sample.sandbox - ), + sandbox=sample.sandbox, files=list(sample.files.keys()) if sample.files else None, setup=sample.setup, messages=state.messages, diff --git a/src/inspect_ai/_eval/task/run.py b/src/inspect_ai/_eval/task/run.py index 9f768741a..0eb2c0a81 100644 --- a/src/inspect_ai/_eval/task/run.py +++ b/src/inspect_ai/_eval/task/run.py @@ -64,6 +64,7 @@ from inspect_ai.solver._fork import set_task_generate from inspect_ai.solver._solver import Solver from inspect_ai.solver._task_state import set_sample_state, state_jsonable +from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec from inspect_ai.util._subtask import init_subtask from ..context import init_task_context @@ -92,7 +93,7 @@ class TaskRunOptions: task: Task model: Model - sandbox: tuple[str, str | None] | None + sandbox: SandboxEnvironmentSpec | None logger: TaskLogger eval_wd: str config: EvalConfig = field(default_factory=EvalConfig) @@ -343,7 +344,7 @@ async def task_run_sample( task_name: str, sample: Sample, state: TaskState, - sandbox: tuple[str, str | None] | None, + sandbox: SandboxEnvironmentSpec | None, sandbox_cleanup: bool, plan: Plan, scorers: list[Scorer] | None, diff --git a/src/inspect_ai/_eval/task/sandbox.py b/src/inspect_ai/_eval/task/sandbox.py index a0dcf871c..d15eaafb4 100644 --- a/src/inspect_ai/_eval/task/sandbox.py +++ b/src/inspect_ai/_eval/task/sandbox.py @@ -1,7 +1,7 @@ import asyncio import base64 import contextlib -from typing import AsyncGenerator +from typing import AsyncGenerator, NamedTuple from inspect_ai._eval.task.task import Task from inspect_ai._eval.task.util import task_run_dir @@ -12,13 +12,16 @@ cleanup_sandbox_environments_sample, init_sandbox_environments_sample, ) -from inspect_ai.util._sandbox.environment import SandboxEnvironment +from inspect_ai.util._sandbox.environment import ( + SandboxEnvironment, + SandboxEnvironmentSpec, +) @contextlib.asynccontextmanager async def sandboxenv_context( task_name: str, - sandbox: tuple[str, str | None] | None, + sandbox: SandboxEnvironmentSpec | None, cleanup: bool, sample: Sample, ) -> AsyncGenerator[None, None]: @@ -47,9 +50,9 @@ async def sandboxenv_context( try: # initialize sandbox environment, environments = await init_sandbox_environments_sample( - type=sandbox[0], + type=sandbox.type, task_name=task_name, - config=sandbox[1], + config=sandbox.config, files=files, setup=setup, metadata=sample.metadata if sample.metadata else {}, @@ -66,9 +69,9 @@ async def sandboxenv_context( # cleanup sandbox environment if environments and cleanup: await cleanup_sandbox_environments_sample( - type=sandbox[0], + type=sandbox.type, task_name=task_name, - config=sandbox[1], + config=sandbox.config, environments=environments, interrupted=interrupted, ) @@ -94,39 +97,40 @@ def read_sandboxenv_file(contents: str) -> bytes: return file_bytes +class TaskSandboxEnvironment(NamedTuple): + sandbox: SandboxEnvironmentSpec + run_dir: str + + def resolve_sandbox_for_task( task: Task, sample: Sample, -) -> tuple[str, str | None, str] | None: +) -> TaskSandboxEnvironment | None: sandbox = resolve_sandbox(task.sandbox, sample) if sandbox is not None: - return sandbox + (task_run_dir(task),) + return TaskSandboxEnvironment(sandbox, task_run_dir(task)) else: return None def resolve_sandbox( - sandbox: tuple[str, str | None] | None, + sandbox: SandboxEnvironmentSpec | None, sample: Sample, -) -> tuple[str, str | None] | None: +) -> SandboxEnvironmentSpec | None: # resolve sandbox (task type overrides sample type, but sample config # file overrides task config file if they have the same type) - sample_sandbox = ( - (sample.sandbox, None) if isinstance(sample.sandbox, str) else sample.sandbox - ) task_sandbox = sandbox if task_sandbox is not None: - sandbox_type = task_sandbox[0] if ( - sample_sandbox - and sample_sandbox[0] == sandbox_type - and isinstance(sample_sandbox[1], str) + sample.sandbox + and sample.sandbox.type == task_sandbox.type + and sample.sandbox.config is not None ): - sandbox_config: str | None = sample_sandbox[1] + sandbox_config: str | None = sample.sandbox.config else: - sandbox_config = task_sandbox[1] - return (sandbox_type, sandbox_config) - elif sample_sandbox is not None: - return sample_sandbox + sandbox_config = task_sandbox.config + return SandboxEnvironmentSpec(task_sandbox.type, sandbox_config) + elif sample.sandbox is not None: + return sample.sandbox else: return None diff --git a/src/inspect_ai/_eval/task/task.py b/src/inspect_ai/_eval/task/task.py index a295f90cf..9e337cb73 100644 --- a/src/inspect_ai/_eval/task/task.py +++ b/src/inspect_ai/_eval/task/task.py @@ -14,6 +14,11 @@ from inspect_ai.scorer._reducer import ScoreReducers, create_reducers from inspect_ai.solver import Plan, Solver, generate from inspect_ai.solver._chain import chain +from inspect_ai.util._sandbox.environment import ( + SandboxEnvironmentSpec, + SandboxEnvironmentType, + resolve_sandbox_environment, +) from .epochs import Epochs @@ -22,7 +27,7 @@ class TaskDeprecatedArgs(TypedDict, total=False): plan: Plan | Solver | list[Solver] - tool_environment: str | tuple[str, str] | None + tool_environment: str | SandboxEnvironmentSpec | None epochs_reducer: ScoreReducers | None max_messages: int | None @@ -40,8 +45,8 @@ class Task: metrics (list[Metric] | dict[str, list[Metric]] | None): Alternative metrics (overrides the metrics provided by the specified scorer). config (GenerateConfig): Model generation config. - sandbox (str | tuple[str,str] | None): Sandbox - environment type (or optionally a tuple with type and config file) + sandbox (SandboxEnvironmentType | None): Sandbox environment type + (or optionally a str or tuple with a shorthand spec) epochs (int | Epochs | None): Epochs to repeat samples for and optional score reducer function(s) used to combine sample scores (defaults to "mean") fail_on_error (bool | float | None): `True` to fail on first sample error @@ -66,7 +71,7 @@ def __init__( scorer: Scorer | list[Scorer] | None = None, metrics: list[Metric] | dict[str, list[Metric]] | None = None, config: GenerateConfig = GenerateConfig(), - sandbox: str | tuple[str, str] | None = None, + sandbox: SandboxEnvironmentType | None = None, epochs: int | Epochs | None = None, fail_on_error: bool | float | None = None, message_limit: int | None = None, @@ -81,7 +86,7 @@ def __init__( newarg = "" if arg == "tool_environment": newarg = "sandbox" - sandbox = cast(str | tuple[str, str] | None, value) + sandbox = cast(str | SandboxEnvironmentSpec | None, value) elif arg == "epochs_reducer": newarg = "epochs" if isinstance(epochs, int): @@ -122,7 +127,7 @@ def __init__( ) self.metrics = metrics self.config = config - self.sandbox = (sandbox, None) if isinstance(sandbox, str) else sandbox + self.sandbox = resolve_sandbox_environment(sandbox) self.epochs = epochs.epochs if epochs else None self.epochs_reducer = epochs.reducer if epochs else None self.fail_on_error = fail_on_error diff --git a/src/inspect_ai/_view/www/log-schema.json b/src/inspect_ai/_view/www/log-schema.json index b0907bc66..a21f02484 100644 --- a/src/inspect_ai/_view/www/log-schema.json +++ b/src/inspect_ai/_view/www/log-schema.json @@ -1218,31 +1218,13 @@ "sandbox": { "anyOf": [ { - "maxItems": 2, - "minItems": 2, - "prefixItems": [ - { - "type": "string" - }, - { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - } - ], - "type": "array" + "$ref": "#/$defs/SandboxEnvironmentSpec" }, { "type": "null" } ], - "default": null, - "title": "Sandbox" + "default": null }, "files": { "anyOf": [ @@ -1526,31 +1508,13 @@ "sandbox": { "anyOf": [ { - "maxItems": 2, - "minItems": 2, - "prefixItems": [ - { - "type": "string" - }, - { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - } - ], - "type": "array" + "$ref": "#/$defs/SandboxEnvironmentSpec" }, { "type": "null" } ], - "default": null, - "title": "Sandbox" + "default": null }, "model": { "title": "Model", @@ -2484,7 +2448,6 @@ "additionalProperties": false }, "Sample": { - "description": "Sample to be used in an evaluation task.\n\nArgs:\n input (str | list[ChatMessage]): The input to be submitted to the model.\n choices (list[str] | None): Optional. List of available answer choices\n (used only for multiple-choice evals).\n target (str | list[str]): Optional. Ideal target output. May be a literal value\n or narrative text to be used by a model grader.\n id (int | str | None): Optional. Unique identifier for sample.\n metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample.\n sandbox (SandboxEnvironmentSpec | None): Optional. Sandbox environment\n type and optional config file.\n files (dict[str, str] | None): Optional. Files that go along with the sample (copied to\n SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL).\n setup (str | None): Optional. Setup script to run for sample (run\n within default SandboxEnvironment).", "properties": { "input": { "anyOf": [ @@ -2573,34 +2536,13 @@ "sandbox": { "anyOf": [ { - "type": "string" - }, - { - "maxItems": 2, - "minItems": 2, - "prefixItems": [ - { - "type": "string" - }, - { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - } - ], - "type": "array" + "$ref": "#/$defs/SandboxEnvironmentSpec" }, { "type": "null" } ], - "default": null, - "title": "Sandbox" + "default": null }, "files": { "anyOf": [ @@ -2802,6 +2744,29 @@ "type": "object", "additionalProperties": false }, + "SandboxEnvironmentSpec": { + "maxItems": 2, + "minItems": 1, + "prefixItems": [ + { + "title": "Type", + "type": "string" + }, + { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Config" + } + ], + "type": "array" + }, "Score": { "description": "Score generated by a scorer.\n\nArgs:\n value (Value): Score value.\n answer (str | None): Answer extracted from model output (optional).\n explanation (str | None): Explanation of score (optional).\n metadata (dict[str,Any]): Additional metadata related to the score.", "properties": { diff --git a/src/inspect_ai/_view/www/src/types/log.d.ts b/src/inspect_ai/_view/www/src/types/log.d.ts index 307797532..d8b06cf87 100644 --- a/src/inspect_ai/_view/www/src/types/log.d.ts +++ b/src/inspect_ai/_view/www/src/types/log.d.ts @@ -19,7 +19,11 @@ export type Name = string | null; export type Location = string | null; export type Samples = number | null; export type Shuffled = boolean | null; -export type Sandbox = [unknown, unknown] | null; +/** + * @minItems 1 + * @maxItems 2 + */ +export type SandboxEnvironmentSpec = [unknown] | [unknown, unknown]; export type Model = string; export type ModelBaseUrl = string | null; export type Limit = number | [unknown, unknown] | null; @@ -153,7 +157,6 @@ export type Type4 = export type Message1 = string; export type Choices = string[] | null; export type Target = string | string[]; -export type Sandbox1 = [unknown, unknown] | null; export type Files = string[] | null; export type Setup = string | null; export type Messages = ( @@ -208,7 +211,6 @@ export type Choices2 = string[] | null; export type Target1 = string | string[]; export type Id2 = number | string | null; export type Metadata7 = {} | null; -export type Sandbox2 = string | [unknown, unknown] | null; export type Files1 = { [k: string]: string; } | null; @@ -380,7 +382,7 @@ export interface EvalSpec { solver: Solver; solver_args: SolverArgs; dataset: EvalDataset; - sandbox: Sandbox; + sandbox: SandboxEnvironmentSpec | null; model: Model; model_base_url: ModelBaseUrl; model_args: ModelArgs; @@ -555,7 +557,7 @@ export interface EvalSample { input: Input; choices: Choices; target: Target; - sandbox: Sandbox1; + sandbox: SandboxEnvironmentSpec | null; files: Files; setup: Setup; messages: Messages; @@ -675,31 +677,13 @@ export interface SampleInitEvent { sample: Sample; state: JsonValue; } -/** - * Sample to be used in an evaluation task. - * - * Args: - * input (str | list[ChatMessage]): The input to be submitted to the model. - * choices (list[str] | None): Optional. List of available answer choices - * (used only for multiple-choice evals). - * target (str | list[str]): Optional. Ideal target output. May be a literal value - * or narrative text to be used by a model grader. - * id (int | str | None): Optional. Unique identifier for sample. - * metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample. - * sandbox (SandboxEnvironmentSpec | None): Optional. Sandbox environment - * type and optional config file. - * files (dict[str, str] | None): Optional. Files that go along with the sample (copied to - * SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL). - * setup (str | None): Optional. Setup script to run for sample (run - * within default SandboxEnvironment). - */ export interface Sample { input: Input1; choices: Choices2; target: Target1; id: Id2; metadata: Metadata7; - sandbox: Sandbox2; + sandbox: SandboxEnvironmentSpec | null; files: Files1; setup: Setup1; } diff --git a/src/inspect_ai/dataset/_dataset.py b/src/inspect_ai/dataset/_dataset.py index b1e543580..12808948a 100644 --- a/src/inspect_ai/dataset/_dataset.py +++ b/src/inspect_ai/dataset/_dataset.py @@ -14,30 +14,52 @@ from typing_extensions import override from inspect_ai.model import ChatMessage -from inspect_ai.util import SandboxEnvironmentSpec +from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType +from inspect_ai.util._sandbox.environment import resolve_sandbox_environment if TYPE_CHECKING: from _typeshed import SupportsRichComparison class Sample(BaseModel): - r"""Sample to be used in an evaluation task. + def __init__( + self, + input: str | list[ChatMessage], + choices: list[str] | None = None, + target: str | list[str] = "", + id: int | str | None = None, + metadata: dict[str, Any] | None = None, + sandbox: SandboxEnvironmentType | None = None, + files: dict[str, str] | None = None, + setup: str | None = None, + ) -> None: + r"""Sample to be used in an evaluation task. - Args: - input (str | list[ChatMessage]): The input to be submitted to the model. - choices (list[str] | None): Optional. List of available answer choices - (used only for multiple-choice evals). - target (str | list[str]): Optional. Ideal target output. May be a literal value - or narrative text to be used by a model grader. - id (int | str | None): Optional. Unique identifier for sample. - metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample. - sandbox (SandboxEnvironmentSpec | None): Optional. Sandbox environment - type and optional config file. - files (dict[str, str] | None): Optional. Files that go along with the sample (copied to - SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL). - setup (str | None): Optional. Setup script to run for sample (run - within default SandboxEnvironment). - """ + Args: + input (str | list[ChatMessage]): The input to be submitted to the model. + choices (list[str] | None): Optional. List of available answer choices + (used only for multiple-choice evals). + target (str | list[str]): Optional. Ideal target output. May be a literal value + or narrative text to be used by a model grader. + id (int | str | None): Optional. Unique identifier for sample. + metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample. + sandbox (SandboxEnvironmentType | None): Sandbox environment type + (or optionally a str or tuple with a shorthand spec) + files (dict[str, str] | None): Optional. Files that go along with the sample (copied to + SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL). + setup (str | None): Optional. Setup script to run for sample (run + within default SandboxEnvironment). + """ + super().__init__( + input=input, + choices=choices, + target=target, + id=id, + metadata=metadata, + sandbox=resolve_sandbox_environment(sandbox), + files=files, + setup=setup, + ) input: str | list[ChatMessage] """The input to be submitted to the model.""" diff --git a/src/inspect_ai/dataset/_sources/util.py b/src/inspect_ai/dataset/_sources/util.py index 4a02b345b..4085e3c0d 100644 --- a/src/inspect_ai/dataset/_sources/util.py +++ b/src/inspect_ai/dataset/_sources/util.py @@ -3,6 +3,7 @@ from inspect_ai._util.content import Content, ContentImage from inspect_ai._util.file import filesystem from inspect_ai.model._chat_message import ChatMessage, ChatMessageUser +from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec from .._dataset import Dataset @@ -33,8 +34,10 @@ def resolve_file(file: str) -> str: # for each sample for sample in dataset: # check for sandbox config file - if isinstance(sample.sandbox, tuple) and sample.sandbox[1] is not None: - sample.sandbox = (sample.sandbox[0], resolve_file(sample.sandbox[1])) + if sample.sandbox and sample.sandbox.config is not None: + sample.sandbox = SandboxEnvironmentSpec( + sample.sandbox.type, resolve_file(sample.sandbox.config) + ) # check for files if sample.files is not None: diff --git a/src/inspect_ai/dataset/_util.py b/src/inspect_ai/dataset/_util.py index e0b41d5c5..df4c59b6a 100644 --- a/src/inspect_ai/dataset/_util.py +++ b/src/inspect_ai/dataset/_util.py @@ -169,11 +169,11 @@ def read_sandbox(sandbox: Any | None) -> SandboxEnvironmentSpec | None: if sandbox.strip().startswith("["): sandbox = json.loads(sandbox) else: - return (sandbox, None) + return SandboxEnvironmentSpec(sandbox) if isinstance(sandbox, list): if len(sandbox) == 2: - return str(sandbox[0]), str(sandbox[1]) + return SandboxEnvironmentSpec(str(sandbox[0]), str(sandbox[1])) else: raise ValueError( f"Invalid 'sandbox' value: '{str(sandbox)}'. Sandbox must be string or 2-item list" diff --git a/src/inspect_ai/log/_log.py b/src/inspect_ai/log/_log.py index 066b54803..2eed96ccc 100644 --- a/src/inspect_ai/log/_log.py +++ b/src/inspect_ai/log/_log.py @@ -25,6 +25,7 @@ ) from inspect_ai.scorer import Score from inspect_ai.scorer._metric import SampleScore +from inspect_ai.util._sandbox.environment import SandboxEnvironmentSpec from ._transcript import EvalEvents @@ -118,7 +119,7 @@ class EvalSample(BaseModel): target: str | list[str] """Sample target value(s)""" - sandbox: tuple[str, str | None] | None = Field(default=None) + sandbox: SandboxEnvironmentSpec | None = Field(default=None) """Sandbox environment type and optional config file.""" files: list[str] | None = Field(default=None) @@ -372,7 +373,7 @@ class EvalSpec(BaseModel): dataset: EvalDataset """Dataset used for eval.""" - sandbox: tuple[str, str | None] | None = Field(default=None) + sandbox: SandboxEnvironmentSpec | None = Field(default=None) """Sandbox environment type and optional config file.""" model: str diff --git a/src/inspect_ai/util/__init__.py b/src/inspect_ai/util/__init__.py index e942603be..5152836e0 100644 --- a/src/inspect_ai/util/__init__.py +++ b/src/inspect_ai/util/__init__.py @@ -5,6 +5,7 @@ SandboxEnvironment, SandboxEnvironments, SandboxEnvironmentSpec, + SandboxEnvironmentType, sandbox, sandbox_with, sandboxenv, @@ -26,6 +27,7 @@ "SandboxEnvironment", "SandboxEnvironments", "SandboxEnvironmentSpec", + "SandboxEnvironmentType", "sandboxenv", "sandbox", "sandbox_with", diff --git a/src/inspect_ai/util/_sandbox/__init__.py b/src/inspect_ai/util/_sandbox/__init__.py index 94b2d6681..e0984be86 100644 --- a/src/inspect_ai/util/_sandbox/__init__.py +++ b/src/inspect_ai/util/_sandbox/__init__.py @@ -2,7 +2,12 @@ from .context import sandbox, sandbox_with from .docker.docker import DockerSandboxEnvironment # noqa: F401 -from .environment import SandboxEnvironment, SandboxEnvironments, SandboxEnvironmentSpec +from .environment import ( + SandboxEnvironment, + SandboxEnvironments, + SandboxEnvironmentSpec, + SandboxEnvironmentType, +) from .local import LocalSandboxEnvironment # noqa: F401 from .registry import sandboxenv @@ -10,6 +15,7 @@ "SandboxEnvironment", "SandboxEnvironments", "SandboxEnvironmentSpec", + "SandboxEnvironmentType", "sandboxenv", "sandbox", "sandbox_with", diff --git a/src/inspect_ai/util/_sandbox/environment.py b/src/inspect_ai/util/_sandbox/environment.py index c5308254f..a35a5e37b 100644 --- a/src/inspect_ai/util/_sandbox/environment.py +++ b/src/inspect_ai/util/_sandbox/environment.py @@ -1,6 +1,6 @@ import abc from dataclasses import dataclass, field -from typing import Awaitable, Callable, Literal, Union, overload +from typing import Awaitable, Callable, Literal, NamedTuple, Union, overload from .._subprocess import ExecResult @@ -194,5 +194,30 @@ class SandboxEnvironments: """ -SandboxEnvironmentSpec = str | tuple[str, str | None] -"""Specification of a SandboxEnvironment (type or tuple with type and config file).""" +class SandboxEnvironmentSpec(NamedTuple): + """Specification of a SandboxEnvironment.""" + + type: str + config: str | None = None + + +SandboxEnvironmentType = SandboxEnvironmentSpec | str | tuple[str, str] +"""SandboxEnvironmentSpec and str and tuple shorthands for it. + +A plain str, e.g. "docker", is equivalent to SandboxEnvironmentSpec("docker") +A tuple, e.g. ("docker", "compose.yaml"), is equivalent to SandboxEnvironmentSpec("docker", "compose.yaml") +""" + + +def resolve_sandbox_environment( + sandbox: SandboxEnvironmentType | None, +) -> SandboxEnvironmentSpec | None: + # do the resolution + if isinstance(sandbox, str): + return SandboxEnvironmentSpec(type=sandbox) + elif isinstance(sandbox, SandboxEnvironmentSpec): + return sandbox + elif isinstance(sandbox, tuple): + return SandboxEnvironmentSpec(sandbox[0], sandbox[1]) + else: + return None diff --git a/tests/tools/test_web_browser.py b/tests/tools/test_web_browser.py index 13dc42b3e..e72f0169c 100644 --- a/tests/tools/test_web_browser.py +++ b/tests/tools/test_web_browser.py @@ -10,7 +10,6 @@ from inspect_ai.model import ModelOutput, get_model from inspect_ai.solver import generate, use_tools from inspect_ai.tool import web_browser -from inspect_ai.util import SandboxEnvironmentSpec @skip_if_no_docker @@ -146,7 +145,7 @@ def test_web_browser_input(): assert type_call -def web_browser_sandbox() -> SandboxEnvironmentSpec: +def web_browser_sandbox() -> tuple[str, str]: return ( "docker", (Path(__file__).parent / "test_web_browser_compose.yaml").as_posix(),