diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 88bb03b1a..3ff60222f 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,11 +1,16 @@ version: 2 updates: -- package-ecosystem: pip - directory: "/" - schedule: - interval: daily - time: "13:00" - groups: - python-packages: - patterns: - - "*" + - package-ecosystem: pip + directory: "/" + schedule: + interval: daily + time: "13:00" + groups: + python-packages: + patterns: + - "*" + ignore: + - dependency-name: "quarto-cli" + versions: [">=1.6.0"] + - dependency-name: "ruff" + versions: [">=0.9.5"] diff --git a/CHANGELOG.md b/CHANGELOG.md index fe7c7e88e..765440aa9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,22 @@ -# Changelog ## Unreleased +- [Reference documentation](https://inspect.ai-safety-institute.org.uk/reference/) for Python API and CLI commands. - Add support for [clustered standard errors](https://inspect.ai-safety-institute.org.uk/scorers.html#clustered-standard-errors) via a new `cluster` parameter for the `stderr()` metric. - Metrics now take `list[SampleScore]` rather than `list[Score]` (previous signature is deprecated but still works with a warning). - Use a sample adjustment for the `var()` metric. +- OpenAI: Native tool calling for o1-mini (upon initial release it required emulated tool calling like o1-preview). +- Google: Speculative fix for completion candidates not being returned as a list. +- Python and Bash tools: Add `sandbox` argument for running in non-default sandboxes. +- Transcript: Log `ScoreEvent` (with `intermediate=True`) when the `score()` function is called. +- Transcript: Add `source` field to `InfoEvent` and use it for events logged by the human agent. - Docker: Support Dockerfiles with `.Dockerfile` extension. +- Docker: Raise error when there is an explicitly configured `container_name` (incompatible with epochs > 1). +- Docker: Dynamically set `compose up` timeout when there are `healthcheck` entries for services. +- Log: Validate that `log_dir` is writeable at startup. +- Log: Write eval config defaults into log file (rather than `None`). - Bugfix: Always honor level-level-transcript setting for transcript logging. +- Bugfix: Fix some dynamic layout issues for sample sandbox view. ## v0.3.63 (07 February 2025) diff --git a/docs/.gitignore b/docs/.gitignore index d86a66004..4bc621e9d 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1,3 @@ /.quarto/ /_book/ -/_site/ +/_site/ \ No newline at end of file diff --git a/docs/_quarto.yml b/docs/_quarto.yml index 0e4eaf9c8..06ab50e67 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -3,8 +3,18 @@ project: resources: - CNAME - llms.txt - post-render: scripts/post-render.sh - + pre-render: + - reference/filter/sidebar.py + post-render: + - scripts/post-render.sh + +metadata-files: + - reference/_sidebar.yml + +filters: + - at: pre-quarto + path: reference/filter/interlink.lua + website: title: "Inspect" bread-crumbs: true @@ -21,66 +31,74 @@ website: title: "Inspect" description: "Open-source framework for large language model evaluations" image: /images/inspect.png - sidebar: - style: floating + navbar: + title: "Inspect AI" + background: light search: true - header: > - [![](/images/aisi-logo.png){fig-alt="UK AI Safety Institute Website"}](https://aisi.gov.uk/) - tools: - - icon: github - href: https://github.com/UKGovernmentBEIS/inspect_ai - text: "Source Code" - contents: - - text: Welcome - href: index.qmd - - section: "Basics" - contents: - - tutorial.qmd - - options.qmd - - text: "Evals" - href: evals/index.qmd - - log-viewer.qmd - - text: "VS Code" - href: vscode.qmd - - - section: "Components" - contents: - - tasks.qmd - - datasets.qmd - - solvers.qmd - - tools.qmd - - scorers.qmd + logo: images/aisi-logo.png + logo-href: https://www.aisi.gov.uk/ + left: + - text: "User Guide" + href: index.qmd + - text: "Reference" + href: reference/index.qmd + - text: "Evals" + href: evals/index.qmd + right: + - icon: github + href: https://github.com/UKGovernmentBEIS/inspect_ai + + sidebar: + - title: Guide + style: docked + contents: + - section: "Basics" + contents: + - text: "Welcome" + href: index.qmd + - tutorial.qmd + - options.qmd + - log-viewer.qmd + - text: "VS Code" + href: vscode.qmd - - section: "Models" - contents: - - models.qmd - - text: "Providers" - href: providers.qmd - - caching.qmd - - multimodal.qmd - - reasoning.qmd + - section: "Components" + contents: + - tasks.qmd + - datasets.qmd + - solvers.qmd + - tools.qmd + - scorers.qmd - - section: "Agents" - contents: - - agents.qmd - - sandboxing.qmd - - agents-api.qmd - - agent-bridge.qmd - - human-agent.qmd - - approval.qmd + - section: "Models" + contents: + - models.qmd + - text: "Providers" + href: providers.qmd + - caching.qmd + - multimodal.qmd + - reasoning.qmd - - section: "Advanced" - contents: - - eval-logs.qmd - - eval-sets.qmd - - text: "Errors & Limits" - href: errors-and-limits.qmd - - typing.qmd - - tracing.qmd - - parallelism.qmd - - interactivity.qmd - - extensions.qmd + - section: "Agents" + contents: + - agents.qmd + - sandboxing.qmd + - agents-api.qmd + - agent-bridge.qmd + - human-agent.qmd + - approval.qmd + - section: "Advanced" + contents: + - eval-logs.qmd + - eval-sets.qmd + - text: "Errors & Limits" + href: errors-and-limits.qmd + - typing.qmd + - tracing.qmd + - parallelism.qmd + - interactivity.qmd + - extensions.qmd page-footer: left: diff --git a/docs/evals/evals.yml b/docs/evals/evals.yml index 3806f0827..e27c1a295 100644 --- a/docs/evals/evals.yml +++ b/docs/evals/evals.yml @@ -3,8 +3,8 @@ - Coding contributors: - adil-a - description: 'Evaluating correctness for synthesizing Python programs from docstrings. - Demonstrates custom scorers and sandboxing untrusted model code. + description: 'Measures the functional correctness of synthesizing programs from + docstrings. ' group: Coding @@ -17,9 +17,8 @@ - Coding contributors: - jddantes - description: 'Measuring the ability of these models to synthesize short Python programs - from natural language descriptions. Demonstrates custom scorers and sandboxing - untrusted model code. + description: 'Measures the ability to synthesize short Python programs from natural + language descriptions. ' group: Coding @@ -35,8 +34,7 @@ - max-kaufmann dependency: swe_bench description: 'Software engineering problems drawn from real GitHub issues and corresponding - pull requests across 12 popular Python repositories. Demonstrates sandboxing untrusted - model code. + pull requests across 12 popular Python repositories. ' group: Coding @@ -45,14 +43,14 @@ - Agent tasks: - swe_bench - title: 'SWE-Bench: Resolving Real-World GitHub Issues' + title: 'SWE-bench Verified: Resolving Real-World GitHub Issues' - arxiv: https://arxiv.org/abs/2211.11501 categories: - Coding contributors: - bienehito - description: DS-1000 is a code generation benchmark with a thousand data science - problems spanning seven Python libraries. + description: Code generation benchmark with a thousand data science problems spanning + seven Python libraries. group: Coding path: src/inspect_evals/ds1000 tasks: @@ -80,10 +78,7 @@ - zhenningdavidliu description: 'Evaluates LLMs on class-level code generation with 100 tasks constructed over 500 person-hours. The study shows that LLMs perform worse on class-level - tasks compared to method-level tasks. GPT-4 and GPT-3.5 outperform other models, - with holistic generation being the best strategy for them, while incremental generation - works better for other models. The study also shows that the performance of LLMs - is highly correlated with the number of tokens in the prompt. + tasks compared to method-level tasks. ' group: Coding @@ -98,10 +93,10 @@ - Agent contributors: - max-kaufmann - description: 'GAIA proposes real-world questions that require a set of fundamental - abilities such as reasoning, multi-modality handling, web browsing, and generally - tool-use proficiency. GAIA questions are conceptually simple for humans yet challenging - for most advanced AIs + description: 'Proposes real-world questions that require a set of fundamental abilities + such as reasoning, multi-modality handling, web browsing, and generally tool-use + proficiency. GAIA questions are conceptually simple for humans yet challenging + for most advanced AIs. ' group: Assistants @@ -114,6 +109,24 @@ - gaia_level2 - gaia_level3 title: 'GAIA: A Benchmark for General AI Assistants' +- arxiv: https://arxiv.org/abs/2404.07972 + categories: + - Assistants + - Agent + contributors: + - epatey + description: 'Benchmark for multimodal agents for open-ended tasks in real computer + environments. + + ' + group: Assistants + path: src/inspect_evals/osworld + tags: + - Agent + tasks: + - osworld + - osworld_small + title: OSWorld - arxiv: https://arxiv.org/abs/2407.15711 categories: - Assistants @@ -121,7 +134,7 @@ contributors: - nlpet description: 'Tests whether AI agents can perform real-world time-consuming tasks - on the web. Includes 214 realistic tasks covering a variety scenarios and domains. + on the web. ' group: Assistants @@ -156,6 +169,7 @@ - arxiv: https://arxiv.org/abs/2402.07688 categories: - Cybersecurity + - Agent contributors: - neilshaabi description: 'Datasets containing 80, 500, 2000 and 10000 multiple-choice questions, @@ -164,6 +178,8 @@ ' group: Cybersecurity path: src/inspect_evals/cybermetric + tags: + - Agent tasks: - cybermetric_80 - cybermetric_500 @@ -182,9 +198,9 @@ group: Cybersecurity path: src/inspect_evals/cyberseceval_2 tasks: - - interpreter_abuse - - prompt_injection - - vulnerability_exploit + - cse2_interpreter_abuse + - cse2_prompt_injection + - cse2_vulnerability_exploit title: 'CyberSecEval_2: A Wide-Ranging Cybersecurity Evaluation Suite for Large Language Models' - arxiv: https://arxiv.org/abs/2306.14898 @@ -248,12 +264,13 @@ - Cybersecurity contributors: - matthewreed26 - description: "\"Security Question Answering\" dataset to assess LLMs' understanding\ - \ and application of security principles.\nSecQA has \u201Cv1\u201D and \u201C\ - v2\u201D datasets of multiple-choice questions that aim to provide two levels\ - \ of cybersecurity evaluation criteria. \nThe questions were generated by GPT-4\ - \ based on the \"Computer Systems Security: Planning for Success\" textbook and\ - \ vetted by humans.\n" + description: '"Security Question Answering" dataset to assess LLMs'' understanding + and application of security principles. SecQA has "v1" and "v2" datasets of multiple-choice + questions that aim to provide two levels of cybersecurity evaluation criteria. The + questions were generated by GPT-4 based on the "Computer Systems Security: Planning + for Success" textbook and vetted by humans. + + ' group: Cybersecurity path: src/inspect_evals/sec_qa tasks: @@ -322,7 +339,7 @@ contributors: - jjallaire description: 'Dataset of 8.5K high quality linguistically diverse grade school math - word problems. Demonstrates fewshot prompting. + word problems. ' group: Mathematics @@ -378,14 +395,15 @@ tags: - Multimodal tasks: - - vstar_bench_ar - - vstar_bench_srr + - vstar_bench_attribute_recognition + - vstar_bench_spatial_relationship_reasoning title: 'V*Bench: A Visual QA Benchmark with Detailed High-resolution Images' - arxiv: https://arxiv.org/abs/1803.05457 categories: - Reasoning contributors: - jjallaire + dependency: math description: Dataset of natural, grade-school science multiple-choice questions (authored for human tests). group: Reasoning @@ -579,6 +597,7 @@ - Reasoning contributors: - adil-a + dependency: ifeval description: 'Evaluates the ability to follow a set of "verifiable instructions" such as "write in more than 400 words" and "mention the keyword of AI at least 3 times. Demonstrates custom scoring. @@ -606,6 +625,8 @@ - arxiv: https://arxiv.org/abs/2407.01437 categories: - Reasoning + contributors: + - owenparsons description: 'NIAH evaluates in-context retrieval ability of long context LLMs by testing a model''s ability to extract factual information from long-context inputs. @@ -755,3 +776,36 @@ - agie_sat_en_without_passage - agie_sat_math title: 'AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models' +- arxiv: https://arxiv.org/abs/2407.13168 + categories: + - Coding + contributors: + - xantheocracy + dependency: scicode + description: 'SciCode tests the ability of language models to generate code to solve + scientific research problems. It assesses models on 65 problems from mathematics, + physics, chemistry, biology, and materials science. + + ' + group: Coding + path: src/inspect_evals/scicode + tasks: + - scicode + title: 'SciCode: A Research Coding Benchmark Curated by Scientists' +- categories: + - Knowledge + contributors: + - bact + description: 'Questions and answers from the Ordinary National Educational Test + (O-NET), administered annually by the National Institute of Educational Testing + Service to Matthayom 6 (Grade 12 / ISCED 3) students in Thailand. The exam contains + six subjects: English language, math, science, social knowledge, and Thai language. + There are questions with multiple-choice and true/false answers. Questions can + be in either English or Thai. + + ' + group: Knowledge + path: src/inspect_evals/onet + tasks: + - onet_m6 + title: O-NET diff --git a/docs/evals/import.py b/docs/evals/import.py index 28ad2bde0..360be95c8 100644 --- a/docs/evals/import.py +++ b/docs/evals/import.py @@ -12,6 +12,8 @@ record["categories"] = [record["group"]] if "tags" in record: record["categories"].extend(record["tags"]) + record["tasks"] = [task["name"] for task in record["tasks"]] + with open(PATH / "evals.yml", "w") as f: yaml.safe_dump(records, f) diff --git a/docs/evals/listing.yml b/docs/evals/listing.yml new file mode 100644 index 000000000..179ce81f2 --- /dev/null +++ b/docs/evals/listing.yml @@ -0,0 +1,826 @@ +# Groups: Coding Assistants Cybersecurity Safeguards Mathematics Reasoning Knowledge + +- title: "HumanEval: Evaluating Large Language Models Trained on Code" + description: | + Measures the functional correctness of synthesizing programs from docstrings. + path: src/inspect_evals/humaneval + arxiv: https://arxiv.org/abs/2107.03374 + group: Coding + contributors: ["adil-a"] + tasks: + - name: humaneval + dataset_samples: 164 + baselines: + - name: openai/gpt-4o + metric: pass@1 + score: 90.2 + parameters: ["0-shot"] + source: https://www.anthropic.com/news/claude-3-5-sonnet + - name: openai/gpt-4 + metric: pass@1 + score: 85.4 + parameters: ["0-shot"] + source: https://arxiv.org/abs/2308.01861 + - name: anthropic/claude-3-5-sonnet-latest + metric: pass@1 + score: 92 + parameters: ["0-shot"] + source: https://www.anthropic.com/news/claude-3-5-sonnet + - name: anthropic/claude-3-sonnet-latest + metric: pass@1 + score: 73 + parameters: ["0-shot"] + source: https://www.anthropic.com/news/claude-3-5-sonnet + - name: anthropic/claude-3-5-opus-latest + metric: pass@1 + score: 84.9 + parameters: ["0-shot"] + source: https://www.anthropic.com/news/claude-3-5-sonnet + - name: google/gemini-1.5-pro + metric: pass@1 + score: 84.1 + parameters: ["0-shot"] + source: https://www.anthropic.com/news/claude-3-5-sonnet + - name: hf/meta-llama/Llama-3.1-405B + metric: pass@1 + score: 54.9 + parameters: ["0-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/deepseek-ai/DeepSeek-V3-Base + metric: pass@1 + score: 65.2 + parameters: ["0-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/Qwen/Qwen2.5-72B + metric: pass@1 + score: 53.0 + parameters: ["0-shot"] + source: https://arxiv.org/abs/2412.19437v1 + +- title: "MBPP: Mostly Basic Python Problems" + description: | + Measures the ability to synthesize short Python programs from natural language descriptions. + path: src/inspect_evals/mbpp + arxiv: https://arxiv.org/abs/2108.07732 + group: Coding + contributors: ["jddantes"] + tasks: + - name: mbpp + dataset_samples: 257 + baselines: + - name: hf/meta-llama/Llama-3.1-405B + metric: pass@1 + score: 68.4 + parameters: ["3-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/deepseek-ai/DeepSeek-V3-Base + metric: pass@1 + score: 75.4 + parameters: ["3-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/Qwen/Qwen2.5-72B + metric: pass@1 + score: 72.6 + parameters: ["3-shot"] + source: https://arxiv.org/abs/2412.19437v1 + +- title: "SWE-bench Verified: Resolving Real-World GitHub Issues" + description: | + Software engineering problems drawn from real GitHub issues and corresponding pull requests across 12 popular Python repositories. + path: src/inspect_evals/swe_bench + arxiv: https://arxiv.org/abs/2310.06770 + group: Coding + contributors: ["max-kaufmann"] + tasks: + - name: swe_bench + dataset_samples: 500 + baselines: + - name: openai/gpt-4 + metric: pass + score: 33.2 + source: https://openai.com/index/introducing-swe-bench-verified + dependency: "swe_bench" + tags: ["Agent"] + +- title: "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation" + description: Code generation benchmark with a thousand data science problems spanning seven Python libraries. + path: src/inspect_evals/ds1000 + arxiv: https://arxiv.org/abs/2211.11501 + group: Coding + contributors: ["bienehito"] + tasks: + - name: ds1000 + dataset_samples: 1000 + +- title: "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions" + description: | + Python coding benchmark with 1,140 diverse questions drawing on numerous python libraries. + path: src/inspect_evals/bigcodebench + arxiv: https://arxiv.org/abs/2406.15877 + group: Coding + contributors: ["tim-hua-01"] + tasks: + - name: bigcodebench + dataset_samples: 1140 + +- title: "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation" + description: | + Evaluates LLMs on class-level code generation with 100 tasks constructed over 500 person-hours. The study shows that LLMs perform worse on class-level tasks compared to method-level tasks. + path: src/inspect_evals/class_eval + arxiv: https://arxiv.org/abs/2308.01861 + group: Coding + contributors: ["zhenningdavidliu"] + tasks: + - name: class_eval + dataset_samples: 100 + baselines: + - name: openai/gpt-4 + metric: pass@1 + score: 37.6 + parameters: ["class-level"] + source: https://arxiv.org/abs/2308.01861 + - name: openai/gpt-4 + metric: pass@3 + score: 41.3 + parameters: ["class-level"] + source: https://arxiv.org/abs/2308.01861 + - name: openai/gpt-4 + metric: pass@5 + score: 42.0 + parameters: ["class-level"] + source: https://arxiv.org/abs/2308.01861 + +- title: "GAIA: A Benchmark for General AI Assistants" + description: | + Proposes real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency. GAIA questions are conceptually simple for humans yet challenging for most advanced AIs. + path: src/inspect_evals/gaia + arxiv: https://arxiv.org/abs/2311.12983 + group: Assistants + contributors: ["max-kaufmann"] + tasks: + - name: gaia + dataset_samples: 165 + - name: gaia_level1 + dataset_samples: 53 + - name: gaia_level2 + dataset_samples: 86 + - name: gaia_level3 + dataset_samples: 26 + tags: ["Agent"] + +- title: "OSWorld" + description: | + Benchmark for multimodal agents for open-ended tasks in real computer environments. + path: src/inspect_evals/osworld + arxiv: https://arxiv.org/abs/2404.07972 + group: Assistants + contributors: ["epatey"] + tasks: + - name: osworld + dataset_samples: 369 # This is the count in the OSWorld corpus. Many samples are excluded because they are not yet supported + - name: osworld_small + dataset_samples: 39 # This is the count in the OSWorld corpus. Many samples are excluded because they are not yet supported + tags: ["Agent"] + +- title: "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?" + description: | + Tests whether AI agents can perform real-world time-consuming tasks on the web. + path: src/inspect_evals/assistant_bench + arxiv: https://arxiv.org/abs/2407.15711 + group: Assistants + contributors: ["nlpet"] + tasks: + - name: assistant_bench + dataset_samples: 33 + tags: ["Agent"] + +- title: "Cybench: A Framework for Evaluating Cybersecurity Capabilities and Risks of Language Models" + description: | + 40 professional-level Capture the Flag (CTF) tasks from 4 distinct CTF competitions, chosen to be recent, meaningful, and spanning a wide range of difficulties. + path: src/inspect_evals/cybench + group: Cybersecurity + contributors: ["sinman-aisi", "sam-deverett-dsit", "kola-aisi", "pgiav"] + arxiv: https://arxiv.org/abs/2408.08926 + tasks: + - name: cybench + dataset_samples: 40 + baselines: + - name: openai/gpt-4o + metric: accuracy + score: 12.5 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + - name: openai/o1-preview + metric: accuracy + score: 10 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + - name: anthropic/claude-3-5-sonnet-latest + metric: accuracy + score: 17.5 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + - name: anthropic/claude-3-opus-latest + metric: accuracy + score: 10 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + - name: hf/meta-llama/Llama-3.1-405B-Instruct + metric: accuracy + score: 7.5 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + - name: hf/meta-llama/Llama-3-70B + metric: accuracy + score: 5 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + - name: google/gemini-1.5-pro + metric: accuracy + score: 7.5 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + - name: mistral/open-mixtral-8x22b + metric: accuracy + score: 7.5 + parameters: ["unguided"] + source: https://arxiv.org/abs/2408.08926 + tags: ["Agent"] + +- title: "CyberMetric: A Benchmark Dataset based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge" + description: | + Datasets containing 80, 500, 2000 and 10000 multiple-choice questions, designed to evaluate understanding across nine domains within cybersecurity + path: src/inspect_evals/cybermetric + arxiv: https://arxiv.org/abs/2402.07688 + group: Cybersecurity + contributors: ["neilshaabi"] + tasks: + - name: cybermetric_80 + dataset_samples: 80 + - name: cybermetric_500 + dataset_samples: 500 + - name: cybermetric_2000 + dataset_samples: 2000 + - name: cybermetric_10000 + dataset_samples: 10000 + tags: ["Agent"] + +- title: "CyberSecEval_2: A Wide-Ranging Cybersecurity Evaluation Suite for Large Language Models" + description: | + Evaluates Large Language Models for risky capabilities in cybersecurity. + path: src/inspect_evals/cyberseceval_2 + arxiv: https://arxiv.org/pdf/2404.13161 + group: Cybersecurity + contributors: ["its-emile"] + tasks: + - name: cse2_interpreter_abuse + dataset_samples: 500 + - name: cse2_prompt_injection + dataset_samples: 251 + - name: cse2_vulnerability_exploit + dataset_samples: 585 + +- title: "InterCode: Capture the Flag" + description: | + Measure expertise in coding, cryptography (i.e. binary exploitation, forensics), reverse engineering, and recognizing security vulnerabilities. Demonstrates tool use and sandboxing untrusted model code. + path: src/inspect_evals/gdm_capabilities/intercode_ctf + arxiv: https://arxiv.org/abs/2306.14898 + group: Cybersecurity + contributors: ["jjallaire"] + tasks: + - name: gdm_intercode_ctf + dataset_samples: 79 + tags: ["Agent"] + +- title: "GDM Dangerous Capabilities: Capture the Flag" + description: | + CTF challenges covering web app vulnerabilities, off-the-shelf exploits, databases, Linux privilege escalation, password cracking and spraying. Demonstrates tool use and sandboxing untrusted model code. + path: src/inspect_evals/gdm_capabilities/in_house_ctf + arxiv: https://arxiv.org/abs/2403.13793 + group: Cybersecurity + contributors: ["XkunW"] + tasks: + - name: gdm_in_house_ctf + dataset_samples: 13 + tags: ["Agent"] + +- title: "SEvenLLM: A benchmark to elicit, and improve cybersecurity incident analysis and response abilities in LLMs for Security Events." + description: | + Designed for analyzing cybersecurity incidents, which is comprised of two primary task categories: understanding and generation, with a further breakdown into 28 subcategories of tasks. + path: src/inspect_evals/sevenllm + group: Cybersecurity + contributors: ["kingroryg"] + arxiv: https://arxiv.org/abs/2405.03446 + tasks: + - name: sevenllm_mcq_zh + dataset_samples: 50 + - name: sevenllm_mcq_en + dataset_samples: 50 + - name: sevenllm_qa_zh + dataset_samples: 600 + - name: sevenllm_qa_en + dataset_samples: 600 + dependency: "sevenllm" + +- title: "SecQA: A Concise Question-Answering Dataset for Evaluating Large Language Models in Computer Security" + description: > + "Security Question Answering" dataset to assess LLMs' understanding and application of security principles. + SecQA has "v1" and "v2" datasets of multiple-choice questions that aim to provide two levels of cybersecurity evaluation criteria. + The questions were generated by GPT-4 based on the "Computer Systems Security: Planning for Success" textbook and vetted by humans. + path: src/inspect_evals/sec_qa + group: Cybersecurity + contributors: ["matthewreed26"] + arxiv: https://arxiv.org/abs/2312.15838 + tasks: + - name: sec_qa_v1 + dataset_samples: 110 + - name: sec_qa_v1_5_shot + dataset_samples: 110 + - name: sec_qa_v2 + dataset_samples: 100 + - name: sec_qa_v2_5_shot + dataset_samples: 100 + +- title: "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents" + description: | + Diverse set of 110 explicitly malicious agent tasks (440 with augmentations), covering 11 harm categories including fraud, cybercrime, and harassment. + path: src/inspect_evals/agentharm + arxiv: https://arxiv.org/abs/2410.09024 + group: Safeguards + contributors: + ["alexandrasouly-aisi", "EricWinsorDSIT", "max-andr", "xanderdavies"] + tasks: + - name: agentharm + dataset_samples: 176 + - name: agentharm_benign + dataset_samples: 176 + tags: ["Agent"] + +- title: "WMDP: Measuring and Reducing Malicious Use With Unlearning" + description: | + A dataset of 3,668 multiple-choice questions developed by a consortium of academics and technical consultants that serve as a proxy measurement of hazardous knowledge in biosecurity, cybersecurity, and chemical security. + path: src/inspect_evals/wmdp + arxiv: https://arxiv.org/abs/2403.03218 + group: Safeguards + contributors: ["alexandraabbas"] + tasks: + - name: wmdp_bio + dataset_samples: 1273 + - name: wmdp_chem + dataset_samples: 408 + - name: wmdp_cyber + dataset_samples: 1987 + +- title: "MATH: Measuring Mathematical Problem Solving" + description: | + Dataset of 12,500 challenging competition mathematics problems. Demonstrates fewshot prompting and custom scorers. + path: src/inspect_evals/mathematics + arxiv: https://arxiv.org/abs/2103.03874 + group: Mathematics + contributors: ["xeon27"] + tasks: + - name: math + dataset_samples: 12500 + dependency: "math" + +- title: "GSM8K: Training Verifiers to Solve Math Word Problems" + description: | + Dataset of 8.5K high quality linguistically diverse grade school math word problems. + path: src/inspect_evals/gsm8k + arxiv: https://arxiv.org/abs/2110.14168 + group: Mathematics + contributors: ["jjallaire"] + tasks: + - name: gsm8k + dataset_samples: 1319 + baselines: + - name: hf/meta-llama/Llama-3.1-405B + metric: accuracy + score: 83.5 + parameters: ["8-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/deepseek-ai/DeepSeek-V3-Base + metric: accuracy + score: 89.3 + parameters: ["8-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/Qwen/Qwen2.5-72B + metric: accuracy + score: 88.3 + parameters: ["8-shot"] + source: https://arxiv.org/abs/2412.19437v1 + +- title: "MathVista: Evaluating Mathematical Reasoning in Visual Contexts" + path: src/inspect_evals/mathvista + description: | + Diverse mathematical and visual tasks that require fine-grained, deep visual understanding and compositional reasoning. Demonstrates multimodal inputs and custom scorers. + arxiv: https://arxiv.org/abs/2310.02255 + group: Mathematics + contributors: ["ShivMunagala"] + tasks: + - name: mathvista + dataset_samples: 1000 + tags: ["Multimodal"] + +- title: "MGSM: Multilingual Grade School Math" + description: | + Extends the original GSM8K dataset by translating 250 of its problems into 10 typologically diverse languages. + path: src/inspect_evals/mgsm + arxiv: https://arxiv.org/abs/2210.03057 + group: Mathematics + contributors: ["manifoldhiker"] + tasks: + - name: mgsm + dataset_samples: 2750 + baselines: + - name: hf/meta-llama/Llama-3.1-405B + metric: pass@1 + score: 69.9 + parameters: ["8-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/deepseek-ai/DeepSeek-V3-Base + metric: pass@1 + score: 79.8 + parameters: ["8-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/Qwen/Qwen2.5-72B + metric: pass@1 + score: 76.2 + parameters: ["8-shot"] + source: https://arxiv.org/abs/2412.19437v1 + +- title: "V*Bench: A Visual QA Benchmark with Detailed High-resolution Images" + description: | + V*Bench is a visual question & answer benchmark that evaluates MLLMs in their ability to process high-resolution and visually crowded images to find and focus on small details. + path: src/inspect_evals/vstar_bench + arxiv: https://arxiv.org/abs/2312.14135 + group: Reasoning + tags: ["Multimodal"] + contributors: ["bienehito"] + tasks: + - name: vstar_bench_attribute_recognition + dataset_samples: 115 + - name: vstar_bench_spatial_relationship_reasoning + dataset_samples: 76 + +- title: "ARC: AI2 Reasoning Challenge" + description: Dataset of natural, grade-school science multiple-choice questions (authored for human tests). + path: src/inspect_evals/arc + arxiv: https://arxiv.org/abs/1803.05457 + group: Reasoning + contributors: ["jjallaire"] + tasks: + - name: arc_easy + dataset_samples: 2376 + - name: arc_challenge + dataset_samples: 1172 + dependency: "math" + +- title: "HellaSwag: Can a Machine Really Finish Your Sentence?" + description: | + Evaluting commonsense natural language inference: given an event description such as "A woman sits at a piano," a machine must select the most likely followup. + path: src/inspect_evals/hellaswag + arxiv: https://arxiv.org/abs/1905.07830 + group: Reasoning + contributors: ["jjallaire"] + tasks: + - name: hellaswag + dataset_samples: 10042 + baselines: + - name: hf/meta-llama/Llama-3.1-405B + metric: accuracy + score: 89.2 + parameters: ["10-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/deepseek-ai/DeepSeek-V3-Base + metric: accuracy + score: 79.8 + parameters: ["10-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/Qwen/Qwen2.5-72B + metric: accuracy + score: 88.9 + parameters: ["10-shot"] + source: https://arxiv.org/abs/2412.19437v1 + +- title: "PIQA: Reasoning about Physical Commonsense in Natural Language" + description: | + Measure physical commonsense reasoning (e.g. "To apply eyeshadow without a brush, should I use a cotton swab or a toothpick?") + path: src/inspect_evals/piqa + arxiv: https://arxiv.org/abs/1911.11641 + group: Reasoning + contributors: ["seddy-aisi"] + tasks: + - name: piqa + dataset_samples: 1838 + +- title: "∞Bench: Extending Long Context Evaluation Beyond 100K Tokens" + description: | + LLM benchmark featuring an average data length surpassing 100K tokens. Comprises synthetic and realistic tasks spanning diverse domains in English and Chinese. + path: src/inspect_evals/infinite_bench + arxiv: https://arxiv.org/abs/2402.13718 + group: Reasoning + contributors: ["celiawaggoner"] + tasks: + - name: infinite_bench_code_debug + dataset_samples: 394 + - name: infinite_bench_code_run + dataset_samples: 400 + - name: infinite_bench_kv_retrieval + dataset_samples: 500 + - name: infinite_bench_longbook_choice_eng + dataset_samples: 229 + - name: infinite_bench_longdialogue_qa_eng + dataset_samples: 200 + - name: infinite_bench_math_calc + dataset_samples: 50 + - name: infinite_bench_math_find + dataset_samples: 350 + - name: infinite_bench_number_string + dataset_samples: 590 + - name: infinite_bench_passkey + dataset_samples: 590 + +- title: "BBH: Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them" + description: | + Suite of 23 challenging BIG-Bench tasks for which prior language model evaluations did not outperform the average human-rater. + path: src/inspect_evals/bbh + arxiv: https://arxiv.org/abs/2210.09261 + group: Reasoning + contributors: ["JoschkaCBraun"] + tasks: + - name: bbh + dataset_samples: 250 + baselines: + - name: human + metric: accuracy + score: 87.5 + source: https://arxiv.org/abs/2210.09261 + - name: hf/meta-llama/Llama-3.1-405B + metric: accuracy + score: 82.9 + parameters: ["3-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/deepseek-ai/DeepSeek-V3-Base + metric: accuracy + score: 79.8 + parameters: ["3-shot"] + source: https://arxiv.org/abs/2412.19437v1 + - name: hf/Qwen/Qwen2.5-72B + metric: accuracy + score: 79.8 + parameters: ["3-shot"] + source: https://arxiv.org/abs/2412.19437v1 + +- title: "BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions" + description: | + Reading comprehension dataset that queries for complex, non-factoid information, and require difficult entailment-like inference to solve. + path: src/inspect_evals/boolq + arxiv: https://arxiv.org/abs/1905.10044 + group: Reasoning + contributors: ["seddy-aisi"] + tasks: + - name: boolq + dataset_samples: 3270 + +- title: "DocVQA: A Dataset for VQA on Document Images" + description: | + DocVQA is a Visual Question Answering benchmark that consists of 50,000 questions covering 12,000+ document images. This implementation solves and scores the "validation" split. + path: src/inspect_evals/docvqa + arxiv: https://arxiv.org/abs/2007.00398 + group: Reasoning + tags: ["Multimodal"] + contributors: ["evanmiller-anthropic"] + tasks: + - name: docvqa + dataset_samples: 5349 + +- title: "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs" + description: | + Evaluates reading comprehension where models must resolve references in a question, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or sorting). + path: src/inspect_evals/drop + arxiv: https://arxiv.org/abs/1903.00161 + group: Reasoning + contributors: ["xeon27"] + tasks: + - name: drop + dataset_samples: 9535 + +- title: "WINOGRANDE: An Adversarial Winograd Schema Challenge at Scale" + description: | + Set of 273 expert-crafted pronoun resolution problems originally designed to be unsolvable for statistical models that rely on selectional preferences or word associations. + path: src/inspect_evals/winogrande + arxiv: https://arxiv.org/abs/1907.10641 + group: Reasoning + contributors: ["xeon27"] + tasks: + - name: winogrande + dataset_samples: 1267 + +- title: "RACE-H: A benchmark for testing reading comprehension and reasoning abilities of neural models" + description: | + Reading comprehension tasks collected from the English exams for middle and high school Chinese students in the age range between 12 to 18. + path: src/inspect_evals/race_h + arxiv: https://arxiv.org/abs/1704.04683 + group: Reasoning + contributors: ["mdrpanwar"] + tasks: + - name: race_h + dataset_samples: 3498 + +- title: "MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark" + description: | + Multimodal questions from college exams, quizzes, and textbooks, covering six core disciplinestasks, demanding college-level subject knowledge and deliberate reasoning. Demonstrates multimodel inputs. + path: src/inspect_evals/mmmu + arxiv: https://arxiv.org/abs/2311.16502 + group: Reasoning + contributors: ["shaheenahmedc"] + tasks: + - name: mmmu_multiple_choice + dataset_samples: 847 + - name: mmmu_open + dataset_samples: 53 + tags: ["Multimodal"] + +- title: "SQuAD: A Reading Comprehension Benchmark requiring reasoning over Wikipedia articles" + description: | + Set of 100,000+ questions posed by crowdworkers on a set of Wikipedia articles, where the answer to each question is a segment of text from the corresponding reading passage. + path: src/inspect_evals/squad + arxiv: https://arxiv.org/abs/1606.05250 + group: Reasoning + contributors: ["tknasir"] + tasks: + - name: squad + dataset_samples: 11873 + +- title: "IFEval: Instruction-Following Evaluation for Large Language Models" + description: | + Evaluates the ability to follow a set of "verifiable instructions" such as "write in more than 400 words" and "mention the keyword of AI at least 3 times. Demonstrates custom scoring. + path: src/inspect_evals/ifeval + arxiv: https://arxiv.org/abs/2311.07911 + group: Reasoning + contributors: ["adil-a"] + dependency: "ifeval" + tasks: + - name: ifeval + dataset_samples: 541 + +- title: "MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning" + description: | + Evaluating models on multistep soft reasoning tasks in the form of free text narratives. + path: src/inspect_evals/musr + arxiv: https://arxiv.org/abs/2310.16049 + group: Reasoning + contributors: ["farrelmahaztra"] + tasks: + - name: musr + dataset_samples: 250 + +- title: "Needle in a Haystack (NIAH): In-Context Retrieval Benchmark for Long Context LLMs" + description: | + NIAH evaluates in-context retrieval ability of long context LLMs by testing a model's ability to extract factual information from long-context inputs. + path: src/inspect_evals/niah + arxiv: https://arxiv.org/abs/2407.01437 + group: Reasoning + contributors: ["owenparsons"] + tasks: + - name: niah + dataset_samples: 225 + +- title: "PAWS: Paraphrase Adversaries from Word Scrambling" + description: | + Evaluating models on the task of paraphrase detection by providing pairs of sentences that are either paraphrases or not. + path: src/inspect_evals/paws + arxiv: https://arxiv.org/abs/1904.01130 + group: Reasoning + contributors: ["meltemkenis"] + tasks: + - name: paws + dataset_samples: 8000 + +- title: "MMLU: Measuring Massive Multitask Language Understanding" + description: | + Evaluate models on 57 tasks including elementary mathematics, US history, computer science, law, and more. + path: src/inspect_evals/mmlu + arxiv: https://arxiv.org/abs/2009.03300 + group: Knowledge + contributors: ["jjallaire", "domdomegg"] + tasks: + - name: mmlu_0_shot + dataset_samples: 14042 + - name: mmlu_5_shot + dataset_samples: 14042 + +- title: "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark" + description: | + An enhanced dataset designed to extend the mostly knowledge-driven MMLU benchmark by integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. + path: src/inspect_evals/mmlu_pro + arxiv: https://arxiv.org/abs/2406.01574 + group: Knowledge + contributors: ["xeon27"] + tasks: + - name: mmlu_pro + dataset_samples: 12032 + +- title: "GPQA: A Graduate-Level Google-Proof Q&A Benchmark" + description: | + Challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry (experts at PhD level in the corresponding domains reach 65% accuracy). + path: src/inspect_evals/gpqa + arxiv: https://arxiv.org/abs/2311.12022 + group: Knowledge + contributors: ["jjallaire"] + tasks: + - name: gpqa_diamond + dataset_samples: 198 + +- title: "CommonsenseQA: A Question Answering Challenge Targeting Commonsense Knowledge" + description: | + Measure question answering with commonsense prior knowledge. + path: src/inspect_evals/commonsense_qa + arxiv: https://arxiv.org/abs/1811.00937 + group: Knowledge + contributors: ["jjallaire"] + tasks: + - name: commonsense_qa + dataset_samples: 1221 + +- title: "TruthfulQA: Measuring How Models Mimic Human Falsehoods" + description: | + Measure whether a language model is truthful in generating answers to questions using questions that some humans would answer falsely due to a false belief or misconception. + path: src/inspect_evals/truthfulqa + arxiv: https://arxiv.org/abs/2109.07958v2 + group: Knowledge + contributors: ["seddy-aisi"] + tasks: + - name: truthfulqa + dataset_samples: 817 + +- title: "XSTest: A benchmark for identifying exaggerated safety behaviours in LLM's" + description: | + Dataset with 250 safe prompts across ten prompt types that well-calibrated models should not refuse, and 200 unsafe prompts as contrasts that models, for most applications, should refuse. + path: src/inspect_evals/xstest + arxiv: https://arxiv.org/abs/2308.01263 + group: Knowledge + contributors: ["NelsonG-C"] + tasks: + - name: xstest + dataset_samples: 250 + +- title: "PubMedQA: A Dataset for Biomedical Research Question Answering" + description: | + Novel biomedical question answering (QA) dataset collected from PubMed abstracts. + path: src/inspect_evals/pubmedqa + arxiv: https://arxiv.org/abs/1909.06146 + group: Knowledge + contributors: ["MattFisher"] + tasks: + - name: pubmedqa + dataset_samples: 500 + +- title: "AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models" + description: | + AGIEval is a human-centric benchmark specifically designed to evaluate the general abilities of foundation models in tasks pertinent to human cognition and problem-solving. + path: src/inspect_evals/agieval + arxiv: https://arxiv.org/abs/2304.06364 + group: Knowledge + tasks: + - name: agie_aqua_rat + dataset_samples: 254 + - name: agie_logiqa_en + dataset_samples: 651 + - name: agie_lsat_ar + dataset_samples: 230 + - name: agie_lsat_lr + dataset_samples: 510 + - name: agie_lsat_rc + dataset_samples: 269 + - name: agie_math + dataset_samples: 1000 + - name: agie_sat_en + dataset_samples: 206 + - name: agie_sat_en_without_passage + dataset_samples: 206 + - name: agie_sat_math + dataset_samples: 220 + +- title: "SciCode: A Research Coding Benchmark Curated by Scientists" + description: | + SciCode tests the ability of language models to generate code to solve scientific research problems. It assesses models on 65 problems from mathematics, physics, chemistry, biology, and materials science. + path: src/inspect_evals/scicode + arxiv: https://arxiv.org/abs/2407.13168 + group: Coding + contributors: ["xantheocracy"] + dependency: "scicode" + tasks: + - name: scicode + dataset_samples: 65 + +- title: "O-NET" + description: | + Questions and answers from the Ordinary National Educational Test (O-NET), administered annually by the National Institute of Educational Testing Service to Matthayom 6 (Grade 12 / ISCED 3) students in Thailand. The exam contains six subjects: English language, math, science, social knowledge, and Thai language. There are questions with multiple-choice and true/false answers. Questions can be in either English or Thai. + path: src/inspect_evals/onet + group: Knowledge + contributors: ["bact"] + tasks: + - name: "onet_m6" + dataset_samples: 397 diff --git a/docs/llms.txt b/docs/llms.txt index 530588c70..4cfce7a1e 100644 --- a/docs/llms.txt +++ b/docs/llms.txt @@ -33,4 +33,29 @@ - [Interactivity](https://inspect.ai-safety-institute.org.uk/interactivity.html.md): Covers various ways to introduce user interaction into the implementation of tasks (for example, confirming consequential actions or prompting the model dynamically based on the trajectory of the evaluation). - [Extensions](https://inspect.ai-safety-institute.org.uk/extensions.html.md) describes the various ways you can extend Inspect, including adding support for new Model APIs, tool execution environments, and storage platforms (for datasets, prompts, and logs). +## Reference: Python API + +- [inspect_ai](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.html.md) describes the core types used to create tasks and run evaluations. +- [inspect_ai.solver](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.solver.html.md) describes built in solvers as well as the types used to create custom solvers. +- [inspect_ai.tool](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.tool.html.md) describes built in tools as well as the types used to create custom tools. +- [inspect_ai.scorer](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.scorer.html.md) describes built in scorers as well as the types used to create custom scorers. +- [inspect_ai.model](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.model.html.md) covers using the Inspect model API for accessing various language models. +- [inspect_ai.dataset](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.dataset.html.md) describes the types used to read and manipulate datasets and samples. +- [inspect_ai.approval](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.approval.html.md) covers using built in approvers as well as the types used to create custom approvers and approval policies. +- [inspect_ai.log](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.log.html.md) describes the types used to list, read, write, and traverse the contents of eval log files. +- [inspect_ai.util](https://inspect.ai-safety-institute.org.uk/reference/inspect_ai.util.html.md) covers various utility functions for concurrency, sandboxes, the store, and more. + +## Reference: Command Line + +- [inspect_eval](https://inspect.ai-safety-institute.org.uk/reference/inspect_eval.html.md): Evaluate one or more tasks. +- [inspect_eval-retry](https://inspect.ai-safety-institute.org.uk/reference/inspect_eval-retry.html.md): Retry an evaluation task. +- [inspect_eval-set](https://inspect.ai-safety-institute.org.uk/reference/inspect_eval-set.html.md): Evaluate a set of tasks with retries. +- [inspect_score](https://inspect.ai-safety-institute.org.uk/reference/inspect_score.html.md): Score a previous evaluation run. +- [inspect_view](https://inspect.ai-safety-institute.org.uk/reference/inspect_view.html.md): Inspect log viewer. +- [inspect_log](https://inspect.ai-safety-institute.org.uk/reference/inspect_log.html.md): Query, read, write, and convert logs. +- [inspect_trace](https://inspect.ai-safety-institute.org.uk/reference/inspect_trace.html.md): List and read execution traces. +- [inspect_sandbox](https://inspect.ai-safety-institute.org.uk/reference/inspect_sandbox.html.md): Manage sandbox environments. +- [inspect_cache](https://inspect.ai-safety-institute.org.uk/reference/inspect_cache.html.md): Manage the Inspect model cache. +- [inspect_list](https://inspect.ai-safety-institute.org.uk/reference/inspect_list.html.md): List tasks on the filesystem. +- [inspect_info](https://inspect.ai-safety-institute.org.uk/reference/inspect_info.html.md): Read version and configuration. diff --git a/docs/reference/.gitignore b/docs/reference/.gitignore new file mode 100644 index 000000000..e181afc7d --- /dev/null +++ b/docs/reference/.gitignore @@ -0,0 +1 @@ +refs.json diff --git a/docs/reference/_metadata.yml b/docs/reference/_metadata.yml new file mode 100644 index 000000000..5d7698298 --- /dev/null +++ b/docs/reference/_metadata.yml @@ -0,0 +1,10 @@ +toc-depth: 3 +toc-expand: true +filters: + - at: pre-ast + type: json + path: filter/filter.py + - at: pre-ast + path: filter/post.lua + + diff --git a/docs/reference/_sidebar.yml b/docs/reference/_sidebar.yml new file mode 100644 index 000000000..03a5f3d7a --- /dev/null +++ b/docs/reference/_sidebar.yml @@ -0,0 +1,442 @@ +website: + sidebar: + - title: Reference + style: docked + collapse-level: 2 + contents: + - reference/index.qmd + - section: Python API + href: reference/inspect_ai.qmd + contents: + - section: inspect_ai + href: reference/inspect_ai.qmd + contents: + - text: eval + href: reference/inspect_ai.qmd#eval + - text: eval_retry + href: reference/inspect_ai.qmd#eval_retry + - text: eval_set + href: reference/inspect_ai.qmd#eval_set + - text: score + href: reference/inspect_ai.qmd#score + - text: Task + href: reference/inspect_ai.qmd#task + - text: task_with + href: reference/inspect_ai.qmd#task_with + - text: Epochs + href: reference/inspect_ai.qmd#epochs + - text: TaskInfo + href: reference/inspect_ai.qmd#taskinfo + - text: Tasks + href: reference/inspect_ai.qmd#tasks + - text: task + href: reference/inspect_ai.qmd#task + - section: inspect_ai.solver + href: reference/inspect_ai.solver.qmd + contents: + - text: generate + href: reference/inspect_ai.solver.qmd#generate + - text: use_tools + href: reference/inspect_ai.solver.qmd#use_tools + - text: prompt_template + href: reference/inspect_ai.solver.qmd#prompt_template + - text: system_message + href: reference/inspect_ai.solver.qmd#system_message + - text: user_message + href: reference/inspect_ai.solver.qmd#user_message + - text: chain_of_thought + href: reference/inspect_ai.solver.qmd#chain_of_thought + - text: self_critique + href: reference/inspect_ai.solver.qmd#self_critique + - text: multiple_choice + href: reference/inspect_ai.solver.qmd#multiple_choice + - text: basic_agent + href: reference/inspect_ai.solver.qmd#basic_agent + - text: human_agent + href: reference/inspect_ai.solver.qmd#human_agent + - text: bridge + href: reference/inspect_ai.solver.qmd#bridge + - text: chain + href: reference/inspect_ai.solver.qmd#chain + - text: fork + href: reference/inspect_ai.solver.qmd#fork + - text: Solver + href: reference/inspect_ai.solver.qmd#solver + - text: SolverSpec + href: reference/inspect_ai.solver.qmd#solverspec + - text: TaskState + href: reference/inspect_ai.solver.qmd#taskstate + - text: Generate + href: reference/inspect_ai.solver.qmd#generate + - text: solver + href: reference/inspect_ai.solver.qmd#solver + - section: inspect_ai.tool + href: reference/inspect_ai.tool.qmd + contents: + - text: bash + href: reference/inspect_ai.tool.qmd#bash + - text: python + href: reference/inspect_ai.tool.qmd#python + - text: web_search + href: reference/inspect_ai.tool.qmd#web_search + - text: web_browser + href: reference/inspect_ai.tool.qmd#web_browser + - text: computer + href: reference/inspect_ai.tool.qmd#computer + - text: tool_with + href: reference/inspect_ai.tool.qmd#tool_with + - text: ToolDef + href: reference/inspect_ai.tool.qmd#tooldef + - text: Tool + href: reference/inspect_ai.tool.qmd#tool + - text: ToolResult + href: reference/inspect_ai.tool.qmd#toolresult + - text: ToolError + href: reference/inspect_ai.tool.qmd#toolerror + - text: ToolCallError + href: reference/inspect_ai.tool.qmd#toolcallerror + - text: ToolChoice + href: reference/inspect_ai.tool.qmd#toolchoice + - text: ToolFunction + href: reference/inspect_ai.tool.qmd#toolfunction + - text: ToolInfo + href: reference/inspect_ai.tool.qmd#toolinfo + - text: ToolParams + href: reference/inspect_ai.tool.qmd#toolparams + - text: ToolParam + href: reference/inspect_ai.tool.qmd#toolparam + - text: JSONType + href: reference/inspect_ai.tool.qmd#jsontype + - text: tool + href: reference/inspect_ai.tool.qmd#tool + - section: inspect_ai.scorer + href: reference/inspect_ai.scorer.qmd + contents: + - text: match + href: reference/inspect_ai.scorer.qmd#match + - text: includes + href: reference/inspect_ai.scorer.qmd#includes + - text: pattern + href: reference/inspect_ai.scorer.qmd#pattern + - text: answer + href: reference/inspect_ai.scorer.qmd#answer + - text: choice + href: reference/inspect_ai.scorer.qmd#choice + - text: f1 + href: reference/inspect_ai.scorer.qmd#f1 + - text: exact + href: reference/inspect_ai.scorer.qmd#exact + - text: model_graded_qa + href: reference/inspect_ai.scorer.qmd#model_graded_qa + - text: model_graded_fact + href: reference/inspect_ai.scorer.qmd#model_graded_fact + - text: multi_scorer + href: reference/inspect_ai.scorer.qmd#multi_scorer + - text: accuracy + href: reference/inspect_ai.scorer.qmd#accuracy + - text: mean + href: reference/inspect_ai.scorer.qmd#mean + - text: std + href: reference/inspect_ai.scorer.qmd#std + - text: stderr + href: reference/inspect_ai.scorer.qmd#stderr + - text: bootstrap_stderr + href: reference/inspect_ai.scorer.qmd#bootstrap_stderr + - text: at_least + href: reference/inspect_ai.scorer.qmd#at_least + - text: pass_at + href: reference/inspect_ai.scorer.qmd#pass_at + - text: max_score + href: reference/inspect_ai.scorer.qmd#max_score + - text: mean_score + href: reference/inspect_ai.scorer.qmd#mean_score + - text: median_score + href: reference/inspect_ai.scorer.qmd#median_score + - text: mode_score + href: reference/inspect_ai.scorer.qmd#mode_score + - text: Scorer + href: reference/inspect_ai.scorer.qmd#scorer + - text: Target + href: reference/inspect_ai.scorer.qmd#target + - text: Score + href: reference/inspect_ai.scorer.qmd#score + - text: Value + href: reference/inspect_ai.scorer.qmd#value + - text: ScoreReducer + href: reference/inspect_ai.scorer.qmd#scorereducer + - text: Metric + href: reference/inspect_ai.scorer.qmd#metric + - text: MetricProtocol + href: reference/inspect_ai.scorer.qmd#metricprotocol + - text: SampleScore + href: reference/inspect_ai.scorer.qmd#samplescore + - text: scorer + href: reference/inspect_ai.scorer.qmd#scorer + - text: metric + href: reference/inspect_ai.scorer.qmd#metric + - text: score_reducer + href: reference/inspect_ai.scorer.qmd#score_reducer + - section: inspect_ai.model + href: reference/inspect_ai.model.qmd + contents: + - text: get_model + href: reference/inspect_ai.model.qmd#get_model + - text: Model + href: reference/inspect_ai.model.qmd#model + - text: GenerateConfig + href: reference/inspect_ai.model.qmd#generateconfig + - text: GenerateConfigArgs + href: reference/inspect_ai.model.qmd#generateconfigargs + - text: ModelOutput + href: reference/inspect_ai.model.qmd#modeloutput + - text: ModelUsage + href: reference/inspect_ai.model.qmd#modelusage + - text: StopReason + href: reference/inspect_ai.model.qmd#stopreason + - text: ChatCompletionChoice + href: reference/inspect_ai.model.qmd#chatcompletionchoice + - text: ChatMessage + href: reference/inspect_ai.model.qmd#chatmessage + - text: ChatMessageBase + href: reference/inspect_ai.model.qmd#chatmessagebase + - text: ChatMessageSystem + href: reference/inspect_ai.model.qmd#chatmessagesystem + - text: ChatMessageUser + href: reference/inspect_ai.model.qmd#chatmessageuser + - text: ChatMessageAssistant + href: reference/inspect_ai.model.qmd#chatmessageassistant + - text: ChatMessageTool + href: reference/inspect_ai.model.qmd#chatmessagetool + - text: Content + href: reference/inspect_ai.model.qmd#content + - text: ContentText + href: reference/inspect_ai.model.qmd#contenttext + - text: ContentImage + href: reference/inspect_ai.model.qmd#contentimage + - text: ContentAudio + href: reference/inspect_ai.model.qmd#contentaudio + - text: ContentVideo + href: reference/inspect_ai.model.qmd#contentvideo + - text: Logprob + href: reference/inspect_ai.model.qmd#logprob + - text: Logprobs + href: reference/inspect_ai.model.qmd#logprobs + - text: TopLogprob + href: reference/inspect_ai.model.qmd#toplogprob + - text: CachePolicy + href: reference/inspect_ai.model.qmd#cachepolicy + - text: cache_size + href: reference/inspect_ai.model.qmd#cache_size + - text: cache_clear + href: reference/inspect_ai.model.qmd#cache_clear + - text: cache_list_expired + href: reference/inspect_ai.model.qmd#cache_list_expired + - text: cache_prune + href: reference/inspect_ai.model.qmd#cache_prune + - text: cache_path + href: reference/inspect_ai.model.qmd#cache_path + - text: modelapi + href: reference/inspect_ai.model.qmd#modelapi + - text: ModelAPI + href: reference/inspect_ai.model.qmd#modelapi + - section: inspect_ai.dataset + href: reference/inspect_ai.dataset.qmd + contents: + - text: csv_dataset + href: reference/inspect_ai.dataset.qmd#csv_dataset + - text: json_dataset + href: reference/inspect_ai.dataset.qmd#json_dataset + - text: hf_dataset + href: reference/inspect_ai.dataset.qmd#hf_dataset + - text: Sample + href: reference/inspect_ai.dataset.qmd#sample + - text: FieldSpec + href: reference/inspect_ai.dataset.qmd#fieldspec + - text: RecordToSample + href: reference/inspect_ai.dataset.qmd#recordtosample + - text: Dataset + href: reference/inspect_ai.dataset.qmd#dataset + - text: MemoryDataset + href: reference/inspect_ai.dataset.qmd#memorydataset + - section: inspect_ai.approval + href: reference/inspect_ai.approval.qmd + contents: + - text: auto_approver + href: reference/inspect_ai.approval.qmd#auto_approver + - text: human_approver + href: reference/inspect_ai.approval.qmd#human_approver + - text: Approver + href: reference/inspect_ai.approval.qmd#approver + - text: Approval + href: reference/inspect_ai.approval.qmd#approval + - text: ApprovalDecision + href: reference/inspect_ai.approval.qmd#approvaldecision + - text: ApprovalPolicy + href: reference/inspect_ai.approval.qmd#approvalpolicy + - text: approver + href: reference/inspect_ai.approval.qmd#approver + - section: inspect_ai.log + href: reference/inspect_ai.log.qmd + contents: + - text: list_eval_logs + href: reference/inspect_ai.log.qmd#list_eval_logs + - text: write_eval_log + href: reference/inspect_ai.log.qmd#write_eval_log + - text: read_eval_log + href: reference/inspect_ai.log.qmd#read_eval_log + - text: read_eval_log_sample + href: reference/inspect_ai.log.qmd#read_eval_log_sample + - text: read_eval_log_samples + href: reference/inspect_ai.log.qmd#read_eval_log_samples + - text: convert_eval_logs + href: reference/inspect_ai.log.qmd#convert_eval_logs + - text: bundle_log_dir + href: reference/inspect_ai.log.qmd#bundle_log_dir + - text: write_log_dir_manifest + href: reference/inspect_ai.log.qmd#write_log_dir_manifest + - text: retryable_eval_logs + href: reference/inspect_ai.log.qmd#retryable_eval_logs + - text: EvalLogInfo + href: reference/inspect_ai.log.qmd#evalloginfo + - text: EvalLog + href: reference/inspect_ai.log.qmd#evallog + - text: EvalSpec + href: reference/inspect_ai.log.qmd#evalspec + - text: EvalDataset + href: reference/inspect_ai.log.qmd#evaldataset + - text: EvalConfig + href: reference/inspect_ai.log.qmd#evalconfig + - text: EvalRevision + href: reference/inspect_ai.log.qmd#evalrevision + - text: EvalPlan + href: reference/inspect_ai.log.qmd#evalplan + - text: EvalPlanStep + href: reference/inspect_ai.log.qmd#evalplanstep + - text: EvalResults + href: reference/inspect_ai.log.qmd#evalresults + - text: EvalScore + href: reference/inspect_ai.log.qmd#evalscore + - text: EvalMetric + href: reference/inspect_ai.log.qmd#evalmetric + - text: EvalSampleReductions + href: reference/inspect_ai.log.qmd#evalsamplereductions + - text: EvalStats + href: reference/inspect_ai.log.qmd#evalstats + - text: EvalError + href: reference/inspect_ai.log.qmd#evalerror + - text: EvalSample + href: reference/inspect_ai.log.qmd#evalsample + - text: EvalSampleLimit + href: reference/inspect_ai.log.qmd#evalsamplelimit + - text: EvalSampleReductions + href: reference/inspect_ai.log.qmd#evalsamplereductions + - text: EvalSampleScore + href: reference/inspect_ai.log.qmd#evalsamplescore + - text: transcript + href: reference/inspect_ai.log.qmd#transcript + - text: Transcript + href: reference/inspect_ai.log.qmd#transcript + - text: Event + href: reference/inspect_ai.log.qmd#event + - text: SampleInitEvent + href: reference/inspect_ai.log.qmd#sampleinitevent + - text: SampleLimitEvent + href: reference/inspect_ai.log.qmd#samplelimitevent + - text: StateEvent + href: reference/inspect_ai.log.qmd#stateevent + - text: StoreEvent + href: reference/inspect_ai.log.qmd#storeevent + - text: ModelEvent + href: reference/inspect_ai.log.qmd#modelevent + - text: ToolEvent + href: reference/inspect_ai.log.qmd#toolevent + - text: ApprovalEvent + href: reference/inspect_ai.log.qmd#approvalevent + - text: InputEvent + href: reference/inspect_ai.log.qmd#inputevent + - text: StoreEvent + href: reference/inspect_ai.log.qmd#storeevent + - text: ErrorEvent + href: reference/inspect_ai.log.qmd#errorevent + - text: LoggerEvent + href: reference/inspect_ai.log.qmd#loggerevent + - text: LoggingLevel + href: reference/inspect_ai.log.qmd#logginglevel + - text: LoggingMessage + href: reference/inspect_ai.log.qmd#loggingmessage + - text: InfoEvent + href: reference/inspect_ai.log.qmd#infoevent + - text: StepEvent + href: reference/inspect_ai.log.qmd#stepevent + - text: SubtaskEvent + href: reference/inspect_ai.log.qmd#subtaskevent + - section: inspect_ai.util + href: reference/inspect_ai.util.qmd + contents: + - text: Store + href: reference/inspect_ai.util.qmd#store + - text: store + href: reference/inspect_ai.util.qmd#store + - text: store_as + href: reference/inspect_ai.util.qmd#store_as + - text: StoreModel + href: reference/inspect_ai.util.qmd#storemodel + - text: concurrency + href: reference/inspect_ai.util.qmd#concurrency + - text: subprocess + href: reference/inspect_ai.util.qmd#subprocess + - text: ExecResult + href: reference/inspect_ai.util.qmd#execresult + - text: display_type + href: reference/inspect_ai.util.qmd#display_type + - text: DisplayType + href: reference/inspect_ai.util.qmd#displaytype + - text: input_screen + href: reference/inspect_ai.util.qmd#input_screen + - text: subtask + href: reference/inspect_ai.util.qmd#subtask + - text: Subtask + href: reference/inspect_ai.util.qmd#subtask + - text: resource + href: reference/inspect_ai.util.qmd#resource + - text: throttle + href: reference/inspect_ai.util.qmd#throttle + - text: trace_action + href: reference/inspect_ai.util.qmd#trace_action + - text: trace_message + href: reference/inspect_ai.util.qmd#trace_message + - text: sandbox + href: reference/inspect_ai.util.qmd#sandbox + - text: sandbox_with + href: reference/inspect_ai.util.qmd#sandbox_with + - text: SandboxEnvironment + href: reference/inspect_ai.util.qmd#sandboxenvironment + - text: SandboxConnection + href: reference/inspect_ai.util.qmd#sandboxconnection + - text: sandboxenv + href: reference/inspect_ai.util.qmd#sandboxenv + - section: Inspect CLI + href: reference/inspect_eval.qmd + contents: + - text: inspect eval + href: reference/inspect_eval.qmd + - text: inspect eval-retry + href: reference/inspect_eval-retry.qmd + - text: inspect eval-set + href: reference/inspect_eval-set.qmd + - text: inspect score + href: reference/inspect_score.qmd + - text: inspect view + href: reference/inspect_view.qmd + - text: inspect log + href: reference/inspect_log.qmd + - text: inspect trace + href: reference/inspect_trace.qmd + - text: inspect sandbox + href: reference/inspect_sandbox.qmd + - text: inspect cache + href: reference/inspect_cache.qmd + - text: inspect list + href: reference/inspect_list.qmd + - text: inspect info + href: reference/inspect_info.qmd \ No newline at end of file diff --git a/docs/reference/filter/commands.py b/docs/reference/filter/commands.py new file mode 100644 index 000000000..c35a87660 --- /dev/null +++ b/docs/reference/filter/commands.py @@ -0,0 +1,393 @@ +# (C) Datadog, Inc. 2020-present +# All rights reserved +# Licensed under the Apache license (see LICENSE) +# from https://github.com/mkdocs/mkdocs-click/blob/master/mkdocs_click/_docs.py + +from __future__ import annotations + +import importlib +import inspect +from contextlib import ExitStack, contextmanager +from typing import Any, Iterator, cast + +import click +from markdown.extensions.toc import slugify + + +def make_command_docs( + command: str, + depth: int = 0, + style: str = "table", + remove_ascii_art: bool = False, + show_hidden: bool = False, + list_subcommands: bool = True, + has_attr_list: bool = True, +) -> Iterator[str]: + """Create the Markdown lines for a command and its sub-commands.""" + command = command.replace("-", "_") + module = "eval" if command.startswith("eval") else command + for line in _recursively_make_command_docs( + f"inspect {command}", + load_command(f"inspect_ai._cli.{module}", f"{command}_command"), + depth=depth, + style=style, + remove_ascii_art=remove_ascii_art, + show_hidden=show_hidden, + list_subcommands=list_subcommands, + has_attr_list=has_attr_list, + ): + if line.strip() == "\b": + continue + + yield line + + +def _recursively_make_command_docs( + prog_name: str, + command: click.BaseCommand, + parent: click.Context | None = None, + depth: int = 0, + style: str = "plain", + remove_ascii_art: bool = False, + show_hidden: bool = False, + list_subcommands: bool = False, + has_attr_list: bool = False, +) -> Iterator[str]: + """Create the raw Markdown lines for a command and its sub-commands.""" + ctx = _build_command_context(prog_name=prog_name, command=command, parent=parent) + + if ctx.command.hidden and not show_hidden: + return + + subcommands = _get_sub_commands(ctx.command, ctx) + + if parent is not None: + yield from _make_title(ctx, depth, has_attr_list=has_attr_list) + yield from _make_description(ctx, remove_ascii_art=remove_ascii_art) + yield from _make_usage(ctx) + if len(subcommands) == 0: + yield from _make_options(ctx, style, show_hidden=show_hidden) + return + + if list_subcommands: + yield from _make_subcommands_links( + subcommands, + ctx, + has_attr_list=has_attr_list, + show_hidden=show_hidden, + ) + + for command in subcommands: + yield from _recursively_make_command_docs( + cast(str, command.name), + command, + parent=ctx, + depth=depth + 1, + style=style, + show_hidden=show_hidden, + list_subcommands=list_subcommands, + has_attr_list=has_attr_list, + ) + + +def _build_command_context( + prog_name: str, command: click.BaseCommand, parent: click.Context | None +) -> click.Context: + return click.Context(cast(click.Command, command), info_name=prog_name, parent=parent) + + +def _get_sub_commands(command: click.Command, ctx: click.Context) -> list[click.Command]: + """Return subcommands of a Click command.""" + subcommands = getattr(command, "commands", {}) + if subcommands: + return list(subcommands.values()) + + if not isinstance(command, click.MultiCommand): + return [] + + subcommands = [] + + for name in command.list_commands(ctx): + subcommand = command.get_command(ctx, name) + assert subcommand is not None + subcommands.append(subcommand) + + return subcommands + + +def _make_title(ctx: click.Context, depth: int, *, has_attr_list: bool) -> Iterator[str]: + """Create the Markdown heading for a command.""" + if has_attr_list: + yield from _make_title_full_command_path(ctx, depth) + else: + yield from _make_title_basic(ctx, depth) + + +def _make_title_basic(ctx: click.Context, depth: int) -> Iterator[str]: + """Create a basic Markdown heading for a command.""" + yield f"{'#' * (depth + 1)} {ctx.info_name}" + yield "" + + +def _make_title_full_command_path(ctx: click.Context, depth: int) -> Iterator[str]: + """Create the markdown heading for a command, showing the full command path. + + This style accomodates nested commands by showing: + * The full command path for headers and permalinks (eg `# git commit` and `http://localhost:8000/#git-commit`) + * The command leaf name only for TOC entries (eg `* commit`). + + We do this because a TOC naturally conveys the hierarchy, whereas headings and permalinks should be namespaced to + convey the hierarchy. + + See: https://github.com/mkdocs/mkdocs-click/issues/35 + """ + text = ctx.command_path # 'git commit' + permalink = slugify(ctx.command_path, "-") # 'git-commit' + toc_label = ctx.info_name # 'commit' + + # Requires `attr_list` extension, see: https://python-markdown.github.io/extensions/toc/#custom-labels + attributes = f"#{permalink} data-toc-label='{toc_label}'" + + yield f"{'#' * (depth + 1)} {text} {{{attributes}}}" + yield "" + + +def _make_description(ctx: click.Context, remove_ascii_art: bool = False) -> Iterator[str]: + """Create markdown lines based on the command's own description.""" + help_string = ctx.command.help or ctx.command.short_help + + if not help_string: + return + + # https://github.com/pallets/click/pull/2151 + help_string = inspect.cleandoc(help_string) + + if not remove_ascii_art: + yield from help_string.splitlines() + yield "" + return + + skipped_ascii_art = True + for i, line in enumerate(help_string.splitlines()): + if skipped_ascii_art is False: + if not line.strip(): + skipped_ascii_art = True + continue + elif i == 0 and line.strip() == "\b": + skipped_ascii_art = False + + if skipped_ascii_art: + yield line + yield "" + + +def _make_usage(ctx: click.Context) -> Iterator[str]: + """Create the Markdown lines from the command usage string.""" + + # Gets the usual 'Usage' string without the prefix. + formatter = ctx.make_formatter() + pieces = ctx.command.collect_usage_pieces(ctx) + formatter.write_usage(ctx.command_path, " ".join(pieces), prefix="") + usage = formatter.getvalue().rstrip("\n") + + yield "#### Usage" + yield "" + yield "```text" + yield usage + yield "```" + yield "" + + +def _make_options( + ctx: click.Context, style: str = "plain", show_hidden: bool = False +) -> Iterator[str]: + """Create the Markdown lines describing the options for the command.""" + + if style == "plain": + return _make_plain_options(ctx, show_hidden=show_hidden) + elif style == "table": + return _make_table_options(ctx, show_hidden=show_hidden) + else: + raise RuntimeError( + f"{style} is not a valid option style, which must be either `plain` or `table`." + ) + + +@contextmanager +def _show_options(ctx: click.Context) -> Iterator[None]: + """Context manager that temporarily shows all hidden options.""" + options = [ + opt for opt in ctx.command.get_params(ctx) if isinstance(opt, click.Option) and opt.hidden + ] + + try: + for option in options: + option.hidden = False + yield + finally: + for option in options: + option.hidden = True + + +def _make_plain_options(ctx: click.Context, show_hidden: bool = False) -> Iterator[str]: + """Create the plain style options description.""" + with ExitStack() as stack: + if show_hidden: + stack.enter_context(_show_options(ctx)) + + formatter = ctx.make_formatter() + click.Command.format_options(ctx.command, ctx, formatter) + + option_lines = formatter.getvalue().splitlines() + + # First line is redundant "Options" + option_lines = option_lines[1:] + + if not option_lines: # pragma: no cover + # We expect at least `--help` to be present. + raise RuntimeError("Expected at least one option") + + yield "#### Options" + yield "" + yield "```text" + yield from option_lines + yield "```" + yield "" + + +# Unicode "Vertical Line" character (U+007C), HTML-compatible. +# "\|" (escaped pipe) would work, too, but linters don't like it in literals. +# https://stackoverflow.com/questions/23723396/how-to-show-the-pipe-symbol-in-markdown-table +_HTML_PIPE = "|" + + +def _format_table_option_type(option: click.Option) -> str: + typename = option.type.name + + + if isinstance(option.type, click.Choice): + # @click.option(..., type=click.Choice(["A", "B", "C"])) + # -> choices (`A` | `B` | `C`) + choices = f" {_HTML_PIPE} ".join(f"`{choice}`" for choice in option.type.choices) + return f"{typename} ({choices})" + + if isinstance(option.type, click.DateTime): + # @click.option(..., type=click.DateTime(["A", "B", "C"])) + # -> datetime (`%Y-%m-%d` | `%Y-%m-%dT%H:%M:%S` | `%Y-%m-%d %H:%M:%S`) + formats = f" {_HTML_PIPE} ".join(f"`{fmt}`" for fmt in option.type.formats) + return f"{typename} ({formats})" + + if isinstance(option.type, (click.IntRange, click.FloatRange)): + if option.type.min is not None and option.type.max is not None: + # @click.option(..., type=click.IntRange(min=0, max=10)) + # -> integer range (between `0` and `10`) + return f"{typename} (between `{option.type.min}` and `{option.type.max}`)" + elif option.type.min is not None: + # @click.option(..., type=click.IntRange(min=0)) + # -> integer range (`0` and above) + return f"{typename} (`{option.type.min}` and above)" + else: + # @click.option(..., type=click.IntRange(max=10)) + # -> integer range (`10` and below) + return f"{typename} (`{option.type.max}` and below)" + + # -> "boolean", "text", etc. + return typename + + +def _format_table_option_row(option: click.Option) -> str: + # Example: @click.option("-V, --version/--show-version", is_flag=True, help="Show version info.") + + # -> "`-V`, `--version`" + names = ", ".join(f"`{opt}`" for opt in option.opts) + + if option.secondary_opts: + # -> "`-V`, `--version` / `--show-info`" + names += " / " + names += ", ".join(f"`{opt}`" for opt in option.secondary_opts) + + # -> "boolean" + value_type = _format_table_option_type(option) + + # -> "Show version info." + description = option.help if option.help is not None else "N/A" + + # -> `False` + none_default_msg = "_required" if option.required else "None" + default = f"`{option.default}`" if option.default is not None else none_default_msg + + # -> "| `-V`, `--version` / `--show-version` | boolean | Show version info. | `False` |" + return f"| {names} | {value_type} | {description} | {default} |" + + +def _make_table_options(ctx: click.Context, show_hidden: bool = False) -> Iterator[str]: + """Create the table style options description.""" + + options = [param for param in ctx.command.get_params(ctx) if isinstance(param, click.Option)] + options = [option for option in options if not option.hidden or show_hidden] + option_rows = [_format_table_option_row(option) for option in options] + + yield "#### Options" + yield "" + yield "| Name | Type | Description | Default |" + yield "| ---- | ---- | ----------- | ------- |" + yield from option_rows + yield ": {.sm .borderless tbl-colwidths=[25,15,50,10]}" + yield "" + + +def _make_subcommands_links( + subcommands: list[click.Command], + parent: click.Context, + has_attr_list: bool, + show_hidden: bool, +) -> Iterator[str]: + yield "#### Subcommands" + yield "" + yield "| | |" + yield "| ---- | ----------- |" + for command in subcommands: + command_name = cast(str, command.name) + ctx = _build_command_context(command_name, command, parent) + if ctx.command.hidden and not show_hidden: + continue + command_bullet = ( + command_name + if not has_attr_list + else f"[{command_name}](#{slugify(ctx.command_path, '-')})" + ) + help_string = ctx.command.short_help or ctx.command.help + if help_string is not None: + help_string = help_string.splitlines()[0] + else: + help_string = "*No description was provided with this command.*" + + yield f"| {command_bullet} | {help_string} |" + yield ": {.borderless tbl-colwidths=[35,65]}" + yield "" + + +def load_command(module: str, attribute: str) -> click.BaseCommand: + """ + Load and return the Click command object located at ':'. + """ + command = _load_obj(module, attribute) + + if not isinstance(command, click.BaseCommand): + raise RuntimeError( + f"{attribute!r} must be a 'click.BaseCommand' object, got {type(command)}" + ) + + return command + + +def _load_obj(module: str, attribute: str) -> Any: + try: + mod = importlib.import_module(module) + except SystemExit: + raise RuntimeError("the module appeared to call sys.exit()") # pragma: no cover + + try: + return getattr(mod, attribute) + except AttributeError: + raise RuntimeError(f"Module {module!r} has no attribute {attribute!r}") \ No newline at end of file diff --git a/docs/reference/filter/filter.py b/docs/reference/filter/filter.py new file mode 100644 index 000000000..14490afc6 --- /dev/null +++ b/docs/reference/filter/filter.py @@ -0,0 +1,57 @@ +import subprocess +import sys +from typing import cast + +from griffe import Module +import griffe +import panflute as pf # type: ignore + +from parse import DocParseOptions, parse_docs +from render import render_docs +from commands import make_command_docs + + +def main(): + # create options + module = cast(Module, griffe.load("inspect_ai")) + sha = ( + subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True) + .stdout.decode() + .strip() + ) + source_url = f"https://github.com/UKGovernmentBEIS/inspect_ai/blob/{sha}/src" + parse_options = DocParseOptions(module=module, source_url=source_url) + + # python api -- convert h3 into refererence + def python_api(elem: pf.Element, doc: pf.Doc): + if isinstance(elem, pf.Header) and elem.level == 3: + title = pf.stringify(doc.metadata["title"]) + if title.startswith("inspect_ai"): + if title.startswith("inspect_ai."): + # get target object + module = title.removeprefix("inspect_ai.") + object = f"{module}.{pf.stringify(elem.content)}" + else: + object = pf.stringify(elem.content) + + # parse docs + docs = parse_docs(object, parse_options) + + # render docs + return render_docs(elem, docs) + + # click cli + def click_cli(elem: pf.Element, doc: pf.Doc): + if isinstance(elem, pf.Doc): + title = pf.stringify(doc.metadata["title"]) + if title.startswith("inspect "): + command = title.split(" ")[1] + docs = "\n".join(list(make_command_docs(command))) + doc.content.append(pf.RawBlock(docs, "markdown")) + + + return pf.run_filters([python_api, click_cli]) + + +if __name__ == "__main__": + main() diff --git a/docs/reference/filter/interlink.lua b/docs/reference/filter/interlink.lua new file mode 100644 index 000000000..2511d2549 --- /dev/null +++ b/docs/reference/filter/interlink.lua @@ -0,0 +1,57 @@ +-- interlink filter for inline code +local pandoc = require('pandoc') + +-- read the refs index +refs = {} +is_reference = string.find(quarto.doc.input_file, "reference/") ~= nil +if is_reference then + refs_path = "refs.json" +else + refs_path = "reference/refs.json" +end + +local refs_file = io.open(refs_path, "r") + +if refs_file ~= nil then + refs = pandoc.json.decode(refs_file:read("a")) + refs_file:close() +end + +local function is_class_name(str) + -- ^[A-Z] checks for capital letter at start + -- %S+ checks for one or more non-space characters + -- $ ensures we reach the end of string + return string.find(str, "^[A-Z]%S+$") ~= nil +end + +local function is_function_call(str) + -- ^%S+ checks for one or more non-space characters from start + -- %(%)$ checks for literal () at the end + return string.find(str, "^%S+%(%)$") ~= nil +end + +local function create_interlink(text, ref) + if is_reference then + prefix = "" + else + prefix = "reference/" + end + return pandoc.Span(pandoc.Link(pandoc.Str(text), prefix .. ref), + { class = "element-type-name ref-interlink" }) +end + + +function Code(el) + if is_class_name(el.text) then + ref = refs[el.text] + if ref ~= nil then + return create_interlink(el.text, ref) + end + elseif is_function_call(el.text) then + func = string.sub(el.text, 1, -3) + ref = refs[func] + if ref ~= nil then + return create_interlink(el.text, ref) + end + end +end diff --git a/docs/reference/filter/parse.py b/docs/reference/filter/parse.py new file mode 100644 index 000000000..34924f510 --- /dev/null +++ b/docs/reference/filter/parse.py @@ -0,0 +1,307 @@ +from dataclasses import dataclass +from itertools import islice +from pathlib import Path +from typing import Any, NamedTuple, cast +from griffe import ( + Alias, + Attribute, + Class, + DocstringSection, + DocstringSectionExamples, + DocstringSectionParameters, + DocstringSectionRaises, + Expr, + Function, + Module, + Object, + ParameterKind, +) + + +@dataclass +class DocParseOptions: + module: Module + source_url: str + + +@dataclass +class DocParameter: + name: str + type: str + required: bool + default: Any + description: str + + +@dataclass +class DocRaises: + type: str + description: str + + +@dataclass +class DocAttribute: + name: str + type: str + description: str + + +@dataclass +class DocObject: + name: str + description: str + source: str + declaration: str + examples: str | None + text_sections: list[str] + + +@dataclass +class DocFunction(DocObject): + parameters: list[DocParameter] + raises: list[DocRaises] + + +@dataclass +class DocClass(DocObject): + attributes: list[DocAttribute] + methods: list[DocFunction] + + +def parse_docs(path: str, options: DocParseOptions) -> DocObject: + # lookup object + object: Object | Alias = options.module + for segment in path.split("."): + object = object.members[segment] + + # resolve aliases + if isinstance(object, Alias): + object = object.final_target + + # type-specific parsing + if isinstance(object, Function): + return parse_function_docs(object, options) + elif isinstance(object, Class): + return parse_class_docs(object, options) + elif isinstance(object, Attribute): + return parse_attribute_docs(object, options) + else: + raise ValueError( + f"Reference object type ({type(object)}) for {path} is unsupported." + ) + + +def parse_attribute_docs(attrib: Attribute, options: DocParseOptions) -> DocObject: + source, declaration, docstrings = read_source(attrib, options) + + return DocObject( + name=attrib.name, + description=docstrings[0].value, + source=source, + declaration=declaration, + examples=None, + text_sections=[], + ) + + +def parse_class_docs(clz: Class, options: DocParseOptions) -> DocObject: + # if this is a protocol then ammend the declaration w/ the __call__ + is_protocol = clz.bases and str(clz.bases[0]) == "Protocol" + if is_protocol: + # read from call (substituting the protocol name) + call = cast(Function, clz.members["__call__"]) + call_docs = parse_function_docs(call, options) + call_docs.name = clz.name + call_docs.declaration = f"class {clz.name}(Protocol):\n{call_docs.declaration}" + return call_docs + else: + # read source + source, declaration, docstrings = read_source(clz, options) + + # read attributes and methods + attributes: list[DocAttribute] = [] + methods: list[DocFunction] = [] + for member in clz.members.values(): + if member.docstring is None: + continue + + if isinstance(member, Attribute): + if not isinstance(member.annotation, Expr): + continue + if member.name.startswith("_"): + continue + if "deprecated" in member.docstring.value.lower(): + continue + attributes.append( + DocAttribute( + name=member.name, + type=str(member.annotation.modernize()), + description=member.docstring.value, + ) + ) + elif isinstance(member, Function) and include_function(member): + methods.append(parse_function_docs(member, options)) + + # return as a class + return DocClass( + name=clz.name, + description=docstrings[0].value, + source=source, + declaration=declaration, + examples=None, + text_sections=[], + attributes=attributes, + methods=methods, + ) + + +def include_function(function: Function) -> bool: + # skip private + if function.name.startswith("_") and not function.name.startswith("__init__"): + return False + + # skip pydantic validators + if "classmethod" in function.labels: + if any(["model_" in str(dec.value) for dec in function.decorators]): + return False + + return True + + +class DocstringContent(NamedTuple): + description: str + parameter_descriptions: dict[str, str] + raises: dict[str, str] + examples: str | None + text_sections: list[str] + + +def parse_function_docs(function: Function, options: DocParseOptions) -> DocFunction: + # read source + source, declaration, docstrings = read_source(function, options) + + # read docstring sections + docstring_content = read_docstring_sections(docstrings) + + # extract params + params = read_params(function, docstring_content.parameter_descriptions) + + # extract raises + raises = [ + DocRaises(type=k, description=v) for k, v in docstring_content.raises.items() + ] + + # return function + return DocFunction( + name=function.name, + description=docstring_content.description, + source=source, + declaration=declaration, + examples=docstring_content.examples, + text_sections=docstring_content.text_sections, + parameters=params, + raises=raises, + ) + + +def read_params( + function: Function, parameter_descriptions: dict[str, str] +) -> list[DocParameter]: + # extract params + params: list[DocParameter] = [] + for p in function.parameters: + # skip self + if p.name == "self" or p.name == "cls": + continue + + # param name w/ varargs prefix + name = p.name + if p.kind == ParameterKind.var_positional: + name = f"*{name}" + elif p.kind == ParameterKind.var_keyword: + name = f"**{name}" + + params.append( + DocParameter( + name=name, + type=str(p.annotation.modernize()) + if isinstance(p.annotation, Expr) + else str(p.annotation), + required=p.required, + default=str(p.default) if p.required else "", + description=parameter_descriptions[name], + ) + ) + + return params + + +def read_docstring_sections(docstrings: list[DocstringSection]) -> DocstringContent: + # main text + description = docstrings[0].value + + examples: str | None = None + text_sections: list[str] = [] + parameter_descriptions: dict[str, str] = {} + raises: dict[str, str] = {} + for doc_section in docstrings[1:]: + if isinstance(doc_section, DocstringSectionParameters): + for p in docstrings[1].value: + desc = p.description.strip() + parameter_descriptions[p.name] = desc + elif isinstance(doc_section, DocstringSectionExamples): + examples = "\n\n".join(value[1] for value in doc_section.value) + elif isinstance(doc_section, DocstringSectionRaises): + for r in doc_section.value: + raises[str(r.annotation)] = r.description + + return DocstringContent( + description=description, + parameter_descriptions=parameter_descriptions, + raises=raises, + examples=examples, + text_sections=text_sections, + ) + + +def read_source( + object: Object, options: DocParseOptions +) -> tuple[str, str, list[DocstringSection]]: + # assert preconditions + assert isinstance(object.filepath, Path) + assert object.lineno is not None + assert object.docstring is not None + assert object.docstring.lineno is not None + + # url to code + source = f"{options.source_url}/{object.relative_package_filepath}#L{object.lineno}" + + # read function source code + declaration = format_declaration( + read_lines(object.filepath, object.lineno, object.docstring.lineno - 1) + ) + + # read docstrings + docstrings = object.docstring.parse("google") + + # return + return source, declaration, docstrings + + +def read_declaration(object: Object | Alias) -> str: + assert isinstance(object.filepath, Path) + assert object.lineno + assert object.endlineno + return format_declaration( + read_lines(object.filepath, object.lineno, object.endlineno) + ) + + +def read_lines(filename: Path, start_line: int, end_line: int) -> list[str]: + with open(filename, "r") as file: + return list(islice(file, start_line - 1, end_line)) + + +def format_declaration(lines: list[str]) -> str: + code = "".join(lines) + return code.removesuffix(":\n") diff --git a/docs/reference/filter/post.lua b/docs/reference/filter/post.lua new file mode 100644 index 000000000..4a8f2b80c --- /dev/null +++ b/docs/reference/filter/post.lua @@ -0,0 +1,54 @@ +--- filter for post-processing the main ref generation filter (filter.py) +--- does interlinks for reference links and renders raw markdown blocks + +local pandoc = require('pandoc') + +-- read refs index +refs_file = io.open("refs.json", "r") +refs = pandoc.json.decode(refs_file:read("a")) +refs_file:close() + +function Span(el) + if el.classes:includes("element-type-name") and not el.classes:includes("ref-interlink") then + type = pandoc.utils.stringify(el) + type_ref = refs[type] + if type_ref ~= nil then + el.content = pandoc.Link(el.content:clone(), type_ref) + return el + end + end +end + +function RawBlock(raw) + -- Only process markdown raw blocks + if raw.format ~= 'markdown' then + return nil + end + + -- Parse the markdown content into pandoc AST + -- Note: pandoc.read returns a Pandoc document, we want its blocks + local doc = pandoc.read(raw.text, 'markdown+autolink_bare_uris') + if doc and doc.blocks then + return doc.blocks + end + return nil +end + +function RawInline(raw) + -- Only process markdown raw inlines + if raw.format ~= 'markdown' then + return nil + end + + -- Parse the markdown content into pandoc AST + local doc = pandoc.read(raw.text, 'markdown+autolink_bare_uris') + if doc and doc.blocks then + -- For inline content, we want the inlines from the first block + -- (typically a Para or Plain block) + local first_block = doc.blocks[1] + if first_block then + return first_block.content + end + end + return nil +end diff --git a/docs/reference/filter/render.py b/docs/reference/filter/render.py new file mode 100644 index 000000000..a5ae429f9 --- /dev/null +++ b/docs/reference/filter/render.py @@ -0,0 +1,221 @@ +from textwrap import dedent +import panflute as pf # type: ignore +from parse import DocAttribute, DocClass, DocFunction, DocObject, DocParameter + + +# render reference elements +def render_docs(elem: pf.Header, docs: DocObject) -> list[pf.Element]: + # remove 'beta' + title = pf.stringify(elem) + if title.startswith("beta."): + title = title.removeprefix("beta.") + elem.content = [pf.Str(title)] + + elements: list[pf.Element] = [elem] + elements.append(pf.RawBlock(docs.description, "markdown")) + + # source link + elements.append(render_source_link(docs)) + + # declaration + elements.append( + pf.CodeBlock(docs.declaration, classes=["python", "doc-declaration"]) + ) + + # type specific rendering + if isinstance(docs, DocFunction): + if docs.parameters: + elements.append(render_params(docs.parameters)) + elif isinstance(docs, DocClass): + if docs.attributes: + elements.append(pf.Header(pf.Str("Attributes"), level=4)) + elements.append(render_attributes(docs.attributes)) + if docs.methods: + elements.append( + pf.Header(pf.Str("Methods"), level=4, classes=["class-methods"]) + ) + elements.append(render_methods(docs.methods)) + + # other sections + for section in docs.text_sections: + elements.append(pf.RawBlock(section, "markdown")) + + # examples + if docs.examples is not None: + elements.append(pf.Header(pf.Str("Examples"), level=4)) + elements.append(pf.RawBlock(docs.examples, "markdown")) + + # return elements + return elements + + +def render_attributes(attribs: list[DocAttribute]) -> pf.Table: + return render_element_list(attribs) + + +def render_methods(methods: list[DocFunction]) -> pf.DefinitionList: + return pf.DefinitionList( + *[render_method_definition_item(method) for method in methods] + ) + + +def render_source_link(object: DocObject) -> pf.Div: + return pf.Div( + pf.Plain(pf.Link(pf.Str("Source"), url=object.source)), classes=["source-link"] + ) + + +def render_method_definition_item(method: DocFunction) -> pf.DefinitionItem: + return pf.DefinitionItem( + [pf.Str(method.name)], + [ + pf.Definition( + pf.RawBlock(method.description, format="markdown"), + render_source_link(method), + pf.CodeBlock(dedent(method.declaration), classes=["python"]), + render_params(method.parameters), + ) + ], + ) + + +def render_params(params: list[DocParameter]) -> pf.Table | pf.Div: + if len(params) > 0: + return render_element_list(params) + else: + return pf.Div() + + +def render_element_list( + elements: list[DocAttribute] | list[DocParameter], +) -> pf.DefinitionList: + return pf.DefinitionList( + *[render_element_definition_item(element) for element in elements] + ) + + +def render_element_definition_item( + element: DocAttribute | DocParameter, +) -> pf.DefinitionItem: + return pf.DefinitionItem( + [pf.Code(element.name, classes=["ref-definition"]), pf.Space(), render_element_type(element.type)], + [pf.Definition(pf.RawBlock(element.description, format="markdown"))], + ) + + +def render_element_type(type: str) -> pf.Span: + element_type: list[pf.Inline] = [] + for token, token_type in tokenize_type_declaration(type): + if token_type == "text": + element_type.append(pf.Str(token)) + else: + element_type.append(pf.Span(pf.Str(token), classes=["element-type-name"])) + + return pf.Span(*element_type, classes=["element-type"]) + + +def render_params_header() -> pf.TableHead: + return pf.TableHead( + pf.TableRow( + pf.TableCell(pf.Plain(pf.Str("Argument"))), + pf.TableCell(pf.Plain(pf.Str("Description"))), + ) + ) + + +def render_header(col1: str, col2: str) -> pf.TableHead: + return pf.TableHead( + pf.TableRow( + pf.TableCell(pf.Plain(pf.Str(col1))), pf.TableCell(pf.Plain(pf.Str(col2))) + ) + ) + + +def tokenize_type_declaration(type_str: str) -> list[tuple[str, str]]: + common_types = { + "Any", + "Dict", + "List", + "Set", + "Tuple", + "Optional", + "Union", + "Callable", + "Iterator", + "Iterable", + "Generator", + "Type", + "TypeVar", + "Generic", + "Protocol", + "NamedTuple", + "TypedDict", + "Literal", + "Final", + "ClassVar", + "NoReturn", + "Never", + "Self", + "int", + "str", + "float", + "bool", + "bytes", + "object", + "None", + "Sequence", + "Mapping", + "MutableMapping", + "Awaitable", + "Coroutine", + "AsyncIterator", + "AsyncIterable", + "ContextManager", + "AsyncContextManager", + "Pattern", + "Match", + } + + tokens = [] + current_token = "" + + def add_token(token: str, force_type: str | None = None) -> None: + """Helper function to add a token with its classified type.""" + if not token: + return + + if force_type: + token_type = force_type + elif token in common_types or (token[0].isupper() and token.isalnum()): + token_type = "type" + else: + token_type = "text" + + tokens.append((token, token_type)) + + i = 0 + while i < len(type_str): + char = type_str[i] + + # Handle whitespace + if char.isspace(): + add_token(current_token) + add_token(char, "text") + current_token = "" + + # Handle special characters + elif char in "[](),|": + add_token(current_token) + add_token(char, "text") + current_token = "" + + # Build up identifier + else: + current_token += char + + i += 1 + + # Add any remaining token + add_token(current_token) + + return tokens diff --git a/docs/reference/filter/sidebar.py b/docs/reference/filter/sidebar.py new file mode 100644 index 000000000..fe4500340 --- /dev/null +++ b/docs/reference/filter/sidebar.py @@ -0,0 +1,115 @@ +import json +import os +import yaml + + +# only execute if a reference doc is in the inputs +input_files = os.getenv("QUARTO_PROJECT_INPUT_FILES", "") +if "reference/inspect_ai" not in input_files: + exit(0) + +# register reference docs (this defines their sidebar order) +reference_docs = ["reference/inspect_ai.qmd"] + [ + f"reference/inspect_ai.{doc}" + for doc in [ + "solver.qmd", + "tool.qmd", + "scorer.qmd", + "model.qmd", + "dataset.qmd", + "approval.qmd", + "log.qmd", + "util.qmd", + ] +] + +# build sidebar yaml +sidebar = yaml.safe_load(""" +website: + sidebar: + - title: Reference + style: docked + collapse-level: 2 + contents: + - reference/index.qmd + - section: Python API + href: reference/inspect_ai.qmd + contents: [] + - section: Inspect CLI + href: reference/inspect_eval.qmd + contents: + - text: inspect eval + href: reference/inspect_eval.qmd + - text: inspect eval-retry + href: reference/inspect_eval-retry.qmd + - text: inspect eval-set + href: reference/inspect_eval-set.qmd + - text: inspect score + href: reference/inspect_score.qmd + - text: inspect view + href: reference/inspect_view.qmd + - text: inspect log + href: reference/inspect_log.qmd + - text: inspect trace + href: reference/inspect_trace.qmd + - text: inspect sandbox + href: reference/inspect_sandbox.qmd + - text: inspect cache + href: reference/inspect_cache.qmd + - text: inspect list + href: reference/inspect_list.qmd + - text: inspect info + href: reference/inspect_info.qmd +""") +contents_yaml = sidebar["website"]["sidebar"][0]["contents"][1]["contents"] + +# build index (for cross linking) +index_json: dict[str, str] = {} + + +# helper to parse reference objects from qmd +def parse_reference_objects(markdown: str) -> list[str]: + objects: list[str] = [] + for line in markdown.splitlines(): + if line.startswith("### "): + line = line.removeprefix("### ").removeprefix("beta.") + objects.append(line.removeprefix("### ")) + + return objects + + +# build for each reference doc +for doc in reference_docs: + with open(doc, "r") as f: + objects = parse_reference_objects(f.read()) + refs = [dict(text=o, href=f"{doc}#{o.lower()}") for o in objects] + for ref in refs: + index_json[ref["text"]] = ref["href"].removeprefix("reference/") + + # add section to sidebar + section = doc.removeprefix("reference/").removesuffix(".qmd") + record = dict(section=section, href=doc, contents=refs) + contents_yaml.append(record) + + +# write ref index +index_file = "reference/refs.json" +with open(index_file, "w") as f: + json.dump(index_json, f, indent=2) + +# dump as yaml +sidebar_yaml = yaml.dump(sidebar, sort_keys=False).strip() + +# read previous sidebar +sidebar_file = "reference/_sidebar.yml" +if os.path.exists(sidebar_file): + with open(sidebar_file, "r") as f: + previous_sidebar_yaml = f.read().strip() +else: + previous_sidebar_yaml = "" + +# only write the file if the sidebar has changed +# (prevents infinite preview render) +if sidebar_yaml != previous_sidebar_yaml: + with open(sidebar_file, "w") as f: + f.write(sidebar_yaml) diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd new file mode 100644 index 000000000..0e9d33eec --- /dev/null +++ b/docs/reference/index.qmd @@ -0,0 +1,55 @@ +--- +title: Reference +tbl-colwidths: [35,66] +toc-depth: 4 +--- + + +#### Python API + +| | | +|----|----| +| [inspect_ai](inspect_ai.qmd) | Tasks, evaluation, and scoring. | +| [inspect_ai.solver](inspect_ai.solver.qmd) | Prompting, elicitation and agents. | +| [inspect_ai.tool](inspect_ai.tool.qmd) | Built-in and custom tool functions. | +| [inspect_ai.scorer](inspect_ai.scorer.qmd) | Task scoring and metrics. | +| [inspect_ai.model](inspect_ai.model.qmd) | Model interface and providers. | +| [inspect_ai.dataset](inspect_ai.dataset.qmd) | Reading samples from datasets. | +| [inspect_ai.approval](inspect_ai.approval.qmd) | Approvers and approval policies. | +| [inspect_ai.log](inspect_ai.log.qmd) | List, read, write, and analyse logs. | +| [inspect_ai.util](inspect_ai.util.qmd) | Miscellaneous utility functions. | + +: {.borderless} + +#### Inspect CLI + +| | | +|------------------------------------|------------------------------------| +| [inspect eval](inspect_eval.qmd) | Evaluate one or more tasks. | +| [inspect eval-retry](inspect_eval-retry.qmd) | Retry an evaluation task. | +| [inspect eval-set](inspect_eval-set.qmd) | Evaluate a set of tasks with retries. | +| [inspect score](inspect_score.qmd) | Score a previous evaluation run. | +| [inspect view](inspect_view.qmd) | Inspect log viewer | +| [inspect_log](inspect_log.qmd) | Query, read, write, and convert logs. | +| [inspect trace](inspect_trace.qmd) | List and read execution traces. | +| [inspect sandbox](inspect_sandbox.qmd) | Manage sandbox environments. | +| [inspect cache](inspect_cache.qmd) | Manage the Inspect model cache. | +| [inspect list](inspect_list.qmd) | List tasks on the filesystem. | +| [inspect info](inspect_info.qmd) | Read version and configuration. | + +: {.borderless} + + +```{=html} + +``` diff --git a/docs/reference/inspect_ai.approval.qmd b/docs/reference/inspect_ai.approval.qmd new file mode 100644 index 000000000..cb565978c --- /dev/null +++ b/docs/reference/inspect_ai.approval.qmd @@ -0,0 +1,20 @@ +--- +title: "inspect_ai.approval" +--- + +## Approvers + +### auto_approver +### human_approver + +## Types + +### Approver +### Approval +### ApprovalDecision +### ApprovalPolicy + +## Decorator + +### approver + diff --git a/docs/reference/inspect_ai.dataset.qmd b/docs/reference/inspect_ai.dataset.qmd new file mode 100644 index 000000000..e1c5c04e0 --- /dev/null +++ b/docs/reference/inspect_ai.dataset.qmd @@ -0,0 +1,18 @@ +--- +title: "inspect_ai.dataset" +--- + +## Readers + +### csv_dataset +### json_dataset +### hf_dataset + +## Types + +### Sample +### FieldSpec +### RecordToSample +### Dataset +### MemoryDataset + diff --git a/docs/reference/inspect_ai.log.qmd b/docs/reference/inspect_ai.log.qmd new file mode 100644 index 000000000..5f0ba70a4 --- /dev/null +++ b/docs/reference/inspect_ai.log.qmd @@ -0,0 +1,67 @@ +--- +title: "inspect_ai.log" +--- + +## Eval Log Files + +### list_eval_logs +### write_eval_log +### read_eval_log +### read_eval_log_sample +### read_eval_log_samples +### convert_eval_logs +### bundle_log_dir +### write_log_dir_manifest +### retryable_eval_logs +### EvalLogInfo + +## Eval Log API + +### EvalLog +### EvalSpec +### EvalDataset +### EvalConfig +### EvalRevision +### EvalPlan +### EvalPlanStep +### EvalResults +### EvalScore +### EvalMetric +### EvalSampleReductions +### EvalStats +### EvalError +### EvalSample +### EvalSampleLimit +### EvalSampleReductions +### EvalSampleScore + +## Transcript API + +### transcript +### Transcript +### Event +### SampleInitEvent +### SampleLimitEvent +### StateEvent +### StoreEvent +### ModelEvent +### ToolEvent +### ApprovalEvent +### InputEvent +### StoreEvent +### ErrorEvent +### LoggerEvent +### LoggingLevel +### LoggingMessage +### InfoEvent +### StepEvent +### SubtaskEvent + + + + + + + + + diff --git a/docs/reference/inspect_ai.model.qmd b/docs/reference/inspect_ai.model.qmd new file mode 100644 index 000000000..87b184cfc --- /dev/null +++ b/docs/reference/inspect_ai.model.qmd @@ -0,0 +1,54 @@ +--- +title: "inspect_ai.model" +--- + +## Generation + +### get_model +### Model +### GenerateConfig +### GenerateConfigArgs +### ModelOutput +### ModelUsage +### StopReason +### ChatCompletionChoice + +## Messages + +### ChatMessage +### ChatMessageBase +### ChatMessageSystem +### ChatMessageUser +### ChatMessageAssistant +### ChatMessageTool + +## Content + +### Content +### ContentText +### ContentImage +### ContentAudio +### ContentVideo + +## Logprobs + +### Logprob +### Logprobs +### TopLogprob + +## Caching + +### CachePolicy +### cache_size +### cache_clear +### cache_list_expired +### cache_prune +### cache_path + +## Provider + +### modelapi +### ModelAPI + + + diff --git a/docs/reference/inspect_ai.qmd b/docs/reference/inspect_ai.qmd new file mode 100644 index 000000000..cc040e73d --- /dev/null +++ b/docs/reference/inspect_ai.qmd @@ -0,0 +1,25 @@ +--- +title: inspect_ai +--- + + + + +## Evaluation + +### eval +### eval_retry +### eval_set +### score + +## Tasks + +### Task +### task_with +### Epochs +### TaskInfo +### Tasks + +## Decorators + +### task diff --git a/docs/reference/inspect_ai.scorer.qmd b/docs/reference/inspect_ai.scorer.qmd new file mode 100644 index 000000000..228745dec --- /dev/null +++ b/docs/reference/inspect_ai.scorer.qmd @@ -0,0 +1,52 @@ +--- +title: "inspect_ai.scorer" +--- + +## Scorers + +### match +### includes +### pattern +### answer +### choice +### f1 +### exact +### model_graded_qa +### model_graded_fact +### multi_scorer + +## Metrics + +### accuracy +### mean +### std +### stderr +### bootstrap_stderr + +## Reducers + +### at_least +### pass_at +### max_score +### mean_score +### median_score +### mode_score + +## Types + +### Scorer +### Target +### Score +### Value +### ScoreReducer +### Metric +### MetricProtocol +### SampleScore + +## Decorators + +### scorer +### metric +### score_reducer + + diff --git a/docs/reference/inspect_ai.solver.qmd b/docs/reference/inspect_ai.solver.qmd new file mode 100644 index 000000000..8a3dc6565 --- /dev/null +++ b/docs/reference/inspect_ai.solver.qmd @@ -0,0 +1,39 @@ +--- +title: "inspect_ai.solver" +--- + +## Generation + +### generate +### use_tools + +## Prompting + +### prompt_template +### system_message +### user_message +### chain_of_thought +### self_critique +### multiple_choice + +## Agents + +### basic_agent +### human_agent +### bridge + +## Composition + +### chain +### fork + +## Types + +### Solver +### SolverSpec +### TaskState +### Generate + +## Decorators + +### solver diff --git a/docs/reference/inspect_ai.tool.qmd b/docs/reference/inspect_ai.tool.qmd new file mode 100644 index 000000000..3b191833b --- /dev/null +++ b/docs/reference/inspect_ai.tool.qmd @@ -0,0 +1,34 @@ +--- +title: "inspect_ai.tool" +--- + +## Tools + +### bash +### python +### web_search +### web_browser +### computer + +## Dynamic + +### tool_with +### ToolDef + +## Types + +### Tool +### ToolResult +### ToolError +### ToolCallError +### ToolChoice +### ToolFunction +### ToolInfo +### ToolParams +### ToolParam +### JSONType + +## Decorator + +### tool + diff --git a/docs/reference/inspect_ai.util.qmd b/docs/reference/inspect_ai.util.qmd new file mode 100644 index 000000000..c0cd4ab6c --- /dev/null +++ b/docs/reference/inspect_ai.util.qmd @@ -0,0 +1,44 @@ +--- +title: "inspect_ai.util" +--- + + +## Store + +### Store +### store +### store_as +### StoreModel + +## Concurrency + +### concurrency +### subprocess +### ExecResult + +## Display + +### display_type +### DisplayType +### input_screen + +## Subtasks + +### subtask +### Subtask + +## Utilities + +### resource +### throttle +### trace_action +### trace_message + +## Sandbox + +### sandbox +### sandbox_with +### SandboxEnvironment +### SandboxConnection +### sandboxenv + diff --git a/docs/reference/inspect_cache.qmd b/docs/reference/inspect_cache.qmd new file mode 100644 index 000000000..86910bdfc --- /dev/null +++ b/docs/reference/inspect_cache.qmd @@ -0,0 +1,10 @@ +--- +title: inspect cache +--- + + + + + + + diff --git a/docs/reference/inspect_eval-retry.qmd b/docs/reference/inspect_eval-retry.qmd new file mode 100644 index 000000000..4d5c10f7e --- /dev/null +++ b/docs/reference/inspect_eval-retry.qmd @@ -0,0 +1,6 @@ +--- +title: inspect eval-retry +--- + + + diff --git a/docs/reference/inspect_eval-set.qmd b/docs/reference/inspect_eval-set.qmd new file mode 100644 index 000000000..51f539ae1 --- /dev/null +++ b/docs/reference/inspect_eval-set.qmd @@ -0,0 +1,6 @@ +--- +title: inspect eval-set +--- + + + diff --git a/docs/reference/inspect_eval.qmd b/docs/reference/inspect_eval.qmd new file mode 100644 index 000000000..8247c33b2 --- /dev/null +++ b/docs/reference/inspect_eval.qmd @@ -0,0 +1,4 @@ +--- +title: inspect eval +--- + diff --git a/docs/reference/inspect_info.qmd b/docs/reference/inspect_info.qmd new file mode 100644 index 000000000..6548e384b --- /dev/null +++ b/docs/reference/inspect_info.qmd @@ -0,0 +1,12 @@ +--- +title: inspect info +--- + + + + + + + + + diff --git a/docs/reference/inspect_list.qmd b/docs/reference/inspect_list.qmd new file mode 100644 index 000000000..bfdbf9f40 --- /dev/null +++ b/docs/reference/inspect_list.qmd @@ -0,0 +1,10 @@ +--- +title: inspect list +--- + + + + + + + diff --git a/docs/reference/inspect_log.qmd b/docs/reference/inspect_log.qmd new file mode 100644 index 000000000..cfc67e1da --- /dev/null +++ b/docs/reference/inspect_log.qmd @@ -0,0 +1,7 @@ +--- +title: inspect log +--- + + + + diff --git a/docs/reference/inspect_sandbox.qmd b/docs/reference/inspect_sandbox.qmd new file mode 100644 index 000000000..ea77d83ee --- /dev/null +++ b/docs/reference/inspect_sandbox.qmd @@ -0,0 +1,9 @@ +--- +title: inspect sandbox +--- + + + + + + diff --git a/docs/reference/inspect_score.qmd b/docs/reference/inspect_score.qmd new file mode 100644 index 000000000..0786a2576 --- /dev/null +++ b/docs/reference/inspect_score.qmd @@ -0,0 +1,6 @@ +--- +title: inspect score +--- + + + diff --git a/docs/reference/inspect_trace.qmd b/docs/reference/inspect_trace.qmd new file mode 100644 index 000000000..93278707b --- /dev/null +++ b/docs/reference/inspect_trace.qmd @@ -0,0 +1,9 @@ +--- +title: inspect trace +--- + + + + + + diff --git a/docs/reference/inspect_view.qmd b/docs/reference/inspect_view.qmd new file mode 100644 index 000000000..31be283eb --- /dev/null +++ b/docs/reference/inspect_view.qmd @@ -0,0 +1,4 @@ +--- +title: "inspect view" +--- + diff --git a/docs/sandboxing.qmd b/docs/sandboxing.qmd index df8854504..f6e61cf4a 100644 --- a/docs/sandboxing.qmd +++ b/docs/sandboxing.qmd @@ -126,7 +126,7 @@ The `Sample` class includes `sandbox`, `files` and `setup` fields that are used You can either define a default `sandbox` for an entire `Task` as illustrated above, or alternatively define a per-sample `sandbox`. For example, you might want to do this if each sample has its own Dockerfile and/or custom compose configuration file. (Note, each sample gets its own sandbox *instance*, even if the sandbox is defined at Task level. So samples do not interfere with each other's sandboxes.) -The `sandbox` can be specified as a string (e.g. `"docker`") or a list of sandbox type and config file (e.g. `["docker", "compose.yaml"]`). +The `sandbox` can be specified as a string (e.g. `"docker`") or a tuple of sandbox type and config file (e.g. `("docker", "compose.yaml")`). ### Files diff --git a/docs/scripts/post-render.sh b/docs/scripts/post-render.sh index e59142503..05afc3779 100755 --- a/docs/scripts/post-render.sh +++ b/docs/scripts/post-render.sh @@ -1,6 +1,6 @@ #!/bin/bash -files=("index" "tutorial" "options" "log-viewer" "vscode" "tasks" "datasets" "solvers" "tools" "scorers" "models" "providers" "caching" "multimodal" "reasoning" "agents" "sandboxing" "agents-api" "agent-bridge" "human-agent" "approval" "eval-logs" "eval-sets" "errors-and-limits" "typing" "tracing" "parallelism" "interactivity" "extensions") +files=("index" "tutorial" "options" "log-viewer" "vscode" "tasks" "datasets" "solvers" "tools" "scorers" "models" "providers" "caching" "multimodal" "reasoning" "agents" "sandboxing" "agents-api" "agent-bridge" "human-agent" "approval" "eval-logs" "eval-sets" "errors-and-limits" "typing" "tracing" "parallelism" "interactivity" "extensions" "reference/inspect_ai" "reference/inspect_ai.solver" "reference/inspect_ai.tool" "reference/inspect_ai.scorer" "reference/inspect_ai.model" "reference/inspect_ai.dataset" "reference/inspect_ai.approval" "reference/inspect_ai.log" "reference/inspect_ai.util" "reference/inspect_eval" "reference/inspect_eval-set" "reference/inspect_eval-retry" "reference/inspect_score" "reference/inspect_view" "reference/inspect_log" "reference/inspect_trace" "reference/inspect_sandbox" "reference/inspect_cache" "reference/inspect_list" "reference/inspect_info") if [ "$QUARTO_PROJECT_RENDER_ALL" = "1" ]; then @@ -9,7 +9,7 @@ if [ "$QUARTO_PROJECT_RENDER_ALL" = "1" ]; then mv _quarto.yml _quarto.yml.bak for file in "${files[@]}"; do echo "llms: ${file}.qmd" - quarto render "${file}.qmd" --to gfm --quiet --no-execute + quarto render "${file}.qmd" --to gfm-raw_html --quiet --no-execute output_file="${file}.md" cat "${output_file}" >> "${llms_full}" echo "" >> "${llms_full}" diff --git a/docs/theme.scss b/docs/theme.scss index 1e1504571..8edca945d 100644 --- a/docs/theme.scss +++ b/docs/theme.scss @@ -4,6 +4,10 @@ margin-top: 18px; } +img.navbar-logo { + padding-right: 15px; +} + .level1 h1 { margin-top: 0; } @@ -36,6 +40,11 @@ margin-bottom: 0; } +.sidebar-item-section .sidebar-item-section { + margin-bottom: 0 !important; +} + + .sidebar-tools-main .quarto-navigation-tool[title="Source Code"] { padding-top: 2.5px; } @@ -76,4 +85,57 @@ .blockquote { color: #505a62; -} \ No newline at end of file +} + +.source-link { + position: relative; +} + +.source-link > a { + color: $btn-code-copy-color; + position: absolute; + right: 5px; + top: 0; + font-size: 0.8em; + z-index: 1001; +} + +.source-link a:hover { + color: $btn-code-copy-color-active; +} + +.element-type { + font-size: 0.8em; + font-weight: 400; +} + +.element-type-name > a { + border-bottom: 1px dotted currentcolor !important; + color: currentColor; + text-decoration: none; +} + +.element-type-name > a:hover { + text-decoration: underline; + cursor: pointer; +} + +.ref-interlink { + color: $code-color; + background-color: $code-bg; + font-family: $font-family-monospace; + font-size: 0.875em; +} + +.doc-declaration .code-copy-button { + display: none; +} + +.class-methods > dl > dt { + font-size: 1.3em; +} + +.ref-definition { + font-weight: normal; +} + diff --git a/pyproject.toml b/pyproject.toml index c1a53423b..d027291ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -117,6 +117,7 @@ dev = [ "google-cloud-aiplatform", "google-generativeai", "goodfire", + "griffe", "groq", "ipython", "mistralai", @@ -133,6 +134,7 @@ dev = [ "pytest-xdist", "ruff==0.9.5", # match version specified in .pre-commit-config.yaml "textual-dev>=0.86.2", + "types-Markdown", "types-PyYAML", "types-beautifulsoup4", "types-aioboto3", @@ -144,5 +146,5 @@ dev = [ "types-psutil", "types-python-dateutil" ] -doc = ["quarto-cli", "jupyter"] +doc = ["quarto-cli==1.5.57", "jupyter", "panflute", "markdown"] dist = ["twine", "build"] diff --git a/src/inspect_ai/_cli/cache.py b/src/inspect_ai/_cli/cache.py index b6c2d50ca..1514a620a 100644 --- a/src/inspect_ai/_cli/cache.py +++ b/src/inspect_ai/_cli/cache.py @@ -42,7 +42,10 @@ def _print_table(title: str, paths: list[tuple[str, int]]) -> None: @click.group("cache") def cache_command() -> None: - """Manage the inspect cache.""" + """Manage the inspect model output cache. + + Learn more about model output caching at https://inspect.ai-safety-institute.org.uk/caching.html. + """ return None @@ -62,11 +65,9 @@ def cache_command() -> None: type=str, help="Clear the cache for a specific model (e.g. --model=openai/gpt-4). Can be passed multiple times.", ) -def clear( - all: bool, model: tuple[str, ...], log_level: str, log_level_transcript: str -) -> None: +def clear(all: bool, model: tuple[str, ...], log_level: str) -> None: """Clear all cache files. Requires either --all or --model flags.""" - init_logger(log_level, log_level_transcript) + init_logger(log_level) if model: _print_table( @@ -119,14 +120,14 @@ def list_caches(pruneable: bool) -> None: type=str, help="Only prune a specific model (e.g. --model=openai/gpt-4). Can be passed multiple times.", ) -def prune(log_level: str, log_level_transcript: str, model: tuple[str, ...]) -> None: +def prune(log_level: str, model: tuple[str, ...]) -> None: """Prune all expired cache entries Over time the cache directory can grow, but many cache entries will be expired. This command will remove all expired cache entries for ease of maintenance. """ - init_logger(log_level, log_level_transcript) + init_logger(log_level) expired_cache_entries = cache_list_expired(list(model)) diff --git a/src/inspect_ai/_cli/common.py b/src/inspect_ai/_cli/common.py index 192871a45..671354c8a 100644 --- a/src/inspect_ai/_cli/common.py +++ b/src/inspect_ai/_cli/common.py @@ -9,14 +9,12 @@ ALL_LOG_LEVELS, DEFAULT_DISPLAY, DEFAULT_LOG_LEVEL, - DEFAULT_LOG_LEVEL_TRANSCRIPT, ) from inspect_ai.util._display import init_display_type class CommonOptions(TypedDict): log_level: str - log_level_transcript: str log_dir: str display: Literal["full", "conversation", "rich", "plain", "none"] no_ansi: bool | None @@ -36,16 +34,6 @@ def log_level_options(func: Callable[..., Any]) -> Callable[..., click.Context]: envvar="INSPECT_LOG_LEVEL", help=f"Set the log level (defaults to '{DEFAULT_LOG_LEVEL}')", ) - @click.option( - "--log-level-transcript", - type=click.Choice( - [level.lower() for level in ALL_LOG_LEVELS], - case_sensitive=False, - ), - default=DEFAULT_LOG_LEVEL_TRANSCRIPT, - envvar="INSPECT_LOG_LEVEL_TRANSCRIPT", - help=f"Set the log level of the transcript (defaults to '{DEFAULT_LOG_LEVEL_TRANSCRIPT}')", - ) @functools.wraps(func) def wrapper(*args: Any, **kwargs: Any) -> click.Context: return cast(click.Context, func(*args, **kwargs)) diff --git a/src/inspect_ai/_cli/eval.py b/src/inspect_ai/_cli/eval.py index 23f6acd8a..ea3ebd377 100644 --- a/src/inspect_ai/_cli/eval.py +++ b/src/inspect_ai/_cli/eval.py @@ -7,7 +7,9 @@ from inspect_ai import Epochs, eval, eval_retry from inspect_ai._eval.evalset import eval_set from inspect_ai._util.constants import ( + ALL_LOG_LEVELS, DEFAULT_EPOCHS, + DEFAULT_LOG_LEVEL_TRANSCRIPT, DEFAULT_MAX_CONNECTIONS, DEFAULT_MAX_RETRIES, ) @@ -399,6 +401,16 @@ def eval_options(func: Callable[..., Any]) -> Callable[..., click.Context]: envvar=["INSPECT_LOG_FORMAT", "INSPECT_EVAL_LOG_FORMAT"], help="Format for writing log files.", ) + @click.option( + "--log-level-transcript", + type=click.Choice( + [level.lower() for level in ALL_LOG_LEVELS], + case_sensitive=False, + ), + default=DEFAULT_LOG_LEVEL_TRANSCRIPT, + envvar="INSPECT_LOG_LEVEL_TRANSCRIPT", + help=f"Set the log level of the transcript (defaults to '{DEFAULT_LOG_LEVEL_TRANSCRIPT}')", + ) @common_options @functools.wraps(func) def wrapper(*args: Any, **kwargs: Any) -> click.Context: @@ -468,6 +480,7 @@ def eval_command( no_score: bool | None, no_score_display: bool | None, log_format: Literal["eval", "json"] | None, + log_level_transcript: str, **common: Unpack[CommonOptions], ) -> None: """Evaluate tasks.""" @@ -482,7 +495,7 @@ def eval_command( tasks=tasks, solver=solver, log_level=common["log_level"], - log_level_transcript=common["log_level_transcript"], + log_level_transcript=log_level_transcript, log_dir=common["log_dir"], log_format=log_format, model=model, @@ -630,9 +643,13 @@ def eval_set_command( bundle_dir: str | None, bundle_overwrite: bool | None, log_format: Literal["eval", "json"] | None, + log_level_transcript: str, **common: Unpack[CommonOptions], ) -> int: - """Evaluate a set of tasks.""" + """Evaluate a set of tasks with retries. + + Learn more about eval sets at https://inspect.ai-safety-institute.org.uk/eval-sets.html. + """ # read config config = config_from_locals(dict(locals())) @@ -644,7 +661,7 @@ def eval_set_command( tasks=tasks, solver=solver, log_level=common["log_level"], - log_level_transcript=common["log_level_transcript"], + log_level_transcript=log_level_transcript, log_dir=common["log_dir"], log_format=log_format, model=model, @@ -967,6 +984,16 @@ def parse_comma_separated(value: str | None) -> list[str] | None: "--max-retries", type=int, help=MAX_RETRIES_HELP, envvar="INSPECT_EVAL_MAX_RETRIES" ) @click.option("--timeout", type=int, help=TIMEOUT_HELP, envvar="INSPECT_EVAL_TIMEOUT") +@click.option( + "--log-level-transcript", + type=click.Choice( + [level.lower() for level in ALL_LOG_LEVELS], + case_sensitive=False, + ), + default=DEFAULT_LOG_LEVEL_TRANSCRIPT, + envvar="INSPECT_LOG_LEVEL_TRANSCRIPT", + help=f"Set the log level of the transcript (defaults to '{DEFAULT_LOG_LEVEL_TRANSCRIPT}')", +) @common_options def eval_retry_command( log_files: tuple[str], @@ -986,6 +1013,7 @@ def eval_retry_command( max_connections: int | None, max_retries: int | None, timeout: int | None, + log_level_transcript: str, **common: Unpack[CommonOptions], ) -> None: """Retry failed evaluation(s)""" @@ -1014,7 +1042,7 @@ def eval_retry_command( eval_retry( retry_log_files, log_level=common["log_level"], - log_level_transcript=common["log_level_transcript"], + log_level_transcript=log_level_transcript, log_dir=common["log_dir"], max_samples=max_samples, max_tasks=max_tasks, diff --git a/src/inspect_ai/_cli/info.py b/src/inspect_ai/_cli/info.py index 4d8027041..fd63aa1a4 100644 --- a/src/inspect_ai/_cli/info.py +++ b/src/inspect_ai/_cli/info.py @@ -25,6 +25,7 @@ def info_command() -> None: help="Output version and path info as JSON", ) def version(json: bool) -> None: + """Output version and path info.""" if json: print(dumps(dict(version=__version__, path=PKG_PATH.as_posix()), indent=2)) else: diff --git a/src/inspect_ai/_cli/list.py b/src/inspect_ai/_cli/list.py index d3282d6fe..c27f92020 100644 --- a/src/inspect_ai/_cli/list.py +++ b/src/inspect_ai/_cli/list.py @@ -14,7 +14,7 @@ @click.group("list") def list_command() -> None: - """List tasks or eval logs.""" + """List tasks on the filesystem.""" return None diff --git a/src/inspect_ai/_cli/log.py b/src/inspect_ai/_cli/log.py index caea8e9c2..e3af26537 100644 --- a/src/inspect_ai/_cli/log.py +++ b/src/inspect_ai/_cli/log.py @@ -29,6 +29,8 @@ def log_command() -> None: The default format is 'eval'. You can change this by setting the INSPECT_LOG_FORMAT environment variable or using the --log-format command line option. The 'log' commands enable you to read Inspect logs uniformly as JSON no matter their physical storage format, and also enable you to read only the headers (everything but the samples) from log files, which is useful for very large logs. + + Learn more about managing log files at https://inspect.ai-safety-institute.org.uk/eval-logs.html. """ return None diff --git a/src/inspect_ai/_cli/sandbox.py b/src/inspect_ai/_cli/sandbox.py index d60bbe5f9..ceb91cb86 100644 --- a/src/inspect_ai/_cli/sandbox.py +++ b/src/inspect_ai/_cli/sandbox.py @@ -7,7 +7,10 @@ @click.group("sandbox") def sandbox_command() -> None: - """Manage Sandbox Environments.""" + """Manage Sandbox Environments. + + Learn more about sandboxing at https://inspect.ai-safety-institute.org.uk/sandboxing.html. + """ return None diff --git a/src/inspect_ai/_cli/score.py b/src/inspect_ai/_cli/score.py index 0bdd3df69..54c4473c3 100644 --- a/src/inspect_ai/_cli/score.py +++ b/src/inspect_ai/_cli/score.py @@ -43,7 +43,6 @@ def score_command( log_file, False if no_overwrite else True, common["log_level"], - common["log_level_transcript"], ) ) @@ -54,10 +53,9 @@ async def score( log_file: str, overwrite: bool, log_level: str | None, - log_level_transcript: str | None, ) -> None: # init eval context - init_eval_context(log_level, log_level_transcript) + init_eval_context(log_level, None) # read the eval log recorder = create_recorder_for_location(log_file, log_dir) diff --git a/src/inspect_ai/_cli/trace.py b/src/inspect_ai/_cli/trace.py index 933d40bac..7e6ed947d 100644 --- a/src/inspect_ai/_cli/trace.py +++ b/src/inspect_ai/_cli/trace.py @@ -26,6 +26,8 @@ def trace_command() -> None: """List and read execution traces. Inspect includes a TRACE log-level which is right below the HTTP and INFO log levels (so not written to the console by default). However, TRACE logs are always recorded to a separate file, and the last 10 TRACE logs are preserved. The 'trace' command provides ways to list and read these traces. + + Learn more about execution traces at https://inspect.ai-safety-institute.org.uk/tracing.html. """ return None diff --git a/src/inspect_ai/_cli/view.py b/src/inspect_ai/_cli/view.py index f53a59731..e4024646f 100644 --- a/src/inspect_ai/_cli/view.py +++ b/src/inspect_ai/_cli/view.py @@ -39,7 +39,10 @@ def wrapper(*args: Any, **kwargs: Any) -> click.Context: @common_options @click.pass_context def view_command(ctx: click.Context, **kwargs: Unpack[CommonOptions]) -> None: - """View command group.""" + """Inspect log viewer. + + Learn more about using the log viewer at https://inspect.ai-safety-institute.org.uk/log-viewer.html. + """ if ctx.invoked_subcommand is None: ctx.invoke(start, **kwargs) else: @@ -78,7 +81,6 @@ def start( port=port, authorization=authorization, log_level=common["log_level"], - log_level_transcript=common["log_level_transcript"], ) diff --git a/src/inspect_ai/_display/core/config.py b/src/inspect_ai/_display/core/config.py index 2796753bf..e8a2fb254 100644 --- a/src/inspect_ai/_display/core/config.py +++ b/src/inspect_ai/_display/core/config.py @@ -1,4 +1,5 @@ from inspect_ai._util.registry import is_registry_dict +from inspect_ai.log._log import eval_config_defaults from .display import TaskProfile @@ -13,7 +14,12 @@ def task_config( value = task_args[key] if is_registry_dict(value): task_args[key] = value["name"] - config = dict(profile.eval_config.model_dump(exclude_none=True)) | task_args + # get eval_config overrides + eval_config = dict(profile.eval_config.model_dump(exclude_none=True)) + for name, default_value in eval_config_defaults().items(): + if eval_config.get(name, None) == default_value: + del eval_config[name] + config = eval_config | task_args if generate_config: config = dict(profile.generate_config.model_dump(exclude_none=True)) | config if profile.tags: diff --git a/src/inspect_ai/_display/textual/widgets/samples.py b/src/inspect_ai/_display/textual/widgets/samples.py index fd6f9d80a..2711f681e 100644 --- a/src/inspect_ai/_display/textual/widgets/samples.py +++ b/src/inspect_ai/_display/textual/widgets/samples.py @@ -347,7 +347,7 @@ async def sync_sample(self, sample: ActiveSample) -> None: class SandboxesView(Vertical): DEFAULT_CSS = """ SandboxesView { - padding: 1 0 1 0; + padding: 1 0 0 0; background: transparent; height: auto; } @@ -358,6 +358,7 @@ class SandboxesView(Vertical): background: transparent; } .clipboard-message { + height: auto; margin-top: 1; } """ @@ -372,7 +373,6 @@ def compose(self) -> ComposeResult: async def sync_sample(self, sample: ActiveSample) -> None: if len(sample.sandboxes) > 0: multiple_sandboxes = len(sample.sandboxes) > 1 - self.display = True sandboxes_caption = cast(Static, self.query_one("#sandboxes-caption")) sandboxes_caption.update( f"[bold]sandbox container{'s' if multiple_sandboxes else ''}:[/bold]" @@ -395,6 +395,7 @@ async def sync_sample(self, sample: ActiveSample) -> None: markup=True, ) ) + self.display = True else: self.display = False @@ -473,7 +474,7 @@ def on_button_pressed(self, event: Button.Pressed) -> None: else None ) if isinstance(last_event, ToolEvent): - last_event.cancel() + last_event._cancel() elif event.button.id == self.CANCEL_SCORE_OUTPUT: self.sample.interrupt("score") elif event.button.id == self.CANCEL_RAISE_ERROR: diff --git a/src/inspect_ai/_display/textual/widgets/sandbox.py b/src/inspect_ai/_display/textual/widgets/sandbox.py index b7cd49dad..0b4981380 100644 --- a/src/inspect_ai/_display/textual/widgets/sandbox.py +++ b/src/inspect_ai/_display/textual/widgets/sandbox.py @@ -9,6 +9,12 @@ class SandboxView(Vertical): DEFAULT_CSS = """ + SandboxView { + height: auto; + } + SandboxView * { + height: auto; + } .indent { width: 2; } diff --git a/src/inspect_ai/_eval/__init__.py b/src/inspect_ai/_eval/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/inspect_ai/_eval/eval.py b/src/inspect_ai/_eval/eval.py index f7b25c361..de57e776e 100644 --- a/src/inspect_ai/_eval/eval.py +++ b/src/inspect_ai/_eval/eval.py @@ -89,67 +89,67 @@ def eval( r"""Evaluate tasks using a Model. Args: - tasks: (Tasks): Task(s) to evaluate. If None, attempt + tasks: Task(s) to evaluate. If None, attempt to evaluate a task in the current working directory - model (str | Model | list[str] | list[Model] | None): Model(s) for + model: Model(s) for evaluation. If not specified use the value of the INSPECT_EVAL_MODEL environment variable. - model_base_url: (str | None): Base URL for communicating + model_base_url: Base URL for communicating with the model API. - model_args (dict[str,Any] | str): Model creation args + model_args: Model creation args (as a dictionary or as a path to a JSON or YAML config file) - task_args (dict[str,Any] | str): Task creation arguments + task_args: Task creation arguments (as a dictionary or as a path to a JSON or YAML config file) - sandbox (SandboxEnvironmentType | None): Sandbox environment type - (or optionally a str or tuple with a shorthand spec) - sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes - (defaults to True) - solver (Solver | list[Solver] | SolverSpec | None): Alternative solver for task(s). - Optional (uses task solver by default). - tags (list[str] | None): Tags to associate with this evaluation run. - trace (bool | None): Trace message interactions with evaluated model to terminal. - display (DisplayType | None): Task display type (defaults to 'full'). - approval: (str | list[ApprovalPolicy] | None): Tool use approval policies. - Either a path to an approval policy config file or a list of approval policies. - Defaults to no approval policy. - log_level (str | None): Level for logging to the console: "debug", "http", "sandbox", - "info", "warning", "error", or "critical" (defaults to "warning") - log_level_transcript (str | None): Level for logging to the log file (defaults to "info") - log_dir (str | None): Output path for logging results - (defaults to file log in ./logs directory). - log_format (Literal["eval", "json"] | None): Format for writing log files (defaults - to "eval", the native high-performance format). - limit (int | tuple[int, int] | None): Limit evaluated samples - (defaults to all samples). - sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset. - epochs (int | Epochs | None): Epochs to repeat samples for and optional score - reducer function(s) used to combine sample scores (defaults to "mean") - fail_on_error (bool | float | None): `True` to fail on first sample error - (default); `False` to never fail on sample errors; Value between 0 and 1 - to fail if a proportion of total samples fails. Value greater than 1 to fail - eval if a count of samples fails. - debug_errors (bool | None): Raise task errors (rather than logging them) - so they can be debugged (defaults to False). - message_limit (int | None): Limit on total messages used for each sample. - token_limit (int | None): Limit on total tokens used for each sample. - time_limit (int | None): Limit on time (in seconds) for execution of each sample. - max_samples (int | None): Maximum number of samples to run in parallel - (default is max_connections) - max_tasks (int | None): Maximum number of tasks to run in parallel - (default is 1) - max_subprocesses (int | None): Maximum number of subprocesses to - run in parallel (default is os.cpu_count()) - max_sandboxes (int | None): Maximum number of sandboxes (per-provider) - to run in parallel. - log_samples: (bool | None): Log detailed samples and scores (defaults to True) - log_images: (bool | None): Log base64 encoded version of images, - even if specified as a filename or URL (defaults to False) - log_buffer: (int | None): Number of samples to buffer before writing log file. - If not specified, an appropriate default for the format and filesystem is - chosen (10 for most all cases, 100 for JSON logs on remote filesystems). - score (bool): Score output (defaults to True) - score_display (bool | None): Show scoring metrics in realtime (defaults to True) - **kwargs (GenerateConfigArgs): Model generation options. + sandbox: Sandbox environment type + (or optionally a str or tuple with a shorthand spec) + sandbox_cleanup: Cleanup sandbox environments after task completes + (defaults to True) + solver: Alternative solver for task(s). + Optional (uses task solver by default). + tags: Tags to associate with this evaluation run. + trace: Trace message interactions with evaluated model to terminal. + display: Task display type (defaults to 'full'). + approval: Tool use approval policies. + Either a path to an approval policy config file or a list of approval policies. + Defaults to no approval policy. + log_level: Level for logging to the console: "debug", "http", "sandbox", + "info", "warning", "error", or "critical" (defaults to "warning") + log_level_transcript: Level for logging to the log file (defaults to "info") + log_dir: Output path for logging results + (defaults to file log in ./logs directory). + log_format: Format for writing log files (defaults + to "eval", the native high-performance format). + limit: Limit evaluated samples + (defaults to all samples). + sample_id: Evaluate specific sample(s) from the dataset. + epochs: Epochs to repeat samples for and optional score + reducer function(s) used to combine sample scores (defaults to "mean") + fail_on_error: `True` to fail on first sample error + (default); `False` to never fail on sample errors; Value between 0 and 1 + to fail if a proportion of total samples fails. Value greater than 1 to fail + eval if a count of samples fails. + debug_errors: Raise task errors (rather than logging them) + so they can be debugged (defaults to False). + message_limit: Limit on total messages used for each sample. + token_limit: Limit on total tokens used for each sample. + time_limit: Limit on time (in seconds) for execution of each sample. + max_samples: Maximum number of samples to run in parallel + (default is max_connections) + max_tasks: Maximum number of tasks to run in parallel + (default is 1) + max_subprocesses: Maximum number of subprocesses to + run in parallel (default is os.cpu_count()) + max_sandboxes: Maximum number of sandboxes (per-provider) + to run in parallel. + log_samples: Log detailed samples and scores (defaults to True) + log_images: Log base64 encoded version of images, + even if specified as a filename or URL (defaults to False) + log_buffer: Number of samples to buffer before writing log file. + If not specified, an appropriate default for the format and filesystem is + chosen (10 for most all cases, 100 for JSON logs on remote filesystems). + score: Score output (defaults to True) + score_display: Show scoring metrics in realtime (defaults to True) + **kwargs: Model generation options. Returns: List of EvalLog (one for each task) @@ -359,10 +359,14 @@ async def eval_async( "Trace mode cannot be used when evaluating multiple models." ) - # resolve recorder + # resolve recorder (confirm writeable) log_dir = log_dir if log_dir else os.environ.get("INSPECT_LOG_DIR", "./logs") log_dir = absolute_file_path(log_dir) recorder = create_recorder_for_format(log_format or DEFAULT_LOG_FORMAT, log_dir) + if not recorder.is_writeable(): + raise PrerequisiteError( + f"ERROR: You do not have write permission for the log_dir '{log_dir}'" + ) # resolve solver solver = chain(solver) if isinstance(solver, list) else solver @@ -492,47 +496,46 @@ def eval_retry( """Retry a previously failed evaluation task. Args: - tasks: (str | EvalLogInfo | EvalLog | list[str] | list[EvalLogInfo] | list[EvalLog]): - Log files for task(s) to retry. - log_level (str | None): Level for logging to the console: "debug", "http", "sandbox", - "info", "warning", "error", or "critical" (defaults to "warning") - log_level_transcript (str | None): Level for logging to the log file (defaults to "info") - log_dir (str | None): Output path for logging results - (defaults to file log in ./logs directory). - log_format (Literal["eval", "json"] | None): Format for writing log files (defaults - to "eval", the native high-performance format). - max_samples (int | None): Maximum number of samples to run in parallel - (default is max_connections) - max_tasks (int | None): Maximum number of tasks to run in parallel - (default is 1) - max_subprocesses (int | None): Maximum number of subprocesses to - run in parallel (default is os.cpu_count()) - max_sandboxes (int | None): Maximum number of sandboxes (per-provider) - to run in parallel. - sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes - (defaults to True) - trace (bool | None): Trace message interactions with evaluated model to terminal. - display (DisplayType | None): Task display type (defaults to 'full'). - fail_on_error (bool | float | None): `True` to fail on first sample error - (default); `False` to never fail on sample errors; Value between 0 and 1 - to fail if a proportion of total samples fails. Value greater than 1 to fail - eval if a count of samples fails. - debug_errors (bool | None): Raise task errors (rather than logging them) - so they can be debugged (defaults to False). - log_samples: (bool | None): Log detailed samples and scores (defaults to True) - log_images: (bool | None): Log base64 encoded version of images, - even if specified as a filename or URL (defaults to False) - log_buffer: (int | None): Number of samples to buffer before writing log file. - If not specified, an appropriate default for the format and filesystem is - chosen (10 for most all cases, 100 for JSON logs on remote filesystems). - score (bool): Score output (defaults to True) - score_display (bool | None): Show scoring metrics in realtime (defaults to True) - max_retries (int | None): - Maximum number of times to retry request. - timeout: (int | None): - Request timeout (in seconds) - max_connections (int | None): - Maximum number of concurrent connections to Model API (default is per Model API) + tasks: Log files for task(s) to retry. + log_level: Level for logging to the console: "debug", "http", "sandbox", + "info", "warning", "error", or "critical" (defaults to "warning") + log_level_transcript: Level for logging to the log file (defaults to "info") + log_dir: Output path for logging results + (defaults to file log in ./logs directory). + log_format: Format for writing log files (defaults + to "eval", the native high-performance format). + max_samples: Maximum number of samples to run in parallel + (default is max_connections) + max_tasks: Maximum number of tasks to run in parallel + (default is 1) + max_subprocesses: Maximum number of subprocesses to + run in parallel (default is os.cpu_count()) + max_sandboxes: Maximum number of sandboxes (per-provider) + to run in parallel. + sandbox_cleanup: Cleanup sandbox environments after task completes + (defaults to True) + trace: Trace message interactions with evaluated model to terminal. + display: Task display type (defaults to 'full'). + fail_on_error: `True` to fail on first sample error + (default); `False` to never fail on sample errors; Value between 0 and 1 + to fail if a proportion of total samples fails. Value greater than 1 to fail + eval if a count of samples fails. + debug_errors: Raise task errors (rather than logging them) + so they can be debugged (defaults to False). + log_samples: Log detailed samples and scores (defaults to True) + log_images: Log base64 encoded version of images, + even if specified as a filename or URL (defaults to False) + log_buffer: Number of samples to buffer before writing log file. + If not specified, an appropriate default for the format and filesystem is + chosen (10 for most all cases, 100 for JSON logs on remote filesystems). + score: Score output (defaults to True) + score_display: Show scoring metrics in realtime (defaults to True) + max_retries: + Maximum number of times to retry request. + timeout: + Request timeout (in seconds) + max_connections: + Maximum number of concurrent connections to Model API (default is per Model API) Returns: List of EvalLog (one for each task) diff --git a/src/inspect_ai/_eval/evalset.py b/src/inspect_ai/_eval/evalset.py index c172ab204..5fb91892c 100644 --- a/src/inspect_ai/_eval/evalset.py +++ b/src/inspect_ai/_eval/evalset.py @@ -93,79 +93,79 @@ def eval_set( r"""Evaluate a set of tasks. Args: - tasks: (Tasks): Task(s) to evaluate. If None, attempt + tasks: Task(s) to evaluate. If None, attempt to evaluate a task in the current working directory - log_dir (str): Output path for logging results - (required to ensure that a unique storage scope is assigned for the set). - retry_attempts: (int | None): Maximum number of retry attempts before giving up - (defaults to 10). - retry_wait (float | None): Time to wait between attempts, increased exponentially. - (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time - per-retry will in no case by longer than 1 hour. - retry_connections (float | None): Reduce max_connections at this rate with each retry - (defaults to 0.5) - retry_cleanup (bool | None): Cleanup failed log files after retries - (defaults to True) - model (str | Model | list[str] | list[Model] | None): Model(s) for - evaluation. If not specified use the value of the INSPECT_EVAL_MODEL - environment variable. - model_base_url: (str | None): Base URL for communicating - with the model API. - model_args (dict[str,Any] | str): Model creation args - (as a dictionary or as a path to a JSON or YAML config file) - task_args (dict[str,Any] | str): Task creation arguments - (as a dictionary or as a path to a JSON or YAML config file) - sandbox (SandboxEnvironmentType | None): Sandbox environment type - (or optionally a str or tuple with a shorthand spec) - sandbox_cleanup (bool | None): Cleanup sandbox environments after task completes - (defaults to True) - solver (Solver | list[Solver] | SolverSpec | None): Alternative solver(s) for - evaluating task(s). ptional (uses task solver by default). - tags (list[str] | None): Tags to associate with this evaluation run. - trace: (bool | None): Trace message interactions with evaluated model to terminal. - display (DisplayType | None): Task display type (defaults to 'full'). - approval: (str | list[ApprovalPolicy] | None): Tool use approval policies. - Either a path to an approval policy config file or a list of approval policies. - Defaults to no approval policy. - score (bool): Score output (defaults to True) - log_level (str | None): Level for logging to the console: "debug", "http", "sandbox", - "info", "warning", "error", or "critical" (defaults to "warning") - log_level_transcript (str | None): Level for logging to the log file (defaults to "info") - log_format (Literal["eval", "json"] | None): Format for writing - log files (defaults to "eval", the native high-performance format). - limit (int | tuple[int, int] | None): Limit evaluated samples - (defaults to all samples). - sample_id (str | int | list[str | int] | None): Evaluate specific sample(s) from the dataset. - epochs (int | Epochs | None): Epochs to repeat samples for and optional score - reducer function(s) used to combine sample scores (defaults to "mean") - fail_on_error (bool | float | None): `True` to fail on first sample error - (default); `False` to never fail on sample errors; Value between 0 and 1 - to fail if a proportion of total samples fails. Value greater than 1 to fail - eval if a count of samples fails. - debug_errors (bool | None): Raise task errors (rather than logging them) - so they can be debugged (defaults to False). - message_limit (int | None): Limit on total messages used for each sample. - token_limit (int | None): Limit on total tokens used for each sample. - time_limit (int | None): Limit on time (in seconds) for execution of each sample. - max_samples (int | None): Maximum number of samples to run in parallel - (default is max_connections) - max_tasks (int | None): Maximum number of tasks to run in parallel - (default is 1) - max_subprocesses (int | None): Maximum number of subprocesses to - run in parallel (default is os.cpu_count()) - max_sandboxes (int | None): Maximum number of sandboxes (per-provider) - to run in parallel. - log_samples: (bool | None): Log detailed samples and scores (defaults to True) - log_images: (bool | None): Log base64 encoded version of images, + log_dir: Output path for logging results + (required to ensure that a unique storage scope is assigned for the set). + retry_attempts: Maximum number of retry attempts before giving up + (defaults to 10). + retry_wait: Time to wait between attempts, increased exponentially. + (defaults to 30, resulting in waits of 30, 60, 120, 240, etc.). Wait time + per-retry will in no case by longer than 1 hour. + retry_connections: Reduce max_connections at this rate with each retry + (defaults to 0.5) + retry_cleanup: Cleanup failed log files after retries + (defaults to True) + model: Model(s) for + evaluation. If not specified use the value of the INSPECT_EVAL_MODEL + environment variable. + model_base_url: Base URL for communicating + with the model API. + model_args: Model creation args + (as a dictionary or as a path to a JSON or YAML config file) + task_args: Task creation arguments + (as a dictionary or as a path to a JSON or YAML config file) + sandbox: Sandbox environment type + (or optionally a str or tuple with a shorthand spec) + sandbox_cleanup: Cleanup sandbox environments after task completes + (defaults to True) + solver: Alternative solver(s) for + evaluating task(s). ptional (uses task solver by default). + tags: Tags to associate with this evaluation run. + trace: Trace message interactions with evaluated model to terminal. + display: Task display type (defaults to 'full'). + approval: Tool use approval policies. + Either a path to an approval policy config file or a list of approval policies. + Defaults to no approval policy. + score: Score output (defaults to True) + log_level: Level for logging to the console: "debug", "http", "sandbox", + "info", "warning", "error", or "critical" (defaults to "warning") + log_level_transcript: Level for logging to the log file (defaults to "info") + log_format: Format for writing + log files (defaults to "eval", the native high-performance format). + limit: Limit evaluated samples + (defaults to all samples). + sample_id: Evaluate specific sample(s) from the dataset. + epochs: Epochs to repeat samples for and optional score + reducer function(s) used to combine sample scores (defaults to "mean") + fail_on_error: `True` to fail on first sample error + (default); `False` to never fail on sample errors; Value between 0 and 1 + to fail if a proportion of total samples fails. Value greater than 1 to fail + eval if a count of samples fails. + debug_errors: Raise task errors (rather than logging them) + so they can be debugged (defaults to False). + message_limit: Limit on total messages used for each sample. + token_limit: Limit on total tokens used for each sample. + time_limit: Limit on time (in seconds) for execution of each sample. + max_samples: Maximum number of samples to run in parallel + (default is max_connections) + max_tasks: Maximum number of tasks to run in parallel + (default is 1) + max_subprocesses: Maximum number of subprocesses to + run in parallel (default is os.cpu_count()) + max_sandboxes: Maximum number of sandboxes (per-provider) + to run in parallel. + log_samples: Log detailed samples and scores (defaults to True) + log_images: Log base64 encoded version of images, even if specified as a filename or URL (defaults to False) - log_buffer: (int | None): Number of samples to buffer before writing log file. - If not specified, an appropriate default for the format and filesystem is - chosen (10 for most all cases, 100 for JSON logs on remote filesystems). - bundle_dir: (str | None): If specified, the log viewer and logs generated + log_buffer: Number of samples to buffer before writing log file. + If not specified, an appropriate default for the format and filesystem is + chosen (10 for most all cases, 100 for JSON logs on remote filesystems). + bundle_dir: If specified, the log viewer and logs generated by this eval set will be bundled into this directory. - bundle_overwrite (bool): Whether to overwrite files in the bundle_dir. + bundle_overwrite: Whether to overwrite files in the bundle_dir. (defaults to False). - **kwargs (GenerateConfigArgs): Model generation options. + **kwargs: Model generation options. Returns: Tuple of bool (whether all tasks completed successfully) and list of EvalLog diff --git a/src/inspect_ai/_eval/loader.py b/src/inspect_ai/_eval/loader.py index b84898be9..d5904da41 100644 --- a/src/inspect_ai/_eval/loader.py +++ b/src/inspect_ai/_eval/loader.py @@ -26,7 +26,7 @@ registry_params, ) from inspect_ai.model import Model, ModelName -from inspect_ai.solver._bridge import bridge +from inspect_ai.solver._bridge.bridge import bridge from inspect_ai.solver._solver import Solver, SolverSpec from inspect_ai.util import SandboxEnvironmentSpec, SandboxEnvironmentType from inspect_ai.util._sandbox.environment import resolve_sandbox_environment diff --git a/src/inspect_ai/_eval/task/log.py b/src/inspect_ai/_eval/task/log.py index 4c05b200b..c901aa24a 100644 --- a/src/inspect_ai/_eval/task/log.py +++ b/src/inspect_ai/_eval/task/log.py @@ -4,9 +4,7 @@ from shortuuid import uuid from inspect_ai._eval.task.util import slice_dataset -from inspect_ai._util.constants import ( - PKG_NAME, -) +from inspect_ai._util.constants import PKG_NAME from inspect_ai._util.datetime import iso_now from inspect_ai._util.git import git_context from inspect_ai._util.path import cwd_relative_path @@ -27,7 +25,11 @@ EvalSpec, EvalStats, ) -from inspect_ai.log._log import EvalLog, EvalSampleReductions +from inspect_ai.log._log import ( + EvalLog, + EvalSampleReductions, + eval_config_defaults, +) from inspect_ai.log._recorders import Recorder from inspect_ai.model import ( GenerateConfig, @@ -92,6 +94,11 @@ def __init__( ], ) + # write defaults for unspecified config + for name, value in eval_config_defaults().items(): + if getattr(eval_config, name, None) is None: + setattr(eval_config, name, value) + # create eval spec self.eval = EvalSpec( run_id=run_id, diff --git a/src/inspect_ai/_eval/task/sandbox.py b/src/inspect_ai/_eval/task/sandbox.py index 9f0bfebd9..9f9bd89ae 100644 --- a/src/inspect_ai/_eval/task/sandbox.py +++ b/src/inspect_ai/_eval/task/sandbox.py @@ -5,11 +5,20 @@ from typing import AsyncGenerator, Callable, NamedTuple, cast import httpx +from tenacity import ( + retry, + retry_if_exception, + stop_after_attempt, + stop_after_delay, + wait_exponential_jitter, +) from inspect_ai._eval.task.task import Task from inspect_ai._eval.task.util import task_run_dir +from inspect_ai._util.constants import DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT from inspect_ai._util.file import file, filesystem from inspect_ai._util.registry import registry_unqualified_name +from inspect_ai._util.retry import httpx_should_retry, log_retry_attempt from inspect_ai._util.url import data_uri_to_base64, is_data_uri, is_http_url from inspect_ai.dataset import Sample from inspect_ai.util._concurrency import concurrency @@ -115,8 +124,7 @@ async def read_sandboxenv_file(contents: str) -> bytes: contents_base64 = data_uri_to_base64(contents) file_bytes = base64.b64decode(contents_base64) elif is_http_url(contents): - client = httpx.AsyncClient() - file_bytes = (await client.get(contents, follow_redirects=True)).content + file_bytes = await _retrying_httpx_get(contents) else: # try to read as a file (if it doesn't exist or has a path not cool w/ # the filesystem then we fall back to contents) @@ -172,3 +180,28 @@ def resolve_sandbox( return sample.sandbox else: return None + + +async def _retrying_httpx_get( + url: str, + client: httpx.AsyncClient = httpx.AsyncClient(), + timeout: int = 30, # per-attempt timeout + max_retries: int = DEFAULT_MAX_RETRIES, + total_timeout: int = DEFAULT_TIMEOUT, # timeout for the whole retry loop. not for an individual attempt +) -> bytes: + @retry( + wait=wait_exponential_jitter(), + stop=(stop_after_attempt(max_retries) | stop_after_delay(total_timeout)), + retry=retry_if_exception(httpx_should_retry), + before_sleep=log_retry_attempt(url), + ) + async def do_get() -> bytes: + response = await client.get( + url=url, + follow_redirects=True, + timeout=(timeout, timeout, timeout, timeout), + ) + response.raise_for_status() + return response.content + + return await do_get() diff --git a/src/inspect_ai/_eval/task/task.py b/src/inspect_ai/_eval/task/task.py index d5b81ea12..6030941ca 100644 --- a/src/inspect_ai/_eval/task/task.py +++ b/src/inspect_ai/_eval/task/task.py @@ -39,38 +39,6 @@ class Task: r"""Evaluation task. Tasks are the basis for defining and running evaluations. - - Args: - dataset (Dataset | Sequence[Sample]): Dataset to evaluate - setup: (Solver | list[Solver] | None): Setup step (always run - even when the main `solver` is replaced). - solver: (Solver | list[Solver]): Solver or list of solvers. - Defaults to generate(), a normal call to the model. - scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output. - metrics (list[Metric] | dict[str, list[Metric]] | None): - Alternative metrics (overrides the metrics provided by the specified scorer). - config (GenerateConfig): Model generation config. - sandbox (SandboxEnvironmentType | None): Sandbox environment type - (or optionally a str or tuple with a shorthand spec) - approval: (str | list[ApprovalPolicy] | None): Tool use approval policies. - Either a path to an approval policy config file or a list of approval policies. - Defaults to no approval policy. - epochs (int | Epochs | None): Epochs to repeat samples for and optional score - reducer function(s) used to combine sample scores (defaults to "mean") - fail_on_error (bool | float | None): `True` to fail on first sample error - (default); `False` to never fail on sample errors; Value between 0 and 1 - to fail if a proportion of total samples fails. Value greater than 1 to fail - eval if a count of samples fails. - message_limit (int | None): Limit on total messages used for each sample. - token_limit (int | None): Limit on total tokens used for each sample. - time_limit (int | None): Limit on time (in seconds) for execution of each sample. - name: (str | None): Task name. If not specified is automatically - determined based on the name of the task directory (or "task") - if its anonymous task (e.g. created in a notebook and passed to - eval() directly) - version: (int): Version of task (to distinguish evolutions - of the task spec or breaking changes to it) - metadata: (dict[str, Any] | None): Additional metadata to associate with the task. """ def __init__( @@ -93,6 +61,41 @@ def __init__( metadata: dict[str, Any] | None = None, **kwargs: Unpack[TaskDeprecatedArgs], ) -> None: + """Create a task. + + Args: + dataset (Dataset | Sequence[Sample]): Dataset to evaluate + setup: (Solver | list[Solver] | None): Setup step (always run + even when the main `solver` is replaced). + solver: (Solver | list[Solver]): Solver or list of solvers. + Defaults to generate(), a normal call to the model. + scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output. + metrics (list[Metric] | dict[str, list[Metric]] | None): + Alternative metrics (overrides the metrics provided by the specified scorer). + config (GenerateConfig): Model generation config. + sandbox (SandboxEnvironmentType | None): Sandbox environment type + (or optionally a str or tuple with a shorthand spec) + approval: (str | list[ApprovalPolicy] | None): Tool use approval policies. + Either a path to an approval policy config file or a list of approval policies. + Defaults to no approval policy. + epochs (int | Epochs | None): Epochs to repeat samples for and optional score + reducer function(s) used to combine sample scores (defaults to "mean") + fail_on_error (bool | float | None): `True` to fail on first sample error + (default); `False` to never fail on sample errors; Value between 0 and 1 + to fail if a proportion of total samples fails. Value greater than 1 to fail + eval if a count of samples fails. + message_limit (int | None): Limit on total messages used for each sample. + token_limit (int | None): Limit on total tokens used for each sample. + time_limit (int | None): Limit on time (in seconds) for execution of each sample. + name: (str | None): Task name. If not specified is automatically + determined based on the name of the task directory (or "task") + if its anonymous task (e.g. created in a notebook and passed to + eval() directly) + version: (int): Version of task (to distinguish evolutions + of the task spec or breaking changes to it) + metadata: (dict[str, Any] | None): Additional metadata to associate with the task. + **kwargs: Deprecated arguments. + """ # handle deprecated args for arg, value in kwargs.items(): newarg = "" @@ -179,33 +182,33 @@ def task_with( task (Task): Task to adapt (it is deep copied prior to mutating options) dataset (Dataset | Sequence[Sample]): Dataset to evaluate setup: (Solver | list[Solver] | None): Setup step (always run - even when the main `solver` is replaced). + even when the main `solver` is replaced). solver: (Solver | list[Solver]): Solver or list of solvers. - Defaults to generate(), a normal call to the model. + Defaults to generate(), a normal call to the model. scorer: (Scorer | list[Scorer] | None): Scorer used to evaluate model output. metrics (list[Metric] | dict[str, list[Metric]] | None): - Alternative metrics (overrides the metrics provided by the specified scorer). + Alternative metrics (overrides the metrics provided by the specified scorer). config (GenerateConfig): Model generation config. sandbox (SandboxEnvironmentType | None): Sandbox environment type - (or optionally a str or tuple with a shorthand spec) + (or optionally a str or tuple with a shorthand spec) approval: (str | list[ApprovalPolicy] | None): Tool use approval policies. - Either a path to an approval policy config file or a list of approval policies. - Defaults to no approval policy. + Either a path to an approval policy config file or a list of approval policies. + Defaults to no approval policy. epochs (int | Epochs | None): Epochs to repeat samples for and optional score - reducer function(s) used to combine sample scores (defaults to "mean") + reducer function(s) used to combine sample scores (defaults to "mean") fail_on_error (bool | float | None): `True` to fail on first sample error - (default); `False` to never fail on sample errors; Value between 0 and 1 - to fail if a proportion of total samples fails. Value greater than 1 to fail - eval if a count of samples fails. + (default); `False` to never fail on sample errors; Value between 0 and 1 + to fail if a proportion of total samples fails. Value greater than 1 to fail + eval if a count of samples fails. message_limit (int | None): Limit on total messages used for each sample. token_limit (int | None): Limit on total tokens used for each sample. time_limit (int | None): Limit on time (in seconds) for execution of each sample. name: (str | None): Task name. If not specified is automatically - determined based on the name of the task directory (or "task") - if its anonymous task (e.g. created in a notebook and passed to - eval() directly) + determined based on the name of the task directory (or "task") + if its anonymous task (e.g. created in a notebook and passed to + eval() directly) version: (int): Version of task (to distinguish evolutions - of the task spec or breaking changes to it) + of the task spec or breaking changes to it) metadata: (dict[str, Any] | None): Additional metadata to associate with the task. Returns: diff --git a/src/inspect_ai/_util/__init__.py b/src/inspect_ai/_util/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/inspect_ai/_util/content.py b/src/inspect_ai/_util/content.py index bb925e447..d4b87abfc 100644 --- a/src/inspect_ai/_util/content.py +++ b/src/inspect_ai/_util/content.py @@ -4,6 +4,8 @@ class ContentText(BaseModel): + """Text content.""" + type: Literal["text"] = Field(default="text") """Type.""" @@ -12,6 +14,8 @@ class ContentText(BaseModel): class ContentImage(BaseModel): + """Image content.""" + type: Literal["image"] = Field(default="image") """Type.""" @@ -26,6 +30,8 @@ class ContentImage(BaseModel): class ContentAudio(BaseModel): + """Audio content.""" + type: Literal["audio"] = Field(default="audio") """Type.""" @@ -37,6 +43,8 @@ class ContentAudio(BaseModel): class ContentVideo(BaseModel): + """Video content.""" + type: Literal["video"] = Field(default="video") """Type.""" diff --git a/src/inspect_ai/_util/error.py b/src/inspect_ai/_util/error.py index 386d9322e..5064913ac 100644 --- a/src/inspect_ai/_util/error.py +++ b/src/inspect_ai/_util/error.py @@ -9,6 +9,8 @@ class EvalError(BaseModel): + """Eval error details.""" + message: str """Error message.""" diff --git a/src/inspect_ai/_util/file.py b/src/inspect_ai/_util/file.py index 8de32d9cc..883034c15 100644 --- a/src/inspect_ai/_util/file.py +++ b/src/inspect_ai/_util/file.py @@ -18,6 +18,7 @@ from fsspec.implementations.local import make_path_posix # type: ignore from pydantic import BaseModel from s3fs import S3FileSystem # type: ignore +from shortuuid import uuid # https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem # https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.generic.GenericFileSystem @@ -169,6 +170,9 @@ def sep(self) -> str: def exists(self, path: str) -> bool: return self.fs.exists(path) is True + def touch(self, path: str) -> None: + self.fs.touch(path) + def rm( self, path: str, recursive: bool = False, maxdepth: int | None = None ) -> None: @@ -218,6 +222,16 @@ def ls( def is_local(self) -> bool: return isinstance(self.fs, fsspec.implementations.local.LocalFileSystem) + def is_writeable(self, path: str) -> bool: + try: + path = path.rstrip("/\\") + touch_file = f"{path}{self.fs.sep}{uuid()}" + self.touch(touch_file) + self.rm(touch_file) + return True + except PermissionError: + return False + def is_async(self) -> bool: return isinstance(self.fs, fsspec.asyn.AsyncFileSystem) @@ -354,7 +368,7 @@ def safe_filename(s: str, max_length: int = 255) -> str: Returns: str: A safe filename string - Example: + Examples: >>> safe_filename("Hello/World?.txt") 'Hello_World.txt' """ diff --git a/src/inspect_ai/_util/logger.py b/src/inspect_ai/_util/logger.py index 527321766..b86a0b6ee 100644 --- a/src/inspect_ai/_util/logger.py +++ b/src/inspect_ai/_util/logger.py @@ -181,7 +181,9 @@ def notify_logger_record(record: LogRecord, write: bool) -> None: from inspect_ai.log._transcript import LoggerEvent, transcript if write: - transcript()._event(LoggerEvent(message=LoggingMessage.from_log_record(record))) + transcript()._event( + LoggerEvent(message=LoggingMessage._from_log_record(record)) + ) global _rate_limit_count if (record.levelno <= INFO and re.search(r"\b429\b", record.getMessage())) or ( record.levelno == DEBUG diff --git a/src/inspect_ai/_view/view.py b/src/inspect_ai/_view/view.py index 22b04aec1..073b558a3 100644 --- a/src/inspect_ai/_view/view.py +++ b/src/inspect_ai/_view/view.py @@ -28,11 +28,10 @@ def view( port: int = DEFAULT_VIEW_PORT, authorization: str | None = None, log_level: str | None = None, - log_level_transcript: str | None = None, fs_options: dict[str, Any] = {}, ) -> None: init_dotenv() - init_logger(log_level, log_level_transcript) + init_logger(log_level) # initialize the log_dir log_dir = log_dir if log_dir else os.getenv("INSPECT_LOG_DIR", "./logs") diff --git a/src/inspect_ai/_view/www/App.css b/src/inspect_ai/_view/www/App.css index 1c0e97bba..385e0c3fd 100644 --- a/src/inspect_ai/_view/www/App.css +++ b/src/inspect_ai/_view/www/App.css @@ -25,6 +25,7 @@ /* Inspect Font Sizes */ --inspect-font-size-title: 1.5rem; --inspect-font-size-title-secondary: 1.3rem; + --inspect-font-size-largest: 1.2rem; --inspect-font-size-larger: 1.1rem; --inspect-font-size-large: 1rem; --inspect-font-size-base: 0.9rem; @@ -84,6 +85,10 @@ body[class^="vscode-"] .app-main-grid { font-size: var(--inspect-font-size-title-secondary); } +.text-size-largest { + font-size: var(--inspect-font-size-largest); +} + .text-size-larger { font-size: var(--inspect-font-size-larger); } diff --git a/src/inspect_ai/_view/www/dist/assets/index.css b/src/inspect_ai/_view/www/dist/assets/index.css index 091f8a1e8..0aaf777ae 100644 --- a/src/inspect_ai/_view/www/dist/assets/index.css +++ b/src/inspect_ai/_view/www/dist/assets/index.css @@ -14298,6 +14298,7 @@ pre[class*="language-"] { /* Inspect Font Sizes */ --inspect-font-size-title: 1.5rem; --inspect-font-size-title-secondary: 1.3rem; + --inspect-font-size-largest: 1.2rem; --inspect-font-size-larger: 1.1rem; --inspect-font-size-large: 1rem; --inspect-font-size-base: 0.9rem; @@ -14357,6 +14358,10 @@ body[class^="vscode-"] .app-main-grid { font-size: var(--inspect-font-size-title-secondary); } +.text-size-largest { + font-size: var(--inspect-font-size-largest); +} + .text-size-larger { font-size: var(--inspect-font-size-larger); } @@ -19490,7 +19495,7 @@ span.ap-marker-container:hover span.ap-marker { display: grid; grid-template-columns: minmax(0, max-content) max-content; } -._simpleMetricsRows_13pa9_1 { +._simpleMetricsRows_tnqkm_1 { display: flex; flex-direction: row; flex-wrap: wrap; @@ -19501,28 +19506,28 @@ span.ap-marker-container:hover span.ap-marker { overflow: scroll; } -._multiMetricsRows_13pa9_12 { +._multiMetricsRows_tnqkm_12 { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: end; - height: 100%; align-items: center; margin-top: 0.2rem; padding-bottom: 0.4rem; row-gap: 1em; max-height: 15em; overflow: scroll; + align-items: baseline; } -._verticalMetricReducer_13pa9_26 { +._verticalMetricReducer_tnqkm_26 { font-size: var(--inspect-font-size-smaller); text-align: center; padding-top: 0.3rem; margin-bottom: -0.3rem; } -._verticalMetricName_13pa9_33 { +._verticalMetricName_tnqkm_33 { font-size: var(--inspect-font-size-smaller); text-align: center; padding-top: 0.3rem; @@ -19530,32 +19535,55 @@ span.ap-marker-container:hover span.ap-marker { border-bottom: solid var(--bs-border-color) 1px; } -._verticalMetricValue_13pa9_41 { - font-size: var(--inspect-font-size-larger); +._verticalMetricValue_tnqkm_41 { font-weight: 500; text-align: center; } -._multiScorerReducer_13pa9_47 { +._multiScorer_tnqkm_46 { + padding-left: 0; + height: 100%; + display: flex; + flex-direction: column; + padding: 0.5em 1em; +} + +._multiScorerIndent_tnqkm_54 { + padding-left: 1.5em; +} + +._multiScorerReducer_tnqkm_58 { text-align: center; margin-bottom: -0.3rem; + margin-top: 0.2em; } -._multiScorerLabel_13pa9_52 { +._multiScorerLabel_tnqkm_64 { text-align: center; border-bottom: solid var(--bs-border-color) 1px; margin-bottom: -0.1rem; } -._multiScorerValue_13pa9_58 { +._multiScorerValue_tnqkm_70 { display: grid; grid-template-columns: auto auto; + grid-auto-rows: auto; grid-column-gap: 0.3rem; grid-row-gap: 0; + padding-top: 0.3em; } -._multiScorerValueContent_13pa9_65 { +._multiScorerValueContent_tnqkm_79 { font-weight: 600; + text-align: center; +} + +._multiScoreMetricGrid_tnqkm_84 { + display: grid; + grid-template-rows: auto auto; + column-gap: 1em; + padding: 0 0.2em; + justify-content: center; } ._statusPanel_1fzh4_1 { padding: 1em; diff --git a/src/inspect_ai/_view/www/dist/assets/index.js b/src/inspect_ai/_view/www/dist/assets/index.js index 5ecdbf782..6c776d95c 100644 --- a/src/inspect_ai/_view/www/dist/assets/index.js +++ b/src/inspect_ai/_view/www/dist/assets/index.js @@ -15908,17 +15908,19 @@ var require_assets = __commonJS({ title: title2, activeItem, setActiveItem - } + }, + `nav-pill-contents-${idx}` ); }); - const navBodies = children2.map((child) => { + const navBodies = children2.map((child, idx) => { var _a2; return /* @__PURE__ */ jsxRuntimeExports.jsx( "div", { className: ((_a2 = child["props"]) == null ? void 0 : _a2.title) === activeItem ? styles$11.visible : styles$11.hidden, children: child - } + }, + `nav-pill-container-${idx}` ); }); return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [ @@ -21347,21 +21349,31 @@ var require_assets = __commonJS({ } const outputs = []; if (Array.isArray(output2)) { - output2.forEach((out) => { + output2.forEach((out, idx) => { + const key2 = `tool-output-${idx}`; if (out.type === "text") { - outputs.push(/* @__PURE__ */ jsxRuntimeExports.jsx(ToolTextOutput, { text: out.text })); + outputs.push(/* @__PURE__ */ jsxRuntimeExports.jsx(ToolTextOutput, { text: out.text }, key2)); } else { if (out.image.startsWith("data:")) { outputs.push( - /* @__PURE__ */ jsxRuntimeExports.jsx("img", { className: clsx(styles$_.toolImage), src: out.image }) + /* @__PURE__ */ jsxRuntimeExports.jsx( + "img", + { + className: clsx(styles$_.toolImage), + src: out.image + }, + key2 + ) ); } else { - outputs.push(/* @__PURE__ */ jsxRuntimeExports.jsx(ToolTextOutput, { text: String(out.image) })); + outputs.push(/* @__PURE__ */ jsxRuntimeExports.jsx(ToolTextOutput, { text: String(out.image) }, key2)); } } }); } else { - outputs.push(/* @__PURE__ */ jsxRuntimeExports.jsx(ToolTextOutput, { text: String(output2) })); + outputs.push( + /* @__PURE__ */ jsxRuntimeExports.jsx(ToolTextOutput, { text: String(output2) }, "tool-output-single") + ); } return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx(styles$_.output), children: outputs }); }; @@ -21373,6 +21385,7 @@ var require_assets = __commonJS({ return contents2.map((content2, index2) => { if (typeof content2 === "string") { return messageRenderers["text"].render( + `text-content-${index2}`, { type: "text", text: content2 @@ -21383,7 +21396,11 @@ var require_assets = __commonJS({ if (content2) { const renderer = messageRenderers[content2.type]; if (renderer) { - return renderer.render(content2, index2 === contents2.length - 1); + return renderer.render( + `text-${content2.type}-${index2}`, + content2, + index2 === contents2.length - 1 + ); } else { console.error(`Unknown message content type '${content2.type}'`); } @@ -21395,48 +21412,53 @@ var require_assets = __commonJS({ type: "text", text: contents2 }; - return messageRenderers["text"].render(contentText, true); + return messageRenderers["text"].render( + "text-message-content", + contentText, + true + ); } }; const messageRenderers = { text: { - render: (content2, isLast) => { + render: (key2, content2, isLast) => { const c2 = content2; return /* @__PURE__ */ jsxRuntimeExports.jsx( MarkdownDiv, { markdown: c2.text, className: isLast ? "no-last-para-padding" : "" - } + }, + key2 ); } }, image: { - render: (content2) => { + render: (key2, content2) => { const c2 = content2; if (c2.image.startsWith("data:")) { - return /* @__PURE__ */ jsxRuntimeExports.jsx("img", { src: c2.image, className: styles$$.contentImage }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("img", { src: c2.image, className: styles$$.contentImage }, key2); } else { - return /* @__PURE__ */ jsxRuntimeExports.jsx("code", { children: c2.image }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("code", { children: c2.image }, key2); } } }, audio: { - render: (content2) => { + render: (key2, content2) => { const c2 = content2; - return /* @__PURE__ */ jsxRuntimeExports.jsx("audio", { controls: true, children: /* @__PURE__ */ jsxRuntimeExports.jsx("source", { src: c2.audio, type: mimeTypeForFormat(c2.format) }) }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("audio", { controls: true, children: /* @__PURE__ */ jsxRuntimeExports.jsx("source", { src: c2.audio, type: mimeTypeForFormat(c2.format) }) }, key2); } }, video: { - render: (content2) => { + render: (key2, content2) => { const c2 = content2; - return /* @__PURE__ */ jsxRuntimeExports.jsx("video", { width: "500", height: "375", controls: true, children: /* @__PURE__ */ jsxRuntimeExports.jsx("source", { src: c2.video, type: mimeTypeForFormat(c2.format) }) }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("video", { width: "500", height: "375", controls: true, children: /* @__PURE__ */ jsxRuntimeExports.jsx("source", { src: c2.video, type: mimeTypeForFormat(c2.format) }) }, key2); } }, tool: { - render: (content2) => { + render: (key2, content2) => { const c2 = content2; - return /* @__PURE__ */ jsxRuntimeExports.jsx(ToolOutput, { output: c2.content }); + return /* @__PURE__ */ jsxRuntimeExports.jsx(ToolOutput, { output: c2.content }, key2); } } }; @@ -21456,23 +21478,17 @@ var require_assets = __commonJS({ }; const resolveToolInput = (fn, toolArgs) => { const toolName = fn; - const [inputKey, inputType] = extractInputMetadata(toolName); - if (inputKey) { - const { input: input2, args } = extractInput( - inputKey, - toolArgs - ); - const functionCall = args.length > 0 ? `${toolName}(${args.join(",")})` : toolName; - return { - functionCall, - input: input2, - inputType - }; - } else { - return { - functionCall: toolName - }; - } + const [inputKey, highlightLanguage] = extractInputMetadata(toolName); + const { input: input2, args } = extractInput( + toolArgs, + inputKey + ); + const functionCall = args.length > 0 ? `${toolName}(${args.join(", ")})` : toolName; + return { + functionCall, + input: input2, + highlightLanguage + }; }; const extractInputMetadata = (toolName) => { if (toolName === "bash") { @@ -21485,25 +21501,13 @@ var require_assets = __commonJS({ return [void 0, void 0]; } }; - const extractInput = (inputKey, args) => { + const extractInput = (args, inputKey) => { const formatArg = (key2, value2) => { - const quotedValue = typeof value2 === "string" ? `"${value2}"` : value2; + const quotedValue = typeof value2 === "string" ? `"${value2}"` : typeof value2 === "object" || Array.isArray(value2) ? JSON.stringify(value2, void 0, 2) : String(value2); return `${key2}: ${quotedValue}`; }; if (args) { - if (Object.keys(args).length === 1) { - const inputRaw = args[Object.keys(args)[0]]; - let input2; - if (Array.isArray(inputRaw) || typeof inputRaw === "object") { - input2 = JSON.stringify(inputRaw, void 0, 2); - } else { - input2 = String(inputRaw); - } - return { - input: input2, - args: [] - }; - } else if (args[inputKey]) { + if (inputKey && args[inputKey]) { const input2 = args[inputKey]; const filteredArgs = Object.keys(args).filter((key2) => { return key2 !== inputKey; @@ -21529,87 +21533,6 @@ var require_assets = __commonJS({ args: [] }; }; - var murmurhash$1 = { exports: {} }; - (function(module2) { - (function() { - const createBuffer = (val) => new TextEncoder().encode(val); - function MurmurHashV2(str2, seed) { - if (typeof str2 === "string") str2 = createBuffer(str2); - let l = str2.length, h = seed ^ l, i2 = 0, k; - while (l >= 4) { - k = str2[i2] & 255 | (str2[++i2] & 255) << 8 | (str2[++i2] & 255) << 16 | (str2[++i2] & 255) << 24; - k = (k & 65535) * 1540483477 + (((k >>> 16) * 1540483477 & 65535) << 16); - k ^= k >>> 24; - k = (k & 65535) * 1540483477 + (((k >>> 16) * 1540483477 & 65535) << 16); - h = (h & 65535) * 1540483477 + (((h >>> 16) * 1540483477 & 65535) << 16) ^ k; - l -= 4; - ++i2; - } - switch (l) { - case 3: - h ^= (str2[i2 + 2] & 255) << 16; - case 2: - h ^= (str2[i2 + 1] & 255) << 8; - case 1: - h ^= str2[i2] & 255; - h = (h & 65535) * 1540483477 + (((h >>> 16) * 1540483477 & 65535) << 16); - } - h ^= h >>> 13; - h = (h & 65535) * 1540483477 + (((h >>> 16) * 1540483477 & 65535) << 16); - h ^= h >>> 15; - return h >>> 0; - } - function MurmurHashV3(key2, seed) { - if (typeof key2 === "string") key2 = createBuffer(key2); - let remainder, bytes, h1, h1b, c1, c2, k1, i2; - remainder = key2.length & 3; - bytes = key2.length - remainder; - h1 = seed; - c1 = 3432918353; - c2 = 461845907; - i2 = 0; - while (i2 < bytes) { - k1 = key2[i2] & 255 | (key2[++i2] & 255) << 8 | (key2[++i2] & 255) << 16 | (key2[++i2] & 255) << 24; - ++i2; - k1 = (k1 & 65535) * c1 + (((k1 >>> 16) * c1 & 65535) << 16) & 4294967295; - k1 = k1 << 15 | k1 >>> 17; - k1 = (k1 & 65535) * c2 + (((k1 >>> 16) * c2 & 65535) << 16) & 4294967295; - h1 ^= k1; - h1 = h1 << 13 | h1 >>> 19; - h1b = (h1 & 65535) * 5 + (((h1 >>> 16) * 5 & 65535) << 16) & 4294967295; - h1 = (h1b & 65535) + 27492 + (((h1b >>> 16) + 58964 & 65535) << 16); - } - k1 = 0; - switch (remainder) { - case 3: - k1 ^= (key2[i2 + 2] & 255) << 16; - case 2: - k1 ^= (key2[i2 + 1] & 255) << 8; - case 1: - k1 ^= key2[i2] & 255; - k1 = (k1 & 65535) * c1 + (((k1 >>> 16) * c1 & 65535) << 16) & 4294967295; - k1 = k1 << 15 | k1 >>> 17; - k1 = (k1 & 65535) * c2 + (((k1 >>> 16) * c2 & 65535) << 16) & 4294967295; - h1 ^= k1; - } - h1 ^= key2.length; - h1 ^= h1 >>> 16; - h1 = (h1 & 65535) * 2246822507 + (((h1 >>> 16) * 2246822507 & 65535) << 16) & 4294967295; - h1 ^= h1 >>> 13; - h1 = (h1 & 65535) * 3266489909 + (((h1 >>> 16) * 3266489909 & 65535) << 16) & 4294967295; - h1 ^= h1 >>> 16; - return h1 >>> 0; - } - const murmur = MurmurHashV3; - murmur.v2 = MurmurHashV2; - murmur.v3 = MurmurHashV3; - { - module2.exports = murmur; - } - })(); - })(murmurhash$1); - var murmurhashExports = murmurhash$1.exports; - const murmurhash = /* @__PURE__ */ getDefaultExportFromCjs(murmurhashExports); const outputPre = "_outputPre_18agr_1"; const outputCode = "_outputCode_18agr_7"; const bottomMargin = "_bottomMargin_18agr_12"; @@ -21618,71 +21541,58 @@ var require_assets = __commonJS({ outputCode, bottomMargin }; - const ToolInput = ({ - type, - contents: contents2, - view - }) => { - if (!contents2 && !(view == null ? void 0 : view.content)) { - return null; - } - if (view) { + const useCodeHighlight = (language2) => { + const codeRef = reactExports.useRef(null); + reactExports.useEffect(() => { + if (codeRef.current && language2) { + prismExports.highlightElement(codeRef.current); + } + }, [language2]); + return codeRef; + }; + const ToolInput = reactExports.memo((props) => { + const { highlightLanguage, contents: contents2, toolCallView } = props; + const codeRef = useCodeHighlight(highlightLanguage); + if (!contents2 && !(toolCallView == null ? void 0 : toolCallView.content)) return null; + if (toolCallView) { const toolViewRef = reactExports.useRef(null); reactExports.useEffect(() => { - if (toolViewRef.current) { - for (const child of toolViewRef.current.children) { - if (child.tagName === "PRE") { - const childChild = child.firstElementChild; - if (childChild && childChild.tagName === "CODE") { - const hasLanguageClass = Array.from(childChild.classList).some( - (className2) => className2.startsWith("language-") - ); - if (hasLanguageClass) { - child.classList.add("tool-output"); - prismExports.highlightElement(childChild); - } + if ((toolCallView == null ? void 0 : toolCallView.content) && toolViewRef.current) { + requestAnimationFrame(() => { + const codeBlocks = toolViewRef.current.querySelectorAll("pre code"); + codeBlocks.forEach((block2) => { + if (block2.className.includes("language-")) { + block2.classList.add("sourceCode"); + prismExports.highlightElement(block2); } - } - } + }); + }); } - }, [contents2, view]); + }, [toolCallView == null ? void 0 : toolCallView.content]); return /* @__PURE__ */ jsxRuntimeExports.jsx( MarkdownDiv, { - markdown: view.content, + markdown: toolCallView.content, ref: toolViewRef, - className: clsx(styles$Z.bottomMargin) - } - ); - } else { - const toolInputRef = reactExports.useRef(null); - reactExports.useEffect(() => { - if (type) { - const tokens = prismExports.languages[type]; - if (toolInputRef.current && tokens) { - prismExports.highlightElement(toolInputRef.current); - } - } - }, [contents2, type, view]); - contents2 = typeof contents2 === "object" || Array.isArray(contents2) ? JSON.stringify(contents2) : contents2; - const key2 = murmurhash.v3(contents2 || ""); - return /* @__PURE__ */ jsxRuntimeExports.jsx( - "pre", - { - className: clsx("tool-output", styles$Z.outputPre, styles$Z.bottomMargin), - children: /* @__PURE__ */ jsxRuntimeExports.jsx( - "code", - { - ref: toolInputRef, - className: clsx("source-code", `language-${type}`, styles$Z.outputCode), - children: contents2 - }, - key2 - ) + className: clsx(styles$Z.bottomMargin, "text-size-small") } ); } - }; + const formattedContent = typeof contents2 === "object" ? JSON.stringify(contents2) : contents2; + return /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: clsx("tool-output", styles$Z.outputPre, styles$Z.bottomMargin), children: /* @__PURE__ */ jsxRuntimeExports.jsx( + "code", + { + ref: codeRef, + className: clsx( + "source-code", + "sourceCode", + `language-${highlightLanguage}`, + styles$Z.outputCode + ), + children: formattedContent + } + ) }); + }); const image = "_image_10saa_1"; const styles$Y = { image @@ -21696,7 +21606,7 @@ var require_assets = __commonJS({ const ToolCallView = ({ functionCall, input: input2, - inputType, + highlightLanguage, view, output: output2, mode @@ -21714,11 +21624,19 @@ var require_assets = __commonJS({ return false; } const collapse = Array.isArray(output2) ? output2.every((item2) => !isContentImage(item2)) : !isContentImage(output2); + const normalizedContent = reactExports.useMemo(() => normalizeContent$1(output2), [output2]); return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [ mode !== "compact" && (!view || view.title) ? /* @__PURE__ */ jsxRuntimeExports.jsx(ToolTitle, { title: (view == null ? void 0 : view.title) || functionCall }) : "", /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [ - /* @__PURE__ */ jsxRuntimeExports.jsx(ToolInput, { type: inputType, contents: input2, view }), - output2 ? /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizeContent$1(output2) }) }) : "" + /* @__PURE__ */ jsxRuntimeExports.jsx( + ToolInput, + { + highlightLanguage, + contents: input2, + toolCallView: view + } + ), + /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse, border: true, lines: 15, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: normalizedContent }) }) ] }) }) ] }); }; @@ -21749,14 +21667,8 @@ var require_assets = __commonJS({ toolCallStyle }) => { if (message2.role === "assistant" && message2.tool_calls && message2.tool_calls.length) { - const result = []; - if (message2.content) { - result.push( - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$X.content, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: message2.content }) }) - ); - } const toolCalls = message2.tool_calls.map((tool_call, idx) => { - const { input: input2, functionCall, inputType } = resolveToolInput( + const { input: input2, functionCall, highlightLanguage } = resolveToolInput( tool_call.function, tool_call.arguments ); @@ -21770,26 +21682,27 @@ var require_assets = __commonJS({ } const resolvedToolOutput = resolveToolMessage(toolMessage); if (toolCallStyle === "compact") { - return /* @__PURE__ */ jsxRuntimeExports.jsxs("code", { children: [ + return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: /* @__PURE__ */ jsxRuntimeExports.jsxs("code", { children: [ "tool: ", functionCall - ] }); + ] }) }, `tool-call-${idx}`); } else { return /* @__PURE__ */ jsxRuntimeExports.jsx( ToolCallView, { functionCall, input: input2, - inputType, + highlightLanguage, output: resolvedToolOutput - } + }, + `tool-call-${idx}` ); } }); - if (toolCalls) { - result.push(...toolCalls); - } - return result; + return /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [ + /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$X.content, children: message2.content ? /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: message2.content }) : void 0 }), + toolCalls + ] }); } else { return /* @__PURE__ */ jsxRuntimeExports.jsx(MessageContent, { contents: message2.content }); } @@ -21923,7 +21836,7 @@ var require_assets = __commonJS({ message2.role === "assistant" && message2.reasoning ? /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [ /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-style-label", "text-style-secondary"), children: "Reasoning" }), /* @__PURE__ */ jsxRuntimeExports.jsx(ExpandablePanel, { collapse: true, children: /* @__PURE__ */ jsxRuntimeExports.jsx(MarkdownDiv, { markdown: message2.reasoning }) }) - ] }) : void 0, + ] }, `${id}-response-label`) : void 0, /* @__PURE__ */ jsxRuntimeExports.jsxs( "div", { @@ -22018,7 +21931,8 @@ var require_assets = __commonJS({ resolvedMessage: msg, indented: indented2, toolCallStyle - } + }, + `${id}-msg-${index2}` ); }) }); return result; @@ -22093,7 +22007,7 @@ var require_assets = __commonJS({ } ), /* @__PURE__ */ jsxRuntimeExports.jsx("td", { className: clsx(styles$V.cell, styles$V.cellValue, "text-size-small"), children: /* @__PURE__ */ jsxRuntimeExports.jsx(RenderedContent, { id: id2, entry: entry2 }) }) - ] }); + ] }, id2); }); return /* @__PURE__ */ jsxRuntimeExports.jsxs( "table", @@ -25410,7 +25324,7 @@ categories: ${categories.join(" ")}`; setSort(sel.value); }, children: options2.map((option) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: option.val, children: option.label }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: option.val, children: option.label }, option.val); }) } ) @@ -25617,6 +25531,28 @@ categories: ${categories.join(" ")}`; running, cancelled }; + const metricDisplayName = (metric2) => { + let modifier = void 0; + for (const metricModifier of metricModifiers) { + modifier = metricModifier(metric2); + if (modifier) { + break; + } + } + const metricName2 = !modifier ? metric2.name : `${metric2.name}[${modifier}]`; + return metricName2; + }; + const clusterMetricModifier = (metric2) => { + if (metric2.name !== "stderr") { + return void 0; + } + const clusterValue = (metric2.params || {})["cluster"]; + if (clusterValue === void 0 || typeof clusterValue !== "string") { + return void 0; + } + return clusterValue; + }; + const metricModifiers = [clusterMetricModifier]; const container$c = "_container_1frsg_1"; const metric = "_metric_1frsg_8"; const metricName$1 = "_metricName_1frsg_17"; @@ -25628,6 +25564,7 @@ categories: ${categories.join(" ")}`; metricReducer: metricReducer$1 }; const SidebarScoreView = ({ scorer }) => { + const showReducer = !!scorer.reducer; return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$O.container, children: Object.keys(scorer.metrics).map((metric2) => { return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: styles$O.metric, children: [ /* @__PURE__ */ jsxRuntimeExports.jsx( @@ -25639,15 +25576,12 @@ categories: ${categories.join(" ")}`; "text-size-small", styles$O.metricName ), - children: scorer.metrics[metric2].name + children: metricDisplayName(scorer.metrics[metric2]) } ), - scorer.reducer ? /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: clsx("text-size-small", styles$O.metricReducer), children: [ - "$", - scorer.reducer - ] }) : "", + showReducer ? /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-small", styles$O.metricReducer), children: scorer.reducer || "default" }) : "", /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: "text-size-title-secondary", children: formatPrettyDecimal(scorer.metrics[metric2].value) }) - ] }); + ] }, metric2); }) }); }; const container$b = "_container_5kpg1_1"; @@ -25665,7 +25599,8 @@ categories: ${categories.join(" ")}`; metricValue }; const SidebarScoresView = ({ scores: scores2 }) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$N.container, children: scores2.map((score2) => { + const showReducer = scores2.findIndex((score2) => !!score2.reducer) !== -1; + return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$N.container, children: scores2.map((score2, idx) => { const name2 = score2.name; const reducer = score2.reducer; return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: styles$N.scoreWrapper, children: [ @@ -25674,31 +25609,33 @@ categories: ${categories.join(" ")}`; { className: clsx( "text-style-secondary", - "text-label", + "text-style-label", "text-size-small", styles$N.metricName ), children: name2 } ), - reducer ? /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-small", styles$N.metricReducer), children: reducer }) : "", + showReducer ? /* @__PURE__ */ jsxRuntimeExports.jsx( + "div", + { + className: clsx( + "text-size-small", + "text-style-label", + "text-style-secondary", + styles$N.metricReducer + ), + children: reducer || "default" + } + ) : "", /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-small", styles$N.metricValues), children: Object.keys(score2.metrics).map((key2) => { const metric2 = score2.metrics[key2]; return /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [ - /* @__PURE__ */ jsxRuntimeExports.jsx( - "div", - { - className: clsx( - "text-style-secondary", - "text-style-label" - ), - children: metric2.name - } - ), + /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx(), children: metricDisplayName(metric2) }), /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$N.metricValue, children: formatPrettyDecimal(metric2.value) }) ] }, key2); }) }) - ] }); + ] }, `scorer-${name2}-${idx}`); }) }); }; const EvalStatus = ({ logHeader }) => { @@ -46359,33 +46296,6 @@ Supported expressions: const scorerScores = scores2.filter((sc) => { return score2 && sc.scorer === score2.scorer; }); - const selectors = [ - /* @__PURE__ */ jsxRuntimeExports.jsx( - ScorerSelector, - { - scorers, - selectedIndex: scorerIndex(scorers, score2), - setSelectedIndex: (index2) => { - setScore(scorers[index2]); - } - } - ) - ]; - if (scorerScores.length > 1) { - selectors.push( - /* @__PURE__ */ jsxRuntimeExports.jsx( - ScoreSelector, - { - className: clsx(styles$J.secondSel), - scores: scorerScores, - selectedIndex: scoreIndex(scorerScores, score2), - setSelectedIndex: (index2) => { - setScore(scorerScores[index2]); - } - } - ) - ); - } return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: styles$J.flex, children: [ /* @__PURE__ */ jsxRuntimeExports.jsx( "span", @@ -46401,7 +46311,27 @@ Supported expressions: children: "Scorer:" } ), - selectors + /* @__PURE__ */ jsxRuntimeExports.jsx( + ScorerSelector, + { + scorers, + selectedIndex: scorerIndex(scorers, score2), + setSelectedIndex: (index2) => { + setScore(scorers[index2]); + } + } + ), + scorerScores.length > 1 ? /* @__PURE__ */ jsxRuntimeExports.jsx( + ScoreSelector, + { + className: clsx(styles$J.secondSel), + scores: scorerScores, + selectedIndex: scoreIndex(scorerScores, score2), + setSelectedIndex: (index2) => { + setScore(scorerScores[index2]); + } + } + ) : void 0 ] }); } }; @@ -46427,7 +46357,7 @@ Supported expressions: setSelectedIndex(sel.selectedIndex); }, children: scores2.map((score2) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: score2.name, children: score2.name }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: score2.name, children: score2.name }, score2.name); }) } ); @@ -46448,7 +46378,7 @@ Supported expressions: setSelectedIndex(sel.selectedIndex); }, children: scorers.map((scorer) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: scorer.scorer, children: scorer.scorer }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("option", { value: scorer.scorer, children: scorer.scorer }, scorer.scorer); }) } ); @@ -46472,8 +46402,7 @@ Supported expressions: scores: scores2, sampleDescriptor }) => { - const tools2 = []; - tools2.push( + return /* @__PURE__ */ jsxRuntimeExports.jsxs(jsxRuntimeExports.Fragment, { children: [ /* @__PURE__ */ jsxRuntimeExports.jsx( SampleFilter, { @@ -46481,20 +46410,11 @@ Supported expressions: scoreFilter, setScoreFilter } - ) - ); - if (scores2.length > 1) { - tools2.push( - /* @__PURE__ */ jsxRuntimeExports.jsx(SelectScorer, { scores: scores2, score: score2, setScore }) - ); - } - if (epochs > 1) { - tools2.push( - /* @__PURE__ */ jsxRuntimeExports.jsx(EpochFilter, { epoch, setEpoch, epochs }) - ); - } - tools2.push(/* @__PURE__ */ jsxRuntimeExports.jsx(SortFilter, { sort, setSort, epochs })); - return tools2; + ), + scores2.length > 1 ? /* @__PURE__ */ jsxRuntimeExports.jsx(SelectScorer, { scores: scores2, score: score2, setScore }) : void 0, + epochs > 1 ? /* @__PURE__ */ jsxRuntimeExports.jsx(EpochFilter, { epoch, setEpoch, epochs }) : void 0, + /* @__PURE__ */ jsxRuntimeExports.jsx(SortFilter, { sort, setSort, epochs }) + ] }); }; const filename = (path) => { const pathparts = path.split("/"); @@ -48822,7 +48742,6 @@ self.onmessage = function (e) { if (remoteZipFile.centralDirectory.has(sampleFile)) { return await readJSONFile(sampleFile, MAX_BYTES); } else { - console.log({ dir: remoteZipFile.centralDirectory }); throw new Error( `Unable to read sample file ${sampleFile} - it is not present in the manifest.` ); @@ -49297,7 +49216,7 @@ self.onmessage = function (e) { tools: tools2, children: children2 }) => { - const validTabs = Array.isArray(children2) ? children2.filter(Boolean) : [children2]; + const validTabs = flattenChildren(children2); if (validTabs.length === 0) return null; return /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [ /* @__PURE__ */ jsxRuntimeExports.jsxs( @@ -49411,6 +49330,18 @@ self.onmessage = function (e) { const TabTools = ({ tools: tools2 }) => /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("tab-tools", moduleStyles.tabTools), children: tools2 }); const computeTabId = (id, index2) => `${id}-${index2}`; const computeTabContentsId = (id) => `${id}-contents`; + const flattenChildren = (children2) => { + return reactExports.Children.toArray(children2).flatMap((child) => { + if (reactExports.isValidElement(child)) { + const element = child; + if (element.type === reactExports.Fragment) { + return flattenChildren(element.props.children); + } + return element; + } + return []; + }); + }; function escapeSelector(id) { return id.replace(/([ #.;,?!+*~'":^$[\]()=>|/\\])/g, "\\$1"); } @@ -49489,7 +49420,7 @@ self.onmessage = function (e) { children: /* @__PURE__ */ jsxRuntimeExports.jsx(RenderedContent, { id: id2, entry: entry2 }) } ) - ] }); + ] }, `${baseId}-record-${index2}`); }); return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { id, className: clsx(className2, styles$H.grid), style: style2, children: entryEls }); }; @@ -49753,9 +49684,9 @@ self.onmessage = function (e) { value: usage.total_tokens, secondary: false }); - return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-small", styles$E.wrapper), children: rows.map((row2) => { + return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-small", styles$E.wrapper), children: rows.map((row2, idx) => { if (row2.label === "---") { - return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$E.separator }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$E.separator }, `$usage-sep-${idx}`); } else { return /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [ /* @__PURE__ */ jsxRuntimeExports.jsx( @@ -49770,7 +49701,7 @@ self.onmessage = function (e) { } ), /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$E.col3, children: row2.value ? formatNumber(row2.value) : "" }) - ] }); + ] }, `$usage-row-${idx}`); } }) }); }; @@ -49863,7 +49794,14 @@ self.onmessage = function (e) { return /* @__PURE__ */ jsxRuntimeExports.jsxs(TokenTable, { className: className2, children: [ /* @__PURE__ */ jsxRuntimeExports.jsx(TokenHeader, {}), /* @__PURE__ */ jsxRuntimeExports.jsx("tbody", { children: Object.keys(model_usage).map((key2) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx(TokenRow, { model: key2, usage: model_usage[key2] }); + return /* @__PURE__ */ jsxRuntimeExports.jsx( + TokenRow, + { + model: `${key2}-token-row`, + usage: model_usage[key2] + }, + key2 + ); }) }) ] }); }; @@ -50307,7 +50245,7 @@ self.onmessage = function (e) { }).join(" ")}` }, children: [ - columns.map((col) => { + columns.map((col, idx) => { return /* @__PURE__ */ jsxRuntimeExports.jsx( "div", { @@ -50318,10 +50256,11 @@ self.onmessage = function (e) { col.center ? styles$y.centerLabel : void 0 ), children: col.label - } + }, + `sample-summ-lbl-${idx}` ); }), - columns.map((col) => { + columns.map((col, idx) => { return /* @__PURE__ */ jsxRuntimeExports.jsx( "div", { @@ -50331,7 +50270,8 @@ self.onmessage = function (e) { col.center ? styles$y.centerLabel : void 0 ), children: col.value - } + }, + `sample-summ-val-${idx}` ); }) ] @@ -50459,7 +50399,8 @@ self.onmessage = function (e) { title: nav2.title, selectedNav, setSelectedNav - } + }, + nav2.title ); }) } @@ -50611,7 +50552,8 @@ self.onmessage = function (e) { id: id2, className: clsx("tab-pane", "show", isSelected ? "active" : ""), children: child - } + }, + `children-${id2}-${index2}` ); }) } @@ -50676,7 +50618,7 @@ self.onmessage = function (e) { EventPanel, { id, - title: "Info", + title: "Info" + (event.source ? ": " + event.source : ""), className: className2, subTitle: formatDateTime(new Date(event.timestamp)), icon: ApplicationIcons.info, @@ -50918,11 +50860,11 @@ self.onmessage = function (e) { ) }) }); }; const ToolsConfig = ({ tools: tools2 }) => { - const toolEls = tools2.map((tool2) => { + const toolEls = tools2.map((tool2, idx) => { return /* @__PURE__ */ jsxRuntimeExports.jsxs(reactExports.Fragment, { children: [ /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-style-label", "text-style-secondary"), children: tool2.name }), /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: tool2.description }) - ] }); + ] }, `${tool2.name}-${idx}`); }); return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$q.toolConfig, children: toolEls }); }; @@ -50950,13 +50892,13 @@ self.onmessage = function (e) { if (event.sample.files && Object.keys(event.sample.files).length > 0) { sections.push( /* @__PURE__ */ jsxRuntimeExports.jsx(EventSection, { title: "Files", children: Object.keys(event.sample.files).map((file) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: styles$p.noMargin, children: file }); - }) }) + return /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: styles$p.noMargin, children: file }, `sample-init-file-${file}`); + }) }, `sample-${id}-init-files`) ); } if (event.sample.setup) { sections.push( - /* @__PURE__ */ jsxRuntimeExports.jsx(EventSection, { title: "Setup", children: /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: styles$p.code, children: /* @__PURE__ */ jsxRuntimeExports.jsx("code", { className: "sourceCode", children: event.sample.setup }) }) }) + /* @__PURE__ */ jsxRuntimeExports.jsx(EventSection, { title: "Setup", children: /* @__PURE__ */ jsxRuntimeExports.jsx("pre", { className: styles$p.code, children: /* @__PURE__ */ jsxRuntimeExports.jsx("code", { className: "sourceCode", children: event.sample.setup }) }) }, `sample-${id}-init-setup`) ); } return /* @__PURE__ */ jsxRuntimeExports.jsxs( @@ -50984,11 +50926,11 @@ self.onmessage = function (e) { String.fromCharCode(65 + index2), ") ", choice - ] }); + ] }, `$choice-{choice}`); }) : "", sections.length > 0 ? /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$p.section, children: sections }) : "", /* @__PURE__ */ jsxRuntimeExports.jsx(EventSection, { title: "Target", children: toArray(event.sample.target).map((target2) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: target2 }); + return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: target2 }, target2); }) }) ] }) ] }), @@ -51080,7 +51022,7 @@ self.onmessage = function (e) { EventPanel, { id, - title: "Score", + title: (event.intermediate ? "Intermediate " : "") + "Score", className: clsx(className2, "text-size-small"), subTitle: formatDateTime(new Date(event.timestamp)), icon: ApplicationIcons.scorer, @@ -57658,7 +57600,8 @@ ${events} { id: "system_msg_event_preview", messages: [message2] - } + }, + "system_msg_event_preview" ); } }; @@ -57740,7 +57683,8 @@ ${events} answer: answer2, runtime, sessionLogs: Object.values(sessions) - } + }, + "human_baseline_view" ); } }; @@ -57791,8 +57735,8 @@ ${events} } ), /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-base"), children: toolsInfo[key2] }) - ] }); - }) }); + ] }, key2); + }) }, "state-diff-tools"); }; const RenderableChangeTypes = [ system_msg_added_sig, @@ -57803,11 +57747,18 @@ ${events} human_baseline_session ]; const Tools = ({ toolDefinitions }) => { - return toolDefinitions.map((toolDefinition) => { + return toolDefinitions.map((toolDefinition, idx) => { var _a2; const toolName = toolDefinition.name; const toolArgs = ((_a2 = toolDefinition.parameters) == null ? void 0 : _a2.properties) ? Object.keys(toolDefinition.parameters.properties) : []; - return /* @__PURE__ */ jsxRuntimeExports.jsx(Tool, { toolName, toolArgs }); + return /* @__PURE__ */ jsxRuntimeExports.jsx( + Tool, + { + toolName, + toolArgs + }, + `${toolName}-${idx}` + ); }); }; const Tool = ({ toolName, toolArgs }) => { @@ -57830,36 +57781,20 @@ ${events} }) => { const summary2 = summarizeChanges(event.changes); const [before, after] = synthesizeComparable(event.changes); - const tabs2 = [ - /* @__PURE__ */ jsxRuntimeExports.jsx( - StateDiffView, - { - before, - after, - "data-name": "Diff", - className: clsx(styles$m.diff) - } - ) - ]; const changePreview = generatePreview( event.changes, structuredClone(after), isStore ); - if (changePreview) { - tabs2.unshift( - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { "data-name": "Summary", className: clsx(styles$m.summary), children: changePreview }) - ); - } const title2 = event.event === "state" ? "State Updated" : "Store Updated"; - return /* @__PURE__ */ jsxRuntimeExports.jsx( + return /* @__PURE__ */ jsxRuntimeExports.jsxs( EventPanel, { id, title: title2, className: className2, subTitle: formatDateTime(new Date(event.timestamp)), - text: tabs2.length === 1 ? summary2 : void 0, + text: !changePreview ? summary2 : void 0, collapse: changePreview === void 0 ? true : void 0, selectedNav: eventState.selectedNav || "", setSelectedNav: (selectedNav) => { @@ -57869,7 +57804,18 @@ ${events} setCollapsed: (collapsed) => { setEventState({ ...eventState, collapsed }); }, - children: tabs2 + children: [ + changePreview ? /* @__PURE__ */ jsxRuntimeExports.jsx("div", { "data-name": "Summary", className: clsx(styles$m.summary), children: changePreview }) : void 0, + /* @__PURE__ */ jsxRuntimeExports.jsx( + StateDiffView, + { + before, + after, + "data-name": "Diff", + className: clsx(styles$m.diff) + } + ) + ] } ); }; @@ -57914,7 +57860,8 @@ ${events} } } if (matchingOps === requiredMatchCount) { - results.push(changeType.render(changes, resolvedState)); + const el = changeType.render(changes, resolvedState); + results.push(el); break; } } @@ -58286,9 +58233,9 @@ ${events} className: className2 }) => { var _a2, _b2; - const { input: input2, functionCall, inputType } = resolveToolInput( - event.function, - event.arguments + const { input: input2, functionCall, highlightLanguage } = reactExports.useMemo( + () => resolveToolInput(event.function, event.arguments), + [event.function, event.arguments] ); const approvalEvent = event.events.find((e) => { return e.event === "approval"; @@ -58317,7 +58264,7 @@ ${events} { functionCall, input: input2, - inputType, + highlightLanguage, output: ((_b2 = event.error) == null ? void 0 : _b2.message) || event.result, mode: "compact", view: event.view ? event.view : void 0 @@ -58758,7 +58705,8 @@ ${events} onClick: () => { printSample(id, targetId); } - } + }, + "sample-print-tool" ) ); } @@ -58797,7 +58745,8 @@ ${events} }, `${baseId}-transcript-display-${id}` ) - } + }, + kSampleTranscriptTabId ) : null, /* @__PURE__ */ jsxRuntimeExports.jsx( TabPanel, @@ -58819,7 +58768,8 @@ ${events} }, `${baseId}-chat-${id}` ) - } + }, + kSampleMessagesTabId ), scorerNames.length === 1 ? /* @__PURE__ */ jsxRuntimeExports.jsx( TabPanel, @@ -58837,7 +58787,8 @@ ${events} scorer: scorerNames[0] } ) - } + }, + kSampleScoringTabId ) : /* @__PURE__ */ jsxRuntimeExports.jsx(jsxRuntimeExports.Fragment, { children: Object.keys(sample2.scores || {}).map((scorer) => { const tabId = `score-${scorer}`; return /* @__PURE__ */ jsxRuntimeExports.jsx( @@ -58856,7 +58807,8 @@ ${events} scorer } ) - } + }, + tabId ); }) }), sampleMetadatas.length > 0 ? /* @__PURE__ */ jsxRuntimeExports.jsx( @@ -58903,7 +58855,7 @@ ${events} ) ] }); }; - const metadataViewsForSample = (_id, sample2) => { + const metadataViewsForSample = (id, sample2) => { const sampleMetadatas = []; if (sample2.model_usage && Object.keys(sample2.model_usage).length > 0) { sampleMetadatas.push( @@ -58916,7 +58868,7 @@ ${events} className: clsx(styles$A.noTop) } ) }) - ] }) + ] }, `sample-usage-${id}`) ); } if (Object.keys(sample2 == null ? void 0 : sample2.metadata).length > 0) { @@ -58931,7 +58883,7 @@ ${events} className: clsx("tab-pane", styles$A.noTop) } ) }) - ] }) + ] }, `sample-metadata-${id}`) ); } if (Object.keys(sample2 == null ? void 0 : sample2.store).length > 0) { @@ -58946,7 +58898,7 @@ ${events} className: clsx("tab-pane", styles$A.noTop) } ) }) - ] }) + ] }, `sample-store-${id}`) ); } return sampleMetadatas; @@ -59090,40 +59042,6 @@ ${events} }, [setInitialScrollPosition] ); - const headerEls = []; - headerEls.push( - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("modal-title", "text-size-smaller", styles$h.title), children: title2 || "" }) - ); - if (detail2) { - headerEls.push( - /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: styles$h.detail, children: [ - (detailTools == null ? void 0 : detailTools.left) ? detailTools.left.map((tool2) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx(TitleTool, { ...tool2 }); - }) : "", - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-smaller", styles$h.detailText), children: /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: detail2 }) }), - (detailTools == null ? void 0 : detailTools.right) ? detailTools.right.map((tool2) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx(TitleTool, { ...tool2 }); - }) : "" - ] }) - ); - } - headerEls.push( - /* @__PURE__ */ jsxRuntimeExports.jsx( - "button", - { - type: "button", - className: clsx( - "btn", - "btn-close-large-dialog", - "text-size-larger", - styles$h.close - ), - onClick: onHide, - "aria-label": "Close", - children: /* @__PURE__ */ jsxRuntimeExports.jsx(HtmlEntity, { html: "×" }) - } - ) - ); return /* @__PURE__ */ jsxRuntimeExports.jsx( "div", { @@ -59146,7 +59064,39 @@ ${events} ), role: "document", children: /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: clsx("modal-content", styles$h.content), children: [ - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("modal-header", styles$h.header), children: headerEls }), + /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: clsx("modal-header", styles$h.header), children: [ + /* @__PURE__ */ jsxRuntimeExports.jsx( + "div", + { + className: clsx("modal-title", "text-size-smaller", styles$h.title), + children: title2 || "" + } + ), + detail2 ? /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: styles$h.detail, children: [ + (detailTools == null ? void 0 : detailTools.left) ? detailTools.left.map((tool2, idx) => { + return /* @__PURE__ */ jsxRuntimeExports.jsx(TitleTool, { ...tool2 }, `tool-left-${idx}`); + }) : "", + /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx("text-size-smaller", styles$h.detailText), children: /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: detail2 }) }), + (detailTools == null ? void 0 : detailTools.right) ? detailTools.right.map((tool2, idx) => { + return /* @__PURE__ */ jsxRuntimeExports.jsx(TitleTool, { ...tool2 }, `tool-right-${idx}`); + }) : "" + ] }) : void 0, + /* @__PURE__ */ jsxRuntimeExports.jsx( + "button", + { + type: "button", + className: clsx( + "btn", + "btn-close-large-dialog", + "text-size-larger", + styles$h.close + ), + onClick: onHide, + "aria-label": "Close", + children: /* @__PURE__ */ jsxRuntimeExports.jsx(HtmlEntity, { html: "×" }) + } + ) + ] }), /* @__PURE__ */ jsxRuntimeExports.jsx(ProgressBar, { animating: showProgress }), /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: "modal-body", ref: scrollRef, onScroll, children: children2 }), modalFooter @@ -60050,7 +60000,7 @@ ${events} } ), index2 < steps.length - 1 ? separator2 : "" - ] }); + ] }, `solver-step-${index2}`); }); return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$a.container, children: details }); }; @@ -60164,7 +60114,8 @@ ${events} name: key2, scores: scorers[key2].scores, params: scorers[key2].params - } + }, + key2 ); }); taskColumns.push({ @@ -60191,7 +60142,8 @@ ${events} className: "text-size-small", entries: taskInformation, tableOptions: "sm" - } + }, + `plan-md-task` ) }); if (task_args && Object.keys(task_args).length > 0) { @@ -60204,7 +60156,8 @@ ${events} className: "text-size-small", entries: task_args, tableOptions: "sm" - } + }, + `plan-md-task-args` ) }); } @@ -60218,7 +60171,8 @@ ${events} className: "text-size-small", entries: model_args, tableOptions: "sm" - } + }, + `plan-md-model-args` ) }); } @@ -60232,7 +60186,8 @@ ${events} className: "text-size-small", entries: config2, tableOptions: "sm" - } + }, + `plan-md-config` ) }); } @@ -60249,7 +60204,8 @@ ${events} className: "text-size-small", entries: generate_record, tableOptions: "sm" - } + }, + `plan-md-generate-config` ) }); } @@ -60263,7 +60219,8 @@ ${events} className: "text-size-small", entries: metadata2, tableOptions: "sm" - } + }, + `plan-md-metadata` ) }); } @@ -60276,12 +60233,28 @@ ${events} gridTemplateColumns: `repeat(${taskColumns.length}, auto)` }, children: taskColumns.map((col) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx(PlanColumn, { title: col.title, className: col.className, children: col.contents }); + return /* @__PURE__ */ jsxRuntimeExports.jsx( + PlanColumn, + { + title: col.title, + className: col.className, + children: col.contents + }, + `plan-col-${col.title}` + ); }) } ), /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx(styles$9.row), children: metadataColumns.map((col) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx(PlanColumn, { title: col.title, className: col.className, children: col.contents }); + return /* @__PURE__ */ jsxRuntimeExports.jsx( + PlanColumn, + { + title: col.title, + className: col.className, + children: col.contents + }, + `plan-col-${col.title}` + ); }) }) ] }); }; @@ -60412,23 +60385,6 @@ ${events} reactExports.useEffect(() => { setHidden(false); }, [evalSpec, evalPlan, evalResults, evalStats, samples]); - const infoCards = []; - infoCards.push([ - /* @__PURE__ */ jsxRuntimeExports.jsx( - PlanCard, - { - evalSpec, - evalPlan, - scores: evalResults == null ? void 0 : evalResults.scores - } - ) - ]); - if (evalStatus !== "started") { - infoCards.push(/* @__PURE__ */ jsxRuntimeExports.jsx(UsageCard, { stats: evalStats })); - } - if (evalStatus === "error" && evalError) { - infoCards.unshift(/* @__PURE__ */ jsxRuntimeExports.jsx(TaskErrorCard, { error: evalError })); - } const showWarning = (!samples || samples.length === 0) && evalStatus === "success" && (evalSpec == null ? void 0 : evalSpec.dataset.samples) && evalSpec.dataset.samples > 0; return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { style: { width: "100%" }, children: [ showWarning ? /* @__PURE__ */ jsxRuntimeExports.jsx( @@ -60440,7 +60396,18 @@ ${events} type: "warning" } ) : "", - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { style: { padding: "0.5em 1em 0 1em", width: "100%" }, children: infoCards }) + /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { style: { padding: "0.5em 1em 0 1em", width: "100%" }, children: [ + /* @__PURE__ */ jsxRuntimeExports.jsx( + PlanCard, + { + evalSpec, + evalPlan, + scores: evalResults == null ? void 0 : evalResults.scores + } + ), + evalStatus !== "started" ? /* @__PURE__ */ jsxRuntimeExports.jsx(UsageCard, { stats: evalStats }) : void 0, + evalStatus === "error" && evalError ? /* @__PURE__ */ jsxRuntimeExports.jsx(TaskErrorCard, { error: evalError }) : void 0 + ] }) ] }); }; const navbarContainer = "_navbarContainer_838qu_1"; @@ -60529,25 +60496,31 @@ ${events} taskStatus, secondaryContainer }; - const simpleMetricsRows = "_simpleMetricsRows_13pa9_1"; - const multiMetricsRows = "_multiMetricsRows_13pa9_12"; - const verticalMetricReducer = "_verticalMetricReducer_13pa9_26"; - const verticalMetricName = "_verticalMetricName_13pa9_33"; - const verticalMetricValue = "_verticalMetricValue_13pa9_41"; - const multiScorerReducer = "_multiScorerReducer_13pa9_47"; - const multiScorerLabel = "_multiScorerLabel_13pa9_52"; - const multiScorerValue = "_multiScorerValue_13pa9_58"; - const multiScorerValueContent = "_multiScorerValueContent_13pa9_65"; + const simpleMetricsRows = "_simpleMetricsRows_tnqkm_1"; + const multiMetricsRows = "_multiMetricsRows_tnqkm_12"; + const verticalMetricReducer = "_verticalMetricReducer_tnqkm_26"; + const verticalMetricName = "_verticalMetricName_tnqkm_33"; + const verticalMetricValue = "_verticalMetricValue_tnqkm_41"; + const multiScorer = "_multiScorer_tnqkm_46"; + const multiScorerIndent = "_multiScorerIndent_tnqkm_54"; + const multiScorerReducer = "_multiScorerReducer_tnqkm_58"; + const multiScorerLabel = "_multiScorerLabel_tnqkm_64"; + const multiScorerValue = "_multiScorerValue_tnqkm_70"; + const multiScorerValueContent = "_multiScorerValueContent_tnqkm_79"; + const multiScoreMetricGrid = "_multiScoreMetricGrid_tnqkm_84"; const styles$3 = { simpleMetricsRows, multiMetricsRows, verticalMetricReducer, verticalMetricName, verticalMetricValue, + multiScorer, + multiScorerIndent, multiScorerReducer, multiScorerLabel, multiScorerValue, - multiScorerValueContent + multiScorerValueContent, + multiScoreMetricGrid }; const ResultsPanel = ({ results }) => { var _a2, _b2; @@ -60560,37 +60533,45 @@ ${events} metric: { name: key2, value: score2.metrics[key2].value, - options: {}, + params: score2.metrics[key2].params, metadata: {} } }; }); }); const metrics = Object.values(scorers)[0]; + const showReducer = !!metrics[0].reducer; return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$3.simpleMetricsRows, children: metrics.map((metric2, i2) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx(VerticalMetric, { metricSummary: metric2, isFirst: i2 === 0 }); + return /* @__PURE__ */ jsxRuntimeExports.jsx( + VerticalMetric, + { + metricSummary: metric2, + isFirst: i2 === 0, + showReducer + }, + `simple-metric-${i2}` + ); }) }); } else { + const showReducer = (results == null ? void 0 : results.scores.findIndex((score2) => !!score2.reducer)) !== -1; return /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$3.multiMetricsRows, children: (_b2 = results == null ? void 0 : results.scores) == null ? void 0 : _b2.map((score2, index2) => { - return /* @__PURE__ */ jsxRuntimeExports.jsx(MultiScorerMetric, { scorer: score2, isFirst: index2 === 0 }); + return /* @__PURE__ */ jsxRuntimeExports.jsx( + MultiScorerMetric, + { + scorer: score2, + isFirst: index2 === 0, + showReducer + }, + `multi-metric-${index2}` + ); }) }); } }; const VerticalMetric = ({ metricSummary, - isFirst + isFirst, + showReducer }) => { - const reducer_component = metricSummary.reducer ? /* @__PURE__ */ jsxRuntimeExports.jsx( - "div", - { - className: clsx( - "text-style-label", - "text-style-secondary", - styles$3.verticalMetricReducer - ), - children: metricSummary.reducer - } - ) : ""; return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { style: { paddingLeft: isFirst ? "0" : "1em" }, children: [ /* @__PURE__ */ jsxRuntimeExports.jsx( "div", @@ -60601,14 +60582,28 @@ ${events} "text-style-secondary", styles$3.verticalMetricName ), - children: metricSummary.metric.name + children: metricDisplayName(metricSummary.metric) } ), - reducer_component, + showReducer ? /* @__PURE__ */ jsxRuntimeExports.jsx( + "div", + { + className: clsx( + "text-style-label", + "text-style-secondary", + styles$3.verticalMetricReducer + ), + children: metricSummary.reducer || "default" + } + ) : void 0, /* @__PURE__ */ jsxRuntimeExports.jsx( "div", { - className: clsx("vertical-metric-value", styles$3.verticalMetricValue), + className: clsx( + "vertical-metric-value", + "text-size-largest", + styles$3.verticalMetricValue + ), children: formatPrettyDecimal(metricSummary.metric.value) } ) @@ -60616,46 +60611,55 @@ ${events} }; const MultiScorerMetric = ({ scorer, - isFirst + isFirst, + showReducer }) => { const titleFontClz = "text-size-base"; const reducerFontClz = "text-size-smaller"; const valueFontClz = "text-size-base"; - const reducer_component = scorer.reducer ? /* @__PURE__ */ jsxRuntimeExports.jsx( + return /* @__PURE__ */ jsxRuntimeExports.jsxs( "div", { className: clsx( - reducerFontClz, - "text-style-label", - "text-style-secondary", - styles$3.multiScorerReducer + styles$3.multiScorer, + isFirst ? styles$3.multiScorerIndent : void 0 ), - children: scorer.reducer - } - ) : ""; - return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { style: { paddingLeft: isFirst ? "0" : "1.5em" }, children: [ - /* @__PURE__ */ jsxRuntimeExports.jsx( - "div", - { - className: clsx( - titleFontClz, - "text-style-label", - "text-style-secondary", - "multi-score-label", - styles$3.multiScorerLabel + children: [ + /* @__PURE__ */ jsxRuntimeExports.jsx( + "div", + { + className: clsx( + titleFontClz, + "text-style-label", + "text-style-secondary", + "multi-score-label", + styles$3.multiScorerLabel + ), + children: scorer.name + } ), - children: scorer.name - } - ), - reducer_component, - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx(valueFontClz, styles$3.multiScorerValue), children: Object.keys(scorer.metrics).map((key2) => { - const metric2 = scorer.metrics[key2]; - return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { children: [ - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: metric2.name }), - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$3.multiScorerValueContent, children: formatPrettyDecimal(metric2.value) }) - ] }); - }) }) - ] }); + showReducer ? /* @__PURE__ */ jsxRuntimeExports.jsx( + "div", + { + className: clsx( + reducerFontClz, + "text-style-label", + "text-style-secondary", + styles$3.multiScorerReducer + ), + children: scorer.reducer || "default" + } + ) : void 0, + /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx(valueFontClz, styles$3.multiScorerValue), children: Object.keys(scorer.metrics).map((key2) => { + const metric2 = scorer.metrics[key2]; + return /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: styles$3.multiScoreMetricGrid, children: [ + /* @__PURE__ */ jsxRuntimeExports.jsx("div", { children: metricDisplayName(metric2) }), + /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: styles$3.multiScorerValueContent, children: formatPrettyDecimal(metric2.value) }) + ] }, key2); + }) }) + ] + } + ); }; const statusPanel = "_statusPanel_1fzh4_1"; const statusIcon = "_statusIcon_1fzh4_10"; @@ -60725,16 +60729,6 @@ ${events} evalSpec, setOffcanvas }) => { - let statusPanel2; - if (status === "success") { - statusPanel2 = /* @__PURE__ */ jsxRuntimeExports.jsx(ResultsPanel, { results: evalResults }); - } else if (status === "cancelled") { - statusPanel2 = /* @__PURE__ */ jsxRuntimeExports.jsx(CancelledPanel, { sampleCount: (samples == null ? void 0 : samples.length) || 0 }); - } else if (status === "started") { - statusPanel2 = /* @__PURE__ */ jsxRuntimeExports.jsx(RunningPanel, { sampleCount: (samples == null ? void 0 : samples.length) || 0 }); - } else if (status === "error") { - statusPanel2 = /* @__PURE__ */ jsxRuntimeExports.jsx(ErroredPanel, { sampleCount: (samples == null ? void 0 : samples.length) || 0 }); - } const logFileName = file ? filename(file) : ""; const handleToggle = reactExports.useCallback(() => { setOffcanvas(!offcanvas); @@ -60798,7 +60792,12 @@ ${events} ] } ), - /* @__PURE__ */ jsxRuntimeExports.jsx("div", { className: clsx(styles$4.taskStatus, "navbar-text"), children: statusPanel2 }), + /* @__PURE__ */ jsxRuntimeExports.jsxs("div", { className: clsx(styles$4.taskStatus, "navbar-text"), children: [ + status === "success" ? /* @__PURE__ */ jsxRuntimeExports.jsx(ResultsPanel, { results: evalResults }) : void 0, + status === "cancelled" ? /* @__PURE__ */ jsxRuntimeExports.jsx(CancelledPanel, { sampleCount: (samples == null ? void 0 : samples.length) || 0 }) : void 0, + status === "started" ? /* @__PURE__ */ jsxRuntimeExports.jsx(RunningPanel, { sampleCount: (samples == null ? void 0 : samples.length) || 0 }) : void 0, + status === "error" ? /* @__PURE__ */ jsxRuntimeExports.jsx(ErroredPanel, { sampleCount: (samples == null ? void 0 : samples.length) || 0 }) : void 0 + ] }), /* @__PURE__ */ jsxRuntimeExports.jsx("div", { id: "task-created", style: { display: "none" }, children: evalSpec == null ? void 0 : evalSpec.created }) ] }); }; @@ -60882,7 +60881,8 @@ ${events} epochs } ) - } + }, + "sb-dataset" ) }); const label2 = (evalResults == null ? void 0 : evalResults.scores) && evalResults.scores.length > 1 ? "Scorers" : "Scorer"; @@ -60898,7 +60898,8 @@ ${events} "text-size-small" ), children: /* @__PURE__ */ jsxRuntimeExports.jsx(ScorerSummary, { evalDescriptor }) - } + }, + "sb-scorer" ) }); if (hasConfig) { @@ -60910,7 +60911,8 @@ ${events} label: "Config", className: clsx(styles$1.justifyRight, "text-size-small"), children: /* @__PURE__ */ jsxRuntimeExports.jsx(ParamSummary, { params: hyperparameters }) - } + }, + "sb-params" ) }); } @@ -60927,7 +60929,8 @@ ${events} label: "Duration", className: clsx(styles$1.justifyRight, "text-size-small"), children: totalDuration - } + }, + "sb-duration" ) }); } @@ -61085,31 +61088,6 @@ ${events} }, [setSelectedTab] ); - const tabPanels2 = reactExports.useMemo(() => { - return Object.keys(tabs2).map((key2) => { - var _a2; - const tab2 = tabs2[key2]; - return /* @__PURE__ */ jsxRuntimeExports.jsx( - TabPanel, - { - id: tab2.id, - title: tab2.label, - onSelected, - selected: selectedTab === tab2.id, - scrollable: !!tab2.scrollable, - scrollRef: tab2.scrollRef, - scrollPosition: (_a2 = workspaceTabScrollPositionRef.current) == null ? void 0 : _a2[tab2.id], - setScrollPosition: reactExports.useCallback( - (position) => { - onScroll(tab2.id, position); - }, - [onScroll] - ), - children: tab2.content() - } - ); - }); - }, [tabs2, selectedTab]); if (evalSpec === void 0) { return /* @__PURE__ */ jsxRuntimeExports.jsx(EmptyPanel, {}); } else { @@ -61152,7 +61130,30 @@ ${events} className: clsx(styles.tabSet, "text-size-smaller"), tabControlsClassName: clsx(styles.tabs, "text-size-smaller"), tabPanelsClassName: clsx(styles.tabPanels), - children: tabPanels2 + children: Object.keys(tabs2).map((key2) => { + var _a2; + const tab2 = tabs2[key2]; + return /* @__PURE__ */ jsxRuntimeExports.jsx( + TabPanel, + { + id: tab2.id, + title: tab2.label, + onSelected, + selected: selectedTab === tab2.id, + scrollable: !!tab2.scrollable, + scrollRef: tab2.scrollRef, + scrollPosition: (_a2 = workspaceTabScrollPositionRef.current) == null ? void 0 : _a2[tab2.id], + setScrollPosition: reactExports.useCallback( + (position) => { + onScroll(tab2.id, position); + }, + [onScroll] + ), + children: tab2.content() + }, + tab2.id + ); + }) } ) }) }) ] }); diff --git a/src/inspect_ai/_view/www/log-schema.json b/src/inspect_ai/_view/www/log-schema.json index a3815b2af..90a5a2fc2 100644 --- a/src/inspect_ai/_view/www/log-schema.json +++ b/src/inspect_ai/_view/www/log-schema.json @@ -157,6 +157,7 @@ "type": "object" }, "ChatCompletionChoice": { + "description": "Choice generated for completion.", "properties": { "message": { "$ref": "#/$defs/ChatMessageAssistant" @@ -196,7 +197,14 @@ "additionalProperties": false }, "ChatMessageAssistant": { + "description": "Assistant chat message.", "properties": { + "role": { + "const": "assistant", + "default": "assistant", + "title": "Role", + "type": "string" + }, "content": { "anyOf": [ { @@ -240,12 +248,6 @@ "default": null, "title": "Source" }, - "role": { - "const": "assistant", - "default": "assistant", - "title": "Role", - "type": "string" - }, "tool_calls": { "anyOf": [ { @@ -275,9 +277,9 @@ } }, "required": [ + "role", "content", "source", - "role", "tool_calls", "reasoning" ], @@ -286,7 +288,14 @@ "additionalProperties": false }, "ChatMessageSystem": { + "description": "System chat message.", "properties": { + "role": { + "const": "system", + "default": "system", + "title": "Role", + "type": "string" + }, "content": { "anyOf": [ { @@ -329,25 +338,26 @@ ], "default": null, "title": "Source" - }, - "role": { - "const": "system", - "default": "system", - "title": "Role", - "type": "string" } }, "required": [ + "role", "content", - "source", - "role" + "source" ], "title": "ChatMessageSystem", "type": "object", "additionalProperties": false }, "ChatMessageTool": { + "description": "Tool chat message.", "properties": { + "role": { + "const": "tool", + "default": "tool", + "title": "Role", + "type": "string" + }, "content": { "anyOf": [ { @@ -391,12 +401,6 @@ "default": null, "title": "Source" }, - "role": { - "const": "tool", - "default": "tool", - "title": "Role", - "type": "string" - }, "tool_call_id": { "anyOf": [ { @@ -434,9 +438,9 @@ } }, "required": [ + "role", "content", "source", - "role", "tool_call_id", "function", "error" @@ -446,7 +450,14 @@ "additionalProperties": false }, "ChatMessageUser": { + "description": "User chat message.", "properties": { + "role": { + "const": "user", + "default": "user", + "title": "Role", + "type": "string" + }, "content": { "anyOf": [ { @@ -490,12 +501,6 @@ "default": null, "title": "Source" }, - "role": { - "const": "user", - "default": "user", - "title": "Role", - "type": "string" - }, "tool_call_id": { "anyOf": [ { @@ -513,9 +518,9 @@ } }, "required": [ + "role", "content", "source", - "role", "tool_call_id" ], "title": "ChatMessageUser", @@ -523,6 +528,7 @@ "additionalProperties": false }, "ContentAudio": { + "description": "Audio content.", "properties": { "type": { "const": "audio", @@ -553,6 +559,7 @@ "additionalProperties": false }, "ContentImage": { + "description": "Image content.", "properties": { "type": { "const": "image", @@ -585,6 +592,7 @@ "additionalProperties": false }, "ContentText": { + "description": "Text content.", "properties": { "type": { "const": "text", @@ -606,6 +614,7 @@ "additionalProperties": false }, "ContentVideo": { + "description": "Video content.", "properties": { "type": { "const": "video", @@ -677,6 +686,7 @@ "additionalProperties": false }, "EvalConfig": { + "description": "Configuration used for evaluation.", "properties": { "limit": { "anyOf": [ @@ -954,6 +964,7 @@ "additionalProperties": false }, "EvalDataset": { + "description": "Dataset used for evaluation.", "properties": { "name": { "anyOf": [ @@ -1038,6 +1049,7 @@ "additionalProperties": false }, "EvalError": { + "description": "Eval error details.", "properties": { "message": { "title": "Message", @@ -1062,6 +1074,7 @@ "additionalProperties": false }, "EvalMetric": { + "description": "Metric for evaluation score.", "properties": { "name": { "title": "Name", @@ -1078,8 +1091,8 @@ ], "title": "Value" }, - "options": { - "title": "Options", + "params": { + "title": "Params", "type": "object" }, "metadata": { @@ -1098,7 +1111,7 @@ "required": [ "name", "value", - "options", + "params", "metadata" ], "title": "EvalMetric", @@ -1106,6 +1119,7 @@ "additionalProperties": false }, "EvalPlan": { + "description": "Plan (solvers) used in evaluation.", "properties": { "name": { "default": "plan", @@ -1171,6 +1185,7 @@ "additionalProperties": false }, "EvalPlanStep": { + "description": "Solver step.", "properties": { "solver": { "title": "Solver", @@ -1190,6 +1205,7 @@ "additionalProperties": false }, "EvalResults": { + "description": "Scoring results from evaluation.", "properties": { "total_samples": { "default": 0, @@ -1233,6 +1249,7 @@ "additionalProperties": false }, "EvalRevision": { + "description": "Git revision for evaluation.", "properties": { "type": { "const": "git", @@ -1258,6 +1275,7 @@ "additionalProperties": false }, "EvalSample": { + "description": "Sample from evaluation task.", "properties": { "id": { "anyOf": [ @@ -1526,6 +1544,7 @@ "additionalProperties": false }, "EvalSampleLimit": { + "description": "Limit encontered by sample.", "properties": { "type": { "enum": [ @@ -1553,6 +1572,7 @@ "additionalProperties": false }, "EvalSampleReductions": { + "description": "Score reductions.", "properties": { "scorer": { "title": "Scorer", @@ -1588,6 +1608,7 @@ "additionalProperties": false }, "EvalSampleScore": { + "description": "Score and sample_id scored.", "properties": { "value": { "anyOf": [ @@ -1711,6 +1732,7 @@ "additionalProperties": false }, "EvalScore": { + "description": "Score for evaluation task.", "properties": { "name": { "title": "Name", @@ -1769,6 +1791,7 @@ "additionalProperties": false }, "EvalSpec": { + "description": "Eval target and configuration.", "properties": { "run_id": { "title": "Run Id", @@ -1945,6 +1968,7 @@ "additionalProperties": false }, "EvalStats": { + "description": "Timing and usage statistics.", "properties": { "started_at": { "title": "Started At", @@ -1972,7 +1996,7 @@ "additionalProperties": false }, "GenerateConfig": { - "description": "Base class for model generation configs.", + "description": "Model generation options.", "properties": { "max_retries": { "anyOf": [ @@ -2321,6 +2345,18 @@ "title": "Event", "type": "string" }, + "source": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Source" + }, "data": { "$ref": "#/$defs/JsonValue" } @@ -2329,6 +2365,7 @@ "timestamp", "pending", "event", + "source", "data" ], "title": "InfoEvent", @@ -2474,6 +2511,7 @@ "additionalProperties": false }, "LoggingMessage": { + "description": "Message written to Python log.", "properties": { "name": { "anyOf": [ @@ -2772,6 +2810,7 @@ "additionalProperties": false }, "ModelOutput": { + "description": "Output from model generation.", "properties": { "model": { "title": "Model", @@ -2846,6 +2885,7 @@ "additionalProperties": false }, "ModelUsage": { + "description": "Token usage for completion.", "properties": { "input_tokens": { "default": 0, @@ -2899,6 +2939,7 @@ "additionalProperties": false }, "Sample": { + "description": "Sample for an evaluation task.", "properties": { "input": { "anyOf": [ @@ -3173,7 +3214,7 @@ "type": "array" }, "Score": { - "description": "Score generated by a scorer.\n\nArgs:\n value (Value): Score value.\n answer (str | None): Answer extracted from model output (optional).\n explanation (str | None): Explanation of score (optional).\n metadata (dict[str,Any]): Additional metadata related to the score.", + "description": "Score generated by a scorer.", "properties": { "value": { "anyOf": [ @@ -3281,7 +3322,7 @@ "additionalProperties": false }, "ScoreEvent": { - "description": "Event with sample score.", + "description": "Event with score.\n\nCan be the final score for a `Sample`, or can be an intermediate score\nresulting from a call to `score`.", "properties": { "timestamp": { "format": "date-time", @@ -3326,6 +3367,11 @@ ], "default": null, "title": "Target" + }, + "intermediate": { + "default": false, + "title": "Intermediate", + "type": "boolean" } }, "required": [ @@ -3333,7 +3379,8 @@ "pending", "event", "score", - "target" + "target", + "intermediate" ], "title": "ScoreEvent", "type": "object", @@ -4223,6 +4270,7 @@ "additionalProperties": false } }, + "description": "Evaluation log.", "properties": { "version": { "default": 2, @@ -4244,37 +4292,7 @@ "$ref": "#/$defs/EvalSpec" }, "plan": { - "$ref": "#/$defs/EvalPlan", - "default": { - "name": "plan", - "steps": [], - "finish": null, - "config": { - "best_of": null, - "cache_prompt": null, - "frequency_penalty": null, - "internal_tools": null, - "logit_bias": null, - "logprobs": null, - "max_connections": null, - "max_retries": null, - "max_tokens": null, - "max_tool_output": null, - "num_choices": null, - "parallel_tool_calls": null, - "presence_penalty": null, - "reasoning_effort": null, - "reasoning_history": null, - "seed": null, - "stop_seqs": null, - "system_message": null, - "temperature": null, - "timeout": null, - "top_k": null, - "top_logprobs": null, - "top_p": null - } - } + "$ref": "#/$defs/EvalPlan" }, "results": { "anyOf": [ @@ -4288,12 +4306,7 @@ "default": null }, "stats": { - "$ref": "#/$defs/EvalStats", - "default": { - "started_at": "", - "completed_at": "", - "model_usage": {} - } + "$ref": "#/$defs/EvalStats" }, "error": { "anyOf": [ diff --git a/src/inspect_ai/_view/www/package.json b/src/inspect_ai/_view/www/package.json index 2bb6c0330..d40592500 100644 --- a/src/inspect_ai/_view/www/package.json +++ b/src/inspect_ai/_view/www/package.json @@ -8,7 +8,7 @@ "scripts": { "build": "vite build", "watch": "vite build --watch", - "dev-watch": "vite build --mode development --watch", + "dev-watch": "NODE_ENV=development vite build --mode development --watch", "dev": "vite", "prettier:check": "prettier --check src", "prettier:write": "prettier --write src", diff --git a/src/inspect_ai/_view/www/src/App.tsx b/src/inspect_ai/_view/www/src/App.tsx index a908c1ef8..f2e2a2595 100644 --- a/src/inspect_ai/_view/www/src/App.tsx +++ b/src/inspect_ai/_view/www/src/App.tsx @@ -990,6 +990,7 @@ const defaultScorers = (log: EvalSummary): Array => { }, [] as Array); } else if (log.sampleSummaries && log.sampleSummaries.length > 0) { const scores = log.sampleSummaries[0].scores; + if (scores !== null) { return Object.keys(scores).map((key) => { return { diff --git a/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx b/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx index 5b189f7f4..39c2ff0b6 100644 --- a/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx +++ b/src/inspect_ai/_view/www/src/components/AnsiDisplay.tsx @@ -1,6 +1,6 @@ import { ANSIColor, ANSIOutput, ANSIOutputRun, ANSIStyle } from "ansi-output"; import clsx from "clsx"; -import "./ANSIDisplay.css"; +import "./AnsiDisplay.css"; interface ANSIDisplayProps { output: string; diff --git a/src/inspect_ai/_view/www/src/components/JsonPanel.tsx b/src/inspect_ai/_view/www/src/components/JsonPanel.tsx index 3b57c8699..09633f19b 100644 --- a/src/inspect_ai/_view/www/src/components/JsonPanel.tsx +++ b/src/inspect_ai/_view/www/src/components/JsonPanel.tsx @@ -1,7 +1,7 @@ import clsx from "clsx"; import { highlightElement } from "prismjs"; import React, { useEffect, useMemo, useRef } from "react"; -import "./JSONPanel.css"; +import "./JsonPanel.css"; const kPrismRenderMaxSize = 250000; diff --git a/src/inspect_ai/_view/www/src/components/LargeModal.tsx b/src/inspect_ai/_view/www/src/components/LargeModal.tsx index d7ce00305..1c38862c9 100644 --- a/src/inspect_ai/_view/www/src/components/LargeModal.tsx +++ b/src/inspect_ai/_view/www/src/components/LargeModal.tsx @@ -78,54 +78,6 @@ export const LargeModal: React.FC = ({ [setInitialScrollPosition], ); - // Capture header elements - const headerEls = []; - // The title - headerEls.push( -
- {title || ""} -
, - ); - - // A centered text element with tools to the left and right - if (detail) { - headerEls.push( -
- {detailTools?.left - ? detailTools.left.map((tool) => { - return ; - }) - : ""} -
-
{detail}
-
- - {detailTools?.right - ? detailTools.right.map((tool) => { - return ; - }) - : ""} -
, - ); - } - - // The close 'x' - headerEls.push( - , - ); - return (
= ({ role="document" >
-
{headerEls}
+
+
+ {title || ""} +
+ + {detail ? ( +
+ {detailTools?.left + ? detailTools.left.map((tool, idx) => { + return ; + }) + : ""} +
+
{detail}
+
+ + {detailTools?.right + ? detailTools.right.map((tool, idx) => { + return ; + }) + : ""} +
+ ) : undefined} + +
{children} diff --git a/src/inspect_ai/_view/www/src/components/NavPills.tsx b/src/inspect_ai/_view/www/src/components/NavPills.tsx index 7bd501552..c545ab7cf 100644 --- a/src/inspect_ai/_view/www/src/components/NavPills.tsx +++ b/src/inspect_ai/_view/www/src/components/NavPills.tsx @@ -26,6 +26,7 @@ export const NavPills: React.FC = ({ children }) => { : `Tab ${idx}`; return ( = ({ children }) => { }); // Wrap each of the children in a 'body' to control its visibility - const navBodies = children.map((child) => { + const navBodies = children.map((child, idx) => { return (
= ({ tools, children, }) => { - const validTabs: ReactElement[] = Array.isArray(children) - ? (children.filter(Boolean) as ReactElement[]) - : [children]; - + const validTabs = flattenChildren(children); if (validTabs.length === 0) return null; return ( @@ -198,3 +197,19 @@ const TabTools: React.FC<{ tools?: React.ReactNode }> = ({ tools }) => ( // Utility functions const computeTabId = (id: string, index: number) => `${id}-${index}`; const computeTabContentsId = (id: string) => `${id}-contents`; + +const flattenChildren = ( + children: React.ReactNode, +): ReactElement[] => { + return Children.toArray(children).flatMap((child) => { + if (isValidElement(child)) { + const element = child as React.ReactElement; + + if (element.type === Fragment) { + return flattenChildren(element.props.children); + } + return element; + } + return []; + }); +}; diff --git a/src/inspect_ai/_view/www/src/logfile/remoteLogFile.ts b/src/inspect_ai/_view/www/src/logfile/remoteLogFile.ts index 047f98afb..30d80d09a 100644 --- a/src/inspect_ai/_view/www/src/logfile/remoteLogFile.ts +++ b/src/inspect_ai/_view/www/src/logfile/remoteLogFile.ts @@ -101,7 +101,6 @@ export const openRemoteLogFile = async ( if (remoteZipFile.centralDirectory.has(sampleFile)) { return (await readJSONFile(sampleFile, MAX_BYTES)) as EvalSample; } else { - console.log({ dir: remoteZipFile.centralDirectory }); throw new Error( `Unable to read sample file ${sampleFile} - it is not present in the manifest.`, ); diff --git a/src/inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx b/src/inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx index b6fc6811b..4c2c18f66 100644 --- a/src/inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx +++ b/src/inspect_ai/_view/www/src/metadata/MetaDataGrid.tsx @@ -26,7 +26,7 @@ export const MetaDataGrid: React.FC = ({ const entryEls = entryRecords(entries).map((entry, index) => { const id = `${baseId}-value-${index}`; return ( - +
= ({ const entryEls = (coercedEntries || []).map((entry, index) => { const id = `${baseId}-value-${index}`; return ( - + = ({ const scorerPanels = Object.keys(scorers).map((key) => { return ( } @@ -159,6 +160,7 @@ export const PlanDetailView: React.FC = ({ className: cols === 1 ? styles.oneCol : styles.twoCol, contents: ( = ({ className: cols === 1 ? styles.oneCol : styles.twoCol, contents: ( } tableOptions="sm" @@ -185,6 +188,7 @@ export const PlanDetailView: React.FC = ({ className: cols === 1 ? styles.oneCol : styles.twoCol, contents: ( } tableOptions="sm" @@ -199,6 +203,7 @@ export const PlanDetailView: React.FC = ({ className: cols === 1 ? styles.oneCol : styles.twoCol, contents: ( = ({ className: cols === 1 ? styles.oneCol : styles.twoCol, contents: ( = ({ className: cols === 1 ? styles.oneCol : styles.twoCol, contents: ( = ({ > {taskColumns.map((col) => { return ( - + {col.contents} ); @@ -259,7 +270,11 @@ export const PlanDetailView: React.FC = ({
{metadataColumns.map((col) => { return ( - + {col.contents} ); diff --git a/src/inspect_ai/_view/www/src/plan/SolverDetailView.tsx b/src/inspect_ai/_view/www/src/plan/SolverDetailView.tsx index 1cc8622ff..48d86d256 100644 --- a/src/inspect_ai/_view/www/src/plan/SolverDetailView.tsx +++ b/src/inspect_ai/_view/www/src/plan/SolverDetailView.tsx @@ -18,7 +18,7 @@ export const SolversDetailView: React.FC = ({ steps }) => { const details = steps?.map((step, index) => { return ( - + = ({ if (!isVscode()) { tools.push( { @@ -101,6 +102,7 @@ export const SampleDisplay: React.FC = ({ > {sample.events && sample.events.length > 0 ? ( = ({ ) : null} = ({ {scorerNames.length === 1 ? ( = ({ const tabId = `score-${scorer}`; return ( = ({ ); }; -const metadataViewsForSample = (_id: string, sample: EvalSample) => { +const metadataViewsForSample = (id: string, sample: EvalSample) => { const sampleMetadatas = []; if (sample.model_usage && Object.keys(sample.model_usage).length > 0) { sampleMetadatas.push( - + { if (Object.keys(sample?.metadata).length > 0) { sampleMetadatas.push( - + { if (Object.keys(sample?.store).length > 0) { sampleMetadatas.push( - + = ({ .join(" ")}`, }} > - {columns.map((col) => { + {columns.map((col, idx) => { return (
= ({
); })} - {columns.map((col) => { + {columns.map((col, idx) => { return (
= ({ scores, sampleDescriptor, }) => { - const tools = []; - - tools.push( - , + return ( + + + {scores.length > 1 ? ( + + ) : undefined} + {epochs > 1 ? ( + + ) : undefined} + + ); - - if (scores.length > 1) { - tools.push( - , - ); - } - - if (epochs > 1) { - tools.push( - , - ); - } - - tools.push(); - - return tools; }; diff --git a/src/inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx b/src/inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx index ac343eb48..6229a517a 100644 --- a/src/inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx +++ b/src/inspect_ai/_view/www/src/samples/chat/ChatMessage.tsx @@ -42,7 +42,7 @@ export const ChatMessage: React.FC = ({ {message.role}
{message.role === "assistant" && message.reasoning ? ( - +
Reasoning
diff --git a/src/inspect_ai/_view/www/src/samples/chat/ChatView.tsx b/src/inspect_ai/_view/www/src/samples/chat/ChatView.tsx index 9f0e4a790..9fc2f9207 100644 --- a/src/inspect_ai/_view/www/src/samples/chat/ChatView.tsx +++ b/src/inspect_ai/_view/www/src/samples/chat/ChatView.tsx @@ -32,6 +32,7 @@ export const ChatView: React.FC = ({ collapsedMessages.length > 1 && numbered ? index + 1 : undefined; return ( = ({ contents }) => { return contents.map((content, index) => { if (typeof content === "string") { return messageRenderers["text"].render( + `text-content-${index}`, { type: "text", text: content, @@ -52,7 +53,11 @@ export const MessageContent: React.FC = ({ contents }) => { if (content) { const renderer = messageRenderers[content.type]; if (renderer) { - return renderer.render(content, index === contents.length - 1); + return renderer.render( + `text-${content.type}-${index}`, + content, + index === contents.length - 1, + ); } else { console.error(`Unknown message content type '${content.type}'`); } @@ -65,20 +70,29 @@ export const MessageContent: React.FC = ({ contents }) => { type: "text", text: contents, }; - return messageRenderers["text"].render(contentText, true); + return messageRenderers["text"].render( + "text-message-content", + contentText, + true, + ); } }; interface MessageRenderer { - render: (content: ContentType, isLast: boolean) => React.ReactNode; + render: ( + key: string, + content: ContentType, + isLast: boolean, + ) => React.ReactNode; } const messageRenderers: Record = { text: { - render: (content, isLast) => { + render: (key, content, isLast) => { const c = content as ContentText; return ( @@ -86,39 +100,39 @@ const messageRenderers: Record = { }, }, image: { - render: (content) => { + render: (key, content) => { const c = content as ContentImage; if (c.image.startsWith("data:")) { - return ; + return ; } else { - return {c.image}; + return {c.image}; } }, }, audio: { - render: (content) => { + render: (key, content) => { const c = content as ContentAudio; return ( -
diff --git a/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx b/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx index f64c31d00..2a806aea4 100644 --- a/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx +++ b/src/inspect_ai/_view/www/src/samples/transcript/InfoEventView.tsx @@ -35,7 +35,7 @@ export const InfoEventView: React.FC = ({ return ( = ({ tools }) => { - const toolEls = tools.map((tool) => { + const toolEls = tools.map((tool, idx) => { return ( - +
{tool.name}
diff --git a/src/inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx b/src/inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx index 09ecdc7e7..f6b5b6df8 100644 --- a/src/inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx +++ b/src/inspect_ai/_view/www/src/samples/transcript/SampleInitEventView.tsx @@ -35,9 +35,13 @@ export const SampleInitEventView: React.FC = ({ if (event.sample.files && Object.keys(event.sample.files).length > 0) { sections.push( - + {Object.keys(event.sample.files).map((file) => { - return
{file}
; + return ( +
+              {file}
+            
+ ); })}
, ); @@ -45,7 +49,7 @@ export const SampleInitEventView: React.FC = ({ if (event.sample.setup) { sections.push( - +
           {event.sample.setup}
         
@@ -75,7 +79,7 @@ export const SampleInitEventView: React.FC = ({ {event.sample.choices ? event.sample.choices.map((choice, index) => { return ( -
+
{String.fromCharCode(65 + index)}) {choice}
); @@ -88,7 +92,7 @@ export const SampleInitEventView: React.FC = ({ )} {toArray(event.sample.target).map((target) => { - return
{target}
; + return
{target}
; })}
diff --git a/src/inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx b/src/inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx index 3b3b326f4..13bd73d65 100644 --- a/src/inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx +++ b/src/inspect_ai/_view/www/src/samples/transcript/ScoreEventView.tsx @@ -37,7 +37,7 @@ export const ScoreEventView: React.FC = ({ return ( = ({ className, }) => { // Extract tool input - const { input, functionCall, inputType } = resolveToolInput( - event.function, - event.arguments, + const { input, functionCall, highlightLanguage } = useMemo( + () => resolveToolInput(event.function, event.arguments), + [event.function, event.arguments], ); // Find an approval if there is one @@ -62,7 +63,7 @@ export const ToolEventView: React.FC = ({ = ({ {navs.map((nav) => { return ( = ({ return (
diff --git a/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx b/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx index 9d255c6ba..4112096bc 100644 --- a/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx +++ b/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventRenderers.tsx @@ -1,5 +1,5 @@ import clsx from "clsx"; -import { Fragment, ReactNode } from "react"; +import { Fragment, JSX, ReactNode } from "react"; import { HumanBaselineView, SessionLog, @@ -18,7 +18,10 @@ interface Signature { interface ChangeType { type: string; signature: Signature; - render: (changes: JsonChange[], state: Record) => ReactNode; + render: ( + changes: JsonChange[], + state: Record, + ) => JSX.Element; } const system_msg_added_sig: ChangeType = { @@ -33,6 +36,7 @@ const system_msg_added_sig: ChangeType = { const message = messages[0]; return ( @@ -123,6 +127,7 @@ const human_baseline_session: ChangeType = { return ( +
{Object.keys(toolsInfo).map((key) => { return ( - +
= ({ toolDefinitions }) => { - return toolDefinitions.map((toolDefinition) => { + return toolDefinitions.map((toolDefinition, idx) => { const toolName = toolDefinition.name; const toolArgs = toolDefinition.parameters?.properties ? Object.keys(toolDefinition.parameters.properties) : []; - return ; + return ( + + ); }); }; diff --git a/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx b/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx index 70651a622..d43b6b52b 100644 --- a/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx +++ b/src/inspect_ai/_view/www/src/samples/transcript/state/StateEventView.tsx @@ -44,14 +44,6 @@ export const StateEventView: React.FC = ({ // Synthesize objects for comparison const [before, after] = synthesizeComparable(event.changes); - const tabs = [ - , - ]; // This clone is important since the state is used by react as potential values that are rendered // and as a result may be decorated with additional properties, etc..., resulting in DOM elements // appearing attached to state. @@ -60,14 +52,6 @@ export const StateEventView: React.FC = ({ structuredClone(after), isStore, ); - if (changePreview) { - tabs.unshift( -
- {changePreview} -
, - ); - } - // Compute the title const title = event.event === "state" ? "State Updated" : "Store Updated"; @@ -77,7 +61,7 @@ export const StateEventView: React.FC = ({ title={title} className={className} subTitle={formatDateTime(new Date(event.timestamp))} - text={tabs.length === 1 ? summary : undefined} + text={!changePreview ? summary : undefined} collapse={changePreview === undefined ? true : undefined} selectedNav={eventState.selectedNav || ""} setSelectedNav={(selectedNav) => { @@ -88,7 +72,17 @@ export const StateEventView: React.FC = ({ setEventState({ ...eventState, collapsed }); }} > - {tabs} + {changePreview ? ( +
+ {changePreview} +
+ ) : undefined} + ); }; @@ -153,7 +147,8 @@ const generatePreview = ( } } if (matchingOps === requiredMatchCount) { - results.push(changeType.render(changes, resolvedState)); + const el = changeType.render(changes, resolvedState); + results.push(el); // Only one renderer can process a change // TODO: consider changing this to allow many handlers to render (though then we sort of need // to match the renderer to the key (e.g. a rendered for `tool_choice` a renderer for `tools` etc..)) diff --git a/src/inspect_ai/_view/www/src/types/log.d.ts b/src/inspect_ai/_view/www/src/types/log.d.ts index e816254fd..d9c9a4df7 100644 --- a/src/inspect_ai/_view/www/src/types/log.d.ts +++ b/src/inspect_ai/_view/www/src/types/log.d.ts @@ -112,6 +112,7 @@ export type Input = | ChatMessageAssistant | ChatMessageTool )[]; +export type Role = "system"; export type Content = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; @@ -127,18 +128,17 @@ export type Type4 = "video"; export type Video = string; export type Format1 = "mp4" | "mpeg" | "mov"; export type Source = ("input" | "generate") | null; -export type Role = "system"; +export type Role1 = "user"; export type Content1 = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; export type Source1 = ("input" | "generate") | null; -export type Role1 = "user"; export type ToolCallId = string[] | null; +export type Role2 = "assistant"; export type Content2 = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; export type Source2 = ("input" | "generate") | null; -export type Role2 = "assistant"; export type ToolCalls = ToolCall[] | null; export type Id1 = string; export type Function = string; @@ -148,11 +148,11 @@ export type Title = string | null; export type Format2 = "text" | "markdown"; export type Content3 = string; export type Reasoning = string | null; +export type Role3 = "tool"; export type Content4 = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; export type Source3 = ("input" | "generate") | null; -export type Role3 = "tool"; export type ToolCallId1 = string | null; export type Function1 = string | null; export type Type6 = @@ -315,6 +315,7 @@ export type Timestamp8 = string; export type Pending8 = boolean | null; export type Event8 = "score"; export type Target2 = string | string[] | null; +export type Intermediate = boolean; export type Timestamp9 = string; export type Pending9 = boolean | null; export type Event9 = "error"; @@ -339,6 +340,7 @@ export type Lineno = number; export type Timestamp11 = string; export type Pending11 = boolean | null; export type Event11 = "info"; +export type Source4 = string | null; export type Timestamp12 = string; export type Pending12 = boolean | null; export type Event12 = "step"; @@ -424,6 +426,9 @@ export type SampleId1 = string | number | null; export type Samples2 = EvalSampleScore[]; export type Location1 = string; +/** + * Evaluation log. + */ export interface EvalLog { version?: Version; status?: Status; @@ -436,6 +441,9 @@ export interface EvalLog { reductions?: Reductions; location?: Location1; } +/** + * Eval target and configuration. + */ export interface EvalSpec { run_id: RunId; created: Created; @@ -460,6 +468,9 @@ export interface EvalSpec { } export interface TaskAttribs {} export interface TaskArgs {} +/** + * Dataset used for evaluation. + */ export interface EvalDataset { name: Name; location: Location; @@ -468,6 +479,9 @@ export interface EvalDataset { shuffled: Shuffled; } export interface ModelArgs {} +/** + * Configuration used for evaluation. + */ export interface EvalConfig { limit: Limit; sample_id: SampleId; @@ -513,6 +527,9 @@ export interface ApproverPolicyConfig { params: Params; } export interface Params {} +/** + * Git revision for evaluation. + */ export interface EvalRevision { type: Type; origin: Origin; @@ -521,19 +538,25 @@ export interface EvalRevision { export interface Packages { [k: string]: string; } +/** + * Plan (solvers) used in evaluation. + */ export interface EvalPlan { name: Name2; steps: Steps; finish: EvalPlanStep | null; config: GenerateConfig; } +/** + * Solver step. + */ export interface EvalPlanStep { solver: Solver1; params: Params1; } export interface Params1 {} /** - * Base class for model generation configs. + * Model generation options. */ export interface GenerateConfig { max_retries: MaxRetries; @@ -560,12 +583,18 @@ export interface GenerateConfig { reasoning_effort: ReasoningEffort; reasoning_history: ReasoningHistory; } +/** + * Scoring results from evaluation. + */ export interface EvalResults { total_samples: TotalSamples; completed_samples: CompletedSamples; scores: Scores; metadata: Metadata3; } +/** + * Score for evaluation task. + */ export interface EvalScore { name: Name3; scorer: Scorer; @@ -578,13 +607,19 @@ export interface Params2 {} export interface Metrics { [k: string]: EvalMetric; } +/** + * Metric for evaluation score. + */ export interface EvalMetric { name: Name4; value: Value; - options: Options; + params: Params3; metadata: Metadata1; } -export interface Options {} +export interface Params3 {} +/** + * Timing and usage statistics. + */ export interface EvalStats { started_at: StartedAt; completed_at: CompletedAt; @@ -593,6 +628,9 @@ export interface EvalStats { export interface ModelUsage { [k: string]: ModelUsage1; } +/** + * Token usage for completion. + */ export interface ModelUsage1 { input_tokens: InputTokens; output_tokens: OutputTokens; @@ -600,11 +638,17 @@ export interface ModelUsage1 { input_tokens_cache_write: InputTokensCacheWrite; input_tokens_cache_read: InputTokensCacheRead; } +/** + * Eval error details. + */ export interface EvalError { message: Message; traceback: Traceback; traceback_ansi: TracebackAnsi; } +/** + * Sample from evaluation task. + */ export interface EvalSample { id: Id; epoch: Epoch; @@ -625,40 +669,61 @@ export interface EvalSample { attachments: Attachments; limit: EvalSampleLimit | null; } +/** + * System chat message. + */ export interface ChatMessageSystem { + role: Role; content: Content; source: Source; - role: Role; } +/** + * Text content. + */ export interface ContentText { type: Type1; text: Text; } +/** + * Image content. + */ export interface ContentImage { type: Type2; image: Image; detail: Detail; } +/** + * Audio content. + */ export interface ContentAudio { type: Type3; audio: Audio; format: Format; } +/** + * Video content. + */ export interface ContentVideo { type: Type4; video: Video; format: Format1; } +/** + * User chat message. + */ export interface ChatMessageUser { + role: Role1; content: Content1; source: Source1; - role: Role1; tool_call_id: ToolCallId; } +/** + * Assistant chat message. + */ export interface ChatMessageAssistant { + role: Role2; content: Content2; source: Source2; - role: Role2; tool_calls: ToolCalls; reasoning: Reasoning; } @@ -679,10 +744,13 @@ export interface ToolCallContent { format: Format2; content: Content3; } +/** + * Tool chat message. + */ export interface ChatMessageTool { + role: Role3; content: Content4; source: Source3; - role: Role3; tool_call_id: ToolCallId1; function: Function1; error: ToolCallError | null; @@ -691,6 +759,9 @@ export interface ToolCallError { type: Type6; message: Message1; } +/** + * Output from model generation. + */ export interface ModelOutput { model: Model1; choices: Choices1; @@ -699,6 +770,9 @@ export interface ModelOutput { metadata: Metadata4; error: Error; } +/** + * Choice generated for completion. + */ export interface ChatCompletionChoice { message: ChatMessageAssistant; stop_reason: StopReason; @@ -729,12 +803,6 @@ export interface TopLogprob { } /** * Score generated by a scorer. - * - * Args: - * value (Value): Score value. - * answer (str | None): Answer extracted from model output (optional). - * explanation (str | None): Explanation of score (optional). - * metadata (dict[str,Any]): Additional metadata related to the score. */ export interface Score { value: Value1; @@ -754,6 +822,9 @@ export interface SampleInitEvent { sample: Sample; state: JsonValue; } +/** + * Sample for an evaluation task. + */ export interface Sample { input: Input1; choices: Choices2; @@ -888,7 +959,7 @@ export interface ToolFunction { name: Name6; } /** - * Base class for model generation configs. + * Model generation options. */ export interface GenerateConfig1 { max_retries: MaxRetries; @@ -984,7 +1055,10 @@ export interface InputEvent { input_ansi: InputAnsi; } /** - * Event with sample score. + * Event with score. + * + * Can be the final score for a `Sample`, or can be an intermediate score + * resulting from a call to `score`. */ export interface ScoreEvent { timestamp: Timestamp8; @@ -992,6 +1066,7 @@ export interface ScoreEvent { event: Event8; score: Score; target: Target2; + intermediate: Intermediate; } /** * Event with sample error. @@ -1011,6 +1086,9 @@ export interface LoggerEvent { event: Event10; message: LoggingMessage; } +/** + * Message written to Python log. + */ export interface LoggingMessage { name: Name7; level: Level; @@ -1027,6 +1105,7 @@ export interface InfoEvent { timestamp: Timestamp11; pending: Pending11; event: Event11; + source: Source4; data: JsonValue; } /** @@ -1063,15 +1142,24 @@ export interface ModelUsage2 { export interface Attachments { [k: string]: string; } +/** + * Limit encontered by sample. + */ export interface EvalSampleLimit { type: Type13; limit: Limit2; } +/** + * Score reductions. + */ export interface EvalSampleReductions { scorer: Scorer1; reducer: Reducer1; samples: Samples2; } +/** + * Score and sample_id scored. + */ export interface EvalSampleScore { value: Value2; answer: Answer1; diff --git a/src/inspect_ai/_view/www/src/usage/ModelTokenTable.tsx b/src/inspect_ai/_view/www/src/usage/ModelTokenTable.tsx index b55ab07a4..00abf9a75 100644 --- a/src/inspect_ai/_view/www/src/usage/ModelTokenTable.tsx +++ b/src/inspect_ai/_view/www/src/usage/ModelTokenTable.tsx @@ -14,7 +14,13 @@ export const ModelTokenTable: React.FC = ({ {Object.keys(model_usage).map((key) => { - return ; + return ( + + ); })} diff --git a/src/inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx b/src/inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx index 76b323a65..d6f098588 100644 --- a/src/inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx +++ b/src/inspect_ai/_view/www/src/usage/ModelUsagePanel.tsx @@ -68,12 +68,14 @@ export const ModelUsagePanel: React.FC = ({ usage }) => { return (
- {rows.map((row) => { + {rows.map((row, idx) => { if (row.label === "---") { - return
; + return ( +
+ ); } else { return ( - +
= ({ [setSelectedTab], ); - // Compute tab panels anytime the tabs change - const tabPanels = useMemo(() => { - return Object.keys(tabs).map((key) => { - const tab = tabs[key]; - return ( - { - onScroll(tab.id, position); - }, - [onScroll], - )} - > - {tab.content()} - - ); - }); - }, [tabs, selectedTab]); - if (evalSpec === undefined) { return ; } else { @@ -150,7 +124,31 @@ export const WorkSpaceView: React.FC = ({ tabControlsClassName={clsx(styles.tabs, "text-size-smaller")} tabPanelsClassName={clsx(styles.tabPanels)} > - {tabPanels} + {Object.keys(tabs).map((key) => { + const tab = tabs[key]; + return ( + { + onScroll(tab.id, position); + }, + [onScroll], + )} + > + {tab.content()} + + ); + })}
diff --git a/src/inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx b/src/inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx index 5c6590fe1..0c9b85205 100644 --- a/src/inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx +++ b/src/inspect_ai/_view/www/src/workspace/navbar/PrimaryBar.tsx @@ -30,16 +30,6 @@ export const PrimaryBar: React.FC = ({ evalSpec, setOffcanvas, }) => { - let statusPanel; - if (status === "success") { - statusPanel = ; - } else if (status === "cancelled") { - statusPanel = ; - } else if (status === "started") { - statusPanel = ; - } else if (status === "error") { - statusPanel = ; - } const logFileName = file ? filename(file) : ""; const handleToggle = useCallback(() => { @@ -103,7 +93,18 @@ export const PrimaryBar: React.FC = ({
- {statusPanel} + {status === "success" ? ( + + ) : undefined} + {status === "cancelled" ? ( + + ) : undefined} + {status === "started" ? ( + + ) : undefined} + {status === "error" ? ( + + ) : undefined}
{evalSpec?.created} diff --git a/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css b/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css index 7f2d323e0..d57dbbe2f 100644 --- a/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css +++ b/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.module.css @@ -14,13 +14,13 @@ flex-direction: row; flex-wrap: wrap; justify-content: end; - height: 100%; align-items: center; margin-top: 0.2rem; padding-bottom: 0.4rem; row-gap: 1em; max-height: 15em; overflow: scroll; + align-items: baseline; } .verticalMetricReducer { @@ -39,14 +39,26 @@ } .verticalMetricValue { - font-size: var(--inspect-font-size-larger); font-weight: 500; text-align: center; } +.multiScorer { + padding-left: 0; + height: 100%; + display: flex; + flex-direction: column; + padding: 0.5em 1em; +} + +.multiScorerIndent { + padding-left: 1.5em; +} + .multiScorerReducer { text-align: center; margin-bottom: -0.3rem; + margin-top: 0.2em; } .multiScorerLabel { @@ -58,10 +70,21 @@ .multiScorerValue { display: grid; grid-template-columns: auto auto; + grid-auto-rows: auto; grid-column-gap: 0.3rem; grid-row-gap: 0; + padding-top: 0.3em; } .multiScorerValueContent { font-weight: 600; + text-align: center; +} + +.multiScoreMetricGrid { + display: grid; + grid-template-rows: auto auto; + column-gap: 1em; + padding: 0 0.2em; + justify-content: center; } diff --git a/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx b/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx index 1f0aafae0..7819336f6 100644 --- a/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx +++ b/src/inspect_ai/_view/www/src/workspace/navbar/ResultsPanel.tsx @@ -1,6 +1,7 @@ import clsx from "clsx"; import { EvalMetric, EvalResults, EvalScore, Reducer } from "../../types/log"; import { formatPrettyDecimal } from "../../utils/format"; +import { metricDisplayName } from "../utils"; import styles from "./ResultsPanel.module.css"; interface ResultsPanelProps { @@ -23,7 +24,7 @@ export const ResultsPanel: React.FC = ({ results }) => { metric: { name: key, value: score.metrics[key].value, - options: {}, + params: score.metrics[key].params, metadata: {}, }, }; @@ -31,18 +32,35 @@ export const ResultsPanel: React.FC = ({ results }) => { }); const metrics = Object.values(scorers)[0]; + const showReducer = !!metrics[0].reducer; return (
{metrics.map((metric, i) => { - return ; + return ( + + ); })}
); } else { + const showReducer = + results?.scores.findIndex((score) => !!score.reducer) !== -1; return (
{results?.scores?.map((score, index) => { - return ; + return ( + + ); })}
); @@ -52,6 +70,7 @@ export const ResultsPanel: React.FC = ({ results }) => { interface VerticalMetricProps { metricSummary: MetricSummary; isFirst: boolean; + showReducer: boolean; } /** Renders a Vertical Metric @@ -59,21 +78,8 @@ interface VerticalMetricProps { const VerticalMetric: React.FC = ({ metricSummary, isFirst, + showReducer, }) => { - const reducer_component = metricSummary.reducer ? ( -
- {metricSummary.reducer} -
- ) : ( - "" - ); - return (
= ({ styles.verticalMetricName, )} > - {metricSummary.metric.name} + {metricDisplayName(metricSummary.metric)}
- {reducer_component} + {showReducer ? ( +
+ {metricSummary.reducer || "default"} +
+ ) : undefined} +
{formatPrettyDecimal(metricSummary.metric.value)}
@@ -99,33 +120,25 @@ const VerticalMetric: React.FC = ({ interface MultiScorerMetricProps { scorer: EvalScore; isFirst: boolean; + showReducer: boolean; } const MultiScorerMetric: React.FC = ({ scorer, isFirst, + showReducer, }) => { const titleFontClz = "text-size-base"; const reducerFontClz = "text-size-smaller"; const valueFontClz = "text-size-base"; - const reducer_component = scorer.reducer ? ( + return (
- {scorer.reducer} -
- ) : ( - "" - ); - - return ( -
= ({ > {scorer.name}
- {reducer_component} + {showReducer ? ( +
+ {scorer.reducer || "default"} +
+ ) : undefined}
{Object.keys(scorer.metrics).map((key) => { const metric = scorer.metrics[key]; return ( -
-
{metric.name}
+
+
{metricDisplayName(metric)}
{formatPrettyDecimal(metric.value)}
diff --git a/src/inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx b/src/inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx index 5eee8b6c8..7b562348c 100644 --- a/src/inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx +++ b/src/inspect_ai/_view/www/src/workspace/navbar/SecondaryBar.tsx @@ -53,6 +53,7 @@ export const SecondaryBar: React.FC = ({ size: "minmax(12%, auto)", value: ( @@ -71,6 +72,7 @@ export const SecondaryBar: React.FC = ({ size: "minmax(12%, auto)", value: ( = ({ size: "minmax(12%, auto)", value: ( @@ -106,6 +109,7 @@ export const SecondaryBar: React.FC = ({ size: "minmax(12%, auto)", value: ( diff --git a/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx b/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx index 7792494eb..50fd4519a 100644 --- a/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx +++ b/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoreView.tsx @@ -2,17 +2,19 @@ import clsx from "clsx"; import { EvalScore } from "../../types/log"; import { formatPrettyDecimal } from "../../utils/format"; +import { metricDisplayName } from "../utils"; import styles from "./SidebarScoreView.module.css"; interface SidebarScoreProps { scorer: EvalScore; } export const SidebarScoreView: React.FC = ({ scorer }) => { + const showReducer = !!scorer.reducer; return (
{Object.keys(scorer.metrics).map((metric) => { return ( -
+
= ({ scorer }) => { styles.metricName, )} > - {scorer.metrics[metric].name} + {metricDisplayName(scorer.metrics[metric])}
- {scorer.reducer ? ( + {showReducer ? (
- ${scorer.reducer} + {scorer.reducer || "default"}
) : ( "" diff --git a/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx b/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx index 124049de1..01edcb546 100644 --- a/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx +++ b/src/inspect_ai/_view/www/src/workspace/sidebar/SidebarScoresView.tsx @@ -2,6 +2,7 @@ import clsx from "clsx"; import { Fragment } from "react"; import { Scores } from "../../types/log"; import { formatPrettyDecimal } from "../../utils/format"; +import { metricDisplayName } from "../utils"; import styles from "./SidebarScoresView.module.css"; interface SidebarScoresProps { @@ -9,26 +10,34 @@ interface SidebarScoresProps { } export const SidebarScoresView: React.FC = ({ scores }) => { + const showReducer = scores.findIndex((score) => !!score.reducer) !== -1; return (
- {scores.map((score) => { + {scores.map((score, idx) => { const name = score.name; const reducer = score.reducer; return ( -
+
{name}
- {reducer ? ( -
- {reducer} + {showReducer ? ( +
+ {reducer || "default"}
) : ( "" @@ -38,14 +47,7 @@ export const SidebarScoresView: React.FC = ({ scores }) => { const metric = score.metrics[key]; return ( -
- {metric.name} -
+
{metricDisplayName(metric)}
{formatPrettyDecimal(metric.value)}
diff --git a/src/inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx b/src/inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx index 21c30af8f..bf163180d 100644 --- a/src/inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx +++ b/src/inspect_ai/_view/www/src/workspace/tabs/InfoTab.tsx @@ -36,24 +36,6 @@ export const InfoTab: React.FC = ({ setHidden(false); }, [evalSpec, evalPlan, evalResults, evalStats, samples]); - const infoCards = []; - infoCards.push([ - , - ]); - - if (evalStatus !== "started") { - infoCards.push(); - } - - // If there is error or progress, includes those within info - if (evalStatus === "error" && evalError) { - infoCards.unshift(); - } - const showWarning = (!samples || samples.length === 0) && evalStatus === "success" && @@ -73,7 +55,15 @@ export const InfoTab: React.FC = ({ "" )}
- {infoCards} + + {evalStatus !== "started" ? : undefined} + {evalStatus === "error" && evalError ? ( + + ) : undefined}
); diff --git a/src/inspect_ai/_view/www/src/workspace/utils.ts b/src/inspect_ai/_view/www/src/workspace/utils.ts new file mode 100644 index 000000000..f36919952 --- /dev/null +++ b/src/inspect_ai/_view/www/src/workspace/utils.ts @@ -0,0 +1,34 @@ +import { EvalMetric } from "../types/log"; + +export const metricDisplayName = (metric: EvalMetric): string => { + let modifier = undefined; + for (const metricModifier of metricModifiers) { + modifier = metricModifier(metric); + if (modifier) { + break; + } + } + const metricName = !modifier ? metric.name : `${metric.name}[${modifier}]`; + + return metricName; +}; + +type MetricModifier = (metric: EvalMetric) => string | undefined; + +const clusterMetricModifier: MetricModifier = ( + metric: EvalMetric, +): string | undefined => { + if (metric.name !== "stderr") { + return undefined; + } + + const clusterValue = ((metric.params || {}) as Record)[ + "cluster" + ]; + if (clusterValue === undefined || typeof clusterValue !== "string") { + return undefined; + } + return clusterValue; +}; + +const metricModifiers: MetricModifier[] = [clusterMetricModifier]; diff --git a/src/inspect_ai/approval/_approval.py b/src/inspect_ai/approval/_approval.py index f51b6b469..d757bd779 100644 --- a/src/inspect_ai/approval/_approval.py +++ b/src/inspect_ai/approval/_approval.py @@ -17,6 +17,8 @@ class Approval(BaseModel): + """Approval details (decision, explanation, etc.)""" + decision: ApprovalDecision """Approval decision.""" diff --git a/src/inspect_ai/approval/_approver.py b/src/inspect_ai/approval/_approver.py index 894732c04..4033263bf 100644 --- a/src/inspect_ai/approval/_approver.py +++ b/src/inspect_ai/approval/_approver.py @@ -20,10 +20,10 @@ async def __call__( Approve or reject a tool call. Args: - message (str): Message genreated by the model along with the tool call. - call (ToolCall): The tool call to be approved. - view (ToolCallView): Custom rendering of tool context and call. - state (state | None): The current task state, if available. + message: Message genreated by the model along with the tool call. + call: The tool call to be approved. + view: Custom rendering of tool context and call. + state: The current task state, if available. Returns: Approval: An Approval object containing the decision and explanation. diff --git a/src/inspect_ai/approval/_auto.py b/src/inspect_ai/approval/_auto.py index 229986af8..92ef38e5d 100644 --- a/src/inspect_ai/approval/_auto.py +++ b/src/inspect_ai/approval/_auto.py @@ -11,7 +11,7 @@ def auto_approver(decision: ApprovalDecision = "approve") -> Approver: """Automatically apply a decision to tool calls. Args: - decision (ApprovalDecision): Decision to apply. + decision: Decision to apply. Returns: Approver: Auto approver. diff --git a/src/inspect_ai/approval/_human/approver.py b/src/inspect_ai/approval/_human/approver.py index 4e5ac5b8d..99dcaf337 100644 --- a/src/inspect_ai/approval/_human/approver.py +++ b/src/inspect_ai/approval/_human/approver.py @@ -14,6 +14,9 @@ def human_approver( ) -> Approver: """Interactive human approver. + Args: + choices: Choices to present to human. + Returns: Approver: Interactive human approver. """ diff --git a/src/inspect_ai/approval/_policy.py b/src/inspect_ai/approval/_policy.py index b4625a352..7dc4c5e6b 100644 --- a/src/inspect_ai/approval/_policy.py +++ b/src/inspect_ai/approval/_policy.py @@ -20,8 +20,13 @@ @dataclass class ApprovalPolicy: + """Policy mapping approvers to tools.""" + approver: Approver + """Approver for policy.""" + tools: str | list[str] + """Tools to use this approver for (can be full tool names or globs).""" def policy_approver(policies: str | list[ApprovalPolicy]) -> Approver: diff --git a/src/inspect_ai/approval/_registry.py b/src/inspect_ai/approval/_registry.py index 6ed8c8712..c4264c142 100644 --- a/src/inspect_ai/approval/_registry.py +++ b/src/inspect_ai/approval/_registry.py @@ -31,11 +31,11 @@ def approver(*args: Any, name: str | None = None, **attribs: Any) -> Any: Args: *args: Function returning `Approver` targeted by plain approver decorator without attributes (e.g. `@approver`) - name (str | None): + name: Optional name for approver. If the decorator has no name argument then the name of the function will be used to automatically assign a name. - **attribs: (dict[str,Any]): Additional approver attributes. + **attribs: Additional approver attributes. Returns: Approver with registry attributes. diff --git a/src/inspect_ai/dataset/_dataset.py b/src/inspect_ai/dataset/_dataset.py index adbb88cbd..478866a9f 100644 --- a/src/inspect_ai/dataset/_dataset.py +++ b/src/inspect_ai/dataset/_dataset.py @@ -27,6 +27,8 @@ class Sample(BaseModel): + r"""Sample for an evaluation task.""" + def __init__( self, input: str | list[ChatMessage], @@ -38,22 +40,22 @@ def __init__( files: dict[str, str] | None = None, setup: str | None = None, ) -> None: - r"""Sample to be used in an evaluation task. + r"""Create a Sample. Args: - input (str | list[ChatMessage]): The input to be submitted to the model. - choices (list[str] | None): Optional. List of available answer choices - (used only for multiple-choice evals). - target (str | list[str]): Optional. Ideal target output. May be a literal value + input: The input to be submitted to the model. + choices: Optional. List of available answer choices + (used only for multiple-choice evals). + target: Optional. Ideal target output. May be a literal value or narrative text to be used by a model grader. - id (int | str | None): Optional. Unique identifier for sample. - metadata (dict[str,Any] | None): Optional. Arbitrary metadata associated with the sample. - sandbox (SandboxEnvironmentType | None): Sandbox environment type - (or optionally a str or tuple with a shorthand spec) - files (dict[str, str] | None): Optional. Files that go along with the sample (copied to - SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL). - setup (str | None): Optional. Setup script to run for sample (run - within default SandboxEnvironment). + id: Optional. Unique identifier for sample. + metadata: Optional. Arbitrary metadata associated with the sample. + sandbox (SandboxEnvironmentType | None): Sandbox environment type (or optionally a str or tuple with a shorthand spec) + sandbox: Optional. Sandbox specification for this sample. + files: Optional. Files that go along with the sample (copied to + SandboxEnvironment). Files can be paths, inline text, or inline binary (base64 encoded data URL). + setup: Optional. Setup script to run for sample (run + within default SandboxEnvironment). """ super().__init__( input=input, @@ -144,14 +146,6 @@ def location(self) -> str | None: ... @abc.abstractmethod def shuffled(self) -> bool: ... - @abc.abstractmethod - def shuffle_choices(self, seed: int | None = None) -> None: - """Shuffle the order of the choices with each sample. - - Args: - seed: (int | None): Random seed for shuffling (optional). - """ - @overload def __getitem__(self, index: int) -> Sample: ... @@ -164,14 +158,6 @@ def __getitem__(self, index: Union[int, slice]) -> Union[Sample, "Dataset"]: ... @abc.abstractmethod def __len__(self) -> int: ... - @abc.abstractmethod - def shuffle(self, seed: int | None = None) -> None: - """Shuffle the order of the dataset (in place). - - Args: - seed: (int | None): Random seed for shuffling (optional). - """ - @abc.abstractmethod def sort( self, @@ -185,8 +171,8 @@ def sort( The key function defaults to measuring the length of the sample's input field. Args: - reverse (bool): if true, sort in descending order. Defaults to False. - key (Callable[[Any], Any]): a callable mapping each item to a numeric value (optional, defaults to sample_input_len). + reverse: If `Treu`, sort in descending order. Defaults to False. + key: a callable mapping each item to a numeric value (optional, defaults to sample_input_len). """ @abc.abstractmethod @@ -196,28 +182,33 @@ def filter( """Filter the dataset using a predicate. Args: - predicate (Callable[[Sample], bool]): Filtering function. - name (str | None): Name for filtered dataset (optional). + predicate: Filtering function. + name: Name for filtered dataset (optional). Returns: Filtered dataset. """ + @abc.abstractmethod + def shuffle(self, seed: int | None = None) -> None: + """Shuffle the order of the dataset (in place). + + Args: + seed: Random seed for shuffling (optional). + """ + + @abc.abstractmethod + def shuffle_choices(self, seed: int | None = None) -> None: + """Shuffle the order of the choices with each sample. + + Args: + seed: Random seed for shuffling (optional). + """ + @dataclass class FieldSpec: - r"""Specification for mapping data source fields to sample fields. - - Args: - input (str): Name of the field containing the sample input. - target (str): Name of the field containing the sample target. - choices (str): Optional. Name of field containing the list of answer choices. - id (str): Optional. Unique identifier for the sample. - metadata (list[str] | None): List of additional field names that should be read as metadata. - sandbox (str): Optional. Sandbox type along with optional config file - files (str): Optional. Files that go along with the sample. - setup (str): Optional. Setup script to run for sample . - """ + r"""Specification for mapping data source fields to sample fields.""" input: str = field(default="input") """Name of the field containing the sample input.""" diff --git a/src/inspect_ai/dataset/_sources/__init__.py b/src/inspect_ai/dataset/_sources/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/inspect_ai/dataset/_sources/csv.py b/src/inspect_ai/dataset/_sources/csv.py index 04bcfef58..28522109f 100644 --- a/src/inspect_ai/dataset/_sources/csv.py +++ b/src/inspect_ai/dataset/_sources/csv.py @@ -35,30 +35,30 @@ def csv_dataset( r"""Read dataset from CSV file. Args: - csv_file (str): Path to CSV file. Can be a local filesystem path, + csv_file: Path to CSV file. Can be a local filesystem path, a path to an S3 bucket (e.g. "s3://my-bucket"), or an HTTPS URL. Use `fs_options` to pass arguments through to the `S3FileSystem` constructor. - sample_fields (FieldSpec | RecordToSample): Method of mapping underlying + sample_fields: Method of mapping underlying fields in the data source to Sample objects. Pass `None` if the data is already stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to handle mapping with a custom function that returns one or more samples. - auto_id (bool): Assign an auto-incrementing ID for each sample. - shuffle (bool): Randomly shuffle the dataset order. - seed: (int | None): Seed used for random shuffle. - shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling. - limit (int | None): Limit the number of records to read. - dialect (str): CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details - encoding (str): Text encoding for file (defaults to "utf-8"). - name (str): Optional name for dataset (for logging). If not specified, + auto_id: Assign an auto-incrementing ID for each sample. + shuffle: Randomly shuffle the dataset order. + seed: Seed used for random shuffle. + shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling. + limit: Limit the number of records to read. + dialect: CSV dialect ("unix", "excel" or"excel-tab"). Defaults to "unix". See https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters for more details + encoding: Text encoding for file (defaults to "utf-8"). + name: Optional name for dataset (for logging). If not specified, defaults to the stem of the filename - fs_options (dict[str, Any]): Optional. Additional arguments to pass through + fs_options: Optional. Additional arguments to pass through to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }` if you are accessing a public S3 bucket with no credentials. - fieldnames (list[str] | None): Optional. A list of fieldnames to use for the CSV. + fieldnames: Optional. A list of fieldnames to use for the CSV. If None, the values in the first row of the file will be used as the fieldnames. Useful for files without a header. - delimiter (str): Optional. The delimiter to use when parsing the file. Defaults to ",". + delimiter: Optional. The delimiter to use when parsing the file. Defaults to ",". Returns: Dataset read from CSV file. diff --git a/src/inspect_ai/dataset/_sources/hf.py b/src/inspect_ai/dataset/_sources/hf.py index b310ff6ac..f2ef5dbe2 100644 --- a/src/inspect_ai/dataset/_sources/hf.py +++ b/src/inspect_ai/dataset/_sources/hf.py @@ -41,36 +41,36 @@ def hf_dataset( `datasets` package, including remote datasets on Hugging Face Hub. Args: - path (str): Path or name of the dataset. Depending on path, the dataset - builder that is used comes from a generic dataset script (JSON, CSV, - Parquet, text etc.) or from the dataset script (a python file) inside - the dataset directory. - split (str): Which split of the data to load. - name (str | None): Name of the dataset configuration. - data_dir (str | None): data_dir of the dataset configuration - to read data from. - revision (str | None): Specific revision to load (e.g. "main", a branch - name, or a specific commit SHA). When using `revision` the `cached` option - is ignored and datasets are revalidated on Hugging Face before loading. - sample_fields (FieldSpec | RecordToSample): Method of mapping underlying - fields in the data source to Sample objects. Pass `None` if the data is already - stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a - `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to + path: Path or name of the dataset. Depending on path, the dataset + builder that is used comes from a generic dataset script (JSON, CSV, + Parquet, text etc.) or from the dataset script (a python file) inside + the dataset directory. + split: Which split of the data to load. + name: Name of the dataset configuration. + data_dir: data_dir of the dataset configuration + to read data from. + revision: Specific revision to load (e.g. "main", a branch + name, or a specific commit SHA). When using `revision` the `cached` option + is ignored and datasets are revalidated on Hugging Face before loading. + sample_fields: Method of mapping underlying + fields in the data source to Sample objects. Pass `None` if the data is already + stored in `Sample` form (i.e. has "input" and "target" columns.); Pass a + `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to handle mapping with a custom function that returns one or more samples. - auto_id (bool): Assign an auto-incrementing ID for each sample. - shuffle (bool): Randomly shuffle the dataset order. - seed: (int | None): Seed used for random shuffle. - shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling. - limit (int | None): Limit the number of records to read. - trust (bool): Whether or not to allow for datasets defined on the Hub - using a dataset script. This option should only be set to True for - repositories you trust and in which you have read the code, as it - will execute code present on the Hub on your local machine. - cached (bool): By default, datasets are read once from HuggingFace - Hub and then cached for future reads. Pass `cached=False` to force - re-reading the dataset from Hugging Face. Ignored when the `revision` - option is specified. - **kwargs (dict[str, Any]): Additional arguments to pass through to the + auto_id: Assign an auto-incrementing ID for each sample. + shuffle: Randomly shuffle the dataset order. + seed: Seed used for random shuffle. + shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling. + limit: Limit the number of records to read. + trust: Whether or not to allow for datasets defined on the Hub + using a dataset script. This option should only be set to True for + repositories you trust and in which you have read the code, as it + will execute code present on the Hub on your local machine. + cached: By default, datasets are read once from HuggingFace + Hub and then cached for future reads. Pass `cached=False` to force + re-reading the dataset from Hugging Face. Ignored when the `revision` + option is specified. + **kwargs (dict[str, Any]): Additional arguments to pass through to the `load_dataset` function of the `datasets` package. Returns: diff --git a/src/inspect_ai/dataset/_sources/json.py b/src/inspect_ai/dataset/_sources/json.py index 1e7491acd..f10fcac1f 100644 --- a/src/inspect_ai/dataset/_sources/json.py +++ b/src/inspect_ai/dataset/_sources/json.py @@ -39,23 +39,23 @@ def json_dataset( the `sample_fields` argument. Args: - json_file (str): Path to JSON file. Can be a local filesystem path or + json_file: Path to JSON file. Can be a local filesystem path or a path to an S3 bucket (e.g. "s3://my-bucket"). Use `fs_options` to pass arguments through to the `S3FileSystem` constructor. - sample_fields (FieldSpec | RecordToSample): Method of mapping underlying + sample_fields: Method of mapping underlying fields in the data source to `Sample` objects. Pass `None` if the data is already stored in `Sample` form (i.e. object with "input" and "target" fields); Pass a `FieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to handle mapping with a custom function that returns one or more samples. - auto_id (bool): Assign an auto-incrementing ID for each sample. - shuffle (bool): Randomly shuffle the dataset order. - seed: (int | None): Seed used for random shuffle. - shuffle_choices: (bool | int | None): Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling. - limit (int | None): Limit the number of records to read. - encoding (str): Text encoding for file (defaults to "utf-8"). - name (str): Optional name for dataset (for logging). If not specified, + auto_id: Assign an auto-incrementing ID for each sample. + shuffle: Randomly shuffle the dataset order. + seed: Seed used for random shuffle. + shuffle_choices: Whether to shuffle the choices. If an int is passed, this will be used as the seed when shuffling. + limit: Limit the number of records to read. + encoding: Text encoding for file (defaults to "utf-8"). + name: Optional name for dataset (for logging). If not specified, defaults to the stem of the filename. - fs_options (dict[str, Any]): Optional. Additional arguments to pass through + fs_options: Optional. Additional arguments to pass through to the filesystem provider (e.g. `S3FileSystem`). Use `{"anon": True }` if you are accessing a public S3 bucket with no credentials. diff --git a/src/inspect_ai/log/__init__.py b/src/inspect_ai/log/__init__.py index e523205d0..49f872b4e 100644 --- a/src/inspect_ai/log/__init__.py +++ b/src/inspect_ai/log/__init__.py @@ -22,6 +22,7 @@ EvalResults, EvalRevision, EvalSample, + EvalSampleLimit, EvalSampleReductions, EvalSampleScore, EvalScore, @@ -61,6 +62,7 @@ "EvalResults", "EvalRevision", "EvalSample", + "EvalSampleLimit", "EvalSampleScore", "EvalSampleReductions", "EvalScore", diff --git a/src/inspect_ai/log/_convert.py b/src/inspect_ai/log/_convert.py index bd0d57abf..05d63f8c7 100644 --- a/src/inspect_ai/log/_convert.py +++ b/src/inspect_ai/log/_convert.py @@ -20,12 +20,12 @@ def convert_eval_logs( Args: path (str): Path to source log file(s). Should be either a single - log file or a directory containing log files. + log file or a directory containing log files. to (Literal["eval", "json"]): Format to convert to. If a file is - already in the target format it will just be copied to the output dir. + already in the target format it will just be copied to the output dir. output_dir (str): Output directory to write converted log file(s) to. overwrite (bool): Overwrite existing log files (defaults to `False`, - raising an error if the output file path already exists). + raising an error if the output file path already exists). """ from inspect_ai._display import display diff --git a/src/inspect_ai/log/_file.py b/src/inspect_ai/log/_file.py index 491cd231e..f6bfaf90a 100644 --- a/src/inspect_ai/log/_file.py +++ b/src/inspect_ai/log/_file.py @@ -3,6 +3,7 @@ from logging import getLogger from typing import Any, Callable, Generator, Literal, cast +from pydantic import BaseModel from pydantic_core import to_json from inspect_ai._util._async import run_coroutine @@ -22,7 +23,21 @@ logger = getLogger(__name__) -class EvalLogInfo(FileInfo): +class EvalLogInfo(BaseModel): + """File info and task identifiers for eval log.""" + + name: str + """Name of file.""" + + type: str + """Type of file (file or directory)""" + + size: int + """File size in bytes.""" + + mtime: float | None + """File modification time (None if the file is a directory on S3).""" + task: str """Task name.""" @@ -231,7 +246,7 @@ def write_log_dir_manifest( def read_eval_log( - log_file: str | FileInfo, + log_file: str | EvalLogInfo, header_only: bool = False, resolve_attachments: bool = False, format: Literal["eval", "json", "auto"] = "auto", @@ -241,7 +256,7 @@ def read_eval_log( Args: log_file (str | FileInfo): Log file to read. header_only (bool): Read only the header (i.e. exclude - the "samples" and "logging" fields). Defaults to False. + the "samples" and "logging" fields). Defaults to False. resolve_attachments (bool): Resolve attachments (e.g. images) to their full content. format (Literal["eval", "json", "auto"]): Read from format @@ -256,7 +271,7 @@ def read_eval_log( async def read_eval_log_async( - log_file: str | FileInfo, + log_file: str | EvalLogInfo, header_only: bool = False, resolve_attachments: bool = False, format: Literal["eval", "json", "auto"] = "auto", @@ -304,13 +319,13 @@ async def read_eval_log_async( def read_eval_log_headers( - log_files: list[str] | list[FileInfo] | list[EvalLogInfo], + log_files: list[str] | list[EvalLogInfo], ) -> list[EvalLog]: return run_coroutine(read_eval_log_headers_async(log_files)) async def read_eval_log_headers_async( - log_files: list[str] | list[FileInfo] | list[EvalLogInfo], + log_files: list[str] | list[EvalLogInfo], ) -> list[EvalLog]: return [ await read_eval_log_async(log_file, header_only=True) for log_file in log_files @@ -318,7 +333,7 @@ async def read_eval_log_headers_async( def read_eval_log_sample( - log_file: str | FileInfo, + log_file: str | EvalLogInfo, id: int | str, epoch: int = 1, resolve_attachments: bool = False, @@ -347,7 +362,7 @@ def read_eval_log_sample( async def read_eval_log_sample_async( - log_file: str | FileInfo, + log_file: str | EvalLogInfo, id: int | str, epoch: int = 1, resolve_attachments: bool = False, @@ -386,7 +401,7 @@ async def read_eval_log_sample_async( def read_eval_log_samples( - log_file: str | FileInfo, + log_file: str | EvalLogInfo, all_samples_required: bool = True, resolve_attachments: bool = False, format: Literal["eval", "json", "auto"] = "auto", diff --git a/src/inspect_ai/log/_log.py b/src/inspect_ai/log/_log.py index b50877660..c3296aa33 100644 --- a/src/inspect_ai/log/_log.py +++ b/src/inspect_ai/log/_log.py @@ -4,11 +4,17 @@ import traceback from logging import getLogger from types import TracebackType -from typing import Any, Literal, Type +from typing import Any, Literal, Type, TypedDict import click import tenacity -from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + PrivateAttr, + model_validator, +) from rich.console import Console, RenderableType from rich.traceback import Traceback @@ -30,7 +36,31 @@ SCORER_PLACEHOLDER = "88F74D2C" +class EvalConfigDefaults(TypedDict): + epochs: int + epochs_reducer: list[str] + fail_on_error: bool + sandbox_cleanup: bool + log_samples: bool + log_images: bool + score_display: bool + + +def eval_config_defaults() -> EvalConfigDefaults: + return { + "epochs": 1, + "epochs_reducer": ["mean"], + "fail_on_error": True, + "sandbox_cleanup": True, + "log_samples": True, + "log_images": True, + "score_display": True, + } + + class EvalConfig(BaseModel): + """Configuration used for evaluation.""" + limit: int | tuple[int, int] | None = Field(default=None) """Sample limit (number of samples or range of samples).""" @@ -109,6 +139,8 @@ def convert_max_messages_to_message_limit( class EvalSampleLimit(BaseModel): + """Limit encontered by sample.""" + type: Literal["context", "time", "message", "token", "operator", "custom"] """The type of limit""" @@ -117,6 +149,8 @@ class EvalSampleLimit(BaseModel): class EvalSample(BaseModel): + """Sample from evaluation task.""" + id: int | str """Unique id for sample.""" @@ -191,7 +225,7 @@ def store_as(self, model_cls: Type[SMT]) -> SMT: """Attachments referenced from messages and events. Resolve attachments for a sample (replacing attachment://* references with - attachment content) with the resolve_sample_attachments() function. + attachment content) by passing `resolve_attachments=True` to log reading functions. """ limit: EvalSampleLimit | None = Field(default=None) @@ -262,6 +296,8 @@ class EvalEvents(BaseModel): class EvalPlanStep(BaseModel): + """Solver step.""" + solver: str """Name of solver.""" @@ -270,6 +306,8 @@ class EvalPlanStep(BaseModel): class EvalPlan(BaseModel): + """Plan (solvers) used in evaluation.""" + name: str = Field(default="plan") """Plan name.""" @@ -284,6 +322,8 @@ class EvalPlan(BaseModel): class EvalMetric(BaseModel): + """Metric for evaluation score.""" + name: str """Metric name.""" @@ -298,6 +338,8 @@ class EvalMetric(BaseModel): class EvalScore(BaseModel): + """Score for evaluation task.""" + name: str """Score name.""" @@ -318,11 +360,15 @@ class EvalScore(BaseModel): class EvalSampleScore(Score): + """Score and sample_id scored.""" + sample_id: str | int | None = Field(default=None) """Sample ID.""" class EvalSampleReductions(BaseModel): + """Score reductions.""" + scorer: str """Name the of scorer""" @@ -334,6 +380,8 @@ class EvalSampleReductions(BaseModel): class EvalResults(BaseModel): + """Scoring results from evaluation.""" + total_samples: int = Field(default=0) """Total samples in eval (dataset samples * epochs)""" @@ -416,6 +464,8 @@ def convert_scorer_to_scorers( class EvalDataset(BaseModel): + """Dataset used for evaluation.""" + name: str | None = Field(default=None) """Dataset name.""" @@ -433,6 +483,8 @@ class EvalDataset(BaseModel): class EvalRevision(BaseModel): + """Git revision for evaluation.""" + type: Literal["git"] """Type of revision (currently only "git")""" @@ -444,6 +496,8 @@ class EvalRevision(BaseModel): class EvalSpec(BaseModel): + """Eval target and configuration.""" + run_id: str = Field(default_factory=str) """Unique run id""" @@ -547,6 +601,8 @@ def rich_traceback( class EvalStats(BaseModel): + """Timing and usage statistics.""" + started_at: str = Field(default_factory=str) """Evaluation start time.""" @@ -561,6 +617,8 @@ class EvalStats(BaseModel): class EvalLog(BaseModel): + """Evaluation log.""" + # WARNING: The order of these fields is important for the log file format. # Do not change the order of these fields without incrementing the version number, # updating the log file read/write functionality (such as read_eval_log), @@ -576,13 +634,13 @@ class EvalLog(BaseModel): eval: EvalSpec """Eval identity and configuration.""" - plan: EvalPlan = Field(default=EvalPlan()) + plan: EvalPlan = Field(default_factory=EvalPlan) """Eval plan (solvers and config)""" results: EvalResults | None = None """Eval results (scores and metrics).""" - stats: EvalStats = Field(default=EvalStats()) + stats: EvalStats = Field(default_factory=EvalStats) """Eval stats (runtime, model usage)""" error: EvalError | None = Field(default=None) diff --git a/src/inspect_ai/log/_message.py b/src/inspect_ai/log/_message.py index f68f007b4..10a65362c 100644 --- a/src/inspect_ai/log/_message.py +++ b/src/inspect_ai/log/_message.py @@ -11,6 +11,8 @@ class LoggingMessage(BaseModel): + """Message written to Python log.""" + name: str | None = Field(default=None) """Logger name (e.g. 'httpx')""" @@ -33,7 +35,7 @@ class LoggingMessage(BaseModel): """Logged from line number.""" @staticmethod - def from_log_record(record: LogRecord) -> "LoggingMessage": + def _from_log_record(record: LogRecord) -> "LoggingMessage": """Create a LoggingMesssage from a LogRecord. Args: diff --git a/src/inspect_ai/log/_recorders/file.py b/src/inspect_ai/log/_recorders/file.py index 742f0ea9d..e11aa58cf 100644 --- a/src/inspect_ai/log/_recorders/file.py +++ b/src/inspect_ai/log/_recorders/file.py @@ -28,6 +28,10 @@ def __init__( def is_local(self) -> bool: return self.fs.is_local() + @override + def is_writeable(self) -> bool: + return self.fs.is_writeable(self.log_dir) + @override @classmethod async def read_log_sample( diff --git a/src/inspect_ai/log/_recorders/recorder.py b/src/inspect_ai/log/_recorders/recorder.py index fa347a93d..baf802dfd 100644 --- a/src/inspect_ai/log/_recorders/recorder.py +++ b/src/inspect_ai/log/_recorders/recorder.py @@ -21,6 +21,9 @@ def handles_location(cls, location: str) -> bool: ... @abc.abstractmethod def default_log_buffer(self) -> int: ... + @abc.abstractmethod + def is_writeable(self) -> bool: ... + @abc.abstractmethod async def log_init(self, eval: EvalSpec, location: str | None = None) -> str: ... diff --git a/src/inspect_ai/log/_transcript.py b/src/inspect_ai/log/_transcript.py index 03de33234..b61932563 100644 --- a/src/inspect_ai/log/_transcript.py +++ b/src/inspect_ai/log/_transcript.py @@ -167,7 +167,7 @@ class ToolEvent(BaseEvent): events: list["Event"] = Field(default_factory=list) """Transcript of events for tool.""" - def set_result( + def _set_result( self, result: ToolResult, truncated: tuple[int, int] | None, @@ -182,11 +182,11 @@ def set_result( # mechanism for operator to cancel the tool call - def set_task(self, task: asyncio.Task[Any]) -> None: + def _set_task(self, task: asyncio.Task[Any]) -> None: """Set the tool task (for possible cancellation)""" self._task = task - def cancel(self) -> None: + def _cancel(self) -> None: """Cancel the tool task.""" if self._task: self._cancelled = True @@ -264,6 +264,9 @@ class InfoEvent(BaseEvent): event: Literal["info"] = Field(default="info") """Event type.""" + source: str | None = Field(default=None) + """Optional source for info event.""" + data: JsonValue """Data provided with event.""" @@ -279,17 +282,24 @@ class ErrorEvent(BaseEvent): class ScoreEvent(BaseEvent): - """Event with sample score.""" + """Event with score. + + Can be the final score for a `Sample`, or can be an intermediate score + resulting from a call to `score`. + """ event: Literal["score"] = Field(default="score") """Event type.""" score: Score - """Sample score.""" + """Score value.""" target: str | list[str] | None = Field(default=None) """"Sample target.""" + intermediate: bool = Field(default=False) + """Was this an intermediate scoring?""" + class StepEvent(BaseEvent): """Step within current sample or subtask.""" @@ -355,13 +365,14 @@ def __init__(self, name: str = "") -> None: self.name = name self._events: list[Event] = [] - def info(self, data: JsonValue) -> None: + def info(self, data: JsonValue, *, source: str | None = None) -> None: """Add an `InfoEvent` to the transcript. Args: - data (JsonValue): Data associated with the event. + data: Data associated with the event. + source: Optional event source. """ - self._event(InfoEvent(data=data)) + self._event(InfoEvent(source=source, data=data)) @contextlib.contextmanager def step(self, name: str, type: str | None = None) -> Iterator[None]: diff --git a/src/inspect_ai/model/__init__.py b/src/inspect_ai/model/__init__.py index ece60b7b6..3d3826bf7 100644 --- a/src/inspect_ai/model/__init__.py +++ b/src/inspect_ai/model/__init__.py @@ -21,6 +21,7 @@ from ._chat_message import ( ChatMessage, ChatMessageAssistant, + ChatMessageBase, ChatMessageSystem, ChatMessageTool, ChatMessageUser, @@ -54,6 +55,7 @@ "ContentVideo", "Content", "ChatMessage", + "ChatMessageBase", "ChatMessageSystem", "ChatMessageUser", "ChatMessageAssistant", diff --git a/src/inspect_ai/model/_cache.py b/src/inspect_ai/model/_cache.py index 11bfd5e64..b5dd0b891 100644 --- a/src/inspect_ai/model/_cache.py +++ b/src/inspect_ai/model/_cache.py @@ -58,22 +58,23 @@ def _parse_expiry(period: str) -> int: class CachePolicy: """The `CachePolicy` is used to define various criteria that impact how model calls are cached. - Attributes: - expiry(str | None): Default "24h". The expiry time for the cache entry. - This is a string of the format "12h" for 12 hours or "1W" for a week, - etc. This is how long we will keep the cache entry, if we access it - after this point we'll clear it. Setting to `None` will cache - indefinitely. - per_epoch(bool): Default True. By default we cache responses separately - for different epochs. The general use case is that if there are - multiple epochs, we should cache each response separately because - scorers will aggregate across epochs. However, sometimes a response - can be cached regardless of epoch if the call being made isn't under - test as part of the evaluation. If False, this option allows you to - bypass that and cache independently of the epoch. - scopes(dict[str, str]): A dictionary of additional metadata that should - be included in the cache key. This allows for more fine-grained - control over the cache key generation. + `expiry`: Default "24h". The expiry time for the cache entry. + This is a string of the format "12h" for 12 hours or "1W" for a week, + etc. This is how long we will keep the cache entry, if we access it + after this point we'll clear it. Setting to `None` will cache + indefinitely. + + `per_epoch`: Default True. By default we cache responses separately + for different epochs. The general use case is that if there are + multiple epochs, we should cache each response separately because + scorers will aggregate across epochs. However, sometimes a response + can be cached regardless of epoch if the call being made isn't under + test as part of the evaluation. If False, this option allows you to + bypass that and cache independently of the epoch. + + `scopes`: A dictionary of additional metadata that should + be included in the cache key. This allows for more fine-grained + control over the cache key generation. """ def __init__( @@ -82,6 +83,14 @@ def __init__( per_epoch: bool = True, scopes: dict[str, str] = {}, ) -> None: + """Create a CachePolicy. + + Args: + expiry: Expiry. + per_epoch: Per epoch + scopes: Scopes + + """ self.per_epoch = per_epoch self.scopes = scopes @@ -236,7 +245,11 @@ def cache_fetch(entry: CacheEntry) -> ModelOutput | None: def cache_clear(model: str = "") -> bool: - """Clear the cache directory.""" + """Clear the cache directory. + + Args: + model: Model to clear cache for. + """ try: path = cache_path(model) @@ -252,6 +265,11 @@ def cache_clear(model: str = "") -> bool: def cache_path(model: str = "") -> Path: + """Path to cache directory. + + Args: + model: Path to cache directory for specific model. + """ env_cache_dir = os.environ.get("INSPECT_CACHE_DIR", None) if env_cache_dir: generate_cache = Path(env_cache_dir) / "generate" @@ -320,9 +338,9 @@ def cache_size( will be calculated. Args: - subdirs(list[str]): List of folders to filter by, which are generally + subdirs: List of folders to filter by, which are generally model names. Empty directories will be ignored. - files(list[str]): List of files to filter by explicitly. Note that + files: List of files to filter by explicitly. Note that return value group these up by their parent directory Returns: @@ -344,7 +362,7 @@ def cache_list_expired(filter_by: list[str] = []) -> list[Path]: """Returns a list of all the cached files that have passed their expiry time. Args: - filter_by(list[str]): Default []. List of model names to filter by. If + filter_by: Default []. List of model names to filter by. If an empty list, this will search the entire cache. """ expired_cache_entries = [] @@ -384,7 +402,7 @@ def cache_prune(files: list[Path] = []) -> None: """Delete all expired cache entries. Args: - files(list[Path]): Default []. List of files to prune. If empty, this + files: List of files to prune. If empty, this will search the entire cache. """ if not files: diff --git a/src/inspect_ai/model/_call_tools.py b/src/inspect_ai/model/_call_tools.py index 1b3f5c140..143b312dc 100644 --- a/src/inspect_ai/model/_call_tools.py +++ b/src/inspect_ai/model/_call_tools.py @@ -187,7 +187,7 @@ async def call_tool_task(call: ToolCall) -> tuple[ChatMessageTool, ToolEvent]: view=call.view, pending=True, ) - event.set_task(task) + event._set_task(task) transcript()._event(event) # execute the tool call. if the operator cancelled the @@ -227,7 +227,7 @@ async def call_tool_task(call: ToolCall) -> tuple[ChatMessageTool, ToolEvent]: conversation_tool_mesage(tool_message) # update the event with the results - event.set_result( + event._set_result( result=result_event.result, truncated=result_event.truncated, error=result_event.error, diff --git a/src/inspect_ai/model/_chat_message.py b/src/inspect_ai/model/_chat_message.py index 696f0f972..74ade5c6d 100644 --- a/src/inspect_ai/model/_chat_message.py +++ b/src/inspect_ai/model/_chat_message.py @@ -13,8 +13,13 @@ class ChatMessageBase(BaseModel): + """Base class for chat messages.""" + + role: Literal["system", "user", "assistant", "tool"] + """Conversation role""" + content: str | list[Content] - """Content (simple string or list of string|image content)""" + """Content (simple string or list of content objects)""" source: Literal["input", "generate"] | None = Field(default=None) """Source of message.""" @@ -31,9 +36,6 @@ def text(self) -> str: property returns either the plain str content, or if the content is a list of text and images, the text items concatenated together (separated by newline) - - Returns: Text content of `ChatMessage` If this message does - not have text content then "" is returned. """ if isinstance(self.content, str): return self.content @@ -66,11 +68,15 @@ def text(self, text: str) -> None: class ChatMessageSystem(ChatMessageBase): + """System chat message.""" + role: Literal["system"] = Field(default="system") """Conversation role.""" class ChatMessageUser(ChatMessageBase): + """User chat message.""" + role: Literal["user"] = Field(default="user") """Conversation role.""" @@ -79,6 +85,8 @@ class ChatMessageUser(ChatMessageBase): class ChatMessageAssistant(ChatMessageBase): + """Assistant chat message.""" + role: Literal["assistant"] = Field(default="assistant") """Conversation role.""" @@ -112,6 +120,8 @@ def extract_reasoning(cls, data: Any) -> Any: class ChatMessageTool(ChatMessageBase): + """Tool chat message.""" + role: Literal["tool"] = Field(default="tool") """Conversation role.""" diff --git a/src/inspect_ai/model/_generate_config.py b/src/inspect_ai/model/_generate_config.py index 51588c213..1e4f3b47d 100644 --- a/src/inspect_ai/model/_generate_config.py +++ b/src/inspect_ai/model/_generate_config.py @@ -80,7 +80,7 @@ class GenerateConfigArgs(TypedDict, total=False): class GenerateConfig(BaseModel): - """Base class for model generation configs.""" + """Model generation options.""" max_retries: int | None = Field(default=None) """Maximum number of times to retry request (defaults to 5).""" diff --git a/src/inspect_ai/model/_model.py b/src/inspect_ai/model/_model.py index 4bad0d1ff..1fec9fa1c 100644 --- a/src/inspect_ai/model/_model.py +++ b/src/inspect_ai/model/_model.py @@ -149,7 +149,11 @@ def connection_key(self) -> str: return "default" def is_rate_limit(self, ex: BaseException) -> bool: - """Is this exception a rate limit error.""" + """Is this exception a rate limit error. + + Args: + ex: Exception to check for rate limit. + """ return False def collapse_user_messages(self) -> bool: @@ -176,12 +180,18 @@ def has_reasoning_history(self) -> bool: class Model: """Model interface.""" + api: ModelAPI + """Model API.""" + + config: GenerateConfig + """Generation config.""" + def __init__(self, api: ModelAPI, config: GenerateConfig) -> None: """Create a model. Args: - api (ModelAPI): Model API provider. - config (GenerateConfig): Model configuration. + api: Model API provider. + config: Model configuration. """ self.api = api self.config = config @@ -212,16 +222,12 @@ async def generate( """Generate output from the model. Args: - input (str | list[ChatMessage]): Chat message - input (if a `str` is passed it is converted + input: Chat message input (if a `str` is passed it is converted to a `ChatMessageUser`). - tools (list[Tool] | list[ToolDef] | list[ToolInfo]): Tools available for the - model to call. - tool_choice (ToolChoice): Directives to the model - as to which tools to prefer. - cache (bool | CachePolicy): Caching behavior for - generate responses (defaults to no caching). - config (GenerateConfig): Model configuration. + tools: Tools available for the model to call. + tool_choice: Directives to the model as to which tools to prefer. + config: Model configuration. + cache: Caching behavior for generate responses (defaults to no caching). Returns: ModelOutput @@ -517,7 +523,8 @@ def complete( ) -> None: # trace if isinstance(result, ModelOutput): - conversation_assistant_message(input, result.choices[0].message) + if result.choices: + conversation_assistant_message(input, result.choices[0].message) event.output = result else: conversation_assistant_error(result) @@ -550,7 +557,7 @@ def __init__(self, model: str | Model) -> None: """Create a ModelName. Args: - model: (str | Model): Model to create name for. + model: Model to create name for. """ if isinstance(model, str): (api, name) = self._parse_model(model) @@ -596,16 +603,16 @@ def get_model( """Get an instance of a model. Args: - model (str | Model | None): Model specification. - If `Model` is passed it is returned unmodified, - if `None` is passed then the model currently being - evaluated is returned (or if there is no evaluation - then the model referred to by `INSPECT_EVAL_MODEL`). - config (GenerateConfig): Configuration for model. - base_url (str | None): Optional. Alternate base URL for model. - api_key (str | None): Optional. API key for model. - **model_args (dict[str,Any]): Additional args to - pass to model constructor. + model: Model specification. + If `Model` is passed it is returned unmodified, + if `None` is passed then the model currently being + evaluated is returned (or if there is no evaluation + then the model referred to by `INSPECT_EVAL_MODEL`). + config: Configuration for model. + base_url: Optional. Alternate base URL for model. + api_key: Optional. API key for model. + **model_args: Additional args to + pass to model constructor. Returns: Model instance. diff --git a/src/inspect_ai/model/_model_output.py b/src/inspect_ai/model/_model_output.py index e187e16fd..f79330555 100644 --- a/src/inspect_ai/model/_model_output.py +++ b/src/inspect_ai/model/_model_output.py @@ -9,6 +9,8 @@ class ModelUsage(BaseModel): + """Token usage for completion.""" + input_tokens: int = Field(default=0) """Total input tokens used.""" @@ -73,6 +75,8 @@ class Logprobs(BaseModel): class ChatCompletionChoice(BaseModel): + """Choice generated for completion.""" + message: ChatMessageAssistant """Assistant message.""" @@ -96,6 +100,8 @@ def migrate_stop_reason( class ModelOutput(BaseModel): + """Output from model generation.""" + model: str = Field(default_factory=str) """Model used for generation.""" @@ -155,7 +161,14 @@ def from_content( stop_reason: StopReason = "stop", error: str | None = None, ) -> "ModelOutput": - """Convenient method to create ModelOutput from simple text content.""" + """Create ModelOutput from simple text content. + + Args: + model: Model name. + content: Text content from generation. + stop_reason: Stop reason for generation. + error: Error message. + """ return ModelOutput( model=model, choices=[ diff --git a/src/inspect_ai/model/_openai.py b/src/inspect_ai/model/_openai.py index d6581d245..22734641b 100644 --- a/src/inspect_ai/model/_openai.py +++ b/src/inspect_ai/model/_openai.py @@ -1,4 +1,5 @@ import json +import re from typing import Literal from openai.types.chat import ( @@ -44,29 +45,13 @@ def is_o_series(name: str) -> bool: - return is_o1(name) or is_o3(name) - - -def is_o1(name: str) -> bool: - return name.startswith("o1") - - -def is_o3(name: str) -> bool: - return name.startswith("o3") - - -def is_o1_full(name: str) -> bool: - return is_o1(name) and not is_o1_mini(name) and not is_o1_preview(name) + return bool(re.match(r"^o\d+", name)) def is_o1_mini(name: str) -> bool: return name.startswith("o1-mini") -def is_o3_mini(name: str) -> bool: - return name.startswith("o3-mini") - - def is_o1_preview(name: str) -> bool: return name.startswith("o1-preview") @@ -132,10 +117,17 @@ async def openai_chat_message( message: ChatMessage, model: str ) -> ChatCompletionMessageParam: if message.role == "system": - if is_o1(model): + # o1-mini does not support developer or system messages + # (see Dec 17, 2024 changelog: https://platform.openai.com/docs/changelog) + if is_o1_mini(model): + return ChatCompletionUserMessageParam(role="user", content=message.text) + # other o-series models use 'developer' rather than 'system' messages + # https://platform.openai.com/docs/guides/reasoning#advice-on-prompting + elif is_o_series(model): return ChatCompletionDeveloperMessageParam( role="developer", content=message.text ) + # gpt models use standard 'system' messages else: return ChatCompletionSystemMessageParam( role=message.role, content=message.text diff --git a/src/inspect_ai/model/_providers/google.py b/src/inspect_ai/model/_providers/google.py index 16bb2717f..d87e252f5 100644 --- a/src/inspect_ai/model/_providers/google.py +++ b/src/inspect_ai/model/_providers/google.py @@ -5,7 +5,7 @@ from copy import copy from io import BytesIO from logging import getLogger -from typing import Any, cast +from typing import Any, MutableSequence, cast import proto # type: ignore from google.ai.generativelanguage import ( @@ -553,11 +553,15 @@ def completion_choice_from_candidate(candidate: Candidate) -> ChatCompletionChoi def completion_choices_from_candidates( - candidates: list[Candidate], + candidates: MutableSequence[Candidate], ) -> list[ChatCompletionChoice]: - candidates = copy(candidates) - candidates.sort(key=lambda c: c.index) - return [completion_choice_from_candidate(candidate) for candidate in candidates] + if candidates: + candidates_list = sorted(candidates, key=lambda c: c.index) + return [ + completion_choice_from_candidate(candidate) for candidate in candidates_list + ] + else: + return [] # google doesn't export FinishReason (it's in a sub-namespace with a beta diff --git a/src/inspect_ai/model/_providers/openai.py b/src/inspect_ai/model/_providers/openai.py index e09a893a9..675285621 100644 --- a/src/inspect_ai/model/_providers/openai.py +++ b/src/inspect_ai/model/_providers/openai.py @@ -36,10 +36,8 @@ ) from .._openai import ( is_gpt, - is_o1_full, is_o1_mini, is_o1_preview, - is_o3, is_o_series, openai_chat_messages, openai_chat_tool_choice, @@ -145,15 +143,9 @@ def is_azure(self) -> bool: def is_o_series(self) -> bool: return is_o_series(self.model_name) - def is_o1_full(self) -> bool: - return is_o1_full(self.model_name) - def is_o1_mini(self) -> bool: return is_o1_mini(self.model_name) - def is_o3(self) -> bool: - return is_o3(self.model_name) - def is_o1_preview(self) -> bool: return is_o1_preview(self.model_name) @@ -167,8 +159,8 @@ async def generate( tool_choice: ToolChoice, config: GenerateConfig, ) -> ModelOutput | tuple[ModelOutput | Exception, ModelCall]: - # short-circuit to call o1- models that are text only - if self.is_o1_preview() or self.is_o1_mini(): + # short-circuit to call o1-preview, which doesn't support standard OAI message syntax and tool calling + if self.is_o1_preview(): return await generate_o1( client=self.client, input=input, @@ -303,7 +295,11 @@ def completion_params(self, config: GenerateConfig, tools: bool) -> dict[str, An params["top_logprobs"] = config.top_logprobs if tools and config.parallel_tool_calls is not None and not self.is_o_series(): params["parallel_tool_calls"] = config.parallel_tool_calls - if config.reasoning_effort is not None and not self.is_gpt(): + if ( + config.reasoning_effort is not None + and not self.is_gpt() + and not self.is_o1_mini() + ): params["reasoning_effort"] = config.reasoning_effort return params diff --git a/src/inspect_ai/scorer/__init__.py b/src/inspect_ai/scorer/__init__.py index 3384af82b..62e738182 100644 --- a/src/inspect_ai/scorer/__init__.py +++ b/src/inspect_ai/scorer/__init__.py @@ -10,6 +10,8 @@ NOANSWER, PARTIAL, Metric, + MetricProtocol, + SampleScore, Score, Value, ValueToFloat, @@ -58,8 +60,10 @@ "mean", "var", "Metric", + "MetricProtocol", "metric", "Score", + "SampleScore", "score", "Value", "ValueToFloat", diff --git a/src/inspect_ai/scorer/_answer.py b/src/inspect_ai/scorer/_answer.py index 795ba609c..2f4887737 100644 --- a/src/inspect_ai/scorer/_answer.py +++ b/src/inspect_ai/scorer/_answer.py @@ -43,7 +43,7 @@ def answer(pattern: Literal["letter", "word", "line"]) -> Scorer: Note that you must specify a `type` for the answer scorer. Args: - pattern: (Literal["letter", "word", "line"]): Type of answer + pattern: Type of answer to extract. "letter" is used with multiple choice and extracts a single letter; "word" will extract the next word (often used for yes/no answers); "line" will take diff --git a/src/inspect_ai/scorer/_classification.py b/src/inspect_ai/scorer/_classification.py index 0600e4d3b..c9775dac5 100644 --- a/src/inspect_ai/scorer/_classification.py +++ b/src/inspect_ai/scorer/_classification.py @@ -17,6 +17,10 @@ def f1( """Scorer which produces an F1 score Computes the `F1` score for the answer (which balances recall precision by taking the harmonic mean between recall and precision). + + Args: + answer_fn: Custom function to extract the answer from the completion (defaults to using the completion). + stop_words: Stop words to include in answer tokenization. """ async def score(state: TaskState, target: Target) -> Score: diff --git a/src/inspect_ai/scorer/_match.py b/src/inspect_ai/scorer/_match.py index 98d5d34cd..77a70ab6b 100644 --- a/src/inspect_ai/scorer/_match.py +++ b/src/inspect_ai/scorer/_match.py @@ -15,12 +15,11 @@ def match( """Scorer which matches text or a number. Args: - location (Literal["begin", "end", "any", "exact"]): - Location to match at. "any" matches anywhere in the + location: Location to match at. "any" matches anywhere in the output; "exact" requires the output be exactly equal to the target (module whitespace, etc.) - ignore_case (bool): Do case insensitive comparison. - numeric (bool): Is this a numeric match? (in this + ignore_case: Do case insensitive comparison. + numeric: Is this a numeric match? (in this case different punctuation removal rules are used and numbers are normalized before comparison). """ @@ -42,7 +41,7 @@ def includes(ignore_case: bool = True) -> Scorer: """Check whether the specified text is included in the model output. Args: - ignore_case (bool): Use a case insensitive comparison. + ignore_case: Use a case insensitive comparison. """ diff --git a/src/inspect_ai/scorer/_metric.py b/src/inspect_ai/scorer/_metric.py index cbc4d8ee0..7774921a4 100644 --- a/src/inspect_ai/scorer/_metric.py +++ b/src/inspect_ai/scorer/_metric.py @@ -43,19 +43,12 @@ """Value provided by a score. Use the methods of `Score` to easily treat -the Value as a simple scalar of various types. +the `Value` as a simple scalar of various types. """ class Score(BaseModel): - """Score generated by a scorer. - - Args: - value (Value): Score value. - answer (str | None): Answer extracted from model output (optional). - explanation (str | None): Explanation of score (optional). - metadata (dict[str,Any]): Additional metadata related to the score. - """ + """Score generated by a scorer.""" value: Value """Score value.""" @@ -112,12 +105,7 @@ def _as_scalar(self) -> str | int | float | bool: class SampleScore(BaseModel): - """Score for a Sample - - Args: - score: Score - sample_id: (str | int | None) Unique id of a sample - """ + """Score for a Sample.""" score: Score """A score""" @@ -192,33 +180,40 @@ def to_float(value: Value) -> float: @runtime_checkable class MetricDeprecated(Protocol): - r"""Evaluate scores using a metric. - - Args: - scores (list[SampleScore]): List of sample scores. - - Returns: - Metric value - """ - def __call__(self, scores: list[Score]) -> Value: ... @runtime_checkable class MetricProtocol(Protocol): - r"""Evaluate sample scores using a metric. - - Args: - scores (list[SampleScore]): List of scores. - - Returns: - Metric value - """ - - def __call__(self, scores: list[SampleScore]) -> Value: ... - - -Metric = MetricDeprecated | MetricProtocol + def __call__(self, scores: list[SampleScore]) -> Value: + r"""Compute a metric on a list of scores. + + Args: + scores: List of scores. + + Returns: + Metric value + + Examples: + ```python + @metric + def mean() -> Metric: + def metric(scores: list[SampleScore]) -> Value: + return np.mean([score.score.as_float() for score in scores]).item() + return metric + ``` + """ + ... + + +Metric = MetricProtocol | MetricDeprecated +"""Metric protocol. + +The Metric signature changed in release v0.3.64. Both +the previous and new signatures are supported -- you +should use `MetricProtocol` for new code as the +depreacated signature will eventually be removed. +""" P = ParamSpec("P") @@ -272,10 +267,18 @@ def metric( r"""Decorator for registering metrics. Args: - name: (str | MetricType): - Optional name for metric. If the decorator has no name - argument then the name of the underlying MetricType - will be used to automatically assign a name. + name: Optional name for metric. If the decorator has no name + argument then the name of the underlying MetricType + will be used to automatically assign a name. + + Examples: + ```python + @metric + def mean() -> Metric: + def metric(scores: list[SampleScore]) -> Value: + return np.mean([score.score.as_float() for score in scores]).item() + return metric + ``` """ # create_metric_wrapper: diff --git a/src/inspect_ai/scorer/_metrics/accuracy.py b/src/inspect_ai/scorer/_metrics/accuracy.py index 2f63fa12d..1d6135cd2 100644 --- a/src/inspect_ai/scorer/_metrics/accuracy.py +++ b/src/inspect_ai/scorer/_metrics/accuracy.py @@ -16,13 +16,11 @@ def accuracy(to_float: ValueToFloat = value_to_float()) -> Metric: r"""Compute proportion of total answers which are correct. Args: - to_float (ValueToFloat): Function for mapping - Value to float for computing metrics. The default - `value_to_float()` maps CORRECT ("C") to 1.0, - INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and - NOANSWER ("N") to 0, casts numeric values to - float directly, and prints a warning and returns - 0 if the Value is a complex object (list or dict). + to_float: Function for mapping `Value` to float for computing + metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0, + INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0, + casts numeric values to float directly, and prints a warning and returns + 0 if the Value is a complex object (list or dict). Returns: Accuracy metric diff --git a/src/inspect_ai/scorer/_metrics/std.py b/src/inspect_ai/scorer/_metrics/std.py index d7082e890..79ced775d 100644 --- a/src/inspect_ai/scorer/_metrics/std.py +++ b/src/inspect_ai/scorer/_metrics/std.py @@ -21,14 +21,14 @@ def bootstrap_stderr( """Standard error of the mean using bootstrap. Args: - num_samples (int): Number of bootstrap samples to take. - to_float (ValueToFloat): Function for mapping - Value to float for computing metrics. The default - `value_to_float()` maps CORRECT ("C") to 1.0, - INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and - NOANSWER ("N") to 0, casts numeric values to - float directly, and prints a warning and returns - 0 if the Value is a complex object (list or dict). + num_samples: Number of bootstrap samples to take. + to_float: Function for mapping + Value to float for computing metrics. The default + `value_to_float()` maps CORRECT ("C") to 1.0, + INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and + NOANSWER ("N") to 0, casts numeric values to + float directly, and prints a warning and returns + 0 if the Value is a complex object (list or dict). Returns: bootstrap_stderr metric @@ -54,20 +54,17 @@ def stderr( """Standard error of the mean using Central Limit Theorem. Args: - to_float (ValueToFloat): Function for mapping - Value to float for computing metrics. The default - `value_to_float()` maps CORRECT ("C") to 1.0, - INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and - NOANSWER ("N") to 0, casts numeric values to - float directly, and prints a warning and returns - 0 if the Value is a complex object (list or dict). - - cluster (str | None): The key from the Sample metadata - corresponding to a cluster identifier for computing - [clustered standard errors](https://en.wikipedia.org/wiki/Clustered_standard_errors). + to_float: Function for mapping `Value` to float for computing + metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0, + INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0, + casts numeric values to float directly, and prints a warning and returns + 0 if the Value is a complex object (list or dict). + cluster (str | None): The key from the Sample metadata + corresponding to a cluster identifier for computing + [clustered standard errors](https://en.wikipedia.org/wiki/Clustered_standard_errors). Returns: - stderr metric + stderr metric """ def clustered_metric(scores: list[SampleScore]) -> float: @@ -142,13 +139,12 @@ def std(to_float: ValueToFloat = value_to_float()) -> Metric: """Calculates the sample standard deviation of a list of scores. Args: - to_float (ValueToFloat): Function for mapping - Value to float for computing metrics. The default - `value_to_float()` maps CORRECT ("C") to 1.0, - INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and - NOANSWER ("N") to 0, casts numeric values to - float directly, and prints a warning and returns - 0 if the Value is a complex object (list or dict). + to_float: Function for mapping `Value` to float for computing + metrics. The default `value_to_float()` maps CORRECT ("C") to 1.0, + INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and NOANSWER ("N") to 0, + casts numeric values to float directly, and prints a warning and returns + 0 if the Value is a complex object (list or dict). + Returns: std metric diff --git a/src/inspect_ai/scorer/_model.py b/src/inspect_ai/scorer/_model.py index 2fa30cbde..c49aa0914 100644 --- a/src/inspect_ai/scorer/_model.py +++ b/src/inspect_ai/scorer/_model.py @@ -35,31 +35,31 @@ def model_graded_fact( """Score a question/answer task with a fact response using a model. Args: - template (str): Template for grading prompt. This template uses + template: Template for grading prompt. This template uses four variables: `question`, `criterion`, `answer`, and `instructions` (which is fed from the `instructions` parameter). Variables from sample `metadata` are also available in the template. - instructions (str): Grading instructions. This should + instructions: Grading instructions. This should include a prompt for the model to answer (e.g. with with chain of thought reasoning) in a way that matches the specified `grade_pattern`, for example, the default `grade_pattern` looks for one of GRADE: C, GRADE: P, or GRADE: I). - grade_pattern (str): Regex to extract the grade from the + grade_pattern: Regex to extract the grade from the model response. Defaults to looking for e.g. GRADE: C The regex should have a single capture group that extracts exactly the letter C, P, or I. - include_history (bool | Callable[[TaskState], str]): + include_history: Whether to include the full chat history in the presented question. Defaults to `False`, which presents only the original sample input. Optionally provide a function to customise how the chat history is presented. - partial_credit (bool): Whether to allow for "partial" credit for + partial_credit: Whether to allow for "partial" credit for answers (by default assigned a score of 0.5). Defaults to `False`. Note that this parameter is only used with the default `instructions` (as custom instructions provide their own prompts for grades). - model (list[str | Model] | str | Model | None): Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used. + model: Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used. """ return model_graded_qa( template=template if template else DEFAULT_MODEL_GRADED_FACT_TEMPLATE, @@ -83,32 +83,32 @@ def model_graded_qa( """Score a question/answer task using a model. Args: - template (str): Template for grading prompt. This template has + template: Template for grading prompt. This template has four variables: - `question`, `criterion`, `answer`, and `instructions` (which is fed from the `instructions` parameter). Variables from sample `metadata` are also available in the template. - instructions (str): Grading instructions. This should + instructions: Grading instructions. This should include a prompt for the model to answer (e.g. with with chain of thought reasoning) in a way that matches the specified `grade_pattern`, for example, the default `grade_pattern` looks for one of GRADE: C, GRADE: P, or GRADE: I. - grade_pattern (str): Regex to extract the grade from the + grade_pattern: Regex to extract the grade from the model response. Defaults to looking for e.g. GRADE: C The regex should have a single capture group that extracts exactly the letter C, P, I. - include_history (bool | Callable[[TaskState], str]): + include_history: Whether to include the full chat history in the presented question. Defaults to `False`, which presents only the original sample input. Optionally provide a function to customise how the chat history is presented. - partial_credit (bool): Whether to allow for "partial" credit for + partial_credit: Whether to allow for "partial" credit for answers (by default assigned a score of 0.5). Defaults to `False`. Note that this parameter is only used with the default `instructions` (as custom instructions provide their own prompts for grades). - model (list[str | Model] | str | Model | None): Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used. + model: Model or Models to use for grading. If multiple models are passed, a majority vote of their grade will be returned. By default the model being evaluated is used. """ # bind variables get_scorer = partial( diff --git a/src/inspect_ai/scorer/_pattern.py b/src/inspect_ai/scorer/_pattern.py index 493170e0b..f8fe1e28d 100644 --- a/src/inspect_ai/scorer/_pattern.py +++ b/src/inspect_ai/scorer/_pattern.py @@ -55,11 +55,11 @@ def pattern(pattern: str, ignore_case: bool = True, match_all: bool = False) -> to match either one or all of the extracted groups Args: - pattern (str): Regular expression for extracting the + pattern: Regular expression for extracting the answer from model output. - ignore_case (bool): Ignore case when comparing + ignore_case: Ignore case when comparing the extract answer to the targets. (Default: True) - match_all (bool): With multiple captures, do all captured + match_all: With multiple captures, do all captured values need to match the target? (Default: False) """ diff --git a/src/inspect_ai/scorer/_reducer/reducer.py b/src/inspect_ai/scorer/_reducer/reducer.py index 4d95af08d..908e4a0e1 100644 --- a/src/inspect_ai/scorer/_reducer/reducer.py +++ b/src/inspect_ai/scorer/_reducer/reducer.py @@ -12,6 +12,8 @@ @score_reducer(name="mode") def mode_score() -> ScoreReducer: + r"""Take the mode from a list of scores.""" + def reduce(scores: list[Score]) -> Score: r"""A utility function for the most common score in a list of scores. @@ -36,12 +38,13 @@ def most_common( @score_reducer(name="mean") def mean_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer: - def reduce(scores: list[Score]) -> Score: - r"""A utility function for taking a mean value over a list of scores. + r"""Take the mean of a list of scores. - Args: - scores: a list of Scores. - """ + Args: + value_to_float: Function to convert the value to a float + """ + + def reduce(scores: list[Score]) -> Score: if isinstance(scores[0].value, dict): return _compute_dict_stat(scores, value_to_float, statistics.mean) elif isinstance(scores[0].value, list): @@ -54,12 +57,13 @@ def reduce(scores: list[Score]) -> Score: @score_reducer(name="median") def median_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer: - def reduce(scores: list[Score]) -> Score: - r"""A utility function for taking a median value over a list of scores. + r"""Take the median value from a list of scores. - Args: - scores: a list of Scores. - """ + Args: + value_to_float: Function to convert the value to a float + """ + + def reduce(scores: list[Score]) -> Score: if isinstance(scores[0].value, dict): return _compute_dict_stat(scores, value_to_float, statistics.median) elif isinstance(scores[0].value, list): @@ -74,13 +78,15 @@ def reduce(scores: list[Score]) -> Score: def at_least( k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float() ) -> ScoreReducer: - def reduce(scores: list[Score]) -> Score: - r"""A utility function for scoring a value as correct if there are at least n score values greater than or equal to the value + r"""Score correct if there are at least k score values greater than or equal to the value. - Args: - scores: a list of Scores. - """ + Args: + k: Number of score values that must exceed `value`. + value: Score value threshold. + value_to_float: Function to convert score values to float. + """ + def reduce(scores: list[Score]) -> Score: def gte_n( counter: Counter[str | int | float | bool], ) -> str | int | float | bool: @@ -104,6 +110,14 @@ def gte_n( def pass_at( k: int, value: float = 1.0, value_to_float: ValueToFloat = value_to_float() ) -> ScoreReducer: + r"""Probability of at least 1 correct sample given `k` epochs (). + + Args: + k: Epochs to compute probability for. + value: Score value threshold. + value_to_float: Function to convert score values to float. + """ + def reduce(scores: list[Score]) -> Score: def pass_at_k(values: list[float]) -> float: total = len(scores) @@ -129,12 +143,13 @@ def pass_at_k(values: list[float]) -> float: @score_reducer(name="max") def max_score(value_to_float: ValueToFloat = value_to_float()) -> ScoreReducer: - def reduce(scores: list[Score]) -> Score: - r"""A utility function for taking the maximum value from a list of scores + r"""Take the maximum value from a list of scores. - Args: - scores: a list of Scores. - """ + Args: + value_to_float: Function to convert the value to a float + """ + + def reduce(scores: list[Score]) -> Score: if isinstance(scores[0].value, dict): dict_result: dict[str, str | int | float | bool | None] = {} keys = scores[0].value.keys() # type: ignore @@ -238,7 +253,7 @@ def _compute_dict_stat( Args: scores: a list of Scores. - value_to_float: function to convert the value to a float + value_to_float: Function to convert the value to a float statistic: the statistic to apply """ # Make sure these are all dictionaries be we proceed diff --git a/src/inspect_ai/scorer/_reducer/registry.py b/src/inspect_ai/scorer/_reducer/registry.py index 94e43ebdb..e98b7aff4 100644 --- a/src/inspect_ai/scorer/_reducer/registry.py +++ b/src/inspect_ai/scorer/_reducer/registry.py @@ -40,9 +40,9 @@ def score_reducer( """Decorator for registering Score Reducers. Args: - func (ScoreReducerType | None): Function returning `ScoreReducer` targeted by + func: Function returning `ScoreReducer` targeted by plain task decorator without attributes (e.g. `@score_reducer`) - name (str | None): Optional name for reducer. If the decorator has no name + name: Optional name for reducer. If the decorator has no name argument then the name of the function will be used to automatically assign a name. Returns: diff --git a/src/inspect_ai/scorer/_reducer/types.py b/src/inspect_ai/scorer/_reducer/types.py index 1b71572ec..02daf47be 100644 --- a/src/inspect_ai/scorer/_reducer/types.py +++ b/src/inspect_ai/scorer/_reducer/types.py @@ -5,7 +5,13 @@ @runtime_checkable class ScoreReducer(Protocol): - def __call__(self, scores: list[Score]) -> Score: ... + def __call__(self, scores: list[Score]) -> Score: + """Reduce a set of scores to a single score. + + Args: + scores: List of scores. + """ + ... @property def __name__(self) -> str: ... diff --git a/src/inspect_ai/scorer/_score.py b/src/inspect_ai/scorer/_score.py index f02ec7f66..0c1913158 100644 --- a/src/inspect_ai/scorer/_score.py +++ b/src/inspect_ai/scorer/_score.py @@ -23,6 +23,8 @@ async def score(state: TaskState) -> list[Score]: a task that does not have a scorer. """ + from inspect_ai.log._transcript import ScoreEvent, transcript + scorers = _scorers.get(None) target = _target.get(None) if scorers is None or target is None: @@ -30,7 +32,15 @@ async def score(state: TaskState) -> list[Score]: "The score() function can only be called while executing a task with a scorer." ) - return [await scorer(state, target) for scorer in scorers] + scores: list[Score] = [] + for scorer in scorers: + score = await scorer(state, target) + scores.append(score) + transcript()._event( + ScoreEvent(score=score, target=target.target, intermediate=True) + ) + + return scores def init_scoring_context(scorers: list[Scorer], target: Target) -> None: diff --git a/src/inspect_ai/scorer/_scorer.py b/src/inspect_ai/scorer/_scorer.py index 418bafc03..a3ab7a53f 100644 --- a/src/inspect_ai/scorer/_scorer.py +++ b/src/inspect_ai/scorer/_scorer.py @@ -26,21 +26,33 @@ @runtime_checkable class Scorer(Protocol): - r"""Score model outputs. - - Evaluate the passed outputs and targets and return a - dictionary with scoring outcomes and context. - - Args: - state (TaskState): Task state - target (Target): Ideal target for the output. - """ - async def __call__( self, state: TaskState, target: Target, - ) -> Score: ... + ) -> Score: + r"""Score model outputs. + + Evaluate the passed outputs and targets and return a + dictionary with scoring outcomes and context. + + Args: + state: Task state + target: Ideal target for the output. + + Examples: + ```python + @scorer + def custom_scorer() -> Scorer: + async def score(state: TaskState, target: Target) -> Score: + # Compare state / model output with target + # to yield a score + return Score(value=...) + + return score + ```` + """ + ... P = ParamSpec("P") @@ -90,17 +102,28 @@ def scorer( r"""Decorator for registering scorers. Args: - metrics (list[Metric] | dict[str, list[Metric]]): One or more metrics to calculate + metrics: One or more metrics to calculate over the scores. - name (str | None): - Optional name for scorer. If the decorator has no name + name: Optional name for scorer. If the decorator has no name argument then the name of the underlying ScorerType object will be used to automatically assign a name. - **metadata (dict[str,Any]): Additional values to serialize + **metadata: Additional values to serialize in metadata. Returns: Scorer with registry attributes. + + Examples: + ```python + @scorer + def custom_scorer() -> Scorer: + async def score(state: TaskState, target: Target) -> Score: + # Compare state / model output with target + # to yield a score + return Score(value=...) + + return score + ```` """ def wrapper(scorer_type: Callable[P, Scorer]) -> Callable[P, Scorer]: diff --git a/src/inspect_ai/solver/__init__.py b/src/inspect_ai/solver/__init__.py index 4a1546b07..22abec61d 100644 --- a/src/inspect_ai/solver/__init__.py +++ b/src/inspect_ai/solver/__init__.py @@ -1,7 +1,7 @@ from inspect_ai._util.deprecation import relocated_module_attribute from ._basic_agent import basic_agent -from ._bridge import bridge +from ._bridge.bridge import bridge from ._chain import chain from ._critique import self_critique from ._fork import fork diff --git a/src/inspect_ai/solver/_basic_agent.py b/src/inspect_ai/solver/_basic_agent.py index e6199b0e5..be7a9b46b 100644 --- a/src/inspect_ai/solver/_basic_agent.py +++ b/src/inspect_ai/solver/_basic_agent.py @@ -81,31 +81,28 @@ def basic_agent( alternate conversion scheme as required via `score_value`. Args: - init: (Solver | list[Solver] | None): Agent initialisation - (defaults to system_message with basic ReAct prompt) - tools (list[Tool | ToolDef] | Solver | None): Tools available for the agent. Either a - list of tools or a Solver that can yield dynamic tools per-sample. - cache: (bool | CachePolicy): Caching behaviour for generate responses - (defaults to no caching). - max_attempts (int): Maximum number of submissions to accept before terminating. - message_limit (int | None): Limit on messages in sample before terminating agent. + init: Agent initialisation (defaults to system_message with basic ReAct prompt) + tools: Tools available for the agent. Either a list of tools or a Solver that + can yield dynamic tools per-sample. + cache: Caching behaviour for generate responses (defaults to no caching). + max_attempts: Maximum number of submissions to accept before terminating. + message_limit: Limit on messages in sample before terminating agent. If not specified, will use limit_messages defined for the task. If there is none defined for the task, 50 will be used as a default. - token_limit (int | None): Limit on tokens used in sample before terminating agent. - max_tool_output (int | None): Maximum output length (in bytes). + token_limit: Limit on tokens used in sample before terminating agent. + max_tool_output: Maximum output length (in bytes). Defaults to max_tool_output from active GenerateConfig. - score_value (ValueToFloat): Function used to extract float from scores (defaults - to standard value_to_float()) - incorrect_message (str | Callable[[TaskState, list[Score]], str | Awaitable[str]]): - User message reply for an incorrect submission from the model. Alternatively, - a function which returns a message (function may optionally be async) - continue_message (str): User message to urge the model to continue when it - doesn't make a tool call. - submit_name (str): Name for tool used to make submissions - (defaults to 'submit') - submit_description (str): Description of submit tool (defaults to - 'Submit an answer for evaluation') - **kwargs (Any): Deprecated arguments for backward compatibility. + score_value: Function used to extract float from scores (defaults + to standard value_to_float()) + incorrect_message: User message reply for an incorrect submission from the model. + Alternatively, a function which returns a message (function may optionally be async) + continue_message: User message to urge the model to continue when it + doesn't make a tool call. + submit_name: Name for tool used to make submissions + (defaults to 'submit') + submit_description: Description of submit tool (defaults to + 'Submit an answer for evaluation') + **kwargs: Deprecated arguments for backward compatibility. Returns: Plan for agent. diff --git a/src/inspect_ai/solver/_bridge/__init__.py b/src/inspect_ai/solver/_bridge/__init__.py index aa1437636..e69de29bb 100644 --- a/src/inspect_ai/solver/_bridge/__init__.py +++ b/src/inspect_ai/solver/_bridge/__init__.py @@ -1,3 +0,0 @@ -from .bridge import bridge - -__all__ = ["bridge"] diff --git a/src/inspect_ai/solver/_bridge/bridge.py b/src/inspect_ai/solver/_bridge/bridge.py index 6de693784..88efa17f8 100644 --- a/src/inspect_ai/solver/_bridge/bridge.py +++ b/src/inspect_ai/solver/_bridge/bridge.py @@ -17,7 +17,7 @@ def bridge(agent: Callable[[dict[str, Any]], Awaitable[dict[str, Any]]]) -> Solver: """Bridge an external agent into an Inspect Solver. - See documentation at https://inspect.ai-safety-institute.org.uk/agent-bridge.html + See documentation at Args: agent: Callable which takes a sample `dict` and returns a result `dict`. @@ -63,11 +63,11 @@ async def solve(state: TaskState, generate: Generate) -> TaskState: else state.input ) - # create sample + # create sample (use standard gpt-4 message encoding -- i.e. no 'developer' messages) sample = BridgeSample( sample_id=str(state.sample_id), epoch=state.epoch, - input=await openai_chat_messages(input, state.model.name), + input=await openai_chat_messages(input, model="gpt-4"), metadata=state.metadata, target=list(state.target), ) diff --git a/src/inspect_ai/solver/_chain.py b/src/inspect_ai/solver/_chain.py index 5ee22aaaa..5a5a9eb50 100644 --- a/src/inspect_ai/solver/_chain.py +++ b/src/inspect_ai/solver/_chain.py @@ -15,8 +15,7 @@ def chain(*solvers: Solver | list[Solver]) -> Solver: early. Args: - solvers (*Solver | list[Solver]): One or more solvers - or lists of solvers to chain together. + *solvers: One or more solvers or lists of solvers to chain together. Returns: Solver that executes the passed solvers as a chain. diff --git a/src/inspect_ai/solver/_critique.py b/src/inspect_ai/solver/_critique.py index cde9125f6..c8d9861ac 100644 --- a/src/inspect_ai/solver/_critique.py +++ b/src/inspect_ai/solver/_critique.py @@ -25,15 +25,15 @@ def self_critique( need to use the model being evaluated). Args: - critique_template (str | None): String or path to file + critique_template: String or path to file containing critique template. The template uses two variables: `question` and `completion`. Variables from sample `metadata` are also available in the template. - completion_template (str | None): String or path to file + completion_template: String or path to file containing completion template. The template uses three variables: `question`, `completion`, and `critique` - model (str | Model | None): Alternate model to be used + model: Alternate model to be used for critique (by default the model being evaluated is used). """ diff --git a/src/inspect_ai/solver/_fork.py b/src/inspect_ai/solver/_fork.py index 60ecad2a1..0b37d2fb0 100644 --- a/src/inspect_ai/solver/_fork.py +++ b/src/inspect_ai/solver/_fork.py @@ -32,8 +32,8 @@ async def fork( Store that doesn't affect the Store of other subtasks or the parent). Args: - state (TaskState): Beginning TaskState - solvers (Solver | list[Solver]): Solvers to apply on the TaskState. + state: Beginning TaskState + solvers: Solvers to apply on the TaskState. Each Solver will get a standalone copy of the TaskState. Returns: diff --git a/src/inspect_ai/solver/_human_agent/__init__.py b/src/inspect_ai/solver/_human_agent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/inspect_ai/solver/_human_agent/agent.py b/src/inspect_ai/solver/_human_agent/agent.py index 58282dc8b..5bb42ad69 100644 --- a/src/inspect_ai/solver/_human_agent/agent.py +++ b/src/inspect_ai/solver/_human_agent/agent.py @@ -30,14 +30,11 @@ def human_agent( using a VS Code Window or Terminal. Args: - answer (bool | str): Is an explicit answer required for this - task or is it scored based on files in the container? Pass a - `str` with a regex to validate that the answer matches - the expected format. - intermediate_scoring (bool): Allow the human agent to - check their score while working. - record_session (bool): Record all user commands and outputs in - the sandbox bash session. + answer: Is an explicit answer required for this task or is it scored + based on files in the container? Pass a `str` with a regex to validate + that the answer matches the expected format. + intermediate_scoring: Allow the human agent to check their score while working. + record_session: Record all user commands and outputs in the sandbox bash session. Returns: Solver: Human agent solver. diff --git a/src/inspect_ai/solver/_human_agent/commands/clock.py b/src/inspect_ai/solver/_human_agent/commands/clock.py index 309d05148..76a1d8480 100644 --- a/src/inspect_ai/solver/_human_agent/commands/clock.py +++ b/src/inspect_ai/solver/_human_agent/commands/clock.py @@ -27,14 +27,10 @@ def cli(self, args: Namespace) -> None: print(call_human_agent("start")) def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]: - from inspect_ai.log._transcript import transcript - async def start() -> str: if not state.running: state.running = True - transcript().info( - f"Task started (total time: {format_progress_time(state.time)})" - ) + clock_action_event("start", state) return render_status(state) return start @@ -57,14 +53,22 @@ def cli(self, args: Namespace) -> None: print(call_human_agent("stop")) def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]: - from inspect_ai.log._transcript import transcript - async def stop() -> str: if state.running: state.running = False - transcript().info( - f"Task stopped (total time: {format_progress_time(state.time)})" - ) + clock_action_event("stop", state) return render_status(state) return stop + + +def clock_action_event(action: str, state: HumanAgentState) -> None: + from inspect_ai.log._transcript import transcript + + transcript().info( + { + "action": action, + "total_time": format_progress_time(state.time, False), + }, + source="human_agent", + ) diff --git a/src/inspect_ai/solver/_human_agent/commands/note.py b/src/inspect_ai/solver/_human_agent/commands/note.py index 5049af6fd..ffe69beb6 100644 --- a/src/inspect_ai/solver/_human_agent/commands/note.py +++ b/src/inspect_ai/solver/_human_agent/commands/note.py @@ -37,6 +37,6 @@ def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]] from inspect_ai.log._transcript import transcript async def note(content: str) -> None: - transcript().info(content) + transcript().info(content, source="human_agent") return note diff --git a/src/inspect_ai/solver/_human_agent/commands/score.py b/src/inspect_ai/solver/_human_agent/commands/score.py index e61776b45..424144ede 100644 --- a/src/inspect_ai/solver/_human_agent/commands/score.py +++ b/src/inspect_ai/solver/_human_agent/commands/score.py @@ -1,6 +1,5 @@ from argparse import Namespace from copy import deepcopy -from textwrap import dedent from typing import Awaitable, Callable, Literal from pydantic import JsonValue @@ -51,8 +50,6 @@ def cli(self, args: Namespace) -> None: def service(self, state: HumanAgentState) -> Callable[..., Awaitable[JsonValue]]: async def score_task(answer: str | None) -> str: - from inspect_ai.log._transcript import transcript - # make a copy of TaskState, add the answer, then score if answer: task_state = deepcopy(self._state) @@ -64,14 +61,6 @@ async def score_task(answer: str | None) -> str: # record the scoring action in our state state.scorings.append(IntermediateScoring(time=state.time, scores=result)) - # record to transcript - transcript().info( - dedent(f""" - ### Intermediate Score - **Answer:** {result[0].answer}, **Score:** {result[0].as_str()} - """) - ) - # notify user return render_text( f"[bold]Answer:[/bold] {result[0].answer}, [bold]Score:[/bold] {result[0].as_str()}" diff --git a/src/inspect_ai/solver/_multiple_choice.py b/src/inspect_ai/solver/_multiple_choice.py index d4961af30..fa496131d 100644 --- a/src/inspect_ai/solver/_multiple_choice.py +++ b/src/inspect_ai/solver/_multiple_choice.py @@ -219,38 +219,35 @@ def multiple_choice( multiple_correct: bool = False, **kwargs: Unpack[DeprecatedArgs], ) -> Solver: - """Multiple choice question solver. - - Formats a multiple choice question prompt, then calls `generate()` - - ### Usage + """Multiple choice question solver. Formats a multiple choice question prompt, then calls `generate()`. Note that due to the way this solver works, it has some constraints: - 1. The `Sample` must have the `choices` attribute set. - 2. The only built-in compatible scorer is the `choice` scorer. - 3. It calls `generate()` internally, so you don't need to call it again - - ### Shuffling - - You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API. + 1. The `Sample` must have the `choices` attribute set. + 2. The only built-in compatible scorer is the `choice` scorer. + 3. It calls `generate()` internally, so you don't need to call it again Args: - template (str | None): Template to use for the multiple choice question. + template: Template to use for the multiple choice question. The defaults vary based on the options and are taken from the `MultipleChoiceTemplate` enum. The template will have questions and possible answers substituted into it before being sent to the model. Consequently it requires three specific template variables: - - `{question}`: The question to be asked. - - `{choices}`: The choices available, which will be formatted as a + + - `{question}`: The question to be asked. + - `{choices}`: The choices available, which will be formatted as a list of A) ... B) ... etc. before sending to the model. - - `{letters}`: (optional) A string of letters representing the choices, e.g. + - `{letters}`: (optional) A string of letters representing the choices, e.g. "A,B,C". Used to be explicit to the model about the possible answers. - cot (bool): Default `False`. Whether the solver should perform chain-of-thought + cot: Default `False`. Whether the solver should perform chain-of-thought reasoning before answering. NOTE: this has no effect if you provide a custom template. - multiple_correct (bool): Default `False`. Whether to allow multiple + multiple_correct: Default `False`. Whether to allow multiple answers to the multiple choice question. For example, "What numbers are squares? A) 3, B) 4, C) 9" has multiple correct answers, B and C. Leave as `False` if there's exactly one correct answer from the choices available. NOTE: this has no effect if you provide a custom template. **kwargs (Any): Deprecated arguments for backward compatibility. + + #### Shuffling + + You can shuffle choices when you load your dataset by using the `shuffle_choices` method or parameter of the datasets API. """ shuffle: bool | Random = False if "shuffle" in kwargs: diff --git a/src/inspect_ai/solver/_prompt.py b/src/inspect_ai/solver/_prompt.py index ddf0c7aca..f041dee3d 100644 --- a/src/inspect_ai/solver/_prompt.py +++ b/src/inspect_ai/solver/_prompt.py @@ -20,8 +20,8 @@ def prompt_template(template: str, **params: Any) -> Solver: `params`. Args: - template: (str): Template for prompt. - **params (dict[str,Any]): Parameters to fill into the template. + template: Template for prompt. + **params: Parameters to fill into the template. Returns: A solver that uses the specified prompt template. @@ -51,8 +51,8 @@ def system_message(template: str, **params: Any) -> Solver: are none it will be inserted at the beginning of the conversation). Args: - template (str): Template for system message. - **params (dict[str,Any]): Parameters to fill into the template. + template: Template for system message. + **params: Parameters to fill into the template. Returns: A solver that inserts the parameterised system message. @@ -80,8 +80,8 @@ def user_message(template: str, **params: Any) -> Solver: included in the `params`. Args: - template (str): Template for user message. - **params (dict[str,Any]): Parameters to fill into the template. + template: Template for user message. + **params: Parameters to fill into the template. Returns: A solver that inserts the parameterised user message. @@ -109,7 +109,7 @@ def chain_of_thought(template: str = DEFAULT_COT_TEMPLATE) -> Solver: """Solver which modifies the user prompt to encourage chain of thought. Args: - template (str): String or path to file containing CoT template. + template: String or path to file containing CoT template. The template uses a single variable: `prompt`. """ diff --git a/src/inspect_ai/solver/_solver.py b/src/inspect_ai/solver/_solver.py index a0f7da7b2..076ca0c0b 100644 --- a/src/inspect_ai/solver/_solver.py +++ b/src/inspect_ai/solver/_solver.py @@ -29,36 +29,32 @@ @runtime_checkable class Generate(Protocol): - """Generate using the model and add the assistant message to the task state. - - Args: - state (TaskState): Beginning task state. - - tool_calls (Literal["loop", "single", "none"]): Resolve tool calls: - - `"loop"` resolves tools calls and then invokes `generate()`, - proceeding in a loop which terminates when there are no more - tool calls, or `message_limit` or `token_limit` is exceeded. - This is the default behavior. - - `"single"` resolves at most a single set of tool calls and then returns. - - `"none"` does not resolve tool calls at all (in this - case you will need to invoke `call_tools()` directly). - - cache: (bool | CachePolicy): - Caching behaviour for generate responses (defaults to no caching). - - **kwargs: Optional generation config arguments. - - Returns: - Updated TaskState. - """ - async def __call__( self, state: TaskState, tool_calls: Literal["loop", "single", "none"] = "loop", cache: bool | CachePolicy = False, **kwargs: Unpack[GenerateConfigArgs], - ) -> TaskState: ... + ) -> TaskState: + """Generate using the model and add the assistant message to the task state. + + Args: + state: Beginning task state. + tool_calls: + - `"loop"` resolves tools calls and then invokes `generate()`, + proceeding in a loop which terminates when there are no more + tool calls, or `message_limit` or `token_limit` is exceeded. + This is the default behavior. + - `"single"` resolves at most a single set of tool calls and then returns. + - `"none"` does not resolve tool calls at all (in this + case you will need to invoke `call_tools()` directly). + cache: Caching behaviour for generate responses (defaults to no caching). + **kwargs: Optional generation config arguments. + + Returns: + Updated TaskState. + """ + ... @dataclass(frozen=True) @@ -74,28 +70,37 @@ class SolverSpec: @runtime_checkable class Solver(Protocol): - r"""Contribute to solving an evaluation task. + async def __call__( + self, + state: TaskState, + generate: Generate, + ) -> TaskState: + r"""Contribute to solving an evaluation task. - Contribute to the solution of a task by transforming a TaskState - (e.g. prompt enhancement, elicitation, etc.). Solvers return a - TaskState (which could simply be a modified version of the one - they were passed) and optionally may call the generate() function - to generate output (and a new TaskState with that output). + Transform a `TaskState`, returning the new state. Solvers may + optionally call the `generate()` function to create a new + state resulting from model generation. Solvers may also do + prompt engineering or other types of elicitation. + Args: + state: State for tasks being evaluated. + generate: Function for generating outputs. - Args: - state (TaskState): States for tasks being evaluated. - generate (Generate): Function for generating outputs. + Returns: + Updated TaskState. - Returns: - Updated TaskState. - """ + Examples: + ```python + @solver + def prompt_cot(template: str) -> Solver: + def solve(state: TaskState, generate: Generate) -> TaskState: + # insert chain of thought prompt + return state - async def __call__( - self, - state: TaskState, - generate: Generate, - ) -> TaskState: ... + return solve + ``` + """ + ... P = ParamSpec("P") @@ -144,7 +149,7 @@ def solver( r"""Decorator for registering solvers. Args: - name: (str | Callable[P, Solver]): + name: Optional name for solver. If the decorator has no name argument then the name of the underlying Callable[P, Solver] object will be used to automatically assign a name. @@ -153,19 +158,15 @@ def solver( Solver with registry attributes. Examples: - @solver - def prompt_cot(state: TaskState, generate: Generate) -> None: - ... - - @solver(name = "prompt_cot") - def cot(state: TaskState, generate: Generate) -> None: - ... - + ```python @solver def prompt_cot(template: str) -> Solver: - def solve(state: TaskState, generate: Generate) -> None: - ... + def solve(state: TaskState, generate: Generate) -> TaskState: + # insert chain of thought prompt + return state + return solve + ``` """ # create_solver_wrapper: diff --git a/src/inspect_ai/solver/_task_state.py b/src/inspect_ai/solver/_task_state.py index 5082af313..b64f4cf52 100644 --- a/src/inspect_ai/solver/_task_state.py +++ b/src/inspect_ai/solver/_task_state.py @@ -31,17 +31,20 @@ class Choice: """ A `Choice` represents a single choice in a multiple choice question. - It is only relevant for the `multiple_choice` solver and corresponding `choice` scorer. + It is only relevant for the `multiple_choice` solver and corresponding + `choice` scorer. """ value: str """The original value of the choice from the `Sample`.""" correct: bool | None - """Did the model think this choice satisfies the question? `None` indicates this has not been set yet""" + """Did the model think this choice satisfies the question? `None` + indicates this has not been set yet""" original_position: int - """Choices may be re-ordered during processing, this represents the original position in the sample's list of choices""" + """Choices may be re-ordered during processing, this represents the + original position in the sample's list of choices""" class Choices(Sequence[Choice]): @@ -127,10 +130,10 @@ class TaskState: """ The `TaskState` represents the internal state of the `Task` being run for a single `Sample`. - It's a mutable object that is updated by each solver during a sample's - evaluation. It allows us to maintain things like the message history between - the running `Task` and the model, the tools available to the model, the - final output of the model and whether or not it's completed yet. + The `TaskState` is passed to and returned from each solver during a sample's + evaluation. It allows us to manipulated the message history, the tools + available to the model, the final output of the model, and whether the task + is completed or has hit a limit. """ def __init__( @@ -149,73 +152,39 @@ def __init__( metadata: dict[str, Any] = {}, ) -> None: self._model = model - """Model name used for this task.""" - - self.sample_id = sample_id - """Unique id for sample.""" - - self.epoch = epoch - """Epoch number for sample.""" - + self._sample_id = sample_id + self._epoch = epoch self._input = input - """ - The original input from the `Sample` for this `TaskState`. - - Should be treated as immutable and not changed during the run, so that - it can be referenced or checked wherever needed. Access through `input` - or `input_text` only - """ - - self.target = target - """The scoring target for this `Sample`.""" - - self.metadata = metadata - """Metadata from the `Sample` for this `TaskState`""" - + self._target = target + self._metadata = metadata self._messages: list[ChatMessage] = ChatMessageList(messages, self) - """ - Chat conversation history for sample. - - This will generally get appended to every time a `generate` call is made - to the model. Useful for both debug and for solvers/scorers to assess - model performance or choose the next step. - """ - self._tools: list[Tool] = [] - """Tools available to the model.""" - - self.tool_choice: ToolChoice | None = None - """Tool choice directive.""" - - self.output = output if output else ModelOutput(model=str(model), choices=[]) - """ - The 'final' model output once we've completed all solving. - - For simple evals this may just be the last `message` from the - conversation history, but more complex solvers may generate this in - different ways depending on what solvers are used.. - """ - + self._output = output if output else ModelOutput(model=str(model)) self._message_limit = message_limit self._token_limit = token_limit self._completed = completed - - """Store for shared data""" - self.store = Store() + self._store = Store() if choices: self.choices = Choices(choices) else: self.choices = Choices([]) - self.scores: dict[str, Score] | None = None - """Scores yielded by running task.""" - @property def model(self) -> ModelName: """Name of model being evaluated.""" return self._model + @property + def sample_id(self) -> int | str: + """Unique id for sample.""" + return self._sample_id + + @property + def epoch(self) -> int: + """Epoch number for sample.""" + return self._epoch + @property def input(self) -> str | list[ChatMessage]: """Input from the `Sample`, should be considered immutable.""" @@ -253,9 +222,6 @@ def user_prompt(self) -> ChatMessageUser: engineering solvers). This property enables easy read and write access to the user chat prompt. Raises an exception if there is no user prompt - - Returns: - First user `ChatMessage` in the task state. """ prompt = next((m for m in self.messages if m.role == "user"), None) if prompt: @@ -263,16 +229,63 @@ def user_prompt(self) -> ChatMessageUser: else: raise ValueError("user_prompt requested from TaskState but none available") + @property + def metadata(self) -> dict[str, Any]: + """Metadata from the `Sample` for this `TaskState`""" + return self._metadata + + @metadata.setter + def metadata(self, metadata: dict[str, Any]) -> None: + self._metadata = metadata + @property def messages(self) -> list[ChatMessage]: - """Messages in chat history""" + """ + Chat conversation history for sample. + + This will generally get appended to every time a `generate` call is made + to the model. Useful for both debug and for solvers/scorers to assess + model performance or choose the next step. + """ return self._messages @messages.setter def messages(self, messages: list[ChatMessage]) -> None: - """Set messages in chat history.""" self._messages = ChatMessageList(messages, self) + @property + def output(self) -> ModelOutput: + """ + The 'final' model output once we've completed all solving. + + For simple evals this may just be the last `message` from the + conversation history, but more complex solvers may set this directly. + """ + return self._output + + @output.setter + def output(self, output: ModelOutput) -> None: + self._output = output + + @property + def store(self) -> Store: + """Store for shared data""" + return self._store + + @property + def tools(self) -> list[Tool]: + """Tools available to the model.""" + return self._tools + + @tools.setter + def tools(self, tools: list[Tool | ToolDef]) -> None: + self._tools.clear() + for tool in tools: + self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool()) + + tool_choice: ToolChoice | None = None + """Tool choice directive.""" + @property def max_messages(self) -> int | None: """Deprecated (use message_limit).""" @@ -351,14 +364,12 @@ def completed(self, completed: bool) -> None: self._completed = completed @property - def tools(self) -> list[Tool]: - return self._tools + def target(self) -> Target: + """The scoring target for this `Sample`.""" + return self._target - @tools.setter - def tools(self, tools: list[Tool | ToolDef]) -> None: - self._tools.clear() - for tool in tools: - self._tools.append(tool if isinstance(tool, Tool) else tool.as_tool()) + scores: dict[str, Score] | None = None + """Scores yielded by running task.""" def metadata_as(self, metadata_cls: Type[MT]) -> MT: """Pydantic model interface to metadata. diff --git a/src/inspect_ai/solver/_use_tools.py b/src/inspect_ai/solver/_use_tools.py index 8dfa95e91..d89483317 100644 --- a/src/inspect_ai/solver/_use_tools.py +++ b/src/inspect_ai/solver/_use_tools.py @@ -15,15 +15,15 @@ def use_tools( Inject tools into the task state to be used in generate(). Args: - *tools (Tool | list[Tool]): One or more tools or lists of tools - to make available to the model. If no tools are passed, then - no change to the currently available set of `tools` is made. - tool_choice (ToolChoice | None): Directive indicating which - tools the model should use. If `None` is passed, then no - change to `tool_choice` is made. - append (bool): If `True`, then the passed-in tools are appended - to the existing tools; otherwise any existing tools are - replaced (the default) + *tools: One or more tools or lists of tools + to make available to the model. If no tools are passed, then + no change to the currently available set of `tools` is made. + tool_choice: Directive indicating which + tools the model should use. If `None` is passed, then no + change to `tool_choice` is made. + append: If `True`, then the passed-in tools are appended + to the existing tools; otherwise any existing tools are + replaced (the default) Returns: A solver that injects the tools and tool_choice into the task state. diff --git a/src/inspect_ai/tool/__init__.py b/src/inspect_ai/tool/__init__.py index bdd16213b..a11ebfa42 100644 --- a/src/inspect_ai/tool/__init__.py +++ b/src/inspect_ai/tool/__init__.py @@ -19,7 +19,7 @@ from ._tool_choice import ToolChoice, ToolFunction from ._tool_def import ToolDef from ._tool_info import ToolInfo -from ._tool_params import ToolParam, ToolParams +from ._tool_params import JSONType, ToolParam, ToolParams from ._tool_with import tool_with from ._tools._computer import computer from ._tools._execute import bash, python @@ -54,6 +54,7 @@ "ToolInfo", "ToolParam", "ToolParams", + "JSONType", ] _UTIL_MODULE_VERSION = "0.3.19" diff --git a/src/inspect_ai/tool/_tool.py b/src/inspect_ai/tool/_tool.py index be5f4282d..18c1b7414 100644 --- a/src/inspect_ai/tool/_tool.py +++ b/src/inspect_ai/tool/_tool.py @@ -40,10 +40,25 @@ | ContentVideo | list[ContentText | ContentImage | ContentAudio | ContentVideo] ) +"""Valid types for results from tool calls.""" class ToolError(Exception): + """Exception thrown from tool call. + + If you throw a `ToolError` form within a tool call, + the error will be reported to the model for further + processing (rather than ending the sample). If you want + to raise a fatal error from a tool call use an appropriate + standard exception type (e.g. `RuntimeError`, `ValueError`, etc.) + """ + def __init__(self, message: str) -> None: + """Create a ToolError. + + Args: + message: Error message to report to the model. + """ super().__init__(message) self.message = message @@ -68,11 +83,21 @@ async def __call__( r"""Additional tool that an agent can use to solve a task. Args: - *args (Any): Arguments for the tool. - **kwargs (Any): Keyword arguments for the tool. + *args: Arguments for the tool. + **kwargs: Keyword arguments for the tool. Returns: Result of tool call. + + Examples: + ```python + @tool + def add() -> Tool: + async def execute(x: int, y: int) -> int: + return x + y + + return execute + ``` """ ... @@ -130,25 +155,29 @@ def tool( r"""Decorator for registering tools. Args: - func (ToolType | None): Tool function - name (str | None): - Optional name for tool. If the decorator has no name + func: Tool function + name: Optional name for tool. If the decorator has no name argument then the name of the tool creation function will be used as the name of the tool. - viewer (ToolCallViewer | None): Provide a custom view - of tool call and context. - model_input (ToolCallModelInput | None): Provide a custom - function for playing back tool results as model input. - parallel (bool): - Does this tool support parallel execution? - (defaults to True). - prompt (str): - Deprecated (provide all descriptive information about + viewer: Provide a custom view of tool call and context. + model_input: Provide a custom function for playing back tool results as model input. + parallel: Does this tool support parallel execution? (defaults to `True`). + prompt: Deprecated (provide all descriptive information about the tool within the tool function's doc comment) Returns: Tool with registry attributes. + + Examples: + ```python + @tool + def add() -> Tool: + async def execute(x: int, y: int) -> int: + return x + y + + return execute + ``` """ if prompt: from inspect_ai._util.logger import warn_once diff --git a/src/inspect_ai/tool/_tool_call.py b/src/inspect_ai/tool/_tool_call.py index c4c636391..ffd947d93 100644 --- a/src/inspect_ai/tool/_tool_call.py +++ b/src/inspect_ai/tool/_tool_call.py @@ -13,10 +13,10 @@ class ToolCallContent(BaseModel): """Optional (plain text) title for tool call content.""" format: Literal["text", "markdown"] - """Format.""" + """Format (text or markdown).""" content: str - """Content.""" + """Text or markdown content.""" class ToolCallView(BaseModel): @@ -56,6 +56,8 @@ class ToolCall: @dataclass class ToolCallError: + """Error raised by a tool call.""" + type: Literal[ "parsing", "timeout", @@ -67,8 +69,10 @@ class ToolCallError: "approval", "unknown", ] + """Error type.""" message: str + """Error message.""" ToolCallViewer = Callable[[ToolCall], ToolCallView] diff --git a/src/inspect_ai/tool/_tool_choice.py b/src/inspect_ai/tool/_tool_choice.py index 976d39528..f5c19d258 100644 --- a/src/inspect_ai/tool/_tool_choice.py +++ b/src/inspect_ai/tool/_tool_choice.py @@ -4,8 +4,10 @@ @dataclass class ToolFunction: + """Indicate that a specific tool function should be called.""" + name: str - """The name of the function to call.""" + """The name of the tool function to call.""" ToolChoice = Union[Literal["auto", "any", "none"], ToolFunction] diff --git a/src/inspect_ai/tool/_tool_def.py b/src/inspect_ai/tool/_tool_def.py index 7d5d727ab..ec3fe01c8 100644 --- a/src/inspect_ai/tool/_tool_def.py +++ b/src/inspect_ai/tool/_tool_def.py @@ -25,6 +25,8 @@ class ToolDef: + """Tool definition.""" + def __init__( self, tool: Callable[..., Any], @@ -35,19 +37,19 @@ def __init__( viewer: ToolCallViewer | None = None, model_input: ToolCallModelInput | None = None, ) -> None: - """Tool definition. + """Create a tool definition. Args: - tool (Callable[..., Any]): Callable to execute tool. - name (str | None): Name of tool. Discovered automatically if not specified. - description (str | None): Description of tool. Discovered automatically + tool: Callable to execute tool. + name: Name of tool. Discovered automatically if not specified. + description: Description of tool. Discovered automatically by parsing doc comments if not specified. - parameters (dict[str,str] | ToolParams | None): Tool parameter descriptions and types. + parameters: Tool parameter descriptions and types. Discovered automatically by parsing doc comments if not specified. - parallel (bool | None): Does the tool support parallel execution + parallel: Does the tool support parallel execution (defaults to True if not specified) - viewer (ToolCallViewer | None): Optional tool call viewer implementation. - model_input (ToolCallModelInput | None): Optional function that determines how + viewer: Optional tool call viewer implementation. + model_input: Optional function that determines how tool call results are played back as model input. Returns: diff --git a/src/inspect_ai/tool/_tool_params.py b/src/inspect_ai/tool/_tool_params.py index e1d80771c..f4c44508a 100644 --- a/src/inspect_ai/tool/_tool_params.py +++ b/src/inspect_ai/tool/_tool_params.py @@ -14,20 +14,44 @@ class ToolParam(BaseModel): """Description of tool parameter in JSON Schema format.""" type: JSONType | None = Field(default=None) + """JSON type of tool parameter.""" + description: str | None = Field(default=None) + """Parameter description.""" + default: Any = Field(default=None) + """Default value for parameter.""" + enum: list[Any] | None = Field(default=None) + """Valid values for enum parameters.""" + items: Optional["ToolParam"] = Field(default=None) + """Valid type for array parameters.""" + properties: dict[str, "ToolParam"] | None = Field(default=None) + """Valid fields for object parametrs.""" + additionalProperties: Optional["ToolParam"] | bool | None = Field(default=None) + """Are additional properties allowed?""" + anyOf: list["ToolParam"] | None = Field(default=None) + """Valid types for union parameters.""" + required: list[str] | None = Field(default=None) + """Required fields for object parameters.""" class ToolParams(BaseModel): """Description of tool parameters object in JSON Schema format.""" type: Literal["object"] = Field(default="object") + """Params type (always 'object')""" + properties: dict[str, ToolParam] = Field(default_factory=dict) + """Tool function parameters.""" + required: list[str] = Field(default_factory=list) + """List of required fields.""" + additionalProperties: bool = Field(default=False) + """Are additional object properties allowed? (always `False`)""" diff --git a/src/inspect_ai/tool/_tool_with.py b/src/inspect_ai/tool/_tool_with.py index e31c79200..155ff5277 100644 --- a/src/inspect_ai/tool/_tool_with.py +++ b/src/inspect_ai/tool/_tool_with.py @@ -25,14 +25,14 @@ def tool_with( """Tool with modifications to name and descriptions. Args: - tool (Tool): Tool instance to copy and add descriptions to. - name (str | None): Tool name (optional). - description (str | None): Tool description (optional). - parameters (dict[str,str] | None): Parameter descriptions (optional) - parallel (bool | None): Does the tool support parallel execution + tool: Tool instance to copy and add descriptions to. + name: Tool name (optional). + description: Tool description (optional). + parameters: Parameter descriptions (optional) + parallel: Does the tool support parallel execution (defaults to True if not specified) - viewer (ToolCallViewer | None): Optional tool call viewer implementation. - model_input (ToolCallModelInput | None): Optional function that determines how + viewer: Optional tool call viewer implementation. + model_input: Optional function that determines how tool call results are played back as model input. Returns: diff --git a/src/inspect_ai/tool/_tools/__init__.py b/src/inspect_ai/tool/_tools/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/inspect_ai/tool/_tools/_computer/_computer.py b/src/inspect_ai/tool/_tools/_computer/_computer.py index c8940d028..0b947d4bd 100644 --- a/src/inspect_ai/tool/_tools/_computer/_computer.py +++ b/src/inspect_ai/tool/_tools/_computer/_computer.py @@ -13,6 +13,17 @@ @tool def computer(max_screenshots: int | None = 1, timeout: int | None = 180) -> Tool: + """Desktop computer tool. + + See documentation at . + + Args: + max_screenshots: The maximum number of screenshots to play + back to the model as input. Defaults to 1 (set to `None` to have no limit). + timeout: Timeout in seconds for computer tool actions. + Defaults to 180 (set to `None` for no timeout). + """ + async def execute( action: Action, text: str | None = None, diff --git a/src/inspect_ai/tool/_tools/_execute.py b/src/inspect_ai/tool/_tools/_execute.py index 6bb0fac95..89f082878 100644 --- a/src/inspect_ai/tool/_tools/_execute.py +++ b/src/inspect_ai/tool/_tools/_execute.py @@ -1,4 +1,4 @@ -from inspect_ai.util import sandbox +from inspect_ai.util import sandbox as sandbox_env from .._tool import Tool, tool from .._tool_call import ToolCall, ToolCallContent, ToolCallView, ToolCallViewer @@ -20,14 +20,17 @@ def viewer(tool_call: ToolCall) -> ToolCallView: @tool(viewer=code_viewer("bash", "cmd")) -def bash(timeout: int | None = None, user: str | None = None) -> Tool: +def bash( + timeout: int | None = None, user: str | None = None, sandbox: str | None = None +) -> Tool: """Bash shell command execution tool. Execute bash shell commands using a sandbox environment (e.g. "docker"). Args: - timeout (int | None): Timeout (in seconds) for command. - user (str | None): User to execute commands as. + timeout: Timeout (in seconds) for command. + user: User to execute commands as. + sandbox: Optional sandbox environmnent name. Returns: String with command output (stdout) or command error (stderr). @@ -44,7 +47,7 @@ async def execute(cmd: str) -> str: The output of the command. """ # execute the command - result = await sandbox().exec( + result = await sandbox_env(sandbox).exec( cmd=["bash", "--login", "-c", cmd], timeout=timeout, user=user ) # return output (including stderr if any) @@ -57,14 +60,17 @@ async def execute(cmd: str) -> str: @tool(viewer=code_viewer("python", "code")) -def python(timeout: int | None = None, user: str | None = None) -> Tool: +def python( + timeout: int | None = None, user: str | None = None, sandbox: str | None = None +) -> Tool: """Python code execution tool. Execute Python code using a sandbox environment (e.g. "docker"). Args: - timeout (int | None): Timeout (in seconds) for command. - user (str | None): User to execute commands as. + timeout: Timeout (in seconds) for command. + user: User to execute commands as. + sandbox: Optional sandbox environmnent name. Returns: String with command output (stdout) or command error (stderr). @@ -89,7 +95,7 @@ async def execute(code: str) -> str: Returns: The output of the Python code. """ - result = await sandbox().exec( + result = await sandbox_env(sandbox).exec( cmd=["python3"], input=code, timeout=timeout, user=user ) # return output (including stderr if any) diff --git a/src/inspect_ai/tool/_tools/_web_browser/_web_browser.py b/src/inspect_ai/tool/_tools/_web_browser/_web_browser.py index f207eed08..5dd239631 100644 --- a/src/inspect_ai/tool/_tools/_web_browser/_web_browser.py +++ b/src/inspect_ai/tool/_tools/_web_browser/_web_browser.py @@ -16,10 +16,12 @@ def web_browser(interactive: bool = True) -> list[Tool]: """Tools used for web browser navigation. + See documentation at . + Args: - interactive (bool): Provide interactive tools (enable - clicking, typing, and submitting forms). Defaults - to True. + interactive: Provide interactive tools (enable + clicking, typing, and submitting forms). Defaults + to True. Returns: List of tools used for web browser navigation. diff --git a/src/inspect_ai/tool/_tools/_web_search.py b/src/inspect_ai/tool/_tools/_web_search.py index 9ba1a3fed..898536752 100644 --- a/src/inspect_ai/tool/_tools/_web_search.py +++ b/src/inspect_ai/tool/_tools/_web_search.py @@ -41,14 +41,16 @@ def web_search( A web search is conducted using the specified provider, the results are parsed for relevance using the specified model, and the top 'num_results' relevant pages are returned. + See further documentation at . + Args: - provider (Literal["google"]): Search provider (defaults to "google", currently + provider: Search provider (defaults to "google", currently the only provider). Possible future providers include "brave" and "bing". - num_results (int): Number of web search result pages to return to the model. - max_provider_calls (int): Maximum number of search calls to make to the search provider. - max_connections (int): Maximum number of concurrent connections to API + num_results: Number of web search result pages to return to the model. + max_provider_calls: Maximum number of search calls to make to the search provider. + max_connections: Maximum number of concurrent connections to API endpoint of search provider. - model (str | Model): Model used to parse web pages for relevance. + model: Model used to parse web pages for relevance. Returns: A tool that can be registered for use by models to search the web. diff --git a/src/inspect_ai/util/_concurrency.py b/src/inspect_ai/util/_concurrency.py index 6d3eb5915..9cc6c7ac7 100644 --- a/src/inspect_ai/util/_concurrency.py +++ b/src/inspect_ai/util/_concurrency.py @@ -23,12 +23,12 @@ def concurrency( for launching subprocesses is handled via the `subprocess` function. Args: - name (str): Name for concurrency context. This serves as the + name: Name for concurrency context. This serves as the display name for the context, and also the unique context key (if the `key` parameter is omitted) - concurrency (int): Maximum number of coroutines that can + concurrency: Maximum number of coroutines that can enter the context. - key (str | None): Unique context key for this context. Optional. + key: Unique context key for this context. Optional. Used if the unique key isn't human readable -- e.g. includes api tokens or account ids so that the more readable `name` can be presented to users e.g in console UI> diff --git a/src/inspect_ai/util/_panel.py b/src/inspect_ai/util/_panel.py index 989c58674..59dc70ca4 100644 --- a/src/inspect_ai/util/_panel.py +++ b/src/inspect_ai/util/_panel.py @@ -5,6 +5,8 @@ class InputPanel(Container): + """Base class for for Inspect input panels.""" + DEFAULT_TITLE = "Panel" DEFAULT_CLASSES = "task-input-panel" diff --git a/src/inspect_ai/util/_resource.py b/src/inspect_ai/util/_resource.py index 2d1b44095..480fc79f8 100644 --- a/src/inspect_ai/util/_resource.py +++ b/src/inspect_ai/util/_resource.py @@ -33,18 +33,18 @@ def resource( `resource("templates/prompt.txt", type="file")` Args: - resource (str): Path to local or remote (e.g. s3://) - resource, or for `type="auto"` (the default), - a string containing the literal resource value. - type (Literal["auto", "file"]): For "auto" (the default), - interpret the resource as a literal string if its not - a valid path. For "file", always interpret it as - a file path. - fs_options (dict[str, Any]): Optional. Additional - arguments to pass through to the `fsspec` filesystem - provider (e.g. `S3FileSystem`). Use `{"anon": True }` - if you are accessing a public S3 bucket with no - credentials. + resource: Path to local or remote (e.g. s3://) + resource, or for `type="auto"` (the default), + a string containing the literal resource value. + type: For "auto" (the default), + interpret the resource as a literal string if its not + a valid path. For "file", always interpret it as + a file path. + fs_options: Optional. Additional + arguments to pass through to the `fsspec` filesystem + provider (e.g. `S3FileSystem`). Use `{"anon": True }` + if you are accessing a public S3 bucket with no + credentials. Returns: Text content of resource. diff --git a/src/inspect_ai/util/_sandbox/docker/compose.py b/src/inspect_ai/util/_sandbox/docker/compose.py index 612d3ac45..e4ec35d2e 100644 --- a/src/inspect_ai/util/_sandbox/docker/compose.py +++ b/src/inspect_ai/util/_sandbox/docker/compose.py @@ -3,12 +3,13 @@ import shlex from logging import getLogger from pathlib import Path -from typing import Any, Literal, TypedDict, cast +from typing import Any, Literal, cast import yaml from pydantic import BaseModel from inspect_ai._util.error import PrerequisiteError +from inspect_ai._util.trace import trace_message from inspect_ai.util._display import display_type from inspect_ai.util._subprocess import ExecResult, subprocess @@ -16,26 +17,37 @@ DOCKER_COMPOSE_REQUIRED_VERSION_PULL_POLICY, validate_docker_compose, ) +from .service import ComposeService, services_healthcheck_time from .util import ComposeProject, is_inspect_project logger = getLogger(__name__) # How long to wait for compose environment to pass a health check -COMPOSE_WAIT = "120" +COMPOSE_WAIT = 120 -async def compose_up(project: ComposeProject) -> None: +async def compose_up( + project: ComposeProject, services: dict[str, ComposeService] +) -> None: + # compute the maximum amount of time we will + up_command = ["up", "--detach", "--wait"] + + # are there healthchecks in the service definitions? if so then peg our timeout + # at the maximum total wait time. otherwise, pick a reasonable default + healthcheck_time = services_healthcheck_time(services) + if healthcheck_time > 0: + timeout: int = healthcheck_time + trace_message(logger, "Docker", "Docker services heathcheck timeout: {timeout}") + else: + timeout = COMPOSE_WAIT + up_command.extend(["--wait-timeout", str(timeout)]) + # Start the environment. Note that we don't check the result because docker will # return a non-zero exit code for services that exit (even successfully) when # passing the --wait flag (see https://github.com/docker/compose/issues/10596). # In practice, we will catch any errors when calling compose_check_running() # immediately after we call compose_up(). - await compose_command( - ["up", "--detach", "--wait", "--wait-timeout", COMPOSE_WAIT], - project=project, - # wait up to 5 minutes for container to go up (compose wait + 3 minutes) - timeout=300, - ) + await compose_command(up_command, project=project, timeout=timeout) async def compose_down(project: ComposeProject, quiet: bool = True) -> None: @@ -191,17 +203,6 @@ async def compose_exec( ) -ComposeService = TypedDict( - "ComposeService", - { - "image": str | None, - "build": str | None, - "x-default": bool | None, - "x-local": bool | None, - }, -) - - async def compose_services(project: ComposeProject) -> dict[str, ComposeService]: result = await compose_command(["config"], project=project, timeout=60) if not result.success: diff --git a/src/inspect_ai/util/_sandbox/docker/docker.py b/src/inspect_ai/util/_sandbox/docker/docker.py index 85f434f47..2292e79c9 100644 --- a/src/inspect_ai/util/_sandbox/docker/docker.py +++ b/src/inspect_ai/util/_sandbox/docker/docker.py @@ -9,6 +9,7 @@ from typing_extensions import override +from inspect_ai._util.error import PrerequisiteError from inspect_ai.util._subprocess import ExecResult, subprocess from ..environment import ( @@ -85,6 +86,14 @@ async def task_init( services = await compose_services(project) for name, service in services.items(): + # if the service has an explicit container_name then + # error (as this won't work w/ epochs > 1) + container_name = service.get("container_name", None) + if container_name: + raise PrerequisiteError( + f"ERROR: Docker service '{name}' includes an explicitly configured container_name ('{container_name}'). This is not permitted, as container names should be provisioned by Docker compose and an explicit container_name will not work with epochs > 1." + ) + # build internal images image = service.get("image", None) if image and is_internal_image(image): @@ -139,7 +148,7 @@ async def sample_init( services = await compose_services(project) # start the services - await compose_up(project) + await compose_up(project, services) # check to ensure that the services are running running_services = await compose_check_running( diff --git a/src/inspect_ai/util/_sandbox/docker/service.py b/src/inspect_ai/util/_sandbox/docker/service.py new file mode 100644 index 000000000..99c5a2153 --- /dev/null +++ b/src/inspect_ai/util/_sandbox/docker/service.py @@ -0,0 +1,100 @@ +import re +from dataclasses import dataclass +from typing import TypedDict + + +class ComposeServiceHealthcheck(TypedDict, total=False): + start_period: str + interval: str + retries: int + timeout: str + + +ComposeService = TypedDict( + "ComposeService", + { + "image": str, + "build": str, + "container_name": str, + "x-default": bool, + "x-local": bool, + "healthcheck": ComposeServiceHealthcheck, + }, + total=False, +) + + +def services_healthcheck_time(services: dict[str, ComposeService]) -> int: + max_time = 0 + + for _, service in services.items(): + service_time = service_healthcheck_time(service) + max_time = max(max_time, service_time) + + return max_time + + +def service_healthcheck_time(service: ComposeService) -> int: + """ + Calculate the maximum time a single service's healthcheck could take. + + The total time is: + (retries * (interval + timeout)) + + Default values (from Docker documentation): + - retries: 3 + - interval: 30s + - timeout: 30s + """ + healthcheck = service.get("healthcheck", None) + if healthcheck is None: + return 0 + + # Parse duration strings with defaults + retries = healthcheck.get("retries", 3) + interval = parse_duration(healthcheck.get("interval", "30s")) + timeout = parse_duration(healthcheck.get("timeout", "30s")) + + # Calculate total time in seconds + total_time = retries * (interval.seconds + timeout.seconds) + + return int(total_time) + + +@dataclass +class Duration: + nanoseconds: int + + @property + def seconds(self) -> float: + return self.nanoseconds / 1_000_000_000 + + +def parse_duration(duration_str: str) -> Duration: + """Parse a Docker compose style duration string.""" + if not duration_str: + return Duration(0) + + units = { + "ns": 1, + "us": 1_000, + "ms": 1_000_000, + "s": 1_000_000_000, + "m": 60_000_000_000, + "h": 3_600_000_000_000, + } + + duration_str = "".join(duration_str.split()) + pattern = re.compile(r"(\d+)([a-z]+)") + matches = pattern.findall(duration_str) + + if not matches: + raise ValueError(f"Invalid duration format: {duration_str}") + + total_nanoseconds = 0 + for number, unit in matches: + if unit not in units: + raise ValueError(f"Invalid unit: {unit}") + total_nanoseconds += int(number) * units[unit] + + return Duration(total_nanoseconds) diff --git a/src/inspect_ai/util/_sandbox/environment.py b/src/inspect_ai/util/_sandbox/environment.py index 749f054e3..e0442f327 100644 --- a/src/inspect_ai/util/_sandbox/environment.py +++ b/src/inspect_ai/util/_sandbox/environment.py @@ -65,91 +65,6 @@ class SandboxEnvironment(abc.ABC): filesystem context to copy samples files into and resolve relative paths to. """ - @classmethod - def config_files(cls) -> list[str]: - """Standard config files for this provider (used for automatic discovery)""" - return [] - - @classmethod - def default_concurrency(cls) -> int | None: - """Default max_sandboxes for this provider (`None` means no maximum)""" - return None - - @classmethod - async def task_init( - cls, task_name: str, config: SandboxEnvironmentConfigType | None - ) -> None: - """Called at task startup initialize resources. - - Args: - task_name (str): Name of task using the sandbox environment. - config (SandboxEnvironmentConfigType): Implementation defined configuration (optional). - """ - pass - - @classmethod - async def sample_init( - cls, - task_name: str, - config: SandboxEnvironmentConfigType | None, - metadata: dict[str, str], - ) -> dict[str, "SandboxEnvironment"]: - """Initialize sandbox environments for a sample. - - Args: - task_name (str): Name of task using the sandbox environment. - config (SandboxEnvironmentConfigType): Implementation defined configuration (optional). - metadata (dict[str,str]): Sample `metadata` field - - Returns: - Dictionary of named sandbox environments. The environment which represents - the default environment (resolved by `sandbox("default")` or `sandbox()`) must - be the first key/value pair in the dictionary. - """ - return {} - - @classmethod - @abc.abstractmethod - async def sample_cleanup( - cls, - task_name: str, - config: SandboxEnvironmentConfigType | None, - environments: dict[str, "SandboxEnvironment"], - interrupted: bool, - ) -> None: - """Cleanup sandbox environments. - - Args: - task_name (str): Name of task using the sandbox environment. - config (SandboxEnvironmentConfigType): Implementation defined configuration (optional). - environments (dict[str,SandboxEnvironment]): Sandbox environments created for this sample. - interrupted (bool): Was the task interrupted by an error or cancellation - """ - ... - - @classmethod - async def task_cleanup( - cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool - ) -> None: - """Called at task exit as a last chance to cleanup resources. - - Args: - task_name (str): Name of task using the sandbox environment. - config (SandboxEnvironmentConfigType): Implementation defined configuration (optional). - cleanup (bool): Whether to actually cleanup environment resources - (False if `--no-sandbox-cleanup` was specified) - """ - pass - - @classmethod - async def cli_cleanup(cls, id: str | None) -> None: - """Handle a cleanup invoked from the CLI (e.g. inspect sandbox cleanup). - - Args: - id (str | None): Optional ID to limit scope of cleanup. - """ - pass - @abc.abstractmethod async def exec( self, @@ -170,13 +85,13 @@ async def exec( `OutputLimitExceededError` will be raised. Args: - cmd (str | list[str]): Command or command and arguments to execute. - input (str | bytes | None): Standard input (optional). - cwd (str | None): Current working dir (optional). If relative, will be relative to the per-sample filesystem context. - env (dict[str,str]): Environment variables for execution. - user (str | None): Optional username or UID to run the command as. - timeout (int | None): Optional execution timeout (seconds). - timeout_retry (bool): Retry the command in the case that it times out. + cmd: Command or command and arguments to execute. + input: Standard input (optional). + cwd: Current working dir (optional). If relative, will be relative to the per-sample filesystem context. + env: Environment variables for execution. + user: Optional username or UID to run the command as. + timeout: Optional execution timeout (seconds). + timeout_retry: Retry the command in the case that it times out. Commands will be retried up to twice, with a timeout of no greater than 60 seconds for the first retry and 30 for the second. @@ -204,9 +119,9 @@ async def write_file(self, file: str, contents: str | bytes) -> None: should be automatically created. Args: - file (str): Path to file (relative file paths will resolve to the + file: Path to file (relative file paths will resolve to the per-sample working directory). - contents (str | bytes): Text or binary file contents. + contents: Text or binary file contents. Raises: PermissionError: If the current user does not have permission to @@ -233,9 +148,9 @@ async def read_file(self, file: str, text: bool = True) -> Union[str | bytes]: to specifying `newline=""` in a call to the Python `open()` function. Args: - file (str): Path to file (relative file paths will resolve to the + file: Path to file (relative file paths will resolve to the per-sample working directory). - text (bool): Read as a utf-8 encoded text file. + text: Read as a utf-8 encoded text file. Returns: Contents of file (as str or bytes for binary files) @@ -265,6 +180,91 @@ async def connection(self) -> SandboxConnection: """ raise NotImplementedError("connection not implemented") + @classmethod + def config_files(cls) -> list[str]: + """Standard config files for this provider (used for automatic discovery)""" + return [] + + @classmethod + def default_concurrency(cls) -> int | None: + """Default max_sandboxes for this provider (`None` means no maximum)""" + return None + + @classmethod + async def task_init( + cls, task_name: str, config: SandboxEnvironmentConfigType | None + ) -> None: + """Called at task startup initialize resources. + + Args: + task_name: Name of task using the sandbox environment. + config: Implementation defined configuration (optional). + """ + pass + + @classmethod + async def sample_init( + cls, + task_name: str, + config: SandboxEnvironmentConfigType | None, + metadata: dict[str, str], + ) -> dict[str, "SandboxEnvironment"]: + """Initialize sandbox environments for a sample. + + Args: + task_name: Name of task using the sandbox environment. + config: Implementation defined configuration (optional). + metadata: Sample `metadata` field + + Returns: + Dictionary of named sandbox environments. The environment which represents + the default environment (resolved by `sandbox("default")` or `sandbox()`) must + be the first key/value pair in the dictionary. + """ + return {} + + @classmethod + @abc.abstractmethod + async def sample_cleanup( + cls, + task_name: str, + config: SandboxEnvironmentConfigType | None, + environments: dict[str, "SandboxEnvironment"], + interrupted: bool, + ) -> None: + """Cleanup sandbox environments. + + Args: + task_name: Name of task using the sandbox environment. + config: Implementation defined configuration (optional). + environments: Sandbox environments created for this sample. + interrupted: Was the task interrupted by an error or cancellation + """ + ... + + @classmethod + async def task_cleanup( + cls, task_name: str, config: SandboxEnvironmentConfigType | None, cleanup: bool + ) -> None: + """Called at task exit as a last chance to cleanup resources. + + Args: + task_name: Name of task using the sandbox environment. + config: Implementation defined configuration (optional). + cleanup: Whether to actually cleanup environment resources + (False if `--no-sandbox-cleanup` was specified) + """ + pass + + @classmethod + async def cli_cleanup(cls, id: str | None) -> None: + """Handle a cleanup invoked from the CLI (e.g. inspect sandbox cleanup). + + Args: + id: Optional ID to limit scope of cleanup. + """ + pass + @dataclass class SandboxEnvironments: @@ -284,7 +284,10 @@ class SandboxEnvironmentSpec(NamedTuple): """Specification of a SandboxEnvironment.""" type: str + """Sandbox type (e.g. 'local', 'docker')""" + config: SandboxEnvironmentConfigType | None = None + """Sandbox configuration (filename or config object).""" SandboxEnvironmentConfigType = BaseModel | str diff --git a/src/inspect_ai/util/_subprocess.py b/src/inspect_ai/util/_subprocess.py index 7c0fd312e..a30ff9e34 100644 --- a/src/inspect_ai/util/_subprocess.py +++ b/src/inspect_ai/util/_subprocess.py @@ -20,6 +20,8 @@ @dataclass class ExecResult(Generic[T]): + """Execution result from call to `subprocess()`.""" + success: bool """Did the process exit with success.""" @@ -85,11 +87,11 @@ async def subprocess( cwd (str | Path | None): Switch to directory for execution. env (dict[str, str]): Additional environment variables. capture_output (bool): Capture stderr and stdout into ExecResult - (if False, then output is redirected to parent stderr/stdout) + (if False, then output is redirected to parent stderr/stdout) output_limit (int | None): Stop reading output if it exceeds - the specified limit (in bytes). + the specified limit (in bytes). timeout (int | None): Timeout. If the timeout expires then - a `TimeoutError` will be raised. + a `TimeoutError` will be raised. Returns: Subprocess result (text or binary depending on `text` param) diff --git a/src/inspect_ai/util/_subtask.py b/src/inspect_ai/util/_subtask.py index 2e8d60adf..48ecfe7b9 100644 --- a/src/inspect_ai/util/_subtask.py +++ b/src/inspect_ai/util/_subtask.py @@ -27,21 +27,21 @@ @runtime_checkable class Subtask(Protocol): - """Subtask with distinct `Store` and `Transcript`. - - Args: - *args (Any): Arguments for the subtask. - **kwargs (Any): Keyword arguments for the subtask. - - Returns: - Result of subtask. - """ - async def __call__( self, *args: Any, **kwargs: Any, - ) -> Any: ... + ) -> Any: + """Subtask with distinct `Store` and `Transcript`. + + Args: + *args (Any): Arguments for the subtask. + **kwargs (Any): Keyword arguments for the subtask. + + Returns: + Result of subtask. + """ + ... @overload @@ -71,11 +71,10 @@ def subtask( r"""Decorator for subtasks. Args: - func (Subtask): Subtask implementation. - name (str | None): Name for subtask (defaults to function name) - store (store | None): Store to use for subtask - type (str | None): Type to use for subtask - input (dict[str, Any] | None): Input to log for subtask + name: Name for subtask (defaults to function name) + store: Store to use for subtask + type: Type to use for subtask + input: Input to log for subtask Returns: Function which runs the Subtask, providing an isolated diff --git a/tests/model/providers/test_openai.py b/tests/model/providers/test_openai.py index 21ec46f42..f8a4a4790 100644 --- a/tests/model/providers/test_openai.py +++ b/tests/model/providers/test_openai.py @@ -6,6 +6,7 @@ GenerateConfig, get_model, ) +from inspect_ai.model._chat_message import ChatMessageSystem @pytest.mark.asyncio @@ -30,6 +31,26 @@ async def test_openai_api() -> None: assert len(response.completion) >= 1 +@pytest.mark.asyncio +@skip_if_no_openai +async def test_openai_o_series_developer_messages() -> None: + async def check_developer_messages(model_name: str): + model = get_model( + model_name, + config=GenerateConfig(reasoning_effort="medium", parallel_tool_calls=True), + ) + await model.generate( + [ + ChatMessageSystem(content="I am a helpful assistant."), + ChatMessageUser(content="What are you?"), + ] + ) + + await check_developer_messages("openai/o1") + await check_developer_messages("openai/o1-mini") + await check_developer_messages("openai/o3-mini") + + @pytest.mark.asyncio @skip_if_no_openai async def test_openai_o_series_reasoning_effort() -> None: @@ -44,6 +65,7 @@ async def check_reasoning_effort(model_name: str): print(response) await check_reasoning_effort("openai/o1") + await check_reasoning_effort("openai/o1-mini") await check_reasoning_effort("openai/o3-mini") @@ -60,4 +82,5 @@ async def check_max_tokens(model_name: str): assert len(response.completion) >= 1 await check_max_tokens("openai/o1") + await check_max_tokens("openai/o1-mini") await check_max_tokens("openai/o3-mini") diff --git a/tests/util/sandbox/test_docker_healthcheck.py b/tests/util/sandbox/test_docker_healthcheck.py new file mode 100644 index 000000000..e4f969108 --- /dev/null +++ b/tests/util/sandbox/test_docker_healthcheck.py @@ -0,0 +1,140 @@ +import pytest + +from inspect_ai.util._sandbox.docker.service import ( + ComposeService, + parse_duration, + service_healthcheck_time, + services_healthcheck_time, +) + + +# Duration Parser Tests +def test_parse_duration_simple(): + assert parse_duration("30s").seconds == 30.0 + assert parse_duration("1m").seconds == 60.0 + assert parse_duration("1h").seconds == 3600.0 + + +def test_parse_duration_combined(): + assert parse_duration("1m30s").seconds == 90.0 + assert parse_duration("1h30m").seconds == 5400.0 + assert parse_duration("2h30m15s").seconds == 9015.0 + + +def test_parse_duration_with_spaces(): + assert parse_duration("1h 30m").seconds == 5400.0 + assert parse_duration("1h 30m 15s").seconds == 5415.0 + + +def test_parse_duration_milliseconds(): + assert parse_duration("100ms").seconds == 0.1 + assert parse_duration("1s500ms").seconds == 1.5 + + +def test_parse_duration_empty(): + assert parse_duration("").seconds == 0.0 + + +def test_parse_duration_invalid(): + with pytest.raises(ValueError): + parse_duration("invalid") + with pytest.raises(ValueError): + parse_duration("30x") # invalid unit + + +# Service Healthcheck Time Tests +def test_service_without_healthcheck() -> None: + service: ComposeService = { + "image": "nginx", + } + assert service_healthcheck_time(service) == 0.0 + + +def test_service_with_default_values() -> None: + service: ComposeService = { + "image": "nginx", + "healthcheck": {}, + } + assert service_healthcheck_time(service) == 180.0 + + +def test_service_with_custom_values() -> None: + service: ComposeService = { + "image": "nginx", + "healthcheck": { + "start_period": "10s", + "interval": "5s", + "timeout": "3s", + "retries": 5, + }, + } + assert service_healthcheck_time(service) == 40.0 + + +def test_service_with_partial_custom_values() -> None: + service: ComposeService = { + "image": "nginx", + "healthcheck": { + "start_period": "10s", + "timeout": "3s", + }, + } + assert service_healthcheck_time(service) == 99.0 + + +# Total Healthcheck Time Tests +def test_total_time_no_services() -> None: + services: dict[str, ComposeService] = {} + assert services_healthcheck_time(services) == 0.0 + + +def test_total_time_no_healthchecks() -> None: + services: dict[str, ComposeService] = { + "web": {"image": "nginx"}, + "db": { + "image": "postgres", + }, + } + assert services_healthcheck_time(services) == 0.0 + + +def test_total_time_multiple_services() -> None: + services: dict[str, ComposeService] = { + "web": { + "image": "nginx", + "healthcheck": { + "start_period": "10s", + "interval": "5s", + "timeout": "3s", + "retries": 5, + }, + }, + "db": { + "image": "postgres", + "healthcheck": { + "start_period": "30s", + "interval": "10s", + "timeout": "5s", + "retries": 3, + }, + }, + } + assert services_healthcheck_time(services) == 45.0 + + +def test_total_time_mixed_services() -> None: + services: dict[str, ComposeService] = { + "web": { + "image": "nginx", + "healthcheck": { + "start_period": "10s", + "interval": "5s", + "timeout": "3s", + "retries": 5, + }, + }, + "db": { + "image": "postgres", + }, + } + assert services_healthcheck_time(services) == 40.0 diff --git a/tools/vscode/CHANGELOG.md b/tools/vscode/CHANGELOG.md index 30c08b3f6..c863884e4 100644 --- a/tools/vscode/CHANGELOG.md +++ b/tools/vscode/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 0.3.52 + +- Don't add entries to the `.gitignore` file. + ## 0.3.51 - Improve performance of log listing rendering by caching information in the workspace. diff --git a/tools/vscode/package.json b/tools/vscode/package.json index 7ad40588a..7b24649ff 100644 --- a/tools/vscode/package.json +++ b/tools/vscode/package.json @@ -7,7 +7,7 @@ "author": { "name": "UK AI Safety Institute" }, - "version": "0.3.51", + "version": "0.3.52", "license": "MIT", "homepage": "https://inspect.ai-safety-institute.org.uk/", "repository": { diff --git a/tools/vscode/src/@types/log.d.ts b/tools/vscode/src/@types/log.d.ts index e816254fd..d9c9a4df7 100644 --- a/tools/vscode/src/@types/log.d.ts +++ b/tools/vscode/src/@types/log.d.ts @@ -112,6 +112,7 @@ export type Input = | ChatMessageAssistant | ChatMessageTool )[]; +export type Role = "system"; export type Content = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; @@ -127,18 +128,17 @@ export type Type4 = "video"; export type Video = string; export type Format1 = "mp4" | "mpeg" | "mov"; export type Source = ("input" | "generate") | null; -export type Role = "system"; +export type Role1 = "user"; export type Content1 = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; export type Source1 = ("input" | "generate") | null; -export type Role1 = "user"; export type ToolCallId = string[] | null; +export type Role2 = "assistant"; export type Content2 = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; export type Source2 = ("input" | "generate") | null; -export type Role2 = "assistant"; export type ToolCalls = ToolCall[] | null; export type Id1 = string; export type Function = string; @@ -148,11 +148,11 @@ export type Title = string | null; export type Format2 = "text" | "markdown"; export type Content3 = string; export type Reasoning = string | null; +export type Role3 = "tool"; export type Content4 = | string | (ContentText | ContentImage | ContentAudio | ContentVideo)[]; export type Source3 = ("input" | "generate") | null; -export type Role3 = "tool"; export type ToolCallId1 = string | null; export type Function1 = string | null; export type Type6 = @@ -315,6 +315,7 @@ export type Timestamp8 = string; export type Pending8 = boolean | null; export type Event8 = "score"; export type Target2 = string | string[] | null; +export type Intermediate = boolean; export type Timestamp9 = string; export type Pending9 = boolean | null; export type Event9 = "error"; @@ -339,6 +340,7 @@ export type Lineno = number; export type Timestamp11 = string; export type Pending11 = boolean | null; export type Event11 = "info"; +export type Source4 = string | null; export type Timestamp12 = string; export type Pending12 = boolean | null; export type Event12 = "step"; @@ -424,6 +426,9 @@ export type SampleId1 = string | number | null; export type Samples2 = EvalSampleScore[]; export type Location1 = string; +/** + * Evaluation log. + */ export interface EvalLog { version?: Version; status?: Status; @@ -436,6 +441,9 @@ export interface EvalLog { reductions?: Reductions; location?: Location1; } +/** + * Eval target and configuration. + */ export interface EvalSpec { run_id: RunId; created: Created; @@ -460,6 +468,9 @@ export interface EvalSpec { } export interface TaskAttribs {} export interface TaskArgs {} +/** + * Dataset used for evaluation. + */ export interface EvalDataset { name: Name; location: Location; @@ -468,6 +479,9 @@ export interface EvalDataset { shuffled: Shuffled; } export interface ModelArgs {} +/** + * Configuration used for evaluation. + */ export interface EvalConfig { limit: Limit; sample_id: SampleId; @@ -513,6 +527,9 @@ export interface ApproverPolicyConfig { params: Params; } export interface Params {} +/** + * Git revision for evaluation. + */ export interface EvalRevision { type: Type; origin: Origin; @@ -521,19 +538,25 @@ export interface EvalRevision { export interface Packages { [k: string]: string; } +/** + * Plan (solvers) used in evaluation. + */ export interface EvalPlan { name: Name2; steps: Steps; finish: EvalPlanStep | null; config: GenerateConfig; } +/** + * Solver step. + */ export interface EvalPlanStep { solver: Solver1; params: Params1; } export interface Params1 {} /** - * Base class for model generation configs. + * Model generation options. */ export interface GenerateConfig { max_retries: MaxRetries; @@ -560,12 +583,18 @@ export interface GenerateConfig { reasoning_effort: ReasoningEffort; reasoning_history: ReasoningHistory; } +/** + * Scoring results from evaluation. + */ export interface EvalResults { total_samples: TotalSamples; completed_samples: CompletedSamples; scores: Scores; metadata: Metadata3; } +/** + * Score for evaluation task. + */ export interface EvalScore { name: Name3; scorer: Scorer; @@ -578,13 +607,19 @@ export interface Params2 {} export interface Metrics { [k: string]: EvalMetric; } +/** + * Metric for evaluation score. + */ export interface EvalMetric { name: Name4; value: Value; - options: Options; + params: Params3; metadata: Metadata1; } -export interface Options {} +export interface Params3 {} +/** + * Timing and usage statistics. + */ export interface EvalStats { started_at: StartedAt; completed_at: CompletedAt; @@ -593,6 +628,9 @@ export interface EvalStats { export interface ModelUsage { [k: string]: ModelUsage1; } +/** + * Token usage for completion. + */ export interface ModelUsage1 { input_tokens: InputTokens; output_tokens: OutputTokens; @@ -600,11 +638,17 @@ export interface ModelUsage1 { input_tokens_cache_write: InputTokensCacheWrite; input_tokens_cache_read: InputTokensCacheRead; } +/** + * Eval error details. + */ export interface EvalError { message: Message; traceback: Traceback; traceback_ansi: TracebackAnsi; } +/** + * Sample from evaluation task. + */ export interface EvalSample { id: Id; epoch: Epoch; @@ -625,40 +669,61 @@ export interface EvalSample { attachments: Attachments; limit: EvalSampleLimit | null; } +/** + * System chat message. + */ export interface ChatMessageSystem { + role: Role; content: Content; source: Source; - role: Role; } +/** + * Text content. + */ export interface ContentText { type: Type1; text: Text; } +/** + * Image content. + */ export interface ContentImage { type: Type2; image: Image; detail: Detail; } +/** + * Audio content. + */ export interface ContentAudio { type: Type3; audio: Audio; format: Format; } +/** + * Video content. + */ export interface ContentVideo { type: Type4; video: Video; format: Format1; } +/** + * User chat message. + */ export interface ChatMessageUser { + role: Role1; content: Content1; source: Source1; - role: Role1; tool_call_id: ToolCallId; } +/** + * Assistant chat message. + */ export interface ChatMessageAssistant { + role: Role2; content: Content2; source: Source2; - role: Role2; tool_calls: ToolCalls; reasoning: Reasoning; } @@ -679,10 +744,13 @@ export interface ToolCallContent { format: Format2; content: Content3; } +/** + * Tool chat message. + */ export interface ChatMessageTool { + role: Role3; content: Content4; source: Source3; - role: Role3; tool_call_id: ToolCallId1; function: Function1; error: ToolCallError | null; @@ -691,6 +759,9 @@ export interface ToolCallError { type: Type6; message: Message1; } +/** + * Output from model generation. + */ export interface ModelOutput { model: Model1; choices: Choices1; @@ -699,6 +770,9 @@ export interface ModelOutput { metadata: Metadata4; error: Error; } +/** + * Choice generated for completion. + */ export interface ChatCompletionChoice { message: ChatMessageAssistant; stop_reason: StopReason; @@ -729,12 +803,6 @@ export interface TopLogprob { } /** * Score generated by a scorer. - * - * Args: - * value (Value): Score value. - * answer (str | None): Answer extracted from model output (optional). - * explanation (str | None): Explanation of score (optional). - * metadata (dict[str,Any]): Additional metadata related to the score. */ export interface Score { value: Value1; @@ -754,6 +822,9 @@ export interface SampleInitEvent { sample: Sample; state: JsonValue; } +/** + * Sample for an evaluation task. + */ export interface Sample { input: Input1; choices: Choices2; @@ -888,7 +959,7 @@ export interface ToolFunction { name: Name6; } /** - * Base class for model generation configs. + * Model generation options. */ export interface GenerateConfig1 { max_retries: MaxRetries; @@ -984,7 +1055,10 @@ export interface InputEvent { input_ansi: InputAnsi; } /** - * Event with sample score. + * Event with score. + * + * Can be the final score for a `Sample`, or can be an intermediate score + * resulting from a call to `score`. */ export interface ScoreEvent { timestamp: Timestamp8; @@ -992,6 +1066,7 @@ export interface ScoreEvent { event: Event8; score: Score; target: Target2; + intermediate: Intermediate; } /** * Event with sample error. @@ -1011,6 +1086,9 @@ export interface LoggerEvent { event: Event10; message: LoggingMessage; } +/** + * Message written to Python log. + */ export interface LoggingMessage { name: Name7; level: Level; @@ -1027,6 +1105,7 @@ export interface InfoEvent { timestamp: Timestamp11; pending: Pending11; event: Event11; + source: Source4; data: JsonValue; } /** @@ -1063,15 +1142,24 @@ export interface ModelUsage2 { export interface Attachments { [k: string]: string; } +/** + * Limit encontered by sample. + */ export interface EvalSampleLimit { type: Type13; limit: Limit2; } +/** + * Score reductions. + */ export interface EvalSampleReductions { scorer: Scorer1; reducer: Reducer1; samples: Samples2; } +/** + * Score and sample_id scored. + */ export interface EvalSampleScore { value: Value2; answer: Answer1; diff --git a/tools/vscode/src/extension.ts b/tools/vscode/src/extension.ts index ec18cd569..fecf498f7 100644 --- a/tools/vscode/src/extension.ts +++ b/tools/vscode/src/extension.ts @@ -13,7 +13,6 @@ import { activateWorkspaceTaskProvider } from "./providers/workspace/workspace-t import { activateWorkspaceState, } from "./providers/workspace/workspace-state-provider"; -import { initializeWorkspace } from "./providers/workspace/workspace-init"; import { activateWorkspaceEnv } from "./providers/workspace/workspace-env-provider"; import { initPythonInterpreter } from "./core/python"; import { initInspectProps } from "./inspect"; @@ -64,9 +63,6 @@ export async function activate(context: ExtensionContext) { const [envComands, workspaceEnvManager] = workspaceActivationResult; context.subscriptions.push(workspaceEnvManager); - // Initial the workspace - await initializeWorkspace(stateManager); - // Initialize the protocol handler activateProtocolHandler(context); diff --git a/tools/vscode/src/providers/workspace/workspace-init.ts b/tools/vscode/src/providers/workspace/workspace-init.ts deleted file mode 100644 index 8e27a20d9..000000000 --- a/tools/vscode/src/providers/workspace/workspace-init.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { Command } from "../../core/command"; -import { WorkspaceStateManager } from "./workspace-state-provider"; -import { ensureGitignore } from "../../core/git"; -import { - activeWorkspacePath, -} from "../../core/path"; - - -const kGitInitKey = "gitInit"; - -export async function initializeWorkspace( - state: WorkspaceStateManager -): Promise<[Command[]]> { - const hasInitializedGit = state.getState(kGitInitKey); - if (hasInitializedGit !== "true" || 1 === 1) { - const path = activeWorkspacePath(); - - // If we're in a workspace, initialize - ensureGitignore(path, ignorePaths()); - - await state.setState(kGitInitKey, "true"); - - } - return [[]]; -} - -// TODO: Extract this for use adding additional paths (like if the modify env with logdir) - -function ignorePaths() { - const ignores: string[] = [".env", "logs/", "__pycache__/"]; - return ignores; -}