Built site for gh-pages

UKGovernmentBEIS · Sep 27, 2024 · a2119ad · a2119ad
1 parent f370db1
commit a2119ad
Show file tree

Hide file tree

Showing 12 changed files with 548 additions and 418 deletions.
diff --git a/.nojekyll b/.nojekyll
@@ -1 +1 @@
-ad327f9f
+94928440
diff --git a/agents-api.html b/agents-api.html
diff --git a/agents.html b/agents.html
diff --git a/eval-logs.html b/eval-logs.html
@@ -1102,7 +1102,7 @@ <h3 class="anchored" data-anchor-id="reading-logs">Reading Logs</h3>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","closeEffect":"zoom","selector":".lightbox","loop":false,"descPosition":"bottom"});
+<script>var lightboxQuarto = GLightbox({"selector":".lightbox","closeEffect":"zoom","openEffect":"zoom","descPosition":"bottom","loop":false});
 (function() {
   let previousOnload = window.onload;
   window.onload = () => {

diff --git a/examples/index.html b/examples/index.html
@@ -586,7 +586,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
       </div>
       <div class="example-info">
         <div class="listing-title">
-          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/mathvista">
+          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/mathvista">
             MathVista: Evaluating Mathematical Reasoning in Visual Contexts
           </a>
         </div>
@@ -693,7 +693,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
       </div>
       <div class="example-info">
         <div class="listing-title">
-          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/drop">
+          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/drop">
             DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs
           </a>
         </div>
@@ -735,7 +735,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
       </div>
       <div class="example-info">
         <div class="listing-title">
-          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/race-h">
+          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/race_h">
             RACE-H: A benchmark for testing reading comprehension and reasoning abilities of neural models
           </a>
         </div>
@@ -756,7 +756,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
       </div>
       <div class="example-info">
         <div class="listing-title">
-          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/mmmu">
+          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/mmmu">
             MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark
           </a>
         </div>
@@ -885,7 +885,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
       </div>
       <div class="example-info">
         <div class="listing-title">
-          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/gpqa">
+          <a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/gpqa">
             GPQA: A Graduate-Level Google-Proof Q&amp;A Benchmark
           </a>
         </div>

diff --git a/index.html b/index.html
@@ -1114,7 +1114,7 @@ <h2 class="anchored" data-anchor-id="learning-more">Learning More</h2>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","loop":false,"openEffect":"zoom"});
+<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","openEffect":"zoom","loop":false});
 (function() {
   let previousOnload = window.onload;
   window.onload = () => {

diff --git a/log-viewer.html b/log-viewer.html
@@ -1074,7 +1074,7 @@ <h3 class="unlisted anchored" data-anchor-id="other-notes">Other Notes</h3>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"loop":false,"selector":".lightbox","descPosition":"bottom","openEffect":"zoom","closeEffect":"zoom"});
+<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","loop":false,"openEffect":"zoom","descPosition":"bottom","selector":".lightbox"});
 (function() {
   let previousOnload = window.onload;
   window.onload = () => {

diff --git a/search.json b/search.json
diff --git a/sitemap.xml b/sitemap.xml
@@ -34,7 +34,7 @@
   </url>
   <url>
     <loc>https://inspect.ai-safety-institute.org.uk/agents.html</loc>
-    <lastmod>2024-09-27T11:01:12.848Z</lastmod>
+    <lastmod>2024-09-27T16:48:55.489Z</lastmod>
   </url>
   <url>
     <loc>https://inspect.ai-safety-institute.org.uk/scorers.html</loc>

diff --git a/tutorial.html b/tutorial.html
@@ -442,7 +442,7 @@ <h2 class="anchored" data-anchor-id="sec-security-guide">Security Guide</h2>
 <section id="setup" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
 <p>We’ll start by importing the functions we need from Inspect and defining a system message that orients the model to its role as a computer security expert.</p>
-<div id="1fad7b98" class="cell">
+<div id="ebc7b4b6" class="cell">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> csv_dataset</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> model_graded_fact</span>
@@ -459,7 +459,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
 <section id="eval" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval">Eval</h3>
 <p>Discerning whether the correct security guidance was provided by the model might provide difficult using only text matching algorithms. Here we use a model to read the response and assess the quality of the answer.</p>
-<div id="50812fc3" class="cell">
+<div id="aa52099e" class="cell">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> security_guide():</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
@@ -489,7 +489,7 @@ <h2 class="anchored" data-anchor-id="sec-hellaswag">HellaSwag</h2>
 <section id="setup-1" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
 <p>We’ll start by importing the functions we need from Inspect, defining a system message, and writing a function to convert dataset records to samples (we need to do this to convert the index-based label in the dataset to a letter).</p>
-<div id="a29f33e1" class="cell">
+<div id="1586299e" class="cell">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
 <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> choice</span>
@@ -514,7 +514,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
 <section id="eval-1" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-1">Eval</h3>
 <p>We’ll load the dataset from <a href="https://huggingface.co/datasets/Rowan/hellaswag">HuggingFace</a> using the <code>hf_dataset()</code> function. We’ll draw data from the validation split, and use the <code>record_to_sample()</code> function to parse the records (we’ll also pass <code>trust=True</code> to indicate that we are okay with Hugging Face executing the dataset loading code provided by hellaswag):</p>
-<div id="73494309" class="cell">
+<div id="dd75c165" class="cell">
 <div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> hellaswag():</span>
 <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>   </span>
@@ -574,7 +574,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
 <li><code>record_to_sample()</code> to convert raw records to samples. Note that we need a function rather than just mapping field names with a <code>FieldSpec</code> because the <strong>answer</strong> field in the dataset needs to be divided into reasoning and the actual answer (which appears at the very end after <code>####</code>).</li>
 <li><code>sample_to_fewshot()</code> to generate fewshot examples from samples.</li>
 </ol>
-<div id="bbc068f6" class="cell">
+<div id="75d0a9db" class="cell">
 <div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
 <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
 <span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> match</span>
@@ -621,7 +621,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
 <section id="eval-2" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-2">Eval</h3>
 <p>We’ll load the dataset from <a href="https://huggingface.co/datasets/gsm8k">HuggingFace</a> using the <code>hf_dataset()</code> function. By default we use 10 fewshot examples, but the <code>fewshot</code> task arg can be used to turn this up, down, or off. The <code>fewshot_seed</code> is provided for stability of fewshot examples across runs.</p>
-<div id="1ddcb4a6" class="cell">
+<div id="9b5e4fbd" class="cell">
 <div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gsm8k(fewshot<span class="op">=</span><span class="dv">10</span>, fewshot_seed<span class="op">=</span><span class="dv">42</span>):</span>
 <span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>    <span class="co"># build solver list dynamically (may or may not be doing fewshot)</span></span>
@@ -688,7 +688,7 @@ <h2 class="anchored" data-anchor-id="sec-mathematics">Mathematics</h2>
 <section id="setup-3" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
 <p>We’ll start by importing the functions we need from Inspect and defining a prompt that asks the model to reason step by step and respond with its answer on a line at the end. It also nudges the model not to enclose its answer in <code>\boxed</code>, a LaTeX command for displaying equations that models often use in math output.</p>
-<div id="db992199" class="cell">
+<div id="618c0fa2" class="cell">
 <div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> re</span>
 <span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
@@ -726,7 +726,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
 <section id="eval-3" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
 <p>Here is the basic setup for our eval. We <code>shuffle</code> the dataset so that when we use <code>--limit</code> to develop on smaller slices we get some variety of inputs and results:</p>
-<div id="851eda24" class="cell">
+<div id="1639e209" class="cell">
 <div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> math(shuffle<span class="op">=</span><span class="va">True</span>):</span>
 <span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
@@ -749,7 +749,7 @@ <h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
 <span id="cb12-20"><a href="#cb12-20" aria-hidden="true" tabindex="-1"></a>    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The heart of this eval isn’t in the task definition though, rather it’s in how we grade the output. Math expressions can be logically equivalent but not literally the same. Consequently, we’ll use a model to assess whether the output and the target are logically equivalent. the <code>expression_equivalence()</code> custom scorer implements this:</p>
-<div id="cd13c13c" class="cell">
+<div id="3d0bcbbb" class="cell">
 <div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="at">@scorer</span>(metrics<span class="op">=</span>[accuracy(), stderr()])</span>
 <span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> expression_equivalence():</span>
 <span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">async</span> <span class="kw">def</span> score(state: TaskState, target: Target):</span>
@@ -830,7 +830,7 @@ <h2 class="anchored" data-anchor-id="sec-tool-use">Tool Use</h2>
 <section id="addition" class="level3 unlisted">
 <h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
 <p>We’ll demonstrate with a simple tool that adds two numbers, using the <code>@tool</code> decorator to register it with the system:</p>
-<div id="b9eded63" class="cell">
+<div id="794de462" class="cell">
 <div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
 <span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample</span>
 <span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> includes, match</span>
@@ -865,7 +865,7 @@ <h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
 <span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>    y: Second number to add.</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Type annotations and descriptions are <em>required</em> for tool declarations so that the model can be informed which types to pass back to the tool function and what the purpose of each parameter is.</p>
 <p>Now that we’ve defined the tool, we can use it in an evaluation by passing it to the <code>use_tools()</code> function.</p>
-<div id="3e649586" class="cell">
+<div id="788de5fb" class="cell">
 <div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
 <span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> addition_problem():</span>
 <span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> Task(</span>
@@ -894,7 +894,7 @@ <h3 class="unlisted anchored" data-anchor-id="task">Task</h3>
 <ol start="2" type="1">
 <li><code>ctf_agent()</code>, which defines the agent’s solver. The solver consists principally of using <code>bash()</code> and <code>python()</code> tools in a loop until the flag is discovered. We’ll describe this function in more detail below.</li>
 </ol>
-<div id="440f1bb2" class="cell">
+<div id="c24e3b9b" class="cell">
 <div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> textwrap <span class="im">import</span> dedent</span>
 <span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> dataset <span class="im">import</span> read_dataset</span>
@@ -920,7 +920,7 @@ <h3 class="unlisted anchored" data-anchor-id="task">Task</h3>
 </div>
 <p>Note that we specify <code>sandbox="docker"</code> to ensure that code generated from the model is run in a secure <a href="agents.html#sec-sandbox-environments">sandbox environment</a>.</p>
 <p>Here is the definition of the agent:</p>
-<div id="b78d5751" class="cell">
+<div id="cc13ffa0" class="cell">
 <div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="at">@solver</span></span>
 <span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> ctf_agent(max_attempts<span class="op">=</span><span class="dv">3</span>):</span>
 <span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>    SYSTEM_MESSAGE <span class="op">=</span> dedent(<span class="st">"""</span></span>

diff --git a/vscode.html b/vscode.html
@@ -913,7 +913,7 @@ <h2 class="anchored" data-anchor-id="troubleshooting">Troubleshooting</h2>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","closeEffect":"zoom","loop":false,"selector":".lightbox","descPosition":"bottom"});
+<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","closeEffect":"zoom","descPosition":"bottom","loop":false,"selector":".lightbox"});
 (function() {
   let previousOnload = window.onload;
   window.onload = () => {

diff --git a/workflow.html b/workflow.html
@@ -1185,7 +1185,7 @@ <h2 class="anchored" data-anchor-id="eval-suites">Eval Suites</h2>
     </div>
   </div>
 </footer>
-<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","openEffect":"zoom","loop":false,"descPosition":"bottom"});
+<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","loop":false,"openEffect":"zoom"});
 (function() {
   let previousOnload = window.onload;
   window.onload = () => {