Skip to content

Commit

Permalink
Built site for gh-pages
Browse files Browse the repository at this point in the history
  • Loading branch information
aisi-inspect committed Sep 27, 2024
1 parent f370db1 commit a2119ad
Show file tree
Hide file tree
Showing 12 changed files with 548 additions and 418 deletions.
2 changes: 1 addition & 1 deletion .nojekyll
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ad327f9f
94928440
241 changes: 153 additions & 88 deletions agents-api.html

Large diffs are not rendered by default.

671 changes: 368 additions & 303 deletions agents.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion eval-logs.html
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ <h3 class="anchored" data-anchor-id="reading-logs">Reading Logs</h3>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","closeEffect":"zoom","selector":".lightbox","loop":false,"descPosition":"bottom"});
<script>var lightboxQuarto = GLightbox({"selector":".lightbox","closeEffect":"zoom","openEffect":"zoom","descPosition":"bottom","loop":false});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
10 changes: 5 additions & 5 deletions examples/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
</div>
<div class="example-info">
<div class="listing-title">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/mathvista">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/mathvista">
MathVista: Evaluating Mathematical Reasoning in Visual Contexts
</a>
</div>
Expand Down Expand Up @@ -693,7 +693,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
</div>
<div class="example-info">
<div class="listing-title">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/drop">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/drop">
DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs
</a>
</div>
Expand Down Expand Up @@ -735,7 +735,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
</div>
<div class="example-info">
<div class="listing-title">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/race-h">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/race_h">
RACE-H: A benchmark for testing reading comprehension and reasoning abilities of neural models
</a>
</div>
Expand All @@ -756,7 +756,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
</div>
<div class="example-info">
<div class="listing-title">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/mmmu">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/mmmu">
MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark
</a>
</div>
Expand Down Expand Up @@ -885,7 +885,7 @@ <h1 class="title"><span id="sec-examples" class="quarto-section-identifier"><spa
</div>
<div class="example-info">
<div class="listing-title">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/evals/gpqa">
<a href="https://github.com/UKGovernmentBEIS/inspect_ai/tree/main/src/inspect_evals/gpqa">
GPQA: A Graduate-Level Google-Proof Q&amp;A Benchmark
</a>
</div>
Expand Down
2 changes: 1 addition & 1 deletion index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1114,7 +1114,7 @@ <h2 class="anchored" data-anchor-id="learning-more">Learning More</h2>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","loop":false,"openEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","openEffect":"zoom","loop":false});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
2 changes: 1 addition & 1 deletion log-viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -1074,7 +1074,7 @@ <h3 class="unlisted anchored" data-anchor-id="other-notes">Other Notes</h3>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"loop":false,"selector":".lightbox","descPosition":"bottom","openEffect":"zoom","closeEffect":"zoom"});
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","loop":false,"openEffect":"zoom","descPosition":"bottom","selector":".lightbox"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
4 changes: 2 additions & 2 deletions search.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion sitemap.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
</url>
<url>
<loc>https://inspect.ai-safety-institute.org.uk/agents.html</loc>
<lastmod>2024-09-27T11:01:12.848Z</lastmod>
<lastmod>2024-09-27T16:48:55.489Z</lastmod>
</url>
<url>
<loc>https://inspect.ai-safety-institute.org.uk/scorers.html</loc>
Expand Down
26 changes: 13 additions & 13 deletions tutorial.html
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ <h2 class="anchored" data-anchor-id="sec-security-guide">Security Guide</h2>
<section id="setup" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
<p>We’ll start by importing the functions we need from Inspect and defining a system message that orients the model to its role as a computer security expert.</p>
<div id="1fad7b98" class="cell">
<div id="ebc7b4b6" class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> csv_dataset</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> model_graded_fact</span>
Expand All @@ -459,7 +459,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup">Setup</h3>
<section id="eval" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval">Eval</h3>
<p>Discerning whether the correct security guidance was provided by the model might provide difficult using only text matching algorithms. Here we use a model to read the response and assess the quality of the answer.</p>
<div id="50812fc3" class="cell">
<div id="aa52099e" class="cell">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> security_guide():</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> Task(</span>
Expand Down Expand Up @@ -489,7 +489,7 @@ <h2 class="anchored" data-anchor-id="sec-hellaswag">HellaSwag</h2>
<section id="setup-1" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
<p>We’ll start by importing the functions we need from Inspect, defining a system message, and writing a function to convert dataset records to samples (we need to do this to convert the index-based label in the dataset to a letter).</p>
<div id="a29f33e1" class="cell">
<div id="1586299e" class="cell">
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> choice</span>
Expand All @@ -514,7 +514,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-1">Setup</h3>
<section id="eval-1" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval-1">Eval</h3>
<p>We’ll load the dataset from <a href="https://huggingface.co/datasets/Rowan/hellaswag">HuggingFace</a> using the <code>hf_dataset()</code> function. We’ll draw data from the validation split, and use the <code>record_to_sample()</code> function to parse the records (we’ll also pass <code>trust=True</code> to indicate that we are okay with Hugging Face executing the dataset loading code provided by hellaswag):</p>
<div id="73494309" class="cell">
<div id="dd75c165" class="cell">
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> hellaswag():</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> </span>
Expand Down Expand Up @@ -574,7 +574,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
<li><code>record_to_sample()</code> to convert raw records to samples. Note that we need a function rather than just mapping field names with a <code>FieldSpec</code> because the <strong>answer</strong> field in the dataset needs to be divided into reasoning and the actual answer (which appears at the very end after <code>####</code>).</li>
<li><code>sample_to_fewshot()</code> to generate fewshot examples from samples.</li>
</ol>
<div id="bbc068f6" class="cell">
<div id="75d0a9db" class="cell">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample, hf_dataset</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> match</span>
Expand Down Expand Up @@ -621,7 +621,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-2">Setup</h3>
<section id="eval-2" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval-2">Eval</h3>
<p>We’ll load the dataset from <a href="https://huggingface.co/datasets/gsm8k">HuggingFace</a> using the <code>hf_dataset()</code> function. By default we use 10 fewshot examples, but the <code>fewshot</code> task arg can be used to turn this up, down, or off. The <code>fewshot_seed</code> is provided for stability of fewshot examples across runs.</p>
<div id="1ddcb4a6" class="cell">
<div id="9b5e4fbd" class="cell">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> gsm8k(fewshot<span class="op">=</span><span class="dv">10</span>, fewshot_seed<span class="op">=</span><span class="dv">42</span>):</span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="co"># build solver list dynamically (may or may not be doing fewshot)</span></span>
Expand Down Expand Up @@ -688,7 +688,7 @@ <h2 class="anchored" data-anchor-id="sec-mathematics">Mathematics</h2>
<section id="setup-3" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
<p>We’ll start by importing the functions we need from Inspect and defining a prompt that asks the model to reason step by step and respond with its answer on a line at the end. It also nudges the model not to enclose its answer in <code>\boxed</code>, a LaTeX command for displaying equations that models often use in math output.</p>
<div id="db992199" class="cell">
<div id="618c0fa2" class="cell">
<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> re</span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, task</span>
Expand Down Expand Up @@ -726,7 +726,7 @@ <h3 class="unlisted anchored" data-anchor-id="setup-3">Setup</h3>
<section id="eval-3" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
<p>Here is the basic setup for our eval. We <code>shuffle</code> the dataset so that when we use <code>--limit</code> to develop on smaller slices we get some variety of inputs and results:</p>
<div id="851eda24" class="cell">
<div id="1639e209" class="cell">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> math(shuffle<span class="op">=</span><span class="va">True</span>):</span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> Task(</span>
Expand All @@ -749,7 +749,7 @@ <h3 class="unlisted anchored" data-anchor-id="eval-3">Eval</h3>
<span id="cb12-20"><a href="#cb12-20" aria-hidden="true" tabindex="-1"></a> )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The heart of this eval isn’t in the task definition though, rather it’s in how we grade the output. Math expressions can be logically equivalent but not literally the same. Consequently, we’ll use a model to assess whether the output and the target are logically equivalent. the <code>expression_equivalence()</code> custom scorer implements this:</p>
<div id="cd13c13c" class="cell">
<div id="3d0bcbbb" class="cell">
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="at">@scorer</span>(metrics<span class="op">=</span>[accuracy(), stderr()])</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> expression_equivalence():</span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">async</span> <span class="kw">def</span> score(state: TaskState, target: Target):</span>
Expand Down Expand Up @@ -830,7 +830,7 @@ <h2 class="anchored" data-anchor-id="sec-tool-use">Tool Use</h2>
<section id="addition" class="level3 unlisted">
<h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
<p>We’ll demonstrate with a simple tool that adds two numbers, using the <code>@tool</code> decorator to register it with the system:</p>
<div id="b9eded63" class="cell">
<div id="794de462" class="cell">
<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai <span class="im">import</span> Task, <span class="bu">eval</span>, task</span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.dataset <span class="im">import</span> Sample</span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> inspect_ai.scorer <span class="im">import</span> includes, match</span>
Expand Down Expand Up @@ -865,7 +865,7 @@ <h3 class="unlisted anchored" data-anchor-id="addition">Addition</h3>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a> y: Second number to add.</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Type annotations and descriptions are <em>required</em> for tool declarations so that the model can be informed which types to pass back to the tool function and what the purpose of each parameter is.</p>
<p>Now that we’ve defined the tool, we can use it in an evaluation by passing it to the <code>use_tools()</code> function.</p>
<div id="3e649586" class="cell">
<div id="788de5fb" class="cell">
<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="at">@task</span></span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> addition_problem():</span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> Task(</span>
Expand Down Expand Up @@ -894,7 +894,7 @@ <h3 class="unlisted anchored" data-anchor-id="task">Task</h3>
<ol start="2" type="1">
<li><code>ctf_agent()</code>, which defines the agent’s solver. The solver consists principally of using <code>bash()</code> and <code>python()</code> tools in a loop until the flag is discovered. We’ll describe this function in more detail below.</li>
</ol>
<div id="440f1bb2" class="cell">
<div id="c24e3b9b" class="cell">
<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> textwrap <span class="im">import</span> dedent</span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> dataset <span class="im">import</span> read_dataset</span>
Expand All @@ -920,7 +920,7 @@ <h3 class="unlisted anchored" data-anchor-id="task">Task</h3>
</div>
<p>Note that we specify <code>sandbox="docker"</code> to ensure that code generated from the model is run in a secure <a href="agents.html#sec-sandbox-environments">sandbox environment</a>.</p>
<p>Here is the definition of the agent:</p>
<div id="b78d5751" class="cell">
<div id="cc13ffa0" class="cell">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="at">@solver</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> ctf_agent(max_attempts<span class="op">=</span><span class="dv">3</span>):</span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a> SYSTEM_MESSAGE <span class="op">=</span> dedent(<span class="st">"""</span></span>
Expand Down
2 changes: 1 addition & 1 deletion vscode.html
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ <h2 class="anchored" data-anchor-id="troubleshooting">Troubleshooting</h2>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","closeEffect":"zoom","loop":false,"selector":".lightbox","descPosition":"bottom"});
<script>var lightboxQuarto = GLightbox({"openEffect":"zoom","closeEffect":"zoom","descPosition":"bottom","loop":false,"selector":".lightbox"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down
2 changes: 1 addition & 1 deletion workflow.html
Original file line number Diff line number Diff line change
Expand Up @@ -1185,7 +1185,7 @@ <h2 class="anchored" data-anchor-id="eval-suites">Eval Suites</h2>
</div>
</div>
</footer>
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","openEffect":"zoom","loop":false,"descPosition":"bottom"});
<script>var lightboxQuarto = GLightbox({"closeEffect":"zoom","selector":".lightbox","descPosition":"bottom","loop":false,"openEffect":"zoom"});
(function() {
let previousOnload = window.onload;
window.onload = () => {
Expand Down

0 comments on commit a2119ad

Please sign in to comment.