From 58347b93914d03dacc5626c8832b66a8a3c6132c Mon Sep 17 00:00:00 2001 From: Giovanni Gatti Pinheiro Date: Thu, 25 Jul 2024 14:35:35 +0200 Subject: [PATCH] fixing image rendering on pages --- docs/_posts/2024-7-25-socratic-llm.md | 6 +- docs/images/training-pipeline.svg | 1159 ++++++++++++++++++------- 2 files changed, 867 insertions(+), 298 deletions(-) diff --git a/docs/_posts/2024-7-25-socratic-llm.md b/docs/_posts/2024-7-25-socratic-llm.md index 4daa329..2d808fe 100644 --- a/docs/_posts/2024-7-25-socratic-llm.md +++ b/docs/_posts/2024-7-25-socratic-llm.md @@ -54,15 +54,15 @@ We fine-tuned three models over three different datasets ([Debugging](https://ar We observe that the model trained on TutorChat is the most performing, yielding good performance on all three datasets. Notably, the TutorChat-trained model surpasses the models trained on MathDial and Debugging when evaluated on their respective test sets, albeit by a small margin. Such an effect is likely due to the preference dataset of TutorChat, which indicates a higher data diversity than the MathDial and Debugging datasets. -![_config.yml]({{ site.baseurl }}/images/table.png) +![_config.yml]({{ site.baseurl }}/images/table.png){:style="width: 430px; display:block; margin-left: auto; margin-right: auto;"} Below, we present the mean summary scores over the 100 samples for the TutorChat fine-tuned model and the base model using only prompt engineering. We add GPT-4o's performance with only prompt engineering to provide a reference of the best possible performance with prompt engineering-only strategies. The fine-tuned model improved significantly over the base model, reaching close performance to a much larger and more powerful GPT-4o in all datasets. -![_config.yml]({{ site.baseurl }}/images/perf-across-datasets.svg) +![_config.yml]({{ site.baseurl }}/images/perf-across-datasets.svg){:style="width: 430px; display:block; margin-left: auto; margin-right: auto;"} The TutorChat-trained model (our best model) showed significant gains in three key areas and now performs almost as well as GPT-4o. This also shows the model's strong generalization ability, as it was trained on TutorChat data but excelled on the different MathDial datasets. -![_config.yml]({{ site.baseurl }}/images/performance-breakdown.svg) +![_config.yml]({{ site.baseurl }}/images/performance-breakdown.svg){:style="width: 430px; display:block; margin-left: auto; margin-right: auto;"} It also showed significant gains in three areas, nearing GPT-4o performance, and demonstrated strong generalization by excelling on a dataset different from its training data. diff --git a/docs/images/training-pipeline.svg b/docs/images/training-pipeline.svg index f8aa6a3..8554ce8 100644 --- a/docs/images/training-pipeline.svg +++ b/docs/images/training-pipeline.svg @@ -7,7 +7,7 @@ viewBox="0 0 210 297" version="1.1" id="svg404" - inkscape:version="1.2.2 (732a01da63, 2022-12-09)" + inkscape:version="1.2.2 (b0a8486541, 2022-12-01)" sodipodi:docname="training-pipeline.svg" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" @@ -25,12 +25,12 @@ inkscape:document-units="mm" showgrid="false" inkscape:zoom="0.97638867" - inkscape:cx="291.37987" - inkscape:cy="133.65579" + inkscape:cx="290.86778" + inkscape:cy="133.1437" inkscape:window-width="1920" - inkscape:window-height="991" - inkscape:window-x="-9" - inkscape:window-y="-9" + inkscape:window-height="941" + inkscape:window-x="0" + inkscape:window-y="27" inkscape:window-maximized="1" inkscape:current-layer="g13044" /> - GPT4o + style="font-size:1173.22px;-inkscape-font-specification:sans-serif;text-align:justify;stroke-width:126.805"> + + + + + + @@ -484,64 +494,94 @@ transform="matrix(0.84420758,0,0,-1.1845428,42.258096,-623.57364)" id="text3462-7" style="font-size:2395.25px;-inkscape-font-specification:sans-serif;text-align:center;text-anchor:middle;stroke-width:319.116;stroke-linejoin:round"> - E + + + - ... + style="font-size:1520.79px;-inkscape-font-specification:sans-serif;text-align:center;text-anchor:middle;stroke-width:113.796"> + + + + + + + + + + + + + + + + + + + + + + + + + - A - Answer A - Answer E - E + + + - ... + style="font-size:1520.79px;-inkscape-font-specification:sans-serif;text-align:center;text-anchor:middle;stroke-width:113.796"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - A - Evaluated answer A - Evaluated answer E - - + Instructions: -Respond to students according to the Socratic principles - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Instructions: -Assess professor's answer on the following aspects + style="font-weight:bold;font-size:10.6667px;-inkscape-font-specification:'sans-serif Bold';text-align:center;white-space:pre;shape-inside:url(#rect2638-7);display:inline;stroke-width:1.297;stroke-linejoin:round"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - 0.3 - + + + + + + - 0.7 + style="fill:#ffffff;stroke:#d45500;stroke-width:0.424308" + d="m 139.1002,44.448662 a 2.7756941,2.7756941 0 0 1 -2.77569,2.775694 2.7756941,2.7756941 0 0 1 -2.7757,-2.775694 2.7756941,2.7756941 0 0 1 2.7757,-2.775694 2.7756941,2.7756941 0 0 1 2.77569,2.775694 z" /> + + + + + - Ranking - + + + + + + + + + - B=C>E>A + style="fill:none;stroke:#000000;stroke-width:1.12334" + d="m 186.03481,17.29706 h 6.0948 c 1.2373,0 2.23339,0.996093 2.23339,2.233392 v 41.609891 c 0,1.237299 -0.99609,2.233392 -2.23339,2.233392 h -6.0948 c -1.23729,0 -2.23339,-0.996093 -2.23339,-2.233392 V 19.530452 c 0,-1.237299 0.9961,-2.233392 2.23339,-2.233392 z" /> + + + + + + + + + Data generation + id="tspan1589">Data generation pipeline + id="tspan1593">pipeline Fine-tuning + id="tspan1597">Fine-tuning