From fc849d3829668eaf39020de493d5df4f9da0c700 Mon Sep 17 00:00:00 2001 From: Antonino Lorenzo <94693967+antoninoLorenzo@users.noreply.github.com> Date: Tue, 18 Jun 2024 13:07:25 +0200 Subject: [PATCH] RAG Evaluation Dataset Generation --- test/benchmarks/rag/dataset_generation.ipynb | 77 +++++++++++--------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/test/benchmarks/rag/dataset_generation.ipynb b/test/benchmarks/rag/dataset_generation.ipynb index aab4a07..4298c2d 100644 --- a/test/benchmarks/rag/dataset_generation.ipynb +++ b/test/benchmarks/rag/dataset_generation.ipynb @@ -64,8 +64,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-06-18T10:30:15.884808Z", - "start_time": "2024-06-18T10:30:14.900302Z" + "end_time": "2024-06-18T10:36:25.913447Z", + "start_time": "2024-06-18T10:36:25.025936Z" } }, "outputs": [], @@ -121,8 +121,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:15.889804Z", - "start_time": "2024-06-18T10:30:15.886302Z" + "end_time": "2024-06-18T10:36:25.918935Z", + "start_time": "2024-06-18T10:36:25.914950Z" } }, "id": "909ffcf7d895f882", @@ -158,8 +158,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:15.895802Z", - "start_time": "2024-06-18T10:30:15.890822Z" + "end_time": "2024-06-18T10:36:25.925443Z", + "start_time": "2024-06-18T10:36:25.919940Z" } }, "id": "b038c4b3396e7573", @@ -192,8 +192,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:22.082984Z", - "start_time": "2024-06-18T10:30:17.499483Z" + "end_time": "2024-06-18T10:36:29.411533Z", + "start_time": "2024-06-18T10:36:25.926435Z" } }, "id": "28d4062e8f104809", @@ -220,8 +220,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:22.105485Z", - "start_time": "2024-06-18T10:30:22.084484Z" + "end_time": "2024-06-18T10:36:29.429527Z", + "start_time": "2024-06-18T10:36:29.414030Z" } }, "id": "46b9a52d84842463", @@ -247,8 +247,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:23.604483Z", - "start_time": "2024-06-18T10:30:22.106484Z" + "end_time": "2024-06-18T10:36:30.512028Z", + "start_time": "2024-06-18T10:36:29.430530Z" } }, "id": "c83fa403738f990a", @@ -272,8 +272,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:23.610483Z", - "start_time": "2024-06-18T10:30:23.606482Z" + "end_time": "2024-06-18T10:36:30.517030Z", + "start_time": "2024-06-18T10:36:30.513045Z" } }, "id": "d7e3ba9fb757051e", @@ -306,8 +306,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:23.856484Z", - "start_time": "2024-06-18T10:30:23.853484Z" + "end_time": "2024-06-18T10:36:30.521528Z", + "start_time": "2024-06-18T10:36:30.518028Z" } }, "id": "d009b37def9b2214", @@ -320,14 +320,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Generating q&a: 100%|██████████| 100/100 [19:33<00:00, 11.73s/it]" + "Generating q&a: 100%|██████████| 100/100 [22:26<00:00, 13.46s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "JSON Decode Errors: 24\n" + "JSON Decode Errors: 20\n" ] }, { @@ -389,12 +389,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:09:26.081429Z", - "start_time": "2024-06-18T09:49:52.814927Z" + "end_time": "2024-06-18T10:58:56.648274Z", + "start_time": "2024-06-18T10:36:30.522529Z" } }, "id": "5cbb2eca96cca287", - "execution_count": 55 + "execution_count": 9 }, { "cell_type": "code", @@ -421,22 +421,22 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:21:09.572631Z", - "start_time": "2024-06-18T10:21:09.567631Z" + "end_time": "2024-06-18T10:58:56.664276Z", + "start_time": "2024-06-18T10:58:56.652277Z" } }, "id": "ea61b759d020bbce", - "execution_count": 78 + "execution_count": 10 }, { "cell_type": "code", "outputs": [ { "data": { - "text/plain": " question \\\n0 How can a security professional choose appropr... \n1 Is any data transmitted in clear text, and are... \n2 What is the SQL injection vulnerability in the... \n3 What security measures are implemented to prev... \n4 What are the key security requirements to be c... \n.. ... \n95 What is the vulnerability category that is not... \n96 How can an attacker gain access to a user's au... \n97 What was the position of security logging and ... \n98 What are some prohibited attack scenarios rela... \n99 What are some example exploitable component vu... \n\n ground_truth \n0 Choose a CSPRNG-based initialization vector fo... \n1 The context does not provide any information a... \n2 The SQL injection vulnerability in the given s... \n3 The context does not provide sufficient inform... \n4 Secure design is a crucial phase in applicatio... \n.. ... \n95 The vulnerability category that is not include... \n96 The attacker can gain access to the user's aut... \n97 The position of security logging and monitorin... \n98 Scenario #1: A credential recovery workflow mi... \n99 CVE-2017-5638, Heartbleed vulnerability, Strut... \n\n[100 rows x 2 columns]", - "text/html": "
\n | question | \nground_truth | \n
---|---|---|
0 | \nHow can a security professional choose appropr... | \nChoose a CSPRNG-based initialization vector fo... | \n
1 | \nIs any data transmitted in clear text, and are... | \nThe context does not provide any information a... | \n
2 | \nWhat is the SQL injection vulnerability in the... | \nThe SQL injection vulnerability in the given s... | \n
3 | \nWhat security measures are implemented to prev... | \nThe context does not provide sufficient inform... | \n
4 | \nWhat are the key security requirements to be c... | \nSecure design is a crucial phase in applicatio... | \n
... | \n... | \n... | \n
95 | \nWhat is the vulnerability category that is not... | \nThe vulnerability category that is not include... | \n
96 | \nHow can an attacker gain access to a user's au... | \nThe attacker can gain access to the user's aut... | \n
97 | \nWhat was the position of security logging and ... | \nThe position of security logging and monitorin... | \n
98 | \nWhat are some prohibited attack scenarios rela... | \nScenario #1: A credential recovery workflow mi... | \n
99 | \nWhat are some example exploitable component vu... | \nCVE-2017-5638, Heartbleed vulnerability, Strut... | \n
100 rows × 2 columns
\n\n | question | \nground_truth | \n
---|---|---|
0 | \nWhat are the most common vulnerabilities found... | \nAccording to the OWASP Cheat Sheet, the most c... | \n
1 | \nHow can a patch management process be implemen... | \nA patch management process can be implemented ... | \n
2 | \nWhat are some example exploitable component vu... | \nSome example exploitable component vulnerabili... | \n
3 | \nWhat are some common authentication weaknesses... | \nCWE-297: Improper Validation of Certificate w... | \n
4 | \nHow can digital signatures or similar mechanis... | \nImplement digital signatures or similar mechan... | \n
... | \n... | \n... | \n
95 | \nHow can an attacker take over an application b... | \nThe context does not provide sufficient inform... | \n
96 | \nWhat security vulnerabilities were exploited b... | \nThe context does not provide information about... | \n
97 | \nAre deprecated cryptographic padding methods s... | \nThe context does not provide information about... | \n
98 | \nWhat are some key concepts related to secure d... | \nInsecure design encompasses various weaknesses... | \n
99 | \nWhat is an example of a prohibited activity re... | \nThe inclusion of 'questions and answers' in th... | \n
100 rows × 2 columns
\n