From fc849d3829668eaf39020de493d5df4f9da0c700 Mon Sep 17 00:00:00 2001 From: Antonino Lorenzo <94693967+antoninoLorenzo@users.noreply.github.com> Date: Tue, 18 Jun 2024 13:07:25 +0200 Subject: [PATCH] RAG Evaluation Dataset Generation --- test/benchmarks/rag/dataset_generation.ipynb | 77 +++++++++++--------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/test/benchmarks/rag/dataset_generation.ipynb b/test/benchmarks/rag/dataset_generation.ipynb index aab4a07..4298c2d 100644 --- a/test/benchmarks/rag/dataset_generation.ipynb +++ b/test/benchmarks/rag/dataset_generation.ipynb @@ -64,8 +64,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-06-18T10:30:15.884808Z", - "start_time": "2024-06-18T10:30:14.900302Z" + "end_time": "2024-06-18T10:36:25.913447Z", + "start_time": "2024-06-18T10:36:25.025936Z" } }, "outputs": [], @@ -121,8 +121,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:15.889804Z", - "start_time": "2024-06-18T10:30:15.886302Z" + "end_time": "2024-06-18T10:36:25.918935Z", + "start_time": "2024-06-18T10:36:25.914950Z" } }, "id": "909ffcf7d895f882", @@ -158,8 +158,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:15.895802Z", - "start_time": "2024-06-18T10:30:15.890822Z" + "end_time": "2024-06-18T10:36:25.925443Z", + "start_time": "2024-06-18T10:36:25.919940Z" } }, "id": "b038c4b3396e7573", @@ -192,8 +192,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:22.082984Z", - "start_time": "2024-06-18T10:30:17.499483Z" + "end_time": "2024-06-18T10:36:29.411533Z", + "start_time": "2024-06-18T10:36:25.926435Z" } }, "id": "28d4062e8f104809", @@ -220,8 +220,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:22.105485Z", - "start_time": "2024-06-18T10:30:22.084484Z" + "end_time": "2024-06-18T10:36:29.429527Z", + "start_time": "2024-06-18T10:36:29.414030Z" } }, "id": "46b9a52d84842463", @@ -247,8 +247,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:23.604483Z", - "start_time": "2024-06-18T10:30:22.106484Z" + "end_time": "2024-06-18T10:36:30.512028Z", + "start_time": "2024-06-18T10:36:29.430530Z" } }, "id": "c83fa403738f990a", @@ -272,8 +272,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:23.610483Z", - "start_time": "2024-06-18T10:30:23.606482Z" + "end_time": "2024-06-18T10:36:30.517030Z", + "start_time": "2024-06-18T10:36:30.513045Z" } }, "id": "d7e3ba9fb757051e", @@ -306,8 +306,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:30:23.856484Z", - "start_time": "2024-06-18T10:30:23.853484Z" + "end_time": "2024-06-18T10:36:30.521528Z", + "start_time": "2024-06-18T10:36:30.518028Z" } }, "id": "d009b37def9b2214", @@ -320,14 +320,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Generating q&a: 100%|██████████| 100/100 [19:33<00:00, 11.73s/it]" + "Generating q&a: 100%|██████████| 100/100 [22:26<00:00, 13.46s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "JSON Decode Errors: 24\n" + "JSON Decode Errors: 20\n" ] }, { @@ -389,12 +389,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:09:26.081429Z", - "start_time": "2024-06-18T09:49:52.814927Z" + "end_time": "2024-06-18T10:58:56.648274Z", + "start_time": "2024-06-18T10:36:30.522529Z" } }, "id": "5cbb2eca96cca287", - "execution_count": 55 + "execution_count": 9 }, { "cell_type": "code", @@ -421,22 +421,22 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:21:09.572631Z", - "start_time": "2024-06-18T10:21:09.567631Z" + "end_time": "2024-06-18T10:58:56.664276Z", + "start_time": "2024-06-18T10:58:56.652277Z" } }, "id": "ea61b759d020bbce", - "execution_count": 78 + "execution_count": 10 }, { "cell_type": "code", "outputs": [ { "data": { - "text/plain": " question \\\n0 How can a security professional choose appropr... \n1 Is any data transmitted in clear text, and are... \n2 What is the SQL injection vulnerability in the... \n3 What security measures are implemented to prev... \n4 What are the key security requirements to be c... \n.. ... \n95 What is the vulnerability category that is not... \n96 How can an attacker gain access to a user's au... \n97 What was the position of security logging and ... \n98 What are some prohibited attack scenarios rela... \n99 What are some example exploitable component vu... \n\n ground_truth \n0 Choose a CSPRNG-based initialization vector fo... \n1 The context does not provide any information a... \n2 The SQL injection vulnerability in the given s... \n3 The context does not provide sufficient inform... \n4 Secure design is a crucial phase in applicatio... \n.. ... \n95 The vulnerability category that is not include... \n96 The attacker can gain access to the user's aut... \n97 The position of security logging and monitorin... \n98 Scenario #1: A credential recovery workflow mi... \n99 CVE-2017-5638, Heartbleed vulnerability, Strut... \n\n[100 rows x 2 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
questionground_truth
0How can a security professional choose appropr...Choose a CSPRNG-based initialization vector fo...
1Is any data transmitted in clear text, and are...The context does not provide any information a...
2What is the SQL injection vulnerability in the...The SQL injection vulnerability in the given s...
3What security measures are implemented to prev...The context does not provide sufficient inform...
4What are the key security requirements to be c...Secure design is a crucial phase in applicatio...
.........
95What is the vulnerability category that is not...The vulnerability category that is not include...
96How can an attacker gain access to a user's au...The attacker can gain access to the user's aut...
97What was the position of security logging and ...The position of security logging and monitorin...
98What are some prohibited attack scenarios rela...Scenario #1: A credential recovery workflow mi...
99What are some example exploitable component vu...CVE-2017-5638, Heartbleed vulnerability, Strut...
\n

100 rows × 2 columns

\n
" + "text/plain": " question \\\n0 What are the most common vulnerabilities found... \n1 How can a patch management process be implemen... \n2 What are some example exploitable component vu... \n3 What are some common authentication weaknesses... \n4 How can digital signatures or similar mechanis... \n.. ... \n95 How can an attacker take over an application b... \n96 What security vulnerabilities were exploited b... \n97 Are deprecated cryptographic padding methods s... \n98 What are some key concepts related to secure d... \n99 What is an example of a prohibited activity re... \n\n ground_truth \n0 According to the OWASP Cheat Sheet, the most c... \n1 A patch management process can be implemented ... \n2 Some example exploitable component vulnerabili... \n3 CWE-297: Improper Validation of Certificate w... \n4 Implement digital signatures or similar mechan... \n.. ... \n95 The context does not provide sufficient inform... \n96 The context does not provide information about... \n97 The context does not provide information about... \n98 Insecure design encompasses various weaknesses... \n99 The inclusion of 'questions and answers' in th... \n\n[100 rows x 2 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
questionground_truth
0What are the most common vulnerabilities found...According to the OWASP Cheat Sheet, the most c...
1How can a patch management process be implemen...A patch management process can be implemented ...
2What are some example exploitable component vu...Some example exploitable component vulnerabili...
3What are some common authentication weaknesses...CWE-297: Improper Validation of Certificate w...
4How can digital signatures or similar mechanis...Implement digital signatures or similar mechan...
.........
95How can an attacker take over an application b...The context does not provide sufficient inform...
96What security vulnerabilities were exploited b...The context does not provide information about...
97Are deprecated cryptographic padding methods s...The context does not provide information about...
98What are some key concepts related to secure d...Insecure design encompasses various weaknesses...
99What is an example of a prohibited activity re...The inclusion of 'questions and answers' in th...
\n

100 rows × 2 columns

\n
" }, - "execution_count": 81, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -448,37 +448,42 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:22:09.582552Z", - "start_time": "2024-06-18T10:22:09.573049Z" + "end_time": "2024-06-18T10:58:56.694773Z", + "start_time": "2024-06-18T10:58:56.667780Z" } }, "id": "510b48619d5f48f8", - "execution_count": 81 + "execution_count": 11 }, { "cell_type": "code", "outputs": [], "source": [ - "output.to_json('../../../data/rag_eval/owasp_100.json')" + "output.to_json('../../../data/rag_eval/owasp_100-200.json')" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-18T10:23:31.320884Z", - "start_time": "2024-06-18T10:23:31.316374Z" + "end_time": "2024-06-18T10:58:56.710283Z", + "start_time": "2024-06-18T10:58:56.698774Z" } }, "id": "ce5a1b24736c23bc", - "execution_count": 82 + "execution_count": 12 }, { "cell_type": "code", "outputs": [], "source": [], "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-06-18T10:58:56.720777Z", + "start_time": "2024-06-18T10:58:56.715275Z" + } }, - "id": "2e7aed9db6484592" + "id": "2e7aed9db6484592", + "execution_count": 12 } ], "metadata": {