Skip to content

Commit

Permalink
RAG Evaluation Dataset Generation
Browse files Browse the repository at this point in the history
  • Loading branch information
antoninoLorenzo committed Jun 18, 2024
1 parent 065480c commit fc849d3
Showing 1 changed file with 41 additions and 36 deletions.
77 changes: 41 additions & 36 deletions test/benchmarks/rag/dataset_generation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:15.884808Z",
"start_time": "2024-06-18T10:30:14.900302Z"
"end_time": "2024-06-18T10:36:25.913447Z",
"start_time": "2024-06-18T10:36:25.025936Z"
}
},
"outputs": [],
Expand Down Expand Up @@ -121,8 +121,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:15.889804Z",
"start_time": "2024-06-18T10:30:15.886302Z"
"end_time": "2024-06-18T10:36:25.918935Z",
"start_time": "2024-06-18T10:36:25.914950Z"
}
},
"id": "909ffcf7d895f882",
Expand Down Expand Up @@ -158,8 +158,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:15.895802Z",
"start_time": "2024-06-18T10:30:15.890822Z"
"end_time": "2024-06-18T10:36:25.925443Z",
"start_time": "2024-06-18T10:36:25.919940Z"
}
},
"id": "b038c4b3396e7573",
Expand Down Expand Up @@ -192,8 +192,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:22.082984Z",
"start_time": "2024-06-18T10:30:17.499483Z"
"end_time": "2024-06-18T10:36:29.411533Z",
"start_time": "2024-06-18T10:36:25.926435Z"
}
},
"id": "28d4062e8f104809",
Expand All @@ -220,8 +220,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:22.105485Z",
"start_time": "2024-06-18T10:30:22.084484Z"
"end_time": "2024-06-18T10:36:29.429527Z",
"start_time": "2024-06-18T10:36:29.414030Z"
}
},
"id": "46b9a52d84842463",
Expand All @@ -247,8 +247,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:23.604483Z",
"start_time": "2024-06-18T10:30:22.106484Z"
"end_time": "2024-06-18T10:36:30.512028Z",
"start_time": "2024-06-18T10:36:29.430530Z"
}
},
"id": "c83fa403738f990a",
Expand All @@ -272,8 +272,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:23.610483Z",
"start_time": "2024-06-18T10:30:23.606482Z"
"end_time": "2024-06-18T10:36:30.517030Z",
"start_time": "2024-06-18T10:36:30.513045Z"
}
},
"id": "d7e3ba9fb757051e",
Expand Down Expand Up @@ -306,8 +306,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:30:23.856484Z",
"start_time": "2024-06-18T10:30:23.853484Z"
"end_time": "2024-06-18T10:36:30.521528Z",
"start_time": "2024-06-18T10:36:30.518028Z"
}
},
"id": "d009b37def9b2214",
Expand All @@ -320,14 +320,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Generating q&a: 100%|██████████| 100/100 [19:33<00:00, 11.73s/it]"
"Generating q&a: 100%|██████████| 100/100 [22:26<00:00, 13.46s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"JSON Decode Errors: 24\n"
"JSON Decode Errors: 20\n"
]
},
{
Expand Down Expand Up @@ -389,12 +389,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:09:26.081429Z",
"start_time": "2024-06-18T09:49:52.814927Z"
"end_time": "2024-06-18T10:58:56.648274Z",
"start_time": "2024-06-18T10:36:30.522529Z"
}
},
"id": "5cbb2eca96cca287",
"execution_count": 55
"execution_count": 9
},
{
"cell_type": "code",
Expand All @@ -421,22 +421,22 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:21:09.572631Z",
"start_time": "2024-06-18T10:21:09.567631Z"
"end_time": "2024-06-18T10:58:56.664276Z",
"start_time": "2024-06-18T10:58:56.652277Z"
}
},
"id": "ea61b759d020bbce",
"execution_count": 78
"execution_count": 10
},
{
"cell_type": "code",
"outputs": [
{
"data": {
"text/plain": " question \\\n0 How can a security professional choose appropr... \n1 Is any data transmitted in clear text, and are... \n2 What is the SQL injection vulnerability in the... \n3 What security measures are implemented to prev... \n4 What are the key security requirements to be c... \n.. ... \n95 What is the vulnerability category that is not... \n96 How can an attacker gain access to a user's au... \n97 What was the position of security logging and ... \n98 What are some prohibited attack scenarios rela... \n99 What are some example exploitable component vu... \n\n ground_truth \n0 Choose a CSPRNG-based initialization vector fo... \n1 The context does not provide any information a... \n2 The SQL injection vulnerability in the given s... \n3 The context does not provide sufficient inform... \n4 Secure design is a crucial phase in applicatio... \n.. ... \n95 The vulnerability category that is not include... \n96 The attacker can gain access to the user's aut... \n97 The position of security logging and monitorin... \n98 Scenario #1: A credential recovery workflow mi... \n99 CVE-2017-5638, Heartbleed vulnerability, Strut... \n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>question</th>\n <th>ground_truth</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>How can a security professional choose appropr...</td>\n <td>Choose a CSPRNG-based initialization vector fo...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Is any data transmitted in clear text, and are...</td>\n <td>The context does not provide any information a...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>What is the SQL injection vulnerability in the...</td>\n <td>The SQL injection vulnerability in the given s...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>What security measures are implemented to prev...</td>\n <td>The context does not provide sufficient inform...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>What are the key security requirements to be c...</td>\n <td>Secure design is a crucial phase in applicatio...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>95</th>\n <td>What is the vulnerability category that is not...</td>\n <td>The vulnerability category that is not include...</td>\n </tr>\n <tr>\n <th>96</th>\n <td>How can an attacker gain access to a user's au...</td>\n <td>The attacker can gain access to the user's aut...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>What was the position of security logging and ...</td>\n <td>The position of security logging and monitorin...</td>\n </tr>\n <tr>\n <th>98</th>\n <td>What are some prohibited attack scenarios rela...</td>\n <td>Scenario #1: A credential recovery workflow mi...</td>\n </tr>\n <tr>\n <th>99</th>\n <td>What are some example exploitable component vu...</td>\n <td>CVE-2017-5638, Heartbleed vulnerability, Strut...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
"text/plain": " question \\\n0 What are the most common vulnerabilities found... \n1 How can a patch management process be implemen... \n2 What are some example exploitable component vu... \n3 What are some common authentication weaknesses... \n4 How can digital signatures or similar mechanis... \n.. ... \n95 How can an attacker take over an application b... \n96 What security vulnerabilities were exploited b... \n97 Are deprecated cryptographic padding methods s... \n98 What are some key concepts related to secure d... \n99 What is an example of a prohibited activity re... \n\n ground_truth \n0 According to the OWASP Cheat Sheet, the most c... \n1 A patch management process can be implemented ... \n2 Some example exploitable component vulnerabili... \n3 CWE-297: Improper Validation of Certificate w... \n4 Implement digital signatures or similar mechan... \n.. ... \n95 The context does not provide sufficient inform... \n96 The context does not provide information about... \n97 The context does not provide information about... \n98 Insecure design encompasses various weaknesses... \n99 The inclusion of 'questions and answers' in th... \n\n[100 rows x 2 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>question</th>\n <th>ground_truth</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>What are the most common vulnerabilities found...</td>\n <td>According to the OWASP Cheat Sheet, the most c...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>How can a patch management process be implemen...</td>\n <td>A patch management process can be implemented ...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>What are some example exploitable component vu...</td>\n <td>Some example exploitable component vulnerabili...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>What are some common authentication weaknesses...</td>\n <td>CWE-297: Improper Validation of Certificate w...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>How can digital signatures or similar mechanis...</td>\n <td>Implement digital signatures or similar mechan...</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>95</th>\n <td>How can an attacker take over an application b...</td>\n <td>The context does not provide sufficient inform...</td>\n </tr>\n <tr>\n <th>96</th>\n <td>What security vulnerabilities were exploited b...</td>\n <td>The context does not provide information about...</td>\n </tr>\n <tr>\n <th>97</th>\n <td>Are deprecated cryptographic padding methods s...</td>\n <td>The context does not provide information about...</td>\n </tr>\n <tr>\n <th>98</th>\n <td>What are some key concepts related to secure d...</td>\n <td>Insecure design encompasses various weaknesses...</td>\n </tr>\n <tr>\n <th>99</th>\n <td>What is an example of a prohibited activity re...</td>\n <td>The inclusion of 'questions and answers' in th...</td>\n </tr>\n </tbody>\n</table>\n<p>100 rows × 2 columns</p>\n</div>"
},
"execution_count": 81,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -448,37 +448,42 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:22:09.582552Z",
"start_time": "2024-06-18T10:22:09.573049Z"
"end_time": "2024-06-18T10:58:56.694773Z",
"start_time": "2024-06-18T10:58:56.667780Z"
}
},
"id": "510b48619d5f48f8",
"execution_count": 81
"execution_count": 11
},
{
"cell_type": "code",
"outputs": [],
"source": [
"output.to_json('../../../data/rag_eval/owasp_100.json')"
"output.to_json('../../../data/rag_eval/owasp_100-200.json')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:23:31.320884Z",
"start_time": "2024-06-18T10:23:31.316374Z"
"end_time": "2024-06-18T10:58:56.710283Z",
"start_time": "2024-06-18T10:58:56.698774Z"
}
},
"id": "ce5a1b24736c23bc",
"execution_count": 82
"execution_count": 12
},
{
"cell_type": "code",
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-06-18T10:58:56.720777Z",
"start_time": "2024-06-18T10:58:56.715275Z"
}
},
"id": "2e7aed9db6484592"
"id": "2e7aed9db6484592",
"execution_count": 12
}
],
"metadata": {
Expand Down

0 comments on commit fc849d3

Please sign in to comment.