feat: 1049 add standard evaluation benchmarks to lfai evals (#1078)

* refactor eval structure * add MMLU benchmark * add HumanEval benchmarks * add DeepEval compatible LLM class using LFAI * update evals README * upgrade DeepEval to v1.3.0
defenseunicorns · Sep 25, 2024 · 8e68b8b · 8e68b8b
1 parent fb2b437
commit 8e68b8b
Show file tree

Hide file tree

Showing 16 changed files with 334 additions and 115 deletions.
diff --git a/src/leapfrogai_evals/.env.example b/src/leapfrogai_evals/.env.example
@@ -26,3 +26,12 @@ QA_NUM_SAMPLES=25
 QA_NUM_DOCUMENTS=5
 #QA_VECTOR_STORE_ID= # set this to a vectore store id if you want to use an already existing vector store with the files present
 QA_CLEANUP_VECTOR_STORE=True # recommend setting this to False if a vector store id is provided
+
+# MMLU
+MMLU_NUM_TASKS=6
+MMLU_NUM_SHOTS=5
+
+# HumanEval
+HUMAN_EVAL_NUM_SAMPLES_PER_TASK=3
+HUMAN_EVAL_NUM_TASKS=50
+HUMAN_EVAL_K=1
diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md
@@ -115,3 +115,21 @@ The LeapfrogAI NIAH evaluation uses the following process:
 - delete the vector store
 
 The retrieval and response rate is then averaged across each copy of the experiment to generate a final score.
+
+## Established Benchmark Evaluations
+
+### MMLU
+
+The [Massive Multitask Language Understanding (MMLU)](https://arxiv.org/abs/2009.03300) benchmark is widely-used in the evaluation of large language models on academic and professional tasks. It consists of 57 tasks across disciplines including mathe, history, biology, law, etc. These tasks reflect the kind of questions students and professionals might encounter, making the benchmark a good test of a model's knowledge and reasoning abilities across different fields.
+
+MMLU is a multiple-choice evaluation, meaning that it focuses more on the logical reasoning behind the questions and less about the specific generation process.
+
+Within the LeapfrogAI evaluation framework, The [DeepEval Implementation of MMLU](https://docs.confident-ai.com/docs/benchmarks-mmlu) is utilized. Additionally, a default subset of 6 topically relevant tasks out of the total 57 tasks are tested against, including: College Computer Science, US Foreign Policy, High School Government and Politics, Formal Logic, Computer Security, and Security Studies. This subset was chosen as the full MMLU evaluation takes a long time to process. Larger evaluation sessions should utilize the entire task set.
+
+### HumanEval
+
+The [HumanEval](https://github.com/openai/human-eval) benchmark is a dataset designed to evaluate an LLM's code generation capabilities. The benchmark consists of 164 hand-crafted programming challenges comparable to simple software interview questions.
+
+HumanEval code snippets are generated from docstrings into Python-executable code. The primary metric utilized is the pass@k metric, which measures the probability that at least one of the top k code snippets generated by the LLM passes all the test cases.
+
+Within the LeapfrogAI evaluation framework, The [DeepEval Implementation of HumanEval](https://docs.confident-ai.com/docs/benchmarks-human-eval) is utilized. Additionally, a default subset of 50 code generation tasks out of the total 164 are tested against. This was chosen as the full HumanEval evaluation takes a long time to process. Larger evaluation sessions should utilize the entire task set.
diff --git a/src/leapfrogai_evals/evals/__init__.py b/src/leapfrogai_evals/evals/__init__.py
@@ -0,0 +1,7 @@
+# __init__.py
+# ruff: noqa: F401
+
+from leapfrogai_evals.evals.human_eval import human_eval
+from leapfrogai_evals.evals.mmlu import mmlu
+from leapfrogai_evals.evals.niah_eval import niah_eval
+from leapfrogai_evals.evals.qa_eval import qa_eval
diff --git a/src/leapfrogai_evals/evals/human_eval.py b/src/leapfrogai_evals/evals/human_eval.py
@@ -0,0 +1,50 @@
+import logging
+import numpy as np
+import os
+
+from deepeval.benchmarks import HumanEval
+from deepeval.benchmarks.tasks import HumanEvalTask
+from tqdm import tqdm
+from typing import Optional
+
+from leapfrogai_evals.models import LFAI_Model
+
+
+def human_eval(
+    num_samples: Optional[int] = None,
+    k: Optional[int] = None,
+    num_tasks: Optional[int] = None,
+) -> dict:
+    """Runs the HumanEval benchmark on a subset of tasks"""
+    eval_results = dict()
+    task_scores = dict()
+    num_tasks = num_tasks or int(
+        os.getenv("HUMAN_EVAL_NUM_TASKS", default=len(list(HumanEvalTask)))
+    )
+    logging.info(f"Running the HumanEval benchmark on {num_tasks} tasks")
+    failed_tasks = 0
+    for task in tqdm(list(HumanEvalTask)[:num_tasks]):
+        task_benchmark = HumanEval(
+            n=num_samples or int(os.getenv("HUMAN_EVAL_NUM_SAMPLES_PER_TASK")),
+            tasks=[task],
+        )
+        try:
+            task_benchmark.evaluate(
+                model=LFAI_Model(), k=k or int(os.getenv("HUMAN_EVAL_K"))
+            )
+            task_scores[task.name] = task_benchmark.overall_score
+        except Exception as exc:
+            logging.info(
+                f"HumanEval task {task.name} failed with error {exc}", exc_info=exc
+            )
+            task_scores[task.name] = 0.0
+            failed_tasks += 1
+
+    human_eval_avg_score = np.mean(list(task_scores.values()))
+    logging.info(f"HumanEval overall score: {human_eval_avg_score}")
+    logging.info(f"HumanEval failed task count: {failed_tasks}")
+    logging.info(f"HumanEval task scores:\n {task_scores}")
+
+    # add the evaluation score to the final results
+    eval_results["HumanEval"] = human_eval_avg_score
+    return eval_results
diff --git a/src/leapfrogai_evals/evals/mmlu.py b/src/leapfrogai_evals/evals/mmlu.py
@@ -0,0 +1,28 @@
+import logging
+import os
+
+from deepeval.benchmarks import MMLU
+from deepeval.benchmarks.tasks import MMLUTask
+from typing import Optional
+
+from leapfrogai_evals.models import LFAI_Model
+
+
+def mmlu(num_tasks: Optional[int] = None, n_shots: Optional[int] = None) -> dict:
+    """Runs the Massive Multitask Language Understanding (MMLU) benchmark on a subset of tasks"""
+    eval_results = dict()
+    num_tasks = num_tasks or int(
+        os.getenv("MMLU_NUM_TASKS", default=len(list(MMLUTask)))
+    )
+    logging.info(f"Running the MMLU benchmark on {num_tasks} tasks")
+    tasks = list(MMLUTask)[:num_tasks]
+    mmlu_benchmark = MMLU(
+        tasks=tasks, n_shots=n_shots or int(os.getenv("MMLU_NUM_SHOTS"))
+    )
+    mmlu_benchmark.evaluate(model=LFAI_Model())
+    logging.info(f"MMLU overall score: {mmlu_benchmark.overall_score}")
+    logging.info(f"MMLU task scores:\n {mmlu_benchmark.task_scores}")
+
+    # add the evaluation score to the final results
+    eval_results["MMLU"] = mmlu_benchmark.overall_score
+    return eval_results
diff --git a/src/leapfrogai_evals/evals/niah_eval.py b/src/leapfrogai_evals/evals/niah_eval.py
@@ -0,0 +1,53 @@
+import logging
+import numpy as np
+
+from deepeval.test_case import LLMTestCase
+
+from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response
+from leapfrogai_evals.runners import NIAH_Runner
+
+
+def niah_eval(*args, **kwargs) -> dict:
+    """Run the Needle in a Haystack evaluation"""
+    logging.info("Beginning Needle in a Haystack Evaluation...")
+    eval_results = dict()
+    niah_test_cases = []
+
+    niah_runner = NIAH_Runner(*args, **kwargs)
+    niah_runner.run_experiment()
+
+    # build test cases out of the niah_dataset
+    for row in niah_runner.niah_data:
+        niah_test_cases.append(
+            LLMTestCase(
+                input=niah_runner.message_prompt,
+                actual_output=row["response"],
+                context=[row["context"]],
+                additional_metadata={
+                    "retrieval_score": row["retrieval_score"],
+                    "response_score": row["response_score"],
+                },
+            )
+        )
+
+    # run metrics
+    # TODO: Give ability to choose which metrics to run
+    retrieval_metric = NIAH_Retrieval()
+    response_metric = NIAH_Response()
+    metrics = [retrieval_metric, response_metric]
+
+    # record scores and return results
+    for metric in metrics:
+        scores = []
+        successes = []
+        for test_case in niah_test_cases:
+            metric.measure(test_case)
+            scores.append(metric.score)
+            successes.append(metric.is_successful())
+        eval_results[f"Average {metric.__name__}"] = np.mean(scores)
+        logging.info(f"{metric.__name__} Results:")
+        logging.info(f"average score: {np.mean(scores)}")
+        logging.info(f"scores: {scores}")
+        logging.info(f"successes: {successes}")
+
+    return eval_results
diff --git a/src/leapfrogai_evals/evals/qa_eval.py b/src/leapfrogai_evals/evals/qa_eval.py
@@ -0,0 +1,72 @@
+import logging
+import numpy as np
+import os
+
+from deepeval.metrics import AnswerRelevancyMetric
+from deepeval.test_case import LLMTestCase
+
+from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric
+from leapfrogai_evals.models import *  # noqa (imports all models)
+from leapfrogai_evals.runners import QA_Runner
+
+
+def qa_eval(*args, **kwargs) -> dict:
+    """Runs the Question/Answer evaluation"""
+    logging.info("Beginning Question/Answer Evaluation...")
+    eval_results = dict()
+    qa_test_cases = []
+
+    qa_runner = QA_Runner(*args, **kwargs)
+    qa_runner.run_experiment()
+
+    # build test cases out of the qa_dataset
+    for row in qa_runner.qa_data:
+        qa_test_cases.append(
+            LLMTestCase(
+                input=row["input"],
+                actual_output=row["actual_output"],
+                context=row["context"],
+                expected_output=row["expected_output"],
+                additional_metadata={
+                    "actual_annotations": row["actual_annotations"],
+                    "expected_annotations": row["expected_annotations"],
+                },
+                # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
+            )
+        )
+
+    # Create judge llm
+    try:
+        judge_model = globals()[os.environ.get("LLM_JUDGE")]()
+    except KeyError:
+        judge_model = os.environ.get("LLM_JUDGE")
+
+    # run metrics
+    # TODO: Give ability to choose which metrics to run
+    correctness_metric = CorrectnessMetric(model=judge_model)
+    answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
+    annotation_relevancy_metric = AnnotationRelevancyMetric()
+    metrics = [
+        correctness_metric,
+        answer_relevancy_metric,
+        annotation_relevancy_metric,
+    ]
+
+    # record scores and return results
+    for metric in metrics:
+        scores = []
+        successes = []
+        reasons = []
+        for test_case in qa_test_cases:
+            metric.measure(test_case)
+            scores.append(metric.score)
+            successes.append(metric.is_successful())
+            reasons.append(metric.reason)
+        eval_results[f"Average {metric.__name__}"] = np.mean(scores)
+        logging.info(f"{metric.__name__} Results:")
+        logging.info(f"average score: {np.mean(scores)}")
+        logging.info(f"scores: {scores}")
+        logging.info(f"successes: {successes}")
+        logging.info(f"reasons: {reasons}")
+
+    return eval_results
diff --git a/src/leapfrogai_evals/judges/__init__.py b/src/leapfrogai_evals/judges/__init__.py
diff --git a/src/leapfrogai_evals/main.py b/src/leapfrogai_evals/main.py
@@ -1,21 +1,11 @@
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import AnswerRelevancyMetric
-
 import logging
-import numpy as np
-import os
-from dotenv import load_dotenv
 import time
+from dotenv import load_dotenv
 from typing import Optional, List
 
-from leapfrogai_evals.judges.claude_sonnet import ClaudeSonnet  # noqa
-from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
-from leapfrogai_evals.metrics.correctness import CorrectnessMetric
-from leapfrogai_evals.metrics.niah_metrics import NIAH_Retrieval, NIAH_Response
-from leapfrogai_evals.runners.niah_runner import NIAH_Runner
-from leapfrogai_evals.runners.qa_runner import QA_Runner
+from leapfrogai_evals.evals import human_eval, mmlu, niah_eval, qa_eval  # noqa
 
-ALL_EVALS = ["niah_eval", "qa_eval"]
+ALL_EVALS = ["niah_eval", "qa_eval", "mmlu", "human_eval"]
 
 
 class RAGEvaluator:
@@ -55,8 +45,9 @@ def run_evals(self, *args, **kwargs) -> None:
 
         start_time = time.time()
         for eval_name in self.eval_list:
-            eval = getattr(self, eval_name)
-            eval(*args, **kwargs)
+            eval = globals()[eval_name]
+            eval_result = eval(*args, **kwargs)
+            self.eval_results.update(eval_result)
         end_time = time.time()
 
         self.eval_results["Eval Execution Runtime (seconds)"] = end_time - start_time
@@ -65,104 +56,6 @@ def run_evals(self, *args, **kwargs) -> None:
         for key, value in self.eval_results.items():
             logging.info(f"{key}: {value}")
 
-    def niah_eval(self, *args, **kwargs) -> None:
-        """Run the Needle in a Haystack evaluation"""
-        logging.info("Beginning Needle in a Haystack Evaluation...")
-        self.niah_test_cases = []
-
-        niah_runner = NIAH_Runner(*args, **kwargs)
-        niah_runner.run_experiment()
-
-        # build test cases out of the niah_dataset
-        for row in niah_runner.niah_data:
-            self.niah_test_cases.append(
-                LLMTestCase(
-                    input=niah_runner.message_prompt,
-                    actual_output=row["response"],
-                    context=[row["context"]],
-                    additional_metadata={
-                        "retrieval_score": row["retrieval_score"],
-                        "response_score": row["response_score"],
-                    },
-                )
-            )
-
-        # run metrics
-        # TODO: Give ability to choose which metrics to run
-        retrieval_metric = NIAH_Retrieval()
-        response_metric = NIAH_Response()
-        metrics = [retrieval_metric, response_metric]
-
-        for metric in metrics:
-            scores = []
-            successes = []
-            for test_case in self.niah_test_cases:
-                metric.measure(test_case)
-                scores.append(metric.score)
-                successes.append(metric.is_successful())
-            self.eval_results[f"Average {metric.__name__}"] = np.mean(scores)
-            logging.info(f"{metric.__name__} Results:")
-            logging.info(f"average score: {np.mean(scores)}")
-            logging.info(f"scores: {scores}")
-            logging.info(f"successes: {successes}")
-
-    def qa_eval(self, *args, **kwargs) -> None:
-        """Runs the Question/Answer evaluation"""
-        logging.info("Beginning Question/Answer Evaluation...")
-        self.qa_test_cases = []
-
-        qa_runner = QA_Runner(*args, **kwargs)
-        qa_runner.run_experiment()
-
-        # build test cases out of the qa_dataset
-        for row in qa_runner.qa_data:
-            self.qa_test_cases.append(
-                LLMTestCase(
-                    input=row["input"],
-                    actual_output=row["actual_output"],
-                    context=row["context"],
-                    expected_output=row["expected_output"],
-                    additional_metadata={
-                        "actual_annotations": row["actual_annotations"],
-                        "expected_annotations": row["expected_annotations"],
-                    },
-                    # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
-                )
-            )
-
-        # Create judge llm
-        try:
-            judge_model = globals()[os.environ.get("LLM_JUDGE")]()
-        except KeyError:
-            judge_model = os.environ.get("LLM_JUDGE")
-
-        # run metrics
-        # TODO: Give ability to choose which metrics to run
-        correctness_metric = CorrectnessMetric(model=judge_model)
-        answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
-        annotation_relevancy_metric = AnnotationRelevancyMetric()
-        metrics = [
-            correctness_metric,
-            answer_relevancy_metric,
-            annotation_relevancy_metric,
-        ]
-
-        for metric in metrics:
-            scores = []
-            successes = []
-            reasons = []
-            for test_case in self.qa_test_cases:
-                metric.measure(test_case)
-                scores.append(metric.score)
-                successes.append(metric.is_successful())
-                reasons.append(metric.reason)
-            self.eval_results[f"Average {metric.__name__}"] = np.mean(scores)
-            logging.info(f"{metric.__name__} Results:")
-            logging.info(f"average score: {np.mean(scores)}")
-            logging.info(f"scores: {scores}")
-            logging.info(f"successes: {successes}")
-            logging.info(f"reasons: {reasons}")
-
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)

diff --git a/src/leapfrogai_evals/metrics/__init__.py b/src/leapfrogai_evals/metrics/__init__.py
@@ -0,0 +1,6 @@
+# __init__.py
+# ruff: noqa: F401
+
+from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
+from leapfrogai_evals.metrics.correctness import CorrectnessMetric
+from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval