This repository has been archived by the owner on Feb 15, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: 1049 add standard evaluation benchmarks to lfai evals (#1078)
* refactor eval structure * add MMLU benchmark * add HumanEval benchmarks * add DeepEval compatible LLM class using LFAI * update evals README * upgrade DeepEval to v1.3.0
- Loading branch information
Showing
16 changed files
with
334 additions
and
115 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# __init__.py | ||
# ruff: noqa: F401 | ||
|
||
from leapfrogai_evals.evals.human_eval import human_eval | ||
from leapfrogai_evals.evals.mmlu import mmlu | ||
from leapfrogai_evals.evals.niah_eval import niah_eval | ||
from leapfrogai_evals.evals.qa_eval import qa_eval |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import logging | ||
import numpy as np | ||
import os | ||
|
||
from deepeval.benchmarks import HumanEval | ||
from deepeval.benchmarks.tasks import HumanEvalTask | ||
from tqdm import tqdm | ||
from typing import Optional | ||
|
||
from leapfrogai_evals.models import LFAI_Model | ||
|
||
|
||
def human_eval( | ||
num_samples: Optional[int] = None, | ||
k: Optional[int] = None, | ||
num_tasks: Optional[int] = None, | ||
) -> dict: | ||
"""Runs the HumanEval benchmark on a subset of tasks""" | ||
eval_results = dict() | ||
task_scores = dict() | ||
num_tasks = num_tasks or int( | ||
os.getenv("HUMAN_EVAL_NUM_TASKS", default=len(list(HumanEvalTask))) | ||
) | ||
logging.info(f"Running the HumanEval benchmark on {num_tasks} tasks") | ||
failed_tasks = 0 | ||
for task in tqdm(list(HumanEvalTask)[:num_tasks]): | ||
task_benchmark = HumanEval( | ||
n=num_samples or int(os.getenv("HUMAN_EVAL_NUM_SAMPLES_PER_TASK")), | ||
tasks=[task], | ||
) | ||
try: | ||
task_benchmark.evaluate( | ||
model=LFAI_Model(), k=k or int(os.getenv("HUMAN_EVAL_K")) | ||
) | ||
task_scores[task.name] = task_benchmark.overall_score | ||
except Exception as exc: | ||
logging.info( | ||
f"HumanEval task {task.name} failed with error {exc}", exc_info=exc | ||
) | ||
task_scores[task.name] = 0.0 | ||
failed_tasks += 1 | ||
|
||
human_eval_avg_score = np.mean(list(task_scores.values())) | ||
logging.info(f"HumanEval overall score: {human_eval_avg_score}") | ||
logging.info(f"HumanEval failed task count: {failed_tasks}") | ||
logging.info(f"HumanEval task scores:\n {task_scores}") | ||
|
||
# add the evaluation score to the final results | ||
eval_results["HumanEval"] = human_eval_avg_score | ||
return eval_results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import logging | ||
import os | ||
|
||
from deepeval.benchmarks import MMLU | ||
from deepeval.benchmarks.tasks import MMLUTask | ||
from typing import Optional | ||
|
||
from leapfrogai_evals.models import LFAI_Model | ||
|
||
|
||
def mmlu(num_tasks: Optional[int] = None, n_shots: Optional[int] = None) -> dict: | ||
"""Runs the Massive Multitask Language Understanding (MMLU) benchmark on a subset of tasks""" | ||
eval_results = dict() | ||
num_tasks = num_tasks or int( | ||
os.getenv("MMLU_NUM_TASKS", default=len(list(MMLUTask))) | ||
) | ||
logging.info(f"Running the MMLU benchmark on {num_tasks} tasks") | ||
tasks = list(MMLUTask)[:num_tasks] | ||
mmlu_benchmark = MMLU( | ||
tasks=tasks, n_shots=n_shots or int(os.getenv("MMLU_NUM_SHOTS")) | ||
) | ||
mmlu_benchmark.evaluate(model=LFAI_Model()) | ||
logging.info(f"MMLU overall score: {mmlu_benchmark.overall_score}") | ||
logging.info(f"MMLU task scores:\n {mmlu_benchmark.task_scores}") | ||
|
||
# add the evaluation score to the final results | ||
eval_results["MMLU"] = mmlu_benchmark.overall_score | ||
return eval_results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import logging | ||
import numpy as np | ||
|
||
from deepeval.test_case import LLMTestCase | ||
|
||
from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response | ||
from leapfrogai_evals.runners import NIAH_Runner | ||
|
||
|
||
def niah_eval(*args, **kwargs) -> dict: | ||
"""Run the Needle in a Haystack evaluation""" | ||
logging.info("Beginning Needle in a Haystack Evaluation...") | ||
eval_results = dict() | ||
niah_test_cases = [] | ||
|
||
niah_runner = NIAH_Runner(*args, **kwargs) | ||
niah_runner.run_experiment() | ||
|
||
# build test cases out of the niah_dataset | ||
for row in niah_runner.niah_data: | ||
niah_test_cases.append( | ||
LLMTestCase( | ||
input=niah_runner.message_prompt, | ||
actual_output=row["response"], | ||
context=[row["context"]], | ||
additional_metadata={ | ||
"retrieval_score": row["retrieval_score"], | ||
"response_score": row["response_score"], | ||
}, | ||
) | ||
) | ||
|
||
# run metrics | ||
# TODO: Give ability to choose which metrics to run | ||
retrieval_metric = NIAH_Retrieval() | ||
response_metric = NIAH_Response() | ||
metrics = [retrieval_metric, response_metric] | ||
|
||
# record scores and return results | ||
for metric in metrics: | ||
scores = [] | ||
successes = [] | ||
for test_case in niah_test_cases: | ||
metric.measure(test_case) | ||
scores.append(metric.score) | ||
successes.append(metric.is_successful()) | ||
eval_results[f"Average {metric.__name__}"] = np.mean(scores) | ||
logging.info(f"{metric.__name__} Results:") | ||
logging.info(f"average score: {np.mean(scores)}") | ||
logging.info(f"scores: {scores}") | ||
logging.info(f"successes: {successes}") | ||
|
||
return eval_results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import logging | ||
import numpy as np | ||
import os | ||
|
||
from deepeval.metrics import AnswerRelevancyMetric | ||
from deepeval.test_case import LLMTestCase | ||
|
||
from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric | ||
from leapfrogai_evals.models import * # noqa (imports all models) | ||
from leapfrogai_evals.runners import QA_Runner | ||
|
||
|
||
def qa_eval(*args, **kwargs) -> dict: | ||
"""Runs the Question/Answer evaluation""" | ||
logging.info("Beginning Question/Answer Evaluation...") | ||
eval_results = dict() | ||
qa_test_cases = [] | ||
|
||
qa_runner = QA_Runner(*args, **kwargs) | ||
qa_runner.run_experiment() | ||
|
||
# build test cases out of the qa_dataset | ||
for row in qa_runner.qa_data: | ||
qa_test_cases.append( | ||
LLMTestCase( | ||
input=row["input"], | ||
actual_output=row["actual_output"], | ||
context=row["context"], | ||
expected_output=row["expected_output"], | ||
additional_metadata={ | ||
"actual_annotations": row["actual_annotations"], | ||
"expected_annotations": row["expected_annotations"], | ||
}, | ||
# retrieval_context = row['retrieval_context'] # TODO: add this for more metrics | ||
) | ||
) | ||
|
||
# Create judge llm | ||
try: | ||
judge_model = globals()[os.environ.get("LLM_JUDGE")]() | ||
except KeyError: | ||
judge_model = os.environ.get("LLM_JUDGE") | ||
|
||
# run metrics | ||
# TODO: Give ability to choose which metrics to run | ||
correctness_metric = CorrectnessMetric(model=judge_model) | ||
answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model) | ||
annotation_relevancy_metric = AnnotationRelevancyMetric() | ||
metrics = [ | ||
correctness_metric, | ||
answer_relevancy_metric, | ||
annotation_relevancy_metric, | ||
] | ||
|
||
# record scores and return results | ||
for metric in metrics: | ||
scores = [] | ||
successes = [] | ||
reasons = [] | ||
for test_case in qa_test_cases: | ||
metric.measure(test_case) | ||
scores.append(metric.score) | ||
successes.append(metric.is_successful()) | ||
reasons.append(metric.reason) | ||
eval_results[f"Average {metric.__name__}"] = np.mean(scores) | ||
logging.info(f"{metric.__name__} Results:") | ||
logging.info(f"average score: {np.mean(scores)}") | ||
logging.info(f"scores: {scores}") | ||
logging.info(f"successes: {successes}") | ||
logging.info(f"reasons: {reasons}") | ||
|
||
return eval_results |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# __init__.py | ||
# ruff: noqa: F401 | ||
|
||
from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric | ||
from leapfrogai_evals.metrics.correctness import CorrectnessMetric | ||
from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval |
Oops, something went wrong.