From 6035536b0731d9323e75fef3d35b715105c7a9bc Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 29 Jan 2024 15:18:19 +0000
Subject: [PATCH 01/24] initial implementation

---
 .../integrations/lm_evaluation_harness.py     | 237 +++---------------
 .../evaluation/integrations/None_rank0.db     | Bin 0 -> 12288 bytes
 .../test_lm_evaluation_harness.py             |  33 +--
 3 files changed, 53 insertions(+), 217 deletions(-)
 create mode 100644 tests/deepsparse/evaluation/integrations/None_rank0.db

diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 2f8c7b8cef..6d4cb21650 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -29,8 +29,12 @@
 from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
-from lm_eval import base, evaluator, tasks, utils
-
+from deepsparse.transformers.metrics import _cross_entropy
+from lm_eval.api.model import LM
+from lm_eval.api.instance import Instance
+from lm_eval import evaluator, utils, tasks
+from lm_eval.__main__ import cli_evaluate
+tasks.initialize_tasks("INFO")
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -56,57 +60,23 @@ def integration_eval(
 
     :return the evaluation results
     """
-    # [START]
-    # The code that sets up the interface between deepsparse and lm_evaluation_harness
     if isinstance(model, Pipeline):
-        # If the model is a Pipeline, we need to wrap
-        # it in a DeepSparseLM object
-        model = DeepSparseLM(
-            pipeline=model,
-            batch_size=batch_size,
-            max_gen_toks=kwargs.get("max_gen_toks"),
-        )
+        model = DeepSparseLM(pipeline=model)
 
-    datasets = (",").join(datasets) if isinstance(datasets, list) else datasets
-    # [END]
-
-    # [START]
-    # The code below is being adapted from:
-    # https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py
-    if kwargs.get("limit"):
-        _LOGGER.warning(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. "
-            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
 
-    if datasets is None:
-        task_names = tasks.ALL_TASKS
-    else:
-        task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS)
+    datasets = (",").join(datasets) if isinstance(datasets, list) else datasets
+    task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS)
 
     _LOGGER.info(f"Selected Tasks: {task_names}")
 
-    description_dict = {}
-    if kwargs.get("description_dict_path"):
-        with open(kwargs.get("description_dict_path"), "r") as f:
-            description_dict = json.load(f)
-
-    evaluator_input = EvaluatorInputSchema(
-        model=model,
-        tasks=task_names,
-        description_dict=description_dict,
-        batch_size=batch_size,
-        **kwargs,
-    )
-
-    results_raw = evaluator.simple_evaluate(**evaluator_input.dict())
+    results_raw = evaluator.simple_evaluate(model=model, tasks=task_names, batch_size=batch_size, **kwargs)
 
-    results = Result(
-        raw=dict(output=results_raw, input=filter_evaluator_input(evaluator_input)),
-        formatted=format_raw_results(results_raw),
-    )
+    # results = Result(
+    #     raw=dict(output=results_raw, input=None),
+    #     formatted=None,
+    # )
 
-    return results
+    return results_raw
 
 
 def filter_evaluator_input(
@@ -152,49 +122,11 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
     return formatted_results
 
 
-class EvaluatorInputSchema(BaseModel):
-    model: Any = Field(description="The name of the model.")
-    tasks: List[str] = Field(
-        description="The task (or multiple tasks) to evaluate the target on."
-    )
-    description_dict: Optional[Dict[str, Any]] = Field(
-        None, description="Description dict."
-    )
-    batch_size: int = Field(description="The batch size to use for evaluation.")
-    model_args: str = Field(
-        "", description="Additional arguments for the evaluated model."
-    )
-    num_fewshot: int = Field(0, description="The number of few shots to use.")
-    max_batch_size: Optional[int] = Field(
-        None, description="Maximal batch size to try with --batch_size auto."
-    )
-    device: Optional[str] = Field(None, description="Device to use for evaluation.")
-    no_cache: bool = Field(False, description="Include this flag to prevent caching.")
-    limit: Optional[float] = Field(
-        None,
-        description="Limit the number of examples per task. If <1, "
-        "limit is a percentage of the total number of "
-        "examples.",
-    )
-    decontamination_ngrams_path: Optional[str] = Field(
-        None, description="Specify the path for decontamination n-grams."
-    )
-    check_integrity: bool = Field(
-        False, description="Include this flag to check integrity."
-    )
-    write_out: bool = Field(False, description="Include this flag to write out.")
-    output_base_path: Optional[str] = Field(
-        None, description="Specify the output base path."
-    )
-
-
-class DeepSparseLM(base.BaseLM):
+class DeepSparseLM(LM):
     def __init__(
         self,
         pipeline: Pipeline,
-        tokenizer: Optional[str] = None,
-        batch_size: int = 1,
-        max_gen_toks: Optional[int] = None,
+
     ):
         """
         Wrapper around the DeepSparse pipeline to make it compatible with the
@@ -203,126 +135,27 @@ def __init__(
         super().__init__()
 
         # Initialize new model and tokenizer instances
-        self.model = pipeline
-        self.tokenizer = tokenizer if tokenizer else self.model.tokenizer
+        self.pipeline = pipeline
 
-        self._batch_size = batch_size
-        self._max_length = pipeline.sequence_length
-        self._max_gen_toks = max_gen_toks or 256
 
-        self.vocab_size = self.tokenizer.vocab_size
+    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        greedy = not self.pipeline.config.do_sample
+        prompts = [request.arguments[0] for request in requests]
+        out = self.pipeline(prompt = prompts,
+                            output_scores=True,
+                            )
 
-    def _model_call(self, inps) -> torch.Tensor:
-        """
-        Override the _model_call method to use the DeepSparse pipeline for
-        logits generation.
+        likelyhoods = []
+        for prompt_idx, prompt in enumerate(prompts):
+            logits = out.generations[prompt_idx].score
+            tokenized_prompt = self.pipeline.tokenizer(prompt)
+            nll = _cross_entropy(logits[:sum(tokenized_prompt["attention_mask"]),:], tokenized_prompt["input_ids"])
+            likelyhoods.append((nll, greedy))
+        return likelyhoods
 
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        # Encode the tokens to strings
-        prompt = self.model.tokenizer.batch_decode(inps.numpy())
-
-        # Run the model to map the prompt to logits
-        out = self.model(
-            prompt=prompt,
-            max_new_tokens=0,
-            include_prompt_logits=True,
-            output_scores=True,
-        )
-        logits_numpy = numpy.stack([generation.score for generation in out.generations])
-        return torch.from_numpy(logits_numpy)
-
-    def greedy_until(
-        self, requests: List[Tuple[str, Union[List[str], str]]]
-    ) -> List[str]:
-        def _collate(x):
-            tokens = self.tok_encode(x[0])
-            return len(tokens), x[0]
-
-        results = []
-        reorder = utils.Reorderer(requests, _collate)
-
-        for chunk in utils.chunks(
-            tqdm(reorder.get_reordered(), disable=False),
-            self.batch_size,
-        ):
-            context = [c[0] for c in chunk]
-            request_args = chunk[0][1]
-            stop = request_args.get("until", None)
-            stop_sequences = stop if isinstance(stop, list) else [stop]
-            max_generation_length = request_args.get("max_length", None)
-
-            assert (
-                isinstance(max_generation_length, int) or max_generation_length is None
-            )
-            assert isinstance(stop_sequences, list) or stop_sequences is None
-
-            # TODO: Find a better way to handle stop sequences for 0-shot.
-            if stop_sequences is None:
-                until = [self.eot_token]
-            else:
-                until = stop_sequences + [self.eot_token]
-
-            if max_generation_length is None:
-                max_tokens = self.max_gen_toks
-            else:
-                max_tokens = max_generation_length
-
-            responses = self.model(
-                sequences=context,
-                max_new_tokens=max_tokens,
-                stop=until,
-                do_sample=False,
-            )
-
-            responses = responses if type(responses) is list else [responses]
-
-            for response in responses:
-                response = response.generations[0].text
-                # Ensure the generated responses do not contain the stop sequences.
-                for term in until:
-                    response = response.split(term)[0]
-                # partial caching
-                self.cache_hook.add_partial("greedy_until", (context, until), response)
-                results.append(response)
-
-        return reorder.get_original(results)
-
-    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
-        raise NotImplementedError()
-
-    @property
-    def eot_token(self) -> str:
-        return self.tokenizer.eos_token
-
-    @property
-    def eot_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
-
-    @property
-    def max_length(self):
-        return self._max_length
-
-    @property
-    def max_gen_toks(self):
-        return self._max_gen_toks
-
-    @property
-    def batch_size(self):
-        # should return self._batch_size but the
-        # TextGeneration model does not support batch_size > 1
-        return 1
-
-    @property
-    def device(self):
-        pass
 
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
+    def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        pass
 
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        pass
diff --git a/tests/deepsparse/evaluation/integrations/None_rank0.db b/tests/deepsparse/evaluation/integrations/None_rank0.db
new file mode 100644
index 0000000000000000000000000000000000000000..6ab676686ae18918fc57fe171b5062bc08bd64e2
GIT binary patch
literal 12288
zcmeI#&r8EF6u|MM$`E06w;g(U?4+QA_z&1h4{C?n6?CT(S%hKNp{AlI|33dCk2a$_
z?_RzS^5Z2WkWWr;_K+=|m6!GV$~d`DBc-*v6j4g)w(YfT8$RFPChh3+U7^+4<IOPq
zQ$grd=)&J!2iQjd0R#|0009ILKmY**5I~?W0y8}rN0I(<=DD&>U7M9%bVG2Jr*V-=
z5hqzHlWuh)N3Zr<iu9?F`}}$u=X1GD=O^-Ms>Vu^-6hAT!C)LmS}p5^{rY&T+N!=Z
zjazT>vb(4}?<T&kKk%MI-;I1nAb<b@2q1s}0tg_000IagfIwdb4*kQZ|NHvAybA#Y
U5I_I{1Q0*~0R#|00D+CbKetRXumAu6

literal 0
HcmV?d00001

diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 9fa9b494cf..cba804e9e7 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -22,7 +22,7 @@
     [
         (
             create_model_from_target(
-                "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime"
+                "zoo:mistral-7b-gsm8k_mistral_pretrain-pruned70", engine_type="onnxruntime", sequence_length = 256
             ),
             create_model_from_target("roneneldan/TinyStories-1M"),
         )
@@ -31,15 +31,15 @@
 @pytest.mark.parametrize(
     "datasets",
     [
-        ["hellaswag"],
-        ["hellaswag", "gsm8k"],
-        "gsm8k",
-        "arc_challenge",
+        #["hellaswag"],
+        # ["hellaswag", "gsm8k"],
+        # "gsm8k",
+        #"arc_challenge",
     ],
 )
 @pytest.mark.parametrize(
     "batch_size",
-    [1, 3],
+    [1],
 )
 class TestLMEvaluationHarness:
     @pytest.mark.skipif(
@@ -54,20 +54,23 @@ def test_integration_eval_onnx_matches_torch(
         )
 
         out_torch = integration_eval(
-            model=model_torch,
+            model="hf",
+            model_args="pretrained=roneneldan/TinyStories-1M,dtype=float32",
             datasets=datasets,
             batch_size=batch_size,
-            limit=5,
-            no_cache=True,  # avoid saving files when running tests
+            limit=1,
+            use_cache=None,  # avoid saving files when running tests
         )
         out_onnx = integration_eval(
             model=pipeline,
             datasets=datasets,
             batch_size=batch_size,
-            limit=5,
-            no_cache=True,  # avoid saving files when running tests
+            limit=1,
+            use_cache=None,  # avoid saving files when running tests
         )
-        out_onnx = out_onnx.raw["output"]
-        out_torch = out_torch.raw["output"]
-
-        assert out_onnx["results"] == out_torch["results"]
+        print(out_onnx)
+        print(out_torch)
+        # out_onnx = out_onnx.raw["output"]
+        # out_torch = out_torch.raw["output"]
+        #
+        # assert out_onnx["results"] == out_torch["results"]

From 53cb9eca9850ee9f79747799f7d8d5b5c48c6585 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 30 Jan 2024 13:05:40 +0000
Subject: [PATCH 02/24] initial commit

---
 src/deepsparse/evaluation/cli.py              | 28 +++-----
 src/deepsparse/evaluation/evaluator.py        | 21 +++---
 src/deepsparse/evaluation/registry.py         |  9 +--
 src/deepsparse/evaluation/utils.py            | 64 +++++++++----------
 .../test_lm_evaluation_harness.py             |  6 +-
 tests/deepsparse/evaluation/test_evaluator.py | 22 ++++---
 tests/deepsparse/evaluation/test_utils.py     | 47 ++------------
 7 files changed, 74 insertions(+), 123 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index ed7ea72831..f37ed46d0c 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,7 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --target TARGET     A path to a remote or local directory containing ONNX/torch model
+    --target TARGET     A path to a remote or local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
                         The dataset to evaluate on. The user may pass multiple datasets
@@ -30,9 +30,7 @@
                         integration name that is registered in the evaluation registry
     -e ENGINE_TYPE, --engine_type ENGINE_TYPE
                         Inference engine to use for the evaluation. The default
-                        is the DeepSparse engine. If the evaluation should be run
-                        without initializing a pipeline (e.g. for the evaluation
-                        of a torch model), the engine type should be set to None
+                        is the DeepSparse engine.
     -s SAVE_PATH, --save_path SAVE_PATH
                         The path to save the evaluation results.
                         By default the results will be saved in the
@@ -90,10 +88,10 @@
     )
 )
 @click.option(
-    "--target",
+    "--model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to a remote or local directory containing ONNX/torch model "
+    help="A path to a remote or local directory containing ONNX model "
     "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
@@ -118,9 +116,7 @@
     type=click.Choice([DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE]),
     default=DEEPSPARSE_ENGINE,
     help="The engine to use for the evaluation. The default is the "
-    "DeepSparse engine. If the evaluation should be run without "
-    "initializing a pipeline (e.g. for the evaluation of a torch "
-    "model), the engine type should be set to None",
+    "DeepSparse engine. ",
 )
 @click.option(
     "-s",
@@ -167,7 +163,7 @@
 )
 @click.argument("integration_args", nargs=-1, type=click.UNPROCESSED)
 def main(
-    target,
+    model_path,
     dataset,
     integration,
     engine_type,
@@ -183,14 +179,8 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Target to evaluate: {target}")
-    if engine_type:
-        _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
-    else:
-        _LOGGER.info(
-            "No engine type specified. The target "
-            "will be evaluated using the native framework"
-        )
+    _LOGGER.info(f"Creating pipeline to evaluate from: {model_path}")
+    _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
 
     _LOGGER.info(
         f"Datasets to evaluate on: {datasets}\n"
@@ -201,7 +191,7 @@ def main(
     )
 
     result: Result = evaluate(
-        target=target,
+        model_path=model_path,
         datasets=datasets,
         integration=integration,
         engine_type=engine_type,
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index 7bd56adf6e..9d1b3228a7 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path
 from typing import Any, List, Optional, Union
 
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Result
-from deepsparse.evaluation.utils import create_model_from_target
+from deepsparse.evaluation.utils import create_pipeline
 from deepsparse.operators.engine_operator import (
     DEEPSPARSE_ENGINE,
     ORT_ENGINE,
@@ -30,11 +31,11 @@
 
 
 def evaluate(
-    target: Any,
+    model_path: Any,
     datasets: Union[str, List[str]],
     integration: Optional[str] = None,
     engine_type: Union[
-        DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE, None
+        DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE
     ] = DEEPSPARSE_ENGINE,
     batch_size: int = 1,
     splits: Union[List[str], str, None] = None,
@@ -42,18 +43,18 @@ def evaluate(
     **kwargs,
 ) -> Result:
 
-    # if target is a string, turn it into an appropriate model/pipeline
+    # if target is a string, turn it into an appropriate pipeline
     # otherwise assume it is a model/pipeline
-    model = (
-        create_model_from_target(target, engine_type)
-        if isinstance(target, str)
-        else target
+    pipeline = (
+        create_pipeline(model_path, engine_type)
+        if isinstance(model_path, (Path, str))
+        else model_path
     )
 
-    eval_integration = EvaluationRegistry.resolve(model, datasets, integration)
+    eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration)
 
     return eval_integration(
-        model=model,
+        pipeline=pipeline,
         datasets=datasets,
         engine_type=engine_type,
         batch_size=batch_size,
diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py
index 5b6e45bc1c..2daabb69cc 100644
--- a/src/deepsparse/evaluation/registry.py
+++ b/src/deepsparse/evaluation/registry.py
@@ -15,8 +15,9 @@
 Implementation of a registry for evaluation functions
 """
 import logging
-from typing import Any, Callable, List, Optional, Union
+from typing import Callable, List, Optional, Union
 
+from deepsparse import Pipeline
 from sparsezoo.utils.registry import RegistryMixin
 
 
@@ -38,7 +39,7 @@ def load_from_registry(cls, name: str) -> Callable[..., "Result"]:  # noqa: F821
     @classmethod
     def resolve(
         cls,
-        model: Any,
+        pipeline: Pipeline,
         datasets: Union[str, List[str]],
         integration: Optional[str] = None,
     ) -> Callable[..., "Result"]:  # noqa: F821
@@ -59,12 +60,12 @@ def resolve(
                 "No integration specified, inferring the evaluation"
                 "function from the input arguments..."
             )
-            integration = resolve_integration(model, datasets)
+            integration = resolve_integration(pipeline, datasets)
 
             if integration is None:
                 raise ValueError(
                     "Unable to resolve an evaluation function for the given model. "
-                    "Specify an integration name or use a model that is supported "
+                    "Specify an integration name or use a pipeline that is supported "
                 )
             _LOGGER.info(f"Inferred the evaluation function: {integration}")
 
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 0534a9f9f3..7290f14adb 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -15,14 +15,11 @@
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from transformers import AutoModelForCausalLM, PreTrainedModel
-
 from deepsparse import Pipeline
-from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE, ORT_ENGINE
 
 
 __all__ = [
-    "create_model_from_target",
+    "create_pipeline",
     "get_save_path",
     "args_to_dict",
     "resolve_integration",
@@ -50,7 +47,7 @@ def potentially_check_dependency_import(integration_name: str) -> bool:
 
 
 def resolve_integration(
-    model: Union[Pipeline, PreTrainedModel], datasets: Union[str, List[str]]
+    pipeline: Pipeline, datasets: Union[str, List[str]]
 ) -> Union[str, None]:
     """
     Given a model and dataset, infer the name of the evaluation integration
@@ -64,21 +61,22 @@ def resolve_integration(
     :param datasets: The datasets to infer the integration for
     :return: The name of the integration to use or None if unable to infer
     """
-    if if_generative_language_model(model):
+    if if_generative_language_model(pipeline):
         return LM_EVALUATION_HARNESS
     return None
 
 
-def if_generative_language_model(model: Any) -> bool:
+def if_generative_language_model(pipeline: Pipeline) -> bool:
     """
     Checks if the model is a generative language model.
     """
-    if isinstance(model, Pipeline):
-        return model.__class__.__name__ == "TextGenerationPipeline"
-    elif isinstance(model, PreTrainedModel):
-        return "CausalLM" in model.__class__.__name__
-    else:
-        return False
+    pipeline_name = pipeline.__class__.__name__
+    if pipeline_name == "TextGenerationPipeline" or (
+        pipeline_name == "TextGenerationPipelineNoKVCache"
+    ):
+        return True
+
+    return False
 
 
 def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]:
@@ -126,34 +124,30 @@ def get_save_path(
     return os.path.join(base_path, file_name)
 
 
-def create_model_from_target(
-    target: str,
+def create_pipeline(
+    model_path: str,
     engine_type: Optional[str] = None,
     **kwargs,
-) -> Union[Pipeline, AutoModelForCausalLM]:
+) -> Pipeline:
     """
-    Create a model or a pipeline from a target path.
+    Create a pipeline for evaluation
 
-    Note: This function is currently limited to:
-        - creating pipelines of type 'text-generation'
-        - creating dense huggingface models of type 'AutoModelForCausalLM'
-    This function will be expanded in the future to support more
-    model types and frameworks.
+    Note: This function is currently primarily
+    focused on creating pipelines of type 'text-generation'
+    This function will be expanded in the future to support
+    more tasks and models
 
-    :param target: The target path to initialize the
+    :param model_path: The target path to initialize the
         text generation model from. This can be a local
         or remote path to the model or a sparsezoo stub
     :param engine_type: The engine type to initialize the model with.
-    :return: The initialized model
+    :return: The initialized pipeline
     """
-    if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
-        return Pipeline.create(
-            task="text-generation",
-            model_path=target,
-            sequence_length=kwargs.pop("sequence_length", 2048),
-            engine_type=engine_type,
-            batch_size=kwargs.pop("batch_size", 1),
-            **kwargs,
-        )
-    else:
-        return AutoModelForCausalLM.from_pretrained(target, **kwargs)
+    return Pipeline.create(
+        task=kwargs.pop("task", "text-generation"),
+        model_path=model_path,
+        sequence_length=kwargs.pop("sequence_length", 2048),
+        engine_type=engine_type,
+        batch_size=kwargs.pop("batch_size", 1),
+        **kwargs,
+    )
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 9fa9b494cf..db847af1ad 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -14,17 +14,17 @@
 
 import pytest
 from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
-from deepsparse.evaluation.utils import create_model_from_target
+from deepsparse.evaluation.utils import create_pipeline
 
 
 @pytest.mark.parametrize(
     "pipeline, model_torch",
     [
         (
-            create_model_from_target(
+            create_pipeline(
                 "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime"
             ),
-            create_model_from_target("roneneldan/TinyStories-1M"),
+            create_pipeline("roneneldan/TinyStories-1M"),
         )
     ],
 )
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index dedd63fa36..f1bc0c277a 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -49,7 +49,7 @@ def dummy_integration(*args, **kwargs):
 
 
 @pytest.fixture()
-def target():
+def model_path():
     return "hf:mgoin/TinyStories-1M-deepsparse"
 
 
@@ -68,18 +68,18 @@ def unknown_integration_name():
     return "unknown_integration"
 
 
-def test_evaluate_unknown_integration(target, datasets, unknown_integration_name):
+def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name):
     with pytest.raises(KeyError):
         evaluate(
-            target=target,
+            model_path=model_path,
             datasets=datasets,
             integration=unknown_integration_name,
         )
 
 
-def test_evaluate(target, datasets, dummy_integration_name):
+def test_evaluate(model_path, datasets, dummy_integration_name):
     result = evaluate(
-        target=target,
+        model_path=model_path,
         datasets=datasets,
         integration=dummy_integration_name,
     )
@@ -91,11 +91,11 @@ def test_evaluate(target, datasets, dummy_integration_name):
     reason="lm_evaluation_harness not installed",
 )
 def test_evaluation_llm_evaluation_harness_integration_name(
-    target,
+    model_path,
     datasets,
 ):
     assert evaluate(
-        target=target,
+        model_path=model_path,
         datasets=datasets,
         limit=2,
         no_cache=True,
@@ -110,15 +110,17 @@ def test_evaluation_llm_evaluation_harness_integration_name(
     "with importing functions that are decorated with "
     "click option where multiple=True",
 )
-def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serialization):
+def test_cli(
+    tmp_path, model_path, datasets, dummy_integration_name, type_serialization
+):
     from deepsparse.evaluation.cli import main
 
     runner = CliRunner()
     runner.invoke(
         main,
         [
-            "--target",
-            target,
+            "--model_path",
+            model_path,
             "--dataset",
             datasets[0],
             "--dataset",
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
index f712dce0df..a16cb8ee32 100644
--- a/tests/deepsparse/evaluation/test_utils.py
+++ b/tests/deepsparse/evaluation/test_utils.py
@@ -23,23 +23,13 @@
 import pytest
 from deepsparse import Pipeline
 from deepsparse.evaluation.utils import (
-    create_model_from_target,
+    create_pipeline,
     get_save_path,
     if_generative_language_model,
     resolve_integration,
 )
 
 
-@pytest.fixture
-def llm_type_hf_model():
-    return AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M")
-
-
-@pytest.fixture
-def not_llm_type_hf_model():
-    return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-
-
 @pytest.fixture
 def llm_type_pipeline():
     return Pipeline.create(
@@ -49,25 +39,13 @@ def llm_type_pipeline():
     )
 
 
-def test_resolve_known_llm_model(llm_type_hf_model):
+def test_resolve_known_llm_pipeline(llm_type_pipeline):
     assert (
-        resolve_integration(model=llm_type_hf_model, datasets="")
+        resolve_integration(pipeline=llm_type_pipeline, datasets="")
         == "lm-evaluation-harness"
     )
 
 
-def test_resolve_unknown_model(not_llm_type_hf_model):
-    assert resolve_integration(model=not_llm_type_hf_model, datasets="") is None
-
-
-def test_if_generative_language_model_true(llm_type_hf_model):
-    assert if_generative_language_model(llm_type_hf_model)
-
-
-def test_if_generative_language_model_false(not_llm_type_hf_model):
-    assert not if_generative_language_model(not_llm_type_hf_model)
-
-
 def test_if_generative_language_pipeline_true(llm_type_pipeline):
     assert if_generative_language_model(llm_type_pipeline)
 
@@ -89,26 +67,11 @@ def pipeline_target():
     return "hf:mgoin/TinyStories-1M-deepsparse"
 
 
-@pytest.fixture
-def torch_target():
-    return "roneneldan/TinyStories-1M"
-
-
 def test_initialize_model_from_target_pipeline_onnx(pipeline_target):
-    model = create_model_from_target(pipeline_target, "onnxruntime")
+    model = create_pipeline(pipeline_target, "onnxruntime")
     assert model.ops.get("single_engine")._engine_type == "onnxruntime"
 
 
-def test_initialize_model_from_target_pipeline_deepsparse(pipeline_target):
-    model = create_model_from_target(pipeline_target, "deepsparse")
-    assert model.ops.get("single_engine")._engine_type == "deepsparse"
-
-
 def test_initialize_model_from_target_pipeline_with_kwargs(pipeline_target):
-    model = create_model_from_target(pipeline_target, "deepsparse", sequence_length=64)
+    model = create_pipeline(pipeline_target, "deepsparse", sequence_length=64)
     assert model.ops.get("process_input").sequence_length == 64
-
-
-def test_initialize_model_from_target_torch(torch_target):
-    model = create_model_from_target(torch_target, "torch")
-    assert isinstance(model, GPTNeoForCausalLM)

From 6599f41cb08cdb2903420d8c17fb3486c6c395ac Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 30 Jan 2024 14:03:14 +0000
Subject: [PATCH 03/24] add some more tests for hardening

---
 src/deepsparse/evaluation/cli.py              |  7 +++--
 src/deepsparse/evaluation/evaluator.py        | 21 +++++++++----
 .../pipelines/text_generation/pipeline.py     |  7 +++++
 .../text_generation/pipeline_no_kv_cache.py   |  8 +++++
 .../test_lm_evaluation_harness.py             |  4 ++-
 tests/deepsparse/evaluation/test_evaluator.py | 31 +++++++++++++++++--
 6 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index f37ed46d0c..e0e16cb4ab 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,8 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --target TARGET     A path to a remote or local directory containing ONNX model
+    --model_path MODEL_PATH 
+                        A path to an ONNX model, local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
                         The dataset to evaluate on. The user may pass multiple datasets
@@ -91,7 +92,7 @@
     "--model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to a remote or local directory containing ONNX model "
+    help="A path to an ONNX model, local directory containing ONNX model"
     "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
@@ -191,7 +192,7 @@ def main(
     )
 
     result: Result = evaluate(
-        model_path=model_path,
+        model=model_path,
         datasets=datasets,
         integration=integration,
         engine_type=engine_type,
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index 9d1b3228a7..b513f07563 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 import logging
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import List, Optional, Union
 
+from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Result
 from deepsparse.evaluation.utils import create_pipeline
@@ -31,7 +32,7 @@
 
 
 def evaluate(
-    model_path: Any,
+    model: Union[Pipeline, Path, str],
     datasets: Union[str, List[str]],
     integration: Optional[str] = None,
     engine_type: Union[
@@ -43,12 +44,20 @@ def evaluate(
     **kwargs,
 ) -> Result:
 
+    if isinstance(model, Pipeline):
+        _LOGGER.info(
+            "Passed a Pipeline object into evaluate function. This will "
+            "override the following arguments:"
+        )
+        batch_size = model.batch_size
+        _LOGGER.info(f"batch_size: {batch_size}")
+        engine_type = engine_type
+        _LOGGER.info(f"engine_type: {engine_type}")
+
     # if target is a string, turn it into an appropriate pipeline
-    # otherwise assume it is a model/pipeline
+    # otherwise assume it is a pipeline
     pipeline = (
-        create_pipeline(model_path, engine_type)
-        if isinstance(model_path, (Path, str))
-        else model_path
+        create_pipeline(model, engine_type) if isinstance(model, (Path, str)) else model
     )
 
     eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration)
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
index 2c858c901b..bbc0e8ba15 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -357,6 +357,13 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
+    def batch_size(self) -> int:
+        return self.ops["single_engine"].batch_size
+
+    @property
+    def engine_type(self) -> str:
+        return self.ops["single_engine"]._engine_type
+
     def _get_continuous_batching_scheduler(
         self, batch_sizes: List[int], engines: List[EngineOperator]
     ) -> ContinuousBatchingScheduler:
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
index 7f6cb9db5f..c6cbc3dd59 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
@@ -127,3 +127,11 @@ def expand_inputs(self, items, batch_size):
         out, orig_batch_size = split_engine_inputs(items, batch_size)
         combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
         return combined_batches, orig_batch_size
+
+    @property
+    def batch_size(self) -> int:
+        return self.ops["engine_operator"].batch_size
+
+    @property
+    def engine_type(self) -> str:
+        return self.ops["engine_operator"]._engine_type
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index db847af1ad..3b9016294f 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from transformers import AutoModelForCausalLM
+
 import pytest
 from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
 from deepsparse.evaluation.utils import create_pipeline
@@ -24,7 +26,7 @@
             create_pipeline(
                 "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime"
             ),
-            create_pipeline("roneneldan/TinyStories-1M"),
+            AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M"),
         )
     ],
 )
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index f1bc0c277a..816ad075e0 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -29,6 +29,7 @@
     Metric,
     Result,
 )
+from deepsparse.pipeline import Pipeline
 
 
 @EvaluationRegistry.register()
@@ -71,7 +72,7 @@ def unknown_integration_name():
 def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name):
     with pytest.raises(KeyError):
         evaluate(
-            model_path=model_path,
+            model=model_path,
             datasets=datasets,
             integration=unknown_integration_name,
         )
@@ -79,7 +80,31 @@ def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_
 
 def test_evaluate(model_path, datasets, dummy_integration_name):
     result = evaluate(
-        model_path=model_path,
+        model=model_path,
+        datasets=datasets,
+        integration=dummy_integration_name,
+    )
+    assert isinstance(result, Result)
+
+
+def test_evaluate_pipeline_with_kv_cache(model_path, datasets, dummy_integration_name):
+    result = evaluate(
+        model=Pipeline.create(model_path=model_path, task="text-generation"),
+        datasets=datasets,
+        integration=dummy_integration_name,
+    )
+    assert isinstance(result, Result)
+
+
+def test_evaluate_pipeline_without_kv_cache(
+    model_path, datasets, dummy_integration_name
+):
+    result = evaluate(
+        model=Pipeline.create(
+            model_path=model_path,
+            task="text-generation",
+            onnx_model_name="model-orig.onnx",
+        ),
         datasets=datasets,
         integration=dummy_integration_name,
     )
@@ -95,7 +120,7 @@ def test_evaluation_llm_evaluation_harness_integration_name(
     datasets,
 ):
     assert evaluate(
-        model_path=model_path,
+        model=model_path,
         datasets=datasets,
         limit=2,
         no_cache=True,

From 4721c1fcd656a4e04b72eb3128fb121ca2297824 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 30 Jan 2024 15:04:32 +0100
Subject: [PATCH 04/24] Update src/deepsparse/evaluation/cli.py

---
 src/deepsparse/evaluation/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index e0e16cb4ab..9c8fe3d06a 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -180,7 +180,7 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Creating pipeline to evaluate from: {model_path}")
+    _LOGGER.info(f"Creating pipeline to evaluate from model path: {model_path}")
     _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
 
     _LOGGER.info(

From 124779435927ec266a18c3486780a77068c3f71a Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 30 Jan 2024 15:06:35 +0100
Subject: [PATCH 05/24] Update
 src/deepsparse/transformers/pipelines/text_generation/pipeline.py

---
 .../transformers/pipelines/text_generation/pipeline.py           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
index bbc0e8ba15..4a38392d76 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -357,6 +357,7 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
+     @property
     def batch_size(self) -> int:
         return self.ops["single_engine"].batch_size
 

From 9e88f89e7ea175d05eed4bacbb86ac1abda8f3fd Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 30 Jan 2024 15:07:31 +0100
Subject: [PATCH 06/24] Apply suggestions from code review

---
 src/deepsparse/evaluation/cli.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index 9c8fe3d06a..6979521c7a 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -180,8 +180,7 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Creating pipeline to evaluate from model path: {model_path}")
-    _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
+    _LOGGER.info(f"Creating {engine_type} pipeline to evaluate from model path: {model_path}")
 
     _LOGGER.info(
         f"Datasets to evaluate on: {datasets}\n"

From fdb21c6cf093bc527c6c318af1bb0e5b96ee68e8 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 30 Jan 2024 14:08:09 +0000
Subject: [PATCH 07/24] quality

---
 src/deepsparse/evaluation/cli.py                              | 4 +++-
 .../transformers/pipelines/text_generation/pipeline.py        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index 6979521c7a..43eaa33790 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -180,7 +180,9 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Creating {engine_type} pipeline to evaluate from model path: {model_path}")
+    _LOGGER.info(
+        f"Creating {engine_type} pipeline to evaluate from model path: {model_path}"
+    )
 
     _LOGGER.info(
         f"Datasets to evaluate on: {datasets}\n"
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
index 4a38392d76..64c0c64a51 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -357,7 +357,7 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
-     @property
+    @property
     def batch_size(self) -> int:
         return self.ops["single_engine"].batch_size
 

From 3e5b7a83f5ff9599aed10434e723190991a8bfc7 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Thu, 1 Feb 2024 14:31:54 +0000
Subject: [PATCH 08/24] fix the UI, implement loglikelihood function

---
 .../integrations/lm_evaluation_harness.py     | 175 +++++++++++++-----
 .../test_lm_evaluation_harness.py             |  84 ++++-----
 2 files changed, 167 insertions(+), 92 deletions(-)

diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 6d4cb21650..9b1d23e855 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -16,24 +16,23 @@
 Integration of the `lm_evaluation_harness`:
 https://github.com/EleutherAI/lm-evaluation-harness
 """
-
-import json
 import logging
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy
-from pydantic import BaseModel, Field
 from tqdm import tqdm
+from transformers import AutoTokenizer
 
-import torch
 from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
-from deepsparse.transformers.metrics import _cross_entropy
-from lm_eval.api.model import LM
+from deepsparse.utils.data import numpy_log_softmax
+from lm_eval import evaluator, tasks, utils
 from lm_eval.api.instance import Instance
-from lm_eval import evaluator, utils, tasks
-from lm_eval.__main__ import cli_evaluate
+from lm_eval.api.model import LM
+from lm_eval.utils import Reorderer
+
+
 tasks.initialize_tasks("INFO")
 
 _LOGGER = logging.getLogger(__name__)
@@ -61,38 +60,23 @@ def integration_eval(
     :return the evaluation results
     """
     if isinstance(model, Pipeline):
-        model = DeepSparseLM(pipeline=model)
-
+        model = DeepSparseLM(pipeline=model, batch_size=batch_size)
 
     datasets = (",").join(datasets) if isinstance(datasets, list) else datasets
     task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS)
 
     _LOGGER.info(f"Selected Tasks: {task_names}")
 
-    results_raw = evaluator.simple_evaluate(model=model, tasks=task_names, batch_size=batch_size, **kwargs)
+    results_raw = evaluator.simple_evaluate(
+        model=model, tasks=task_names, batch_size=batch_size, **kwargs
+    )
 
-    # results = Result(
-    #     raw=dict(output=results_raw, input=None),
-    #     formatted=None,
-    # )
+    results = Result(
+        raw=results_raw,
+        formatted=format_raw_results(results_raw),
+    )
 
-    return results_raw
-
-
-def filter_evaluator_input(
-    evaluator_input: "EvaluatorInputSchema",
-) -> Dict[str, Any]:  # noqa: F821
-    """
-    Filter the evaluator input to remove the model field.
-    The model field is a complex object that cannot be serialized.
-
-    :param evaluator_input: the evaluator input to filter
-    :return: the filtered evaluator input
-    """
-    evaluator = evaluator_input.dict()
-    del evaluator["model"]
-
-    return evaluator
+    return results
 
 
 def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
@@ -107,6 +91,8 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
     for dataset_name, dataset_result in results["results"].items():
         metrics = []
         for metric_name, metric_value in dataset_result.items():
+            if isinstance(metric_value, str):
+                continue
             metric = Metric(name=metric_name, value=metric_value)
             metrics.append(metric)
         dataset = Dataset(
@@ -126,7 +112,8 @@ class DeepSparseLM(LM):
     def __init__(
         self,
         pipeline: Pipeline,
-
+        batch_size: int = 1,
+        tokenizer: Optional[AutoTokenizer] = None,
     ):
         """
         Wrapper around the DeepSparse pipeline to make it compatible with the
@@ -136,26 +123,120 @@ def __init__(
 
         # Initialize new model and tokenizer instances
         self.pipeline = pipeline
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer or pipeline.tokenizer
+        self._max_length = pipeline.sequence_length
+
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string)
 
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
 
-    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
-        greedy = not self.pipeline.config.do_sample
-        prompts = [request.arguments[0] for request in requests]
-        out = self.pipeline(prompt = prompts,
-                            output_scores=True,
-                            )
+    @property
+    def max_length(self) -> int:
+        return self._max_length
+
+    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                raise NotImplemented("Implementing empty context is not supported yet")
+            context_enc, continuation_enc = self._encode_pair(context, continuation)
 
-        likelyhoods = []
-        for prompt_idx, prompt in enumerate(prompts):
-            logits = out.generations[prompt_idx].score
-            tokenized_prompt = self.pipeline.tokenizer(prompt)
-            nll = _cross_entropy(logits[:sum(tokenized_prompt["attention_mask"]),:], tokenized_prompt["input_ids"])
-            likelyhoods.append((nll, greedy))
-        return likelyhoods
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
 
+        return self._loglikelihood_tokens(new_reqs)
 
-    def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+
+        def _collate(x):
+            """Defines the key for the sorted method"""
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        for chunk in tqdm(
+            list(utils.chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
+        ):
+            for cache_key, context_enc, continuation_enc in chunk:
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+
+                response = self.pipeline(
+                    prompt=self.tokenizer.decode(inp),
+                    max_new_tokens=0,
+                    output_scores=True,
+                    include_prompt_logits=True,
+                )
+
+                for i, resp in enumerate(response.generations):
+                    # (seq_len, vocab_size)
+                    multi_scores = resp.score
+                    # (seq_len, vocab_size) but with softmax applied
+                    multi_logits = numpy_log_softmax(multi_scores, axis=1)
+                    # toss out the context half of the sequence
+                    # (cont_len, vocab_size)
+                    continuation_multi_logits = multi_logits[-len(continuation_enc) :]
+
+                    # pick out the logits for the continuation tokens
+                    # (cont_len,)
+                    continuation_logits = continuation_multi_logits[
+                        numpy.arange(len(continuation_enc)), continuation_enc
+                    ]
+                    # check if the tokens generated greedly are the same
+                    # as the expected continuation
+                    greedy_tokens = continuation_multi_logits.argmax(axis=1)
+                    max_equal = greedy_tokens.tolist() == continuation_enc
+
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(continuation_logits.sum()), bool(max_equal))
+
+                    res.append(answer)
+
+                    if cache_key is not None:
+                        self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+        return re_ord.get_original(res)
+
+    def loglikelihood_rolling(
+        self, requests: list[Instance]
+    ) -> list[tuple[float, bool]]:
         pass
 
     def generate_until(self, requests: list[Instance]) -> list[str]:
         pass
+
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index cba804e9e7..726f9f87eb 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -17,60 +17,54 @@
 from deepsparse.evaluation.utils import create_model_from_target
 
 
-@pytest.mark.parametrize(
-    "pipeline, model_torch",
-    [
-        (
-            create_model_from_target(
-                "zoo:mistral-7b-gsm8k_mistral_pretrain-pruned70", engine_type="onnxruntime", sequence_length = 256
-            ),
-            create_model_from_target("roneneldan/TinyStories-1M"),
-        )
-    ],
-)
 @pytest.mark.parametrize(
     "datasets",
     [
-        #["hellaswag"],
+        ["hellaswag"],
         # ["hellaswag", "gsm8k"],
         # "gsm8k",
-        #"arc_challenge",
+        "arc_challenge",
     ],
 )
 @pytest.mark.parametrize(
     "batch_size",
-    [1],
+    [1],  # TODO: Add test for higher batch sizes
+)
+@pytest.mark.skipif(
+    not try_import_lm_evaluation_harness(raise_error=False),
+    reason="lm_evaluation_harness not installed",
 )
-class TestLMEvaluationHarness:
-    @pytest.mark.skipif(
-        not try_import_lm_evaluation_harness(raise_error=False),
-        reason="lm_evaluation_harness not installed",
+def test_integration_eval_onnx_matches_torch(datasets, batch_size):
+    from deepsparse.evaluation.integrations.lm_evaluation_harness import (
+        integration_eval,
+    )
+
+    out_torch = integration_eval(
+        model="hf",
+        model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16",
+        datasets=datasets,
+        batch_size=batch_size,
+        limit=2,
+        use_cache=None,  # avoid saving files when running tests
+    )
+
+    out_onnx = integration_eval(
+        model=create_model_from_target(
+            "hf:mgoin/TinyStories-1M-ds", engine_type="onnxruntime", sequence_length=128
+        ),
+        datasets=datasets,
+        batch_size=batch_size,
+        limit=2,
+        use_cache=None,  # avoid saving files when running tests
     )
-    def test_integration_eval_onnx_matches_torch(
-        self, pipeline, model_torch, datasets, batch_size
-    ):
-        from deepsparse.evaluation.integrations.lm_evaluation_harness import (
-            integration_eval,
-        )
 
-        out_torch = integration_eval(
-            model="hf",
-            model_args="pretrained=roneneldan/TinyStories-1M,dtype=float32",
-            datasets=datasets,
-            batch_size=batch_size,
-            limit=1,
-            use_cache=None,  # avoid saving files when running tests
-        )
-        out_onnx = integration_eval(
-            model=pipeline,
-            datasets=datasets,
-            batch_size=batch_size,
-            limit=1,
-            use_cache=None,  # avoid saving files when running tests
-        )
-        print(out_onnx)
-        print(out_torch)
-        # out_onnx = out_onnx.raw["output"]
-        # out_torch = out_torch.raw["output"]
-        #
-        # assert out_onnx["results"] == out_torch["results"]
+    datasets = datasets if isinstance(datasets, list) else [datasets]
+    for dataset in datasets:
+        torch_samples = out_torch.raw["samples"][dataset]
+        onnx_samples = out_onnx.raw["samples"][dataset]
+        for torch_sample, onnx_sample in zip(torch_samples, onnx_samples):
+            for torch_resp, onnx_resp in zip(
+                torch_sample["resps"], onnx_sample["resps"]
+            ):
+                assert pytest.approx(torch_resp[0][0], 0.1) == onnx_resp[0][0]
+                assert torch_resp[0][1] == onnx_resp[0][1]

From f38f0db2e3af8e7e094ea470aba6460dcbb89b08 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Thu, 1 Feb 2024 14:35:22 +0000
Subject: [PATCH 09/24] remove unneccessary file

---
 .../evaluation/integrations/None_rank0.db       | Bin 12288 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/deepsparse/evaluation/integrations/None_rank0.db

diff --git a/tests/deepsparse/evaluation/integrations/None_rank0.db b/tests/deepsparse/evaluation/integrations/None_rank0.db
deleted file mode 100644
index 6ab676686ae18918fc57fe171b5062bc08bd64e2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI#&r8EF6u|MM$`E06w;g(U?4+QA_z&1h4{C?n6?CT(S%hKNp{AlI|33dCk2a$_
z?_RzS^5Z2WkWWr;_K+=|m6!GV$~d`DBc-*v6j4g)w(YfT8$RFPChh3+U7^+4<IOPq
zQ$grd=)&J!2iQjd0R#|0009ILKmY**5I~?W0y8}rN0I(<=DD&>U7M9%bVG2Jr*V-=
z5hqzHlWuh)N3Zr<iu9?F`}}$u=X1GD=O^-Ms>Vu^-6hAT!C)LmS}p5^{rY&T+N!=Z
zjazT>vb(4}?<T&kKk%MI-;I1nAb<b@2q1s}0tg_000IagfIwdb4*kQZ|NHvAybA#Y
U5I_I{1Q0*~0R#|00D+CbKetRXumAu6


From b2aad17a28dc9a1fbd2130355e3e21832a402ba9 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 2 Feb 2024 11:56:01 +0000
Subject: [PATCH 10/24] initial commit

---
 .../integrations/lm_evaluation_harness.py     | 95 ++++++++++++++++++-
 src/deepsparse/evaluation/utils.py            |  1 +
 .../test_lm_evaluation_harness.py             | 28 +++---
 3 files changed, 107 insertions(+), 17 deletions(-)

diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 9b1d23e855..cedef9c643 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -22,7 +22,7 @@
 import numpy
 from tqdm import tqdm
 from transformers import AutoTokenizer
-
+import copy
 from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
@@ -30,7 +30,7 @@
 from lm_eval import evaluator, tasks, utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
-from lm_eval.utils import Reorderer
+from collections import defaultdict
 
 
 tasks.initialize_tasks("INFO")
@@ -113,6 +113,7 @@ def __init__(
         self,
         pipeline: Pipeline,
         batch_size: int = 1,
+        max_gen_toks: int = 128,
         tokenizer: Optional[AutoTokenizer] = None,
     ):
         """
@@ -126,6 +127,7 @@ def __init__(
         self.batch_size = batch_size
         self.tokenizer = tokenizer or pipeline.tokenizer
         self._max_length = pipeline.sequence_length
+        self._max_gen_toks = max_gen_toks
 
     def tok_encode(self, string: str) -> List[int]:
         return self.tokenizer.encode(string)
@@ -137,6 +139,10 @@ def tok_decode(self, tokens: List[int]) -> str:
     def max_length(self) -> int:
         return self._max_length
 
+    @property
+    def max_gen_toks(self) -> int:
+        return self._max_gen_toks
+
     def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
         """
         Copied directly from
@@ -219,10 +225,91 @@ def _collate(x):
     def loglikelihood_rolling(
         self, requests: list[Instance]
     ) -> list[tuple[float, bool]]:
-        pass
+        raise NotImplementedError()
 
     def generate_until(self, requests: list[Instance]) -> list[str]:
-        pass
+        res = defaultdict(list)
+        re_ords = {}
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
+        for key, reqs in grouper.get_grouped().items():
+            # within each set of reqs for given kwargs, we reorder by token length, descending.
+            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
+
+        pbar = tqdm(total=len(requests))
+        # for each different set of kwargs, we execute all requests, by batch.
+        for key, re_ord in re_ords.items():
+            chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size)
+            for chunk in chunks:
+                contexts, all_gen_kwargs = zip(*chunk)
+                # we assume all gen kwargs in the batch are the same
+                # this is safe to assume because the `grouper` object ensures it.
+                gen_kwargs = all_gen_kwargs[0]
+                # unpack our keyword arguments.
+                until = None
+                if isinstance(gen_kwargs, dict):
+                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    if "until" in kwargs.keys():
+                        until = kwargs.pop("until")
+                        if isinstance(until, str):
+                            until = [kwargs]
+                        elif not isinstance(until, list):
+                            raise ValueError(
+                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                            )
+                else:
+                    raise ValueError(
+                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                    )
+
+                if not until:
+                    until = [self.tok_decode(self.eot_token_id)]
+
+                if "max_gen_toks" in kwargs.keys():
+                    max_gen_toks = kwargs.pop("max_gen_toks")
+                else:
+                    max_gen_toks = self.max_gen_toks
+
+                # we require users to pass do_sample=True explicitly for non-greedy gen
+                if "do_sample" not in kwargs.keys():
+                    kwargs["do_sample"] = False
+
+                # first stop sequence is used to halt generation upon encountering
+                primary_until = [until[0]]
+
+                responses = self.pipeline(
+                    sequences=contexts,
+                    max_new_tokens=max_gen_toks,
+                    stop=until,
+                    **kwargs,
+                )
+
+                responses = responses if type(responses) is list else [responses]
+                for response, context in zip(responses, contexts):
+                    text = response.generations[0].text
+                    # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                    for term in until:
+                        if len(term) > 0:
+                            # ignore possible empty separators
+                            text = text.split(term)[0]
+
+                    res[key].append(text)
+                    self.cache_hook.add_partial("greedy_until", (context, gen_kwargs), text)
+                    pbar.update(1)
+            # reorder this group of results back to original unsorted form
+            res[key] = re_ord.get_original(res[key])
+
+        pbar.close()
+
+        return grouper.get_original(res)
 
     def _encode_pair(
         self, context: str, continuation: str
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 0534a9f9f3..2b3bc5e8c7 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -146,6 +146,7 @@ def create_model_from_target(
     :param engine_type: The engine type to initialize the model with.
     :return: The initialized model
     """
+    engine_type = engine_type or DEEPSPARSE_ENGINE
     if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
         return Pipeline.create(
             task="text-generation",
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 726f9f87eb..e982824966 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -20,10 +20,10 @@
 @pytest.mark.parametrize(
     "datasets",
     [
-        ["hellaswag"],
+        #["hellaswag"],
         # ["hellaswag", "gsm8k"],
-        # "gsm8k",
-        "arc_challenge",
+        "gsm8k",
+        #"arc_challenge",
     ],
 )
 @pytest.mark.parametrize(
@@ -39,22 +39,24 @@ def test_integration_eval_onnx_matches_torch(datasets, batch_size):
         integration_eval,
     )
 
-    out_torch = integration_eval(
-        model="hf",
-        model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16",
-        datasets=datasets,
-        batch_size=batch_size,
-        limit=2,
-        use_cache=None,  # avoid saving files when running tests
-    )
+    # out_torch = integration_eval(
+    #     model="hf",
+    #     model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16",
+    #     datasets=datasets,
+    #     batch_size=batch_size,
+    #     limit=1,
+    #     use_cache=None,  # avoid saving files when running tests
+    # )
 
     out_onnx = integration_eval(
         model=create_model_from_target(
-            "hf:mgoin/TinyStories-1M-ds", engine_type="onnxruntime", sequence_length=128
+            "hf:mgoin/TinyStories-1M-ds",
+            #engine_type="onnxruntime",
+            sequence_length=1024
         ),
         datasets=datasets,
         batch_size=batch_size,
-        limit=2,
+        limit=1,
         use_cache=None,  # avoid saving files when running tests
     )
 

From 35454a1e81b655c46985bb15e6b1fe5bdf44d9a1 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 2 Feb 2024 12:49:54 +0000
Subject: [PATCH 11/24] tests passing, refactor time!

---
 .../integrations/lm_evaluation_harness.py     |  9 +++--
 .../test_lm_evaluation_harness.py             | 39 +++++++++++--------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index cedef9c643..d4021ba8f3 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -16,13 +16,15 @@
 Integration of the `lm_evaluation_harness`:
 https://github.com/EleutherAI/lm-evaluation-harness
 """
+import copy
 import logging
+from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy
 from tqdm import tqdm
 from transformers import AutoTokenizer
-import copy
+
 from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
@@ -30,7 +32,6 @@
 from lm_eval import evaluator, tasks, utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
-from collections import defaultdict
 
 
 tasks.initialize_tasks("INFO")
@@ -302,7 +303,9 @@ def _collate(x):
                             text = text.split(term)[0]
 
                     res[key].append(text)
-                    self.cache_hook.add_partial("greedy_until", (context, gen_kwargs), text)
+                    self.cache_hook.add_partial(
+                        "greedy_until", (context, gen_kwargs), text
+                    )
                     pbar.update(1)
             # reorder this group of results back to original unsorted form
             res[key] = re_ord.get_original(res[key])
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index e982824966..86f7eac452 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -18,12 +18,16 @@
 
 
 @pytest.mark.parametrize(
-    "datasets",
+    "datasets, model_path_ds, model_path_hf",
     [
-        #["hellaswag"],
-        # ["hellaswag", "gsm8k"],
-        "gsm8k",
-        #"arc_challenge",
+        (["hellaswag"], "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"),
+        # (["hellaswag", "gsm8k"],"hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX", "TinyLlama/TinyLlama-1.1B-step-50K-105b"),
+        (
+            "gsm8k",
+            "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX",
+            "TinyLlama/TinyLlama-1.1B-step-50K-105b",
+        ),
+        # ("arc_challenge", "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"),
     ],
 )
 @pytest.mark.parametrize(
@@ -34,25 +38,26 @@
     not try_import_lm_evaluation_harness(raise_error=False),
     reason="lm_evaluation_harness not installed",
 )
-def test_integration_eval_onnx_matches_torch(datasets, batch_size):
+def test_integration_eval_onnx_matches_torch(
+    datasets, model_path_ds, model_path_hf, batch_size
+):
     from deepsparse.evaluation.integrations.lm_evaluation_harness import (
         integration_eval,
     )
 
-    # out_torch = integration_eval(
-    #     model="hf",
-    #     model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16",
-    #     datasets=datasets,
-    #     batch_size=batch_size,
-    #     limit=1,
-    #     use_cache=None,  # avoid saving files when running tests
-    # )
+    out_torch = integration_eval(
+        model="hf",
+        model_args=f"pretrained={model_path_hf},dtype=float16",
+        datasets=datasets,
+        batch_size=batch_size,
+        limit=1,
+        use_cache=None,  # avoid saving files when running tests
+    )
 
     out_onnx = integration_eval(
         model=create_model_from_target(
-            "hf:mgoin/TinyStories-1M-ds",
-            #engine_type="onnxruntime",
-            sequence_length=1024
+            model_path_ds,
+            engine_type="onnxruntime",
         ),
         datasets=datasets,
         batch_size=batch_size,

From d3b84f8b76d94e6beaa459556d608f9422b3337c Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 2 Feb 2024 13:28:39 +0000
Subject: [PATCH 12/24] cleanup

---
 .../integrations/lm_evaluation_harness.py     | 38 ++++++++++++++-----
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index d4021ba8f3..6bf8062a2c 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -114,16 +114,22 @@ def __init__(
         self,
         pipeline: Pipeline,
         batch_size: int = 1,
-        max_gen_toks: int = 128,
+        max_gen_toks: int = 256,
         tokenizer: Optional[AutoTokenizer] = None,
     ):
         """
         Wrapper around the DeepSparse pipeline to make it compatible with the
         llm-evaluation-harness.
+
+        :param pipeline: the pipeline object to wrap
+        :param batch_size: the batch size to use for evaluation
+        :param max_gen_toks: the maximum number of tokens to generate
+            when using the model for generation (see: greed_until method)
+        :param tokenizer: the tokenizer to use for encoding and decoding
+            strings and tokens. By default, the tokenizer from the pipeline
         """
         super().__init__()
 
-        # Initialize new model and tokenizer instances
         self.pipeline = pipeline
         self.batch_size = batch_size
         self.tokenizer = tokenizer or pipeline.tokenizer
@@ -164,6 +170,13 @@ def _loglikelihood_tokens(
         requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
         disable_tqdm: bool = False,
     ) -> List[Tuple[float, bool]]:
+        """
+        The function to compute the loglikelihood of the continuation
+        tokens given the context tokens.
+
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
         res = []
 
         def _collate(x):
@@ -226,9 +239,18 @@ def _collate(x):
     def loglikelihood_rolling(
         self, requests: list[Instance]
     ) -> list[tuple[float, bool]]:
-        raise NotImplementedError()
+        raise NotImplementedError(
+            "The method not required by any of our " "current task integrations so far"
+        )
 
     def generate_until(self, requests: list[Instance]) -> list[str]:
+        """
+        The function to generate a certain number of new tokens
+        given a context.
+
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
         res = defaultdict(list)
         re_ords = {}
 
@@ -283,19 +305,15 @@ def _collate(x):
                 if "do_sample" not in kwargs.keys():
                     kwargs["do_sample"] = False
 
-                # first stop sequence is used to halt generation upon encountering
-                primary_until = [until[0]]
-
-                responses = self.pipeline(
+                out = self.pipeline(
                     sequences=contexts,
                     max_new_tokens=max_gen_toks,
                     stop=until,
                     **kwargs,
                 )
 
-                responses = responses if type(responses) is list else [responses]
-                for response, context in zip(responses, contexts):
-                    text = response.generations[0].text
+                for gen, context in zip(out.generations, contexts):
+                    text = gen.text
                     # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
                     for term in until:
                         if len(term) > 0:

From e7d8c3127dafcbec9b380949a3fe189da77b24ba Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 5 Feb 2024 11:35:26 +0100
Subject: [PATCH 13/24] Update test_evaluator.py

---
 tests/deepsparse/evaluation/test_evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index dedd63fa36..61a1eb3891 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -133,6 +133,6 @@ def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serializat
         standalone_mode=False,
     )
     # makes sure that the result file is created
-    assert os.path.isfile(
-        os.path.join(os.path.dirname(str(tmp_path)), f"result.{type_serialization}")
+    assert os.path.isfile(os.path.join(os.path.dirname(str(tmp_path)), 
+                                       f"result.{type_serialization}")
     )

From a148fc5177ec37db3925a02b36d4002c7f2457b9 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 5 Feb 2024 10:38:51 +0000
Subject: [PATCH 14/24] finished

---
 .../integrations/lm_evaluation_harness.py     | 201 +++++++++---------
 .../test_lm_evaluation_harness.py             | 130 +++++++----
 2 files changed, 182 insertions(+), 149 deletions(-)

diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 6bf8062a2c..7931d12f72 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -135,6 +135,7 @@ def __init__(
         self.tokenizer = tokenizer or pipeline.tokenizer
         self._max_length = pipeline.sequence_length
         self._max_gen_toks = max_gen_toks
+        self.batch_sizes = {}
 
     def tok_encode(self, string: str) -> List[int]:
         return self.tokenizer.encode(string)
@@ -190,6 +191,10 @@ def _collate(x):
             list(utils.chunks(re_ord.get_reordered(), self.batch_size)),
             disable=disable_tqdm,
         ):
+            batch_inp = []
+            batch_cache_key = []
+            batch_continuation_enc = []
+            # len(chunk) is the batch_size
             for cache_key, context_enc, continuation_enc in chunk:
                 # how this all works (illustrated on a causal decoder-only setup):
                 #          CTX      CONT
@@ -200,39 +205,45 @@ def _collate(x):
 
                 inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
 
-                response = self.pipeline(
-                    prompt=self.tokenizer.decode(inp),
-                    max_new_tokens=0,
-                    output_scores=True,
-                    include_prompt_logits=True,
-                )
-
-                for i, resp in enumerate(response.generations):
-                    # (seq_len, vocab_size)
-                    multi_scores = resp.score
-                    # (seq_len, vocab_size) but with softmax applied
-                    multi_logits = numpy_log_softmax(multi_scores, axis=1)
-                    # toss out the context half of the sequence
-                    # (cont_len, vocab_size)
-                    continuation_multi_logits = multi_logits[-len(continuation_enc) :]
-
-                    # pick out the logits for the continuation tokens
-                    # (cont_len,)
-                    continuation_logits = continuation_multi_logits[
-                        numpy.arange(len(continuation_enc)), continuation_enc
-                    ]
-                    # check if the tokens generated greedly are the same
-                    # as the expected continuation
-                    greedy_tokens = continuation_multi_logits.argmax(axis=1)
-                    max_equal = greedy_tokens.tolist() == continuation_enc
-
-                    # Answer: (log prob, is-exact-match)
-                    answer = (float(continuation_logits.sum()), bool(max_equal))
-
-                    res.append(answer)
-
-                    if cache_key is not None:
-                        self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                batch_inp.append(self.tokenizer.decode(inp))
+                batch_cache_key.append(cache_key)
+                batch_continuation_enc.append(continuation_enc)
+
+            response = self.pipeline(
+                prompt=batch_inp,
+                max_new_tokens=0,
+                output_scores=True,
+                include_prompt_logits=True,
+            )
+
+            for resp, continuation_enc, cache_key in zip(
+                response.generations, batch_continuation_enc, batch_cache_key
+            ):
+                # (seq_len, vocab_size)
+                multi_scores = resp.score
+                # (seq_len, vocab_size) but with softmax applied
+                multi_logits = numpy_log_softmax(multi_scores, axis=1)
+                # toss out the context half of the sequence
+                # (cont_len, vocab_size)
+                continuation_multi_logits = multi_logits[-len(continuation_enc) :]
+
+                # pick out the logits for the continuation tokens
+                # (cont_len,)
+                continuation_logits = continuation_multi_logits[
+                    numpy.arange(len(continuation_enc)), continuation_enc
+                ]
+                # check if the tokens generated greedly are the same
+                # as the expected continuation
+                greedy_tokens = continuation_multi_logits.argmax(axis=1)
+                max_equal = greedy_tokens.tolist() == continuation_enc
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(continuation_logits.sum()), bool(max_equal))
+
+                res.append(answer)
+
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
 
         return re_ord.get_original(res)
 
@@ -251,86 +262,70 @@ def generate_until(self, requests: list[Instance]) -> list[str]:
         This function is an adapted version of the original function from
         https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
         """
-        res = defaultdict(list)
-        re_ords = {}
+        if not requests:
+            return []
+        res = []
+        requests = [req.args for req in requests]
 
         def _collate(x):
-            # the negative sign on len(toks) sorts descending
             toks = self.tok_encode(x[0])
-            return -len(toks), x[0]
+            return len(toks), x[0]
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        def sameuntil_chunks(xs, size):
+            ret = []
+            lastuntil = xs[0][1]
+            for x in xs:
+                if len(ret) >= size or x[1] != lastuntil:
+                    yield ret, lastuntil
+                    ret = []
+                    lastuntil = x[1]
+                ret.append(x)
 
-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
-        for key, reqs in grouper.get_grouped().items():
-            # within each set of reqs for given kwargs, we reorder by token length, descending.
-            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
+            if ret:
+                yield ret, lastuntil
 
         pbar = tqdm(total=len(requests))
-        # for each different set of kwargs, we execute all requests, by batch.
-        for key, re_ord in re_ords.items():
-            chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size)
-            for chunk in chunks:
-                contexts, all_gen_kwargs = zip(*chunk)
-                # we assume all gen kwargs in the batch are the same
-                # this is safe to assume because the `grouper` object ensures it.
-                gen_kwargs = all_gen_kwargs[0]
-                # unpack our keyword arguments.
-                until = None
-                if isinstance(gen_kwargs, dict):
-                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                    if "until" in kwargs.keys():
-                        until = kwargs.pop("until")
-                        if isinstance(until, str):
-                            until = [kwargs]
-                        elif not isinstance(until, list):
-                            raise ValueError(
-                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
-                            )
-                else:
-                    raise ValueError(
-                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
-                    )
-
-                if not until:
-                    until = [self.tok_decode(self.eot_token_id)]
-
-                if "max_gen_toks" in kwargs.keys():
-                    max_gen_toks = kwargs.pop("max_gen_toks")
-                else:
-                    max_gen_toks = self.max_gen_toks
-
-                # we require users to pass do_sample=True explicitly for non-greedy gen
-                if "do_sample" not in kwargs.keys():
-                    kwargs["do_sample"] = False
-
-                out = self.pipeline(
-                    sequences=contexts,
-                    max_new_tokens=max_gen_toks,
-                    stop=until,
-                    **kwargs,
-                )
+        for chunk, request_args in tqdm(
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+        ):
+            inps = []
 
-                for gen, context in zip(out.generations, contexts):
-                    text = gen.text
-                    # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                    for term in until:
-                        if len(term) > 0:
-                            # ignore possible empty separators
-                            text = text.split(term)[0]
-
-                    res[key].append(text)
-                    self.cache_hook.add_partial(
-                        "greedy_until", (context, gen_kwargs), text
-                    )
-                    pbar.update(1)
-            # reorder this group of results back to original unsorted form
-            res[key] = re_ord.get_original(res[key])
+            self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
+            print(self._max_gen_toks)
+
+            for context, _ in chunk:
+                inps.append(context)
+
+            until = request_args.pop("until", ["<|endoftext|>"])
+            request_args.pop("do_sample", None)
+            request_args["temperature"] = request_args.get("temperature", 0)
+
+            out = self.pipeline(
+                sequences=inps,
+                max_new_tokens=self.max_gen_toks - 1,
+                stop=until,
+                **request_args,
+            )
+
+            for resp, (context, args_) in zip(out.generations, chunk):
+                text = resp.text
+                until_ = until
+                for term in until_:
+                    if len(term) > 0:
+                        text = text.split(term)[0]
+
+                res.append(text)
+
+                self.cache_hook.add_partial(
+                    "generate_until", (context, {"until": until_}), text
+                )
+                pbar.update(1)
 
         pbar.close()
 
-        return grouper.get_original(res)
+        return re_ord.get_original(res)
 
     def _encode_pair(
         self, context: str, continuation: str
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 86f7eac452..91520ee300 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -17,61 +17,99 @@
 from deepsparse.evaluation.utils import create_model_from_target
 
 
-@pytest.mark.parametrize(
-    "datasets, model_path_ds, model_path_hf",
-    [
-        (["hellaswag"], "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"),
-        # (["hellaswag", "gsm8k"],"hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX", "TinyLlama/TinyLlama-1.1B-step-50K-105b"),
-        (
-            "gsm8k",
-            "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX",
-            "TinyLlama/TinyLlama-1.1B-step-50K-105b",
-        ),
-        # ("arc_challenge", "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"),
-    ],
-)
 @pytest.mark.parametrize(
     "batch_size",
-    [1],  # TODO: Add test for higher batch sizes
+    [1, 3],
 )
 @pytest.mark.skipif(
     not try_import_lm_evaluation_harness(raise_error=False),
     reason="lm_evaluation_harness not installed",
 )
-def test_integration_eval_onnx_matches_torch(
-    datasets, model_path_ds, model_path_hf, batch_size
-):
-    from deepsparse.evaluation.integrations.lm_evaluation_harness import (
-        integration_eval,
-    )
+class TestLMEval:
+    @pytest.fixture()
+    def integration_eval(self):
+        from deepsparse.evaluation.integrations.lm_evaluation_harness import (
+            integration_eval as eval_fn,
+        )
+
+        return eval_fn
 
-    out_torch = integration_eval(
-        model="hf",
-        model_args=f"pretrained={model_path_hf},dtype=float16",
-        datasets=datasets,
-        batch_size=batch_size,
-        limit=1,
-        use_cache=None,  # avoid saving files when running tests
+    @pytest.mark.parametrize(
+        "datasets_likelihood",
+        [
+            "hellaswag",
+            ["arc_challenge"],
+            ["hellaswag", "arc_challenge"],
+        ],
     )
+    def test_likelihood_scenario(
+        self, batch_size, datasets_likelihood, integration_eval
+    ):
+        model_path_ds = "hf:mgoin/TinyStories-1M-ds"
+        model_path_hf = "roneneldan/TinyStories-1M"
+
+        out_onnx = integration_eval(
+            model=create_model_from_target(
+                model_path_ds,
+                engine_type="onnxruntime",
+            ),
+            datasets=datasets_likelihood,
+            batch_size=batch_size,
+            limit=2,
+            use_cache=None,  # avoid saving files when running tests
+        )
 
-    out_onnx = integration_eval(
-        model=create_model_from_target(
-            model_path_ds,
-            engine_type="onnxruntime",
-        ),
-        datasets=datasets,
-        batch_size=batch_size,
-        limit=1,
-        use_cache=None,  # avoid saving files when running tests
+        out_torch = integration_eval(
+            model="hf",
+            model_args=f"pretrained={model_path_hf}",
+            datasets=datasets_likelihood,
+            batch_size=batch_size,
+            limit=2,
+            use_cache=None,  # avoid saving files when running tests
+        )
+        self._test_same(out_onnx, out_torch, datasets_likelihood)
+
+    @pytest.mark.parametrize(
+        "datasets_greedy_until",
+        [
+            "gsm8k",
+        ],
     )
+    def test_greedy_until_scenario(
+        self, batch_size, datasets_greedy_until, integration_eval
+    ):
+        model_path_ds = "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX"
+        model_path_hf = "TinyLlama/TinyLlama-1.1B-step-50K-105b"
+
+        out_onnx = integration_eval(
+            model=create_model_from_target(model_path_ds, engine_type="onnxruntime"),
+            datasets=datasets_greedy_until,
+            batch_size=batch_size,
+            limit=2,
+            gen_kwargs="max_gen_toks=16",
+            use_cache=None,  # avoid saving files when running tests
+        )
+
+        out_torch = integration_eval(
+            model="hf",
+            model_args=f"pretrained={model_path_hf}",
+            datasets=datasets_greedy_until,
+            batch_size=batch_size,
+            limit=2,
+            gen_kwargs="max_gen_toks=16",
+            use_cache=None,  # avoid saving files when running tests
+        )
+        self._test_same(out_onnx, out_torch, datasets_greedy_until)
+
+    @staticmethod
+    def _test_same(out_onnx, out_torch, datasets):
+        datasets = datasets if isinstance(datasets, list) else [datasets]
+        for dataset in datasets:
+            torch_samples = out_torch.raw["samples"][dataset]
+            onnx_samples = out_onnx.raw["samples"][dataset]
+            for torch_sample, onnx_sample in zip(torch_samples, onnx_samples):
+                print(torch_sample)
+                print(onnx_sample)
+                print(torch_sample["resps"], onnx_sample["resps"])
+                assert torch_sample["resps"] == onnx_sample["resps"]
 
-    datasets = datasets if isinstance(datasets, list) else [datasets]
-    for dataset in datasets:
-        torch_samples = out_torch.raw["samples"][dataset]
-        onnx_samples = out_onnx.raw["samples"][dataset]
-        for torch_sample, onnx_sample in zip(torch_samples, onnx_samples):
-            for torch_resp, onnx_resp in zip(
-                torch_sample["resps"], onnx_sample["resps"]
-            ):
-                assert pytest.approx(torch_resp[0][0], 0.1) == onnx_resp[0][0]
-                assert torch_resp[0][1] == onnx_resp[0][1]

From a9e98478ec394b673749f2c496228a4061b02281 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 5 Feb 2024 10:54:06 +0000
Subject: [PATCH 15/24] quality

---
 src/deepsparse/evaluation/cli.py          | 2 +-
 tests/deepsparse/evaluation/test_utils.py | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index 43eaa33790..b68d32d4e5 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,7 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --model_path MODEL_PATH 
+    --model_path MODEL_PATH
                         A path to an ONNX model, local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
index a16cb8ee32..f8f3c731a8 100644
--- a/tests/deepsparse/evaluation/test_utils.py
+++ b/tests/deepsparse/evaluation/test_utils.py
@@ -14,12 +14,6 @@
 
 import os
 
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForSequenceClassification,
-    GPTNeoForCausalLM,
-)
-
 import pytest
 from deepsparse import Pipeline
 from deepsparse.evaluation.utils import (

From b5a6d6d90af3aed8f5e091ead177951839a19e6e Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 5 Feb 2024 13:32:59 +0000
Subject: [PATCH 16/24] manual testing

---
 src/deepsparse/evaluation/cli.py              | 14 ++--
 src/deepsparse/evaluation/evaluator.py        |  1 -
 .../evaluation/integrations/__init__.py       |  3 +-
 .../integrations/lm_evaluation_harness.py     | 24 +++----
 src/deepsparse/evaluation/registry.py         |  2 +-
 src/deepsparse/evaluation/utils.py            | 71 +++++++++++++++----
 .../test_lm_evaluation_harness.py             | 33 +++++----
 tests/deepsparse/evaluation/test_evaluator.py |  1 -
 8 files changed, 96 insertions(+), 53 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index b68d32d4e5..d192dd67a1 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,7 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --model_path MODEL_PATH
+    MODEL_PATH
                         A path to an ONNX model, local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
@@ -72,7 +72,7 @@
 
 from deepsparse.evaluation.evaluator import evaluate
 from deepsparse.evaluation.results import Result, save_result
-from deepsparse.evaluation.utils import args_to_dict, get_save_path
+from deepsparse.evaluation.utils import get_save_path, parse_kwarg_tuples
 from deepsparse.operators.engine_operator import (
     DEEPSPARSE_ENGINE,
     ORT_ENGINE,
@@ -88,12 +88,10 @@
         ignore_unknown_options=True,
     )
 )
-@click.option(
-    "--model_path",
+@click.argument(
+    "model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to an ONNX model, local directory containing ONNX model"
-    "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
     "-d",
@@ -178,7 +176,7 @@ def main(
     # join datasets to a list if multiple datasets are passed
     datasets = list(dataset) if not isinstance(dataset, str) else dataset
     # format kwargs to a  dict
-    integration_args = args_to_dict(integration_args)
+    integration_args = parse_kwarg_tuples(integration_args)
 
     _LOGGER.info(
         f"Creating {engine_type} pipeline to evaluate from model path: {model_path}"
@@ -203,7 +201,7 @@ def main(
         **integration_args,
     )
 
-    _LOGGER.info(f"Evaluation done. Results:\n{result}")
+    _LOGGER.info(f"Evaluation done. Results:\n{result.formatted}")
 
     save_path = get_save_path(
         save_path=save_path,
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index b513f07563..3d18f8489f 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -65,7 +65,6 @@ def evaluate(
     return eval_integration(
         pipeline=pipeline,
         datasets=datasets,
-        engine_type=engine_type,
         batch_size=batch_size,
         splits=splits,
         metrics=metrics,
diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py
index 1cc3bfacf0..c7e8d3c5fa 100644
--- a/src/deepsparse/evaluation/integrations/__init__.py
+++ b/src/deepsparse/evaluation/integrations/__init__.py
@@ -24,8 +24,7 @@ def try_import_lm_evaluation_harness(raise_error=False):
         if raise_error:
             raise ImportError(
                 "Unable to import lm_eval. "
-                "To install run 'pip install "
-                "git+https://github.com/EleutherAI/lm-evaluation-harness@b018a7d51'"
+                "To install run 'pip install lm-eval==0.4.0'"
             )
         return False
 
diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 7931d12f72..1d13bb37ee 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -16,14 +16,11 @@
 Integration of the `lm_evaluation_harness`:
 https://github.com/EleutherAI/lm-evaluation-harness
 """
-import copy
 import logging
-from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy
 from tqdm import tqdm
-from transformers import AutoTokenizer
 
 from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
@@ -43,9 +40,11 @@
 
 @EvaluationRegistry.register(name="lm-evaluation-harness")
 def integration_eval(
-    model: Any,
+    pipeline: Pipeline,
     datasets: Union[List[str], str],
-    batch_size: int,
+    batch_size: int = 1,
+    splits: Union[List[str], str, None] = None,
+    metrics: Union[List[str], str, None] = None,
     **kwargs,
 ) -> Result:
     """
@@ -53,15 +52,14 @@ def integration_eval(
     https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py
     that is compatible with deepsparse.evaluator.py
 
-    :param model: the model/pipeline to evaluate
+    :param pipeline: the model/pipeline to evaluate
     :param datasets: the datasets to evaluate on
     :param batch_size: the batch size to use for evaluation
     :param kwargs: additional arguments to alter the behavior of the evaluation
 
     :return the evaluation results
     """
-    if isinstance(model, Pipeline):
-        model = DeepSparseLM(pipeline=model, batch_size=batch_size)
+    pipeline = DeepSparseLM(pipeline=pipeline, batch_size=batch_size)
 
     datasets = (",").join(datasets) if isinstance(datasets, list) else datasets
     task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS)
@@ -69,7 +67,7 @@ def integration_eval(
     _LOGGER.info(f"Selected Tasks: {task_names}")
 
     results_raw = evaluator.simple_evaluate(
-        model=model, tasks=task_names, batch_size=batch_size, **kwargs
+        model=pipeline, tasks=task_names, batch_size=batch_size, **kwargs
     )
 
     results = Result(
@@ -115,7 +113,7 @@ def __init__(
         pipeline: Pipeline,
         batch_size: int = 1,
         max_gen_toks: int = 256,
-        tokenizer: Optional[AutoTokenizer] = None,
+        tokenizer: Optional["AutoTokenizer"] = None,
     ):
         """
         Wrapper around the DeepSparse pipeline to make it compatible with the
@@ -260,7 +258,7 @@ def generate_until(self, requests: list[Instance]) -> list[str]:
         given a context.
 
         This function is an adapted version of the original function from
-        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
         """
         if not requests:
             return []
@@ -293,15 +291,16 @@ def sameuntil_chunks(xs, size):
             inps = []
 
             self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
-            print(self._max_gen_toks)
 
             for context, _ in chunk:
+                # add context (prompts) to the list
                 inps.append(context)
 
             until = request_args.pop("until", ["<|endoftext|>"])
             request_args.pop("do_sample", None)
             request_args["temperature"] = request_args.get("temperature", 0)
 
+            # run inference (generate max_gen_toks tokens)
             out = self.pipeline(
                 sequences=inps,
                 max_new_tokens=self.max_gen_toks - 1,
@@ -312,6 +311,7 @@ def sameuntil_chunks(xs, size):
             for resp, (context, args_) in zip(out.generations, chunk):
                 text = resp.text
                 until_ = until
+                # split the text at the first occurrence of any of the until tokens
                 for term in until_:
                     if len(term) > 0:
                         text = text.split(term)[0]
diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py
index 2daabb69cc..343cd9786c 100644
--- a/src/deepsparse/evaluation/registry.py
+++ b/src/deepsparse/evaluation/registry.py
@@ -57,7 +57,7 @@ def resolve(
 
         if integration is None:
             _LOGGER.info(
-                "No integration specified, inferring the evaluation"
+                "No integration specified, inferring the evaluation "
                 "function from the input arguments..."
             )
             integration = resolve_integration(pipeline, datasets)
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 998a346780..c170b29476 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import ast
+import logging
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -22,10 +23,10 @@
 __all__ = [
     "create_pipeline",
     "get_save_path",
-    "args_to_dict",
+    "parse_kwarg_tuples",
     "resolve_integration",
 ]
-
+_LOGGER = logging.getLogger(__name__)
 LM_EVALUATION_HARNESS = "lm-evaluation-harness"
 
 
@@ -80,24 +81,66 @@ def if_generative_language_model(pipeline: Pipeline) -> bool:
     return False
 
 
-def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]:
+def parse_kwarg_tuples(kwargs: tuple) -> Dict:
     """
-    Convert a tuple of args to a dict of args.
-
-    :param args: The args to convert. Should be a tuple of alternating
-        arg names and arg values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3).
+    Convert a tuple of kwargs to a dict of kwargs.
+    This function is used to enable the click parsing of kwargs.
+
+    Example use:
+    ```
+    @click.command(
+    context_settings=dict(
+        ignore_unknown_options=True)
+    )
+    @click.argument(...)
+    @click.option(...)
+    ...
+    @click.argument("kwargs", nargs=-1, type=click.UNPROCESSED)
+    def main(..., kwargs):
+        ...
+        kwargs: Dict[str, Any] = parse_kwarg_tuples(kwargs: Tuple)
+    ```
+
+    Example inputs, outputs:
+    ```
+    input = ('--arg1', 1, 'arg2', 2, '-arg3', 3)
+    output = parse_kwarg_tuples(input)
+    output = {'arg1': 1, 'arg2': 2, 'arg3': 3}
+    ```
+
+    :param kwargs: The kwargs to convert. Should be a tuple of alternating
+        kwargs names and kwargs values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3).
         The names can optionally have a '-' or `--` in front of them.
-    :return: The converted args as a dict.
+    :return: The converted kwargs as a dict.
     """
-    if len(args) == 0:
+    if len(kwargs) == 0:
         return {}
+    if len(kwargs) % 2 != 0:
+        raise ValueError(
+            "kwargs must be a tuple of alternating names and values "
+            "i.e. the length of kwargs tuple must be even. Received "
+            f"kwargs: {kwargs}"
+        )
     # names are uneven indices, values are even indices
-    args_names = args[0::2]
-    args_values = args[1::2]
+    kwargs_names = kwargs[0::2]
+    kwargs_values = kwargs[1::2]
+    # by default kwargs values are strings, so convert them
+    # to the appropriate type if possible
+    kwargs_values = list(kwargs_values)
+    for i, value in enumerate(kwargs_values):
+        try:
+            kwargs_values[i] = ast.literal_eval(value)
+        except Exception as e:  # noqa E841
+            _LOGGER.debug(
+                f"Failed to infer non-string type"
+                f"from kwarg value: {value}. It will"
+                f"be left as a string."
+            )
+
     # remove any '-' or '--' from the names
-    args_names = [name.lstrip("-") for name in args_names]
+    kwargs_names = [name.lstrip("-") for name in kwargs_names]
 
-    return dict(zip(args_names, args_values))
+    return dict(zip(kwargs_names, kwargs_values))
 
 
 def get_save_path(
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 61a24e3d75..8d8b343dd5 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -43,6 +43,7 @@ def integration_eval(self):
         ],
     )
     def test_likelihood_scenario(self, batch_size, datasets, integration_eval):
+
         model_path_ds = "hf:mgoin/TinyStories-1M-ds"
         model_path_hf = "roneneldan/TinyStories-1M"
         limit = 2
@@ -58,15 +59,18 @@ def test_likelihood_scenario(self, batch_size, datasets, integration_eval):
             use_cache=None,  # avoid saving files when running tests
         )
 
-        out_torch = integration_eval(
+        from lm_eval import evaluator, tasks, utils
+
+        datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets
+        out_torch = evaluator.simple_evaluate(
             model="hf",
             model_args=f"pretrained={model_path_hf}",
-            datasets=datasets,
+            tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS),
             batch_size=batch_size,
             limit=limit,
             use_cache=None,  # avoid saving files when running tests
         )
-        self._test_same(out_onnx, out_torch, datasets)
+        self._test_same(out_onnx.raw, out_torch, datasets)
 
     @pytest.mark.parametrize(
         "datasets",
@@ -74,9 +78,7 @@ def test_likelihood_scenario(self, batch_size, datasets, integration_eval):
             "gsm8k",
         ],
     )
-    def test_greedy_until_scenario(
-        self, batch_size, datasets, integration_eval, greedy=True
-    ):
+    def test_greedy_until_scenario(self, batch_size, datasets, integration_eval):
         model_path_ds = "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX"
         model_path_hf = "TinyLlama/TinyLlama-1.1B-step-50K-105b"
         limit = 2
@@ -85,7 +87,7 @@ def test_greedy_until_scenario(
         gen_kwargs = "max_gen_toks=16"
 
         out_onnx = integration_eval(
-            model=create_pipeline(model_path_ds, engine_type="onnxruntime"),
+            create_pipeline(model_path_ds, engine_type="onnxruntime"),
             datasets=datasets,
             batch_size=batch_size,
             limit=limit,
@@ -93,23 +95,26 @@ def test_greedy_until_scenario(
             use_cache=None,  # avoid saving files when running tests
         )
 
-        out_torch = integration_eval(
+        from lm_eval import evaluator, tasks, utils
+
+        datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets
+        out_torch = evaluator.simple_evaluate(
             model="hf",
             model_args=f"pretrained={model_path_hf}",
-            datasets=datasets,
+            tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS),
             batch_size=batch_size,
             limit=limit,
             gen_kwargs=gen_kwargs,
             use_cache=None,  # avoid saving files when running tests
         )
-        self._test_same(out_onnx, out_torch, datasets)
+        self._test_same(out_onnx.raw, out_torch, datasets)
 
     @staticmethod
     def _test_same(out_onnx, out_torch, datasets, greedy=False):
         datasets = datasets if isinstance(datasets, list) else [datasets]
         for dataset in datasets:
-            torch_samples = out_torch.raw["samples"][dataset]
-            onnx_samples = out_onnx.raw["samples"][dataset]
+            torch_samples = out_torch["samples"][dataset]
+            onnx_samples = out_onnx["samples"][dataset]
             for torch_sample, onnx_sample in zip(torch_samples, onnx_samples):
                 if greedy:
                     # for datasets that validate greedy generation
@@ -119,6 +124,6 @@ def _test_same(out_onnx, out_torch, datasets, greedy=False):
                     # for datasets that validate likelihood
                     # make sure that likelihoods are the same
                     assert (
-                        pytest.approx(torch_sample["resps"][0], 0.01)
-                        == onnx_sample["resps"][0]
+                        pytest.approx(torch_sample["resps"][0][0], 0.0001)
+                        == onnx_sample["resps"][0][0]
                     )
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index 816ad075e0..9e7d21bdae 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -144,7 +144,6 @@ def test_cli(
     runner.invoke(
         main,
         [
-            "--model_path",
             model_path,
             "--dataset",
             datasets[0],

From e10f0c97073742623fbc0da91b430198a06a653a Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 5 Feb 2024 17:04:28 +0000
Subject: [PATCH 17/24] UI improvements

---
 setup.py                           | 2 +-
 src/deepsparse/evaluation/cli.py   | 2 --
 src/deepsparse/evaluation/utils.py | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 8fe04d23be..ff8269257f 100644
--- a/setup.py
+++ b/setup.py
@@ -308,7 +308,7 @@ def _setup_entry_points() -> Dict:
             f"deepsparse.image_classification.eval={ic_eval}",
             "deepsparse.license=deepsparse.license:main",
             "deepsparse.validate_license=deepsparse.license:validate_license_cli",
-            "deepsparse.eval=deepsparse.evaluation.cli:main",
+            "deepsparse.evaluate=deepsparse.evaluation.cli:main",
         ]
     }
 
diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index e1b9cf5c57..d192dd67a1 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -92,8 +92,6 @@
     "model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to a remote or local directory containing ONNX/torch model "
-    "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
     "-d",
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index c170b29476..b2695abaa1 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -27,7 +27,7 @@
     "resolve_integration",
 ]
 _LOGGER = logging.getLogger(__name__)
-LM_EVALUATION_HARNESS = "lm-evaluation-harness"
+LM_EVALUATION_HARNESS = "lm-eval-harness"
 
 
 def potentially_check_dependency_import(integration_name: str) -> bool:

From 48a5900398d399fef23c999cb2ae3d6b973be264 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 6 Feb 2024 07:28:39 +0000
Subject: [PATCH 18/24] new UI adaptations

---
 src/deepsparse/evaluation/cli.py                |  2 +-
 .../integrations/lm_evaluation_harness.py       | 17 ++++++++++-------
 src/deepsparse/evaluation/utils.py              |  2 +-
 tests/deepsparse/evaluation/test_evaluator.py   |  5 ++---
 tests/deepsparse/evaluation/test_utils.py       |  2 +-
 5 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index d192dd67a1..4d97c904bb 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -61,7 +61,7 @@
 deepsparse.eval zoo:mpt-7b-mpt_pretrain-base_quantized \
                 --dataset hellaswag \
                 --dataset gsm8k \
-                --integration lm-evaluation-harness \
+                --integration lm-eval-harness \
                 --limit 2 # limit the number of samples to evaluate on, specific to the integration
 
 """  # noqa: E501
diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 1d13bb37ee..7347d91bfb 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """
-Integration of the `lm_evaluation_harness`:
+Integration of the `lm-evaluation-harness`:
 https://github.com/EleutherAI/lm-evaluation-harness
 """
 import logging
@@ -25,6 +25,7 @@
 from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
+from deepsparse.evaluation.utils import LM_EVALUATION_HARNESS
 from deepsparse.utils.data import numpy_log_softmax
 from lm_eval import evaluator, tasks, utils
 from lm_eval.api.instance import Instance
@@ -38,7 +39,7 @@
 __all__ = ["integration_eval"]
 
 
-@EvaluationRegistry.register(name="lm-evaluation-harness")
+@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS)
 def integration_eval(
     pipeline: Pipeline,
     datasets: Union[List[str], str],
@@ -83,7 +84,7 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
     Format the raw results from lm_evaluation_harness into a list of
     Evaluation objects.
 
-    :param results: the raw results from lm_evaluation_harness
+    :param results: the raw results from lm-evaluation-harness
     :return: the formatted results as a list of Evaluation objects
     """
     formatted_results = []
@@ -98,7 +99,7 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
             type=None, name=dataset_name, config=results["config"], split=None
         )
         evaluation = Evaluation(
-            task="lm_evaluation_harness",
+            task=LM_EVALUATION_HARNESS,
             dataset=dataset,
             metrics=metrics,
             samples=None,
@@ -113,7 +114,7 @@ def __init__(
         pipeline: Pipeline,
         batch_size: int = 1,
         max_gen_toks: int = 256,
-        tokenizer: Optional["AutoTokenizer"] = None,
+        tokenizer: Optional["AutoTokenizer"] = None,  # noqa: F821
     ):
         """
         Wrapper around the DeepSparse pipeline to make it compatible with the
@@ -157,7 +158,9 @@ def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
         new_reqs = []
         for context, continuation in [req.args for req in requests]:
             if context == "":
-                raise NotImplemented("Implementing empty context is not supported yet")
+                raise NotImplementedError(
+                    "Implementing empty context is not supported yet"
+                )
             context_enc, continuation_enc = self._encode_pair(context, continuation)
 
             new_reqs.append(((context, continuation), context_enc, continuation_enc))
@@ -199,7 +202,7 @@ def _collate(x):
                 # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
                 # model  \               \
                 # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
-                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501
 
                 inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
 
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index b2695abaa1..02f16b089c 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -14,7 +14,7 @@
 import ast
 import logging
 import os
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Union
 
 from deepsparse import Pipeline
 from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index 9e7d21bdae..225a255d52 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -122,9 +122,8 @@ def test_evaluation_llm_evaluation_harness_integration_name(
     assert evaluate(
         model=model_path,
         datasets=datasets,
-        limit=2,
-        no_cache=True,
-        integration="lm_evaluation_harness",
+        limit=1,
+        integration="lm_eval_harness",
     )
 
 
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
index f8f3c731a8..0b8fb187ec 100644
--- a/tests/deepsparse/evaluation/test_utils.py
+++ b/tests/deepsparse/evaluation/test_utils.py
@@ -36,7 +36,7 @@ def llm_type_pipeline():
 def test_resolve_known_llm_pipeline(llm_type_pipeline):
     assert (
         resolve_integration(pipeline=llm_type_pipeline, datasets="")
-        == "lm-evaluation-harness"
+        == "lm-eval-harness"
     )
 
 
From 44e3e6e9c35b35ece9d5f941f6e04599dd928320 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 6 Feb 2024 07:46:00 +0000
Subject: [PATCH 19/24] make test more lightweight

---
 tests/deepsparse/evaluation/test_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index 225a255d52..2b8430a8a6 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -56,7 +56,7 @@ def model_path():
 
 @pytest.fixture()
 def datasets():
-    return ["hellaswag", "gsm8k"]
+    return ["hellaswag"]
 
 
 @pytest.fixture()

From abb6ab8535eb28a55cb77944cea28d21155e9bb3 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Tue, 6 Feb 2024 13:11:32 +0000
Subject: [PATCH 20/24] fix tests 2

---
 tests/deepsparse/evaluation/test_evaluator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index 2b8430a8a6..928fd275e2 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -56,7 +56,7 @@ def model_path():
 
 @pytest.fixture()
 def datasets():
-    return ["hellaswag"]
+    return ["hellaswag", "gsm8k"]
 
 
 @pytest.fixture()
@@ -121,7 +121,9 @@ def test_evaluation_llm_evaluation_harness_integration_name(
 ):
     assert evaluate(
         model=model_path,
-        datasets=datasets,
+        # testing only on hellaswag dataset
+        # to avoid long running time
+        datasets=datasets[0],
         limit=1,
         integration="lm_eval_harness",
     )

From e5aad6515215c88e2a72822707a168342ebafd2e Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Wed, 7 Feb 2024 12:01:03 +0000
Subject: [PATCH 21/24] good point Michael

---
 src/deepsparse/evaluation/integrations/__init__.py | 2 +-
 src/deepsparse/evaluation/utils.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py
index c7e8d3c5fa..15eeee7d8d 100644
--- a/src/deepsparse/evaluation/integrations/__init__.py
+++ b/src/deepsparse/evaluation/integrations/__init__.py
@@ -15,7 +15,7 @@
 # flake8: noqa: F401
 
 
-def try_import_lm_evaluation_harness(raise_error=False):
+def try_import_lm_evaluation_harness(raise_error=True):
     try:
         import lm_eval
 
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 02f16b089c..c089819659 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -43,7 +43,7 @@ def potentially_check_dependency_import(integration_name: str) -> bool:
     if integration_name.replace("_", "-") == LM_EVALUATION_HARNESS:
         from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
 
-        try_import_lm_evaluation_harness(raise_error=True)
+        try_import_lm_evaluation_harness()
 
     return True
 

From d65cac62051dc10b313fb4b0179837963b1efd73 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Thu, 8 Feb 2024 17:33:58 +0100
Subject: [PATCH 22/24] Return to the name `lm-evaluation-harness` but add
 alias `lm-eval-harness`

---
 src/deepsparse/evaluation/cli.py                                | 2 +-
 src/deepsparse/evaluation/integrations/lm_evaluation_harness.py | 2 +-
 src/deepsparse/evaluation/utils.py                              | 2 +-
 tests/deepsparse/evaluation/test_evaluator.py                   | 2 +-
 tests/deepsparse/evaluation/test_utils.py                       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index 4d97c904bb..d192dd67a1 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -61,7 +61,7 @@
 deepsparse.eval zoo:mpt-7b-mpt_pretrain-base_quantized \
                 --dataset hellaswag \
                 --dataset gsm8k \
-                --integration lm-eval-harness \
+                --integration lm-evaluation-harness \
                 --limit 2 # limit the number of samples to evaluate on, specific to the integration
 
 """  # noqa: E501
diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 7347d91bfb..69934af37a 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -39,7 +39,7 @@
 __all__ = ["integration_eval"]
 
 
-@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS)
+@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS, alias="lm-eval-harness")
 def integration_eval(
     pipeline: Pipeline,
     datasets: Union[List[str], str],
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index c089819659..ff2619315b 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -27,7 +27,7 @@
     "resolve_integration",
 ]
 _LOGGER = logging.getLogger(__name__)
-LM_EVALUATION_HARNESS = "lm-eval-harness"
+LM_EVALUATION_HARNESS = "lm-evaluation-harness"
 
 
 def potentially_check_dependency_import(integration_name: str) -> bool:
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index 928fd275e2..241b9a4344 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -125,7 +125,7 @@ def test_evaluation_llm_evaluation_harness_integration_name(
         # to avoid long running time
         datasets=datasets[0],
         limit=1,
-        integration="lm_eval_harness",
+        integration="lm_evaluation_harness",
     )
 
 
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
index 0b8fb187ec..f8f3c731a8 100644
--- a/tests/deepsparse/evaluation/test_utils.py
+++ b/tests/deepsparse/evaluation/test_utils.py
@@ -36,7 +36,7 @@ def llm_type_pipeline():
 def test_resolve_known_llm_pipeline(llm_type_pipeline):
     assert (
         resolve_integration(pipeline=llm_type_pipeline, datasets="")
-        == "lm-eval-harness"
+        == "lm-evaluation-harness"
     )
 
 
From b82b49b1b4b55face353f188886a6f06725b55cd Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Fri, 9 Feb 2024 16:50:49 +0100
Subject: [PATCH 23/24] [DeepSparse Evaluation API] Perplexity (#1555)

* initial commit

* Update src/deepsparse/evaluation/integrations/__init__.py

* design ready, time to define additional features

* split prep_for_generation operator

* fix logits

* update non-kv cache pipeline and tests

* add tests to address edge cases

* add condition to check of kv_cache full during prompt inference, add test to cover this case, revert debugging changes

* fix typing

* remove commented code

* remove irrelevant condition

* perplexity for non-kv cache pipelines works!

* logic is working

* ready for review

* [DeepSparse Evaluation API] Perplexity eval support for `openai_humaneval`, `c4`, `wikitext2` (#1586)

* fix tests 2

* initial commit

* add return to a function

* make script more robust

---------

Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 setup.py                                      |   1 +
 src/deepsparse/evaluation/evaluator.py        |   3 +
 .../evaluation/integrations/__init__.py       |   1 +
 .../evaluation/integrations/perplexity.py     | 278 ++++++++++++++++++
 src/deepsparse/evaluation/results.py          |   4 +-
 src/deepsparse/evaluation/utils.py            |   2 +
 src/deepsparse/transformers/metrics.py        |   2 +-
 .../transformers/utils/eval_helpers.py        |  34 ++-
 .../integrations/test_perplexity.py           | 132 +++++++++
 9 files changed, 448 insertions(+), 9 deletions(-)
 create mode 100644 src/deepsparse/evaluation/integrations/perplexity.py
 create mode 100644 tests/deepsparse/evaluation/integrations/test_perplexity.py

diff --git a/setup.py b/setup.py
index ff8269257f..d9c8dffd7d 100644
--- a/setup.py
+++ b/setup.py
@@ -149,6 +149,7 @@ def _parse_requirements_file(file_path):
     "datasets<2.16",
     "accelerate<0.26",
     "seqeval",
+    "evaluate",
 ]
 _sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps
 
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index 3d18f8489f..3926b78a2a 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -16,6 +16,9 @@
 from typing import List, Optional, Union
 
 from deepsparse import Pipeline
+from deepsparse.evaluation.integrations.perplexity import (  # noqa
+    integration_eval as integration_eval_perplexity,
+)
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Result
 from deepsparse.evaluation.utils import create_pipeline
diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py
index 15eeee7d8d..f0871f135a 100644
--- a/src/deepsparse/evaluation/integrations/__init__.py
+++ b/src/deepsparse/evaluation/integrations/__init__.py
@@ -31,3 +31,4 @@ def try_import_lm_evaluation_harness(raise_error=True):
 
 if try_import_lm_evaluation_harness(raise_error=False):
     from .lm_evaluation_harness import *
+from .perplexity import *
diff --git a/src/deepsparse/evaluation/integrations/perplexity.py b/src/deepsparse/evaluation/integrations/perplexity.py
new file mode 100644
index 0000000000..a9a3f3d8a3
--- /dev/null
+++ b/src/deepsparse/evaluation/integrations/perplexity.py
@@ -0,0 +1,278 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
+
+import numpy
+from tqdm import tqdm
+
+from datasets import load_dataset
+from deepsparse import Pipeline
+from deepsparse.evaluation.registry import EvaluationRegistry
+from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
+from deepsparse.evaluation.utils import PERPLEXITY
+from deepsparse.transformers.metrics import Perplexity
+from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
+from deepsparse.transformers.pipelines.text_generation.pipeline_no_kv_cache import (
+    TextGenerationPipelineNoCache,
+)
+from deepsparse.transformers.utils.eval_helpers import (
+    HumanEvalIteratorWrapper,
+    process_concatenated_datasets,
+)
+
+
+"""
+Integration for the evaluation module
+that computes the perplexity of a model on a dataset
+"""
+_LOGGER = logging.getLogger(__name__)
+
+
+@EvaluationRegistry.register(name=PERPLEXITY)
+def integration_eval(
+    pipeline: Pipeline,
+    datasets: Union[List[str], str] = "openai_humaneval",
+    batch_size: int = 1,
+    limit: Optional[int] = None,
+    accumulate: Optional[bool] = None,
+    splits: Union[List[str], str, None] = "test",
+    metrics: Union[List[str], str, None] = None,
+    **kwargs,
+) -> Result:
+    """
+    A function that computes the perplexity of a pipeline given a set
+    of dataset names.
+
+    :param pipeline: the pipeline to evaluate. The assumed pipeline
+        is a TextGenerationPipeline, either with or without the KV
+        cache support
+    :param datasets: the names of dataset(s) to evaluate on
+    :param batch_size: the batch size to use for evaluation
+    :param splits: the split of the dataset to evaluate on. Default is "test"
+    :param metrics: the metrics to compute. Default is None
+    :param limit: the number of batches to evaluate on. Default is None
+        (evaluates on entire dataset)
+    :param accumulate: whether to perplexity computation should
+        accumulate negative log-likelihood over samples. Defaults to
+        the default accumulate variable inferred from the dataset in
+        `datasets`. If not None, it will override the inferred accumulate
+         variable.
+    :return: a Result object containing the raw and formatted results
+    """
+    metrics = metrics or PERPLEXITY
+    if metrics != PERPLEXITY:
+        raise ValueError(f"Invalid metric {metrics} for perplexity evaluation")
+    if splits is None:
+        splits = "test"
+        _LOGGER.info("Argument `splits` is None. Defaulting to `test` split.")
+    datasets = datasets if isinstance(datasets, list) else [datasets]
+    results_raw = defaultdict(str)
+    for dataset_name in datasets:
+        results_raw[dataset_name] = defaultdict()
+        dataset, _accumulate = load_perplexity_dataset(
+            dataset_name=dataset_name, splits=splits, pipeline=pipeline, **kwargs
+        )
+        if accumulate is None:
+            accumulate = _accumulate
+        else:
+            _LOGGER.info(
+                f"Argument `accumulate` set to {accumulate}. "
+                "Overriding the inferred accumulate variable from the dataset."
+            )
+
+        perplexity = run_perplexity(
+            pipeline=pipeline,
+            dataset=dataset,
+            batch_size=batch_size,
+            accumulate=accumulate,
+            limit=limit,
+        )
+
+        results_raw[dataset_name] = defaultdict()
+        results_raw[dataset_name]["results"] = perplexity
+        results_raw[dataset_name]["split"] = splits
+
+    results = Result(
+        # omit storing raw results. they can potentially
+        # contain numpy arrays that are not serializable.
+        # all the information is stored in the formatted results
+        raw=None,
+        formatted=format_raw_results(results_raw),
+    )
+
+    return results
+
+
+def run_perplexity(
+    pipeline: Union[TextGenerationPipelineNoCache, TextGenerationPipeline],
+    dataset: "Dataset",
+    batch_size: int,
+    accumulate: bool,
+    limit: Optional[int] = None,
+) -> Dict[str, Any]:
+    """
+    Compute the perplexity of a pipeline given a dataset.
+
+    :param pipeline: the pipeline to evaluate. The assumed pipeline
+        is a TextGenerationPipeline, either with or without the KV
+        cache support
+    :param dataset: the dataset to evaluate on
+    :param batch_size: the batch size to use for evaluation
+    :param accumulate: whether to perplexity computation should
+        accumulate negative log-likelihood over samples
+    :param limit: the number of batches to evaluate on. Default is None
+        (evaluates on entire dataset)
+
+    :return: a dictionary containing the perplexity results
+    """
+
+    perplexity = Perplexity(accumulate=accumulate)
+
+    batch = []
+    for idx, sample in _enumerate_progress(
+        dataset, max_steps=None if limit is None else limit * batch_size
+    ):
+
+        if limit is not None:
+            # stop if we have reached the #limit
+            # number of batches to be processed
+            if idx >= limit * batch_size:
+                break
+
+        batch.append(sample)
+
+        if len(batch) == batch_size:
+            if isinstance(pipeline, TextGenerationPipelineNoCache):
+                out = pipeline(
+                    prompt=batch,
+                    output_scores=True,
+                    include_prompt_logits=True,
+                    return_input_tokens=True,
+                )
+            else:
+                out = pipeline(
+                    prompt=batch,
+                    output_scores=True,
+                    max_new_tokens=0,
+                    include_prompt_logits=True,
+                    return_input_tokens=True,
+                )
+
+            for s in range(batch_size):
+                # Need to remove tokens that were masked
+                input_ids = out.input_tokens["input_ids"][s].flatten()
+                attention_mask = out.input_tokens["attention_mask"][s].flatten()
+                logits = out.generations[s].score
+                if batch_size > 1 and isinstance(
+                    pipeline, TextGenerationPipelineNoCache
+                ):
+                    logits = logits[-attention_mask.sum() :, :]
+
+                logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
+                input_ids = numpy.compress(attention_mask, input_ids)[1:]
+
+                # Add predictions (logits) and targets (input_ids) to metric
+                perplexity.add_batch(logits, input_ids)
+
+            batch.clear()
+
+    return perplexity.compute()
+
+
+def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
+    """
+    Format the raw perplexity results into a list of
+    Evaluation objects.
+
+    :param results: the raw results from perplexity computation
+    :return: the formatted results as a list of Evaluation objects
+    """
+    formatted_results = []
+    for dataset_name, dataset_result in results.items():
+        metrics = []
+        for metric_name, metric_value in dataset_result["results"].items():
+            if isinstance(metric_value, numpy.ndarray):
+                metric_value = metric_value.tolist()
+            metric = Metric(name=metric_name, value=metric_value)
+            metrics.append(metric)
+        dataset = Dataset(type=None, name=dataset_name, split=dataset_result["split"])
+        evaluation = Evaluation(
+            task="perplexity",
+            dataset=dataset,
+            metrics=metrics,
+            samples=None,
+        )
+        formatted_results.append(evaluation)
+    return formatted_results
+
+
+def load_perplexity_dataset(
+    dataset_name: str,
+    splits: Union[List[str], str] = "test",
+    pipeline: Optional[Pipeline] = None,
+    **kwargs,
+):
+    """
+    Function to load the dataset for perplexity computation.
+    Eventually we want to load the dataset from the nm_utils
+
+    :param dataset_name: the name of the dataset to load
+    :param splits: the splits to load from the dataset. Default is "test"
+    :param pipeline: the pipeline to use for loading the dataset. The pipeline
+        is used to infer the model path and sequence length to use for loading
+        the dataset. This argument can be omitted if the appropriate kwargs
+        are provided, or if the dataset does not require a process_concatenated_datasets
+        function to load the dataset.
+    :param kwargs: additional keyword arguments to pass to the dataset loading function
+    :return: the dataset and whether to accumulate perplexity over samples
+    """
+    if isinstance(splits, list):
+        raise NotImplementedError("Evaluation on multiple splits not implemented")
+
+    if dataset_name == "openai_humaneval":
+        dataset = load_dataset(dataset_name, split=splits)
+        dataset = HumanEvalIteratorWrapper(dataset)
+        accumulate = False
+    elif dataset_name in {"wikitext2", "c4"}:
+        # fetch max_sequence_length from pipeline if not provided
+        max_sequence_length = kwargs.pop("max_sequence_length", None)
+        if max_sequence_length is None and pipeline is not None:
+            max_sequence_length = pipeline.sequence_length
+
+        # fetch model_path from pipeline if not provided
+        model_path = kwargs.pop("model_path", None)
+        if model_path is None and pipeline is not None:
+            model_path = os.path.dirname(pipeline.model_path)
+
+        dataset = process_concatenated_datasets(
+            dataset_name,
+            model_path=model_path,
+            max_sequence_length=max_sequence_length,
+            split=splits,
+            **kwargs,
+        )
+        accumulate = True
+    else:
+        raise NotImplementedError(f"Dataset {dataset_name} not implemented")
+
+    return dataset, accumulate
+
+
+def _enumerate_progress(dataset, max_steps):
+    progress_bar = tqdm(dataset, total=max_steps) if max_steps else tqdm(dataset)
+    return enumerate(progress_bar)
diff --git a/src/deepsparse/evaluation/results.py b/src/deepsparse/evaluation/results.py
index 00212d0a1e..78c4bbd501 100644
--- a/src/deepsparse/evaluation/results.py
+++ b/src/deepsparse/evaluation/results.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 import yaml
 from pydantic import BaseModel, Field
@@ -32,7 +32,7 @@
 
 class Metric(BaseModel):
     name: str = Field(description="Name of the metric")
-    value: float = Field(description="Value of the metric")
+    value: Union[float, List[float]] = Field(description="Value of the metric")
 
 
 class Dataset(BaseModel):
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index ff2619315b..a5dc460596 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -27,7 +27,9 @@
     "resolve_integration",
 ]
 _LOGGER = logging.getLogger(__name__)
+
 LM_EVALUATION_HARNESS = "lm-evaluation-harness"
+PERPLEXITY = "perplexity"
 
 
 def potentially_check_dependency_import(integration_name: str) -> bool:
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index b90c4dd744..0e7c24c8b6 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -20,7 +20,7 @@
 
 import numpy
 
-from deepsparse.utils import numpy_log_softmax
+from deepsparse.utils.data import numpy_log_softmax
 
 
 __all__ = [
diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 4c0e68b9de..012520b9b5 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Mapping, Union
+from typing import List, Union
 
 import numpy
 from transformers import AutoTokenizer, PreTrainedTokenizerFast
@@ -27,7 +27,8 @@ def process_concatenated_datasets(
     dataset_name: str,
     model_path: str,
     max_sequence_length: int,
-    kwargs: Mapping,
+    split: str = "test",
+    **kwargs,
 ) -> list:
     """
     Concatenate text datasets and split them into chunks text that, after
@@ -38,6 +39,8 @@ def process_concatenated_datasets(
             Options: "wikitext2" or "c4".
         model_path (str): The path to a pretrained transformer model for tokenization.
         max_sequence_length (int): The maximum number of tokens in each sequence.
+        split (str, optional): The split of the dataset to use.
+            Default is "test".
         kwargs (mapping): Additional keyword arguments.
             - eos (str, optional): The end-of-sentence token.
                 Default is "\n\n" for wikitext2 and "" for c4.
@@ -65,13 +68,13 @@ def process_concatenated_datasets(
         eos = kwargs.get("eos", "\n\n")
         bos = kwargs.get("bos", "")
 
-        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split=split)
         raw_text = raw_dataset["text"]
     elif dataset_name == "c4":
         eos = kwargs.get("eos", "<|endoftext|>")
         bos = kwargs.get("bos", "")
         raw_samples = kwargs.get("raw_samples", None)
-        data_file = kwargs.get("data_file", 0)
+        data_file = kwargs.get("data_file", None)
         if data_file is not None:
             raw_dataset = load_dataset(
                 "allenai/c4",
@@ -79,13 +82,13 @@ def process_concatenated_datasets(
                 data_files={
                     "validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"
                 },
-                split="validation",
+                split=split,
             )
         else:
             raw_dataset = load_dataset(
                 "allenai/c4",
                 "allenai--c4",
-                split="validation",
+                split=split,
             )
         if raw_samples is not None:
             raw_dataset = raw_dataset[:raw_samples]
@@ -181,3 +184,22 @@ def _split_text_by_tokens(
         )
 
     return split_text
+
+
+class HumanEvalIteratorWrapper:
+    """
+    Wrapper around the `openai_humaneval` dataset,
+    that joins the prompt and the canonical solution
+    into a single string during iteration.
+    """
+
+    def __init__(self, dataset):
+        self.iterator = iter(dataset)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # Get the next sample from the original iterator
+        sample = next(self.iterator)
+        return sample["prompt"] + sample["canonical_solution"]
diff --git a/tests/deepsparse/evaluation/integrations/test_perplexity.py b/tests/deepsparse/evaluation/integrations/test_perplexity.py
new file mode 100644
index 0000000000..b156e5b9a4
--- /dev/null
+++ b/tests/deepsparse/evaluation/integrations/test_perplexity.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import copy
+
+import numpy as np
+
+import pytest
+from deepsparse.evaluation.integrations.perplexity import (
+    integration_eval,
+    load_perplexity_dataset,
+)
+from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
+from evaluate import load
+
+
+@pytest.fixture()
+def model_path():
+    return "hf:mgoin/TinyStories-1M-deepsparse"
+
+
+@pytest.fixture()
+def model_id():
+    return "roneneldan/TinyStories-1M"
+
+
+@pytest.mark.parametrize(
+    "datasets",
+    [
+        "openai_humaneval",
+        "wikitext2",
+    ],
+)
+@pytest.mark.parametrize("batch_size", [1, 2])
+class TestPerplexity:
+    limit = 2
+
+    def test_perplexity_ground_truth_equal_pipeline(
+        self, model_path, model_id, datasets, batch_size
+    ):
+        # setting max_sequence_length to 16 to speed up the test
+        kwargs_ground_truth = (
+            dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {}
+        )
+        kwargs = copy(kwargs_ground_truth)
+
+        result_gt = self._get_ground_truth(
+            datasets=datasets,
+            batch_size=batch_size,
+            limit=self.limit,
+            model_id=model_id,
+            kwargs=kwargs_ground_truth,
+        )
+
+        result = integration_eval(
+            pipeline=TextGenerationPipeline(
+                model_path="hf:mgoin/TinyStories-1M-deepsparse",
+                engine_type="onnxruntime",
+            ),
+            datasets=datasets,
+            batch_size=batch_size,
+            limit=self.limit,
+            # we are setting accumulate=False to compare
+            # with the torch ground truth apples to apples
+            accumulate=False,
+            **kwargs,
+        )
+        perplexities = result.formatted[0].metrics[0].value
+        perplexities_gt = result_gt["perplexities"]
+        assert np.allclose(perplexities, perplexities_gt, rtol=0.1)
+
+    def test_perplexity_kv_cache_pipeline_equal_no_kv_cache_pipeline(
+        self, model_path, model_id, datasets, batch_size
+    ):
+
+        kwargs_ground_truth = (
+            dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {}
+        )
+        kwargs = copy(kwargs_ground_truth)
+
+        result_kv_cache = integration_eval(
+            pipeline=TextGenerationPipeline(
+                model_path="hf:mgoin/TinyStories-1M-deepsparse",
+                engine_type="onnxruntime",
+            ),
+            datasets=datasets,
+            model_path=model_id,
+            batch_size=batch_size,
+            limit=self.limit,
+            **kwargs,
+        )
+
+        result_non_kv_cache = integration_eval(
+            pipeline=TextGenerationPipeline(
+                model_path="hf:mgoin/TinyStories-1M-deepsparse",
+                engine_type="onnxruntime",
+                onnx_model_name="model-orig.onnx",
+            ),
+            datasets=datasets,
+            batch_size=batch_size,
+            limit=self.limit,
+            **kwargs,
+        )
+
+        perplexities_kv_cache = result_kv_cache.formatted[0].metrics[0].value
+        perplexities_non_kv_cache = result_non_kv_cache.formatted[0].metrics[0].value
+        np.allclose(perplexities_kv_cache, perplexities_non_kv_cache, rtol=0.1)
+
+    @staticmethod
+    def _get_ground_truth(datasets, batch_size, limit, model_id, kwargs={}):
+        perplexity = load("perplexity", module_type="metric")
+        kwargs["model_path"] = model_id
+        dataset, *_ = load_perplexity_dataset(dataset_name=datasets, **kwargs)
+        predictions = []
+        for i, sample in enumerate(dataset):
+            if i == batch_size * limit:
+                break
+            predictions.append(sample)
+        return perplexity.compute(
+            predictions=predictions, add_start_token=False, model_id=model_id
+        )

From 7a3ad2fcfe7aa2c2c782ac24d9803417aeeb2bc9 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Fri, 9 Feb 2024 16:16:51 +0000
Subject: [PATCH 24/24] move the registration of the perplexity eval function
 where it belongs

---
 src/deepsparse/evaluation/evaluator.py        |  3 ---
 src/deepsparse/evaluation/utils.py            |  6 +++++-
 tests/deepsparse/evaluation/test_evaluator.py | 11 ++++++++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index 3926b78a2a..3d18f8489f 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -16,9 +16,6 @@
 from typing import List, Optional, Union
 
 from deepsparse import Pipeline
-from deepsparse.evaluation.integrations.perplexity import (  # noqa
-    integration_eval as integration_eval_perplexity,
-)
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Result
 from deepsparse.evaluation.utils import create_pipeline
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index a5dc460596..6e5ade9344 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -42,10 +42,14 @@ def potentially_check_dependency_import(integration_name: str) -> bool:
     :return: True if the dependency is installed, False otherwise
     """
 
-    if integration_name.replace("_", "-") == LM_EVALUATION_HARNESS:
+    if integration_name == LM_EVALUATION_HARNESS:
         from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
 
         try_import_lm_evaluation_harness()
+    if integration_name == PERPLEXITY:
+        from deepsparse.evaluation.integrations.perplexity import (  # noqa F401
+            integration_eval,
+        )
 
     return True
 
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index 241b9a4344..58eedff836 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -115,20 +115,25 @@ def test_evaluate_pipeline_without_kv_cache(
     not try_import_lm_evaluation_harness(raise_error=False),
     reason="lm_evaluation_harness not installed",
 )
-def test_evaluation_llm_evaluation_harness_integration_name(
+def test_evaluation_llm_evaluation_harness(
     model_path,
-    datasets,
 ):
     assert evaluate(
         model=model_path,
         # testing only on hellaswag dataset
         # to avoid long running time
-        datasets=datasets[0],
+        datasets="hellaswag",
         limit=1,
         integration="lm_evaluation_harness",
     )
 
 
+def test_evaluation_perplexity(model_path):
+    assert evaluate(
+        model=model_path, datasets="openai_humaneval", limit=1, integration="perplexity"
+    )
+
+
 @pytest.mark.parametrize("type_serialization", ["json", "yaml"])
 @pytest.mark.skipif(
     tuple(map(int, sys.version.split(".")[:2])) < (3, 10),