From e29211c95b75ff19e0bf0c65a26b6044ba51bf80 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 29 Jan 2024 16:19:06 -0500
Subject: [PATCH 01/16] skip continuous batching test (#1567)

---
 .../deepsparse/schedulers/test_continuous_batching_scheduler.py | 2 ++
 .../schedulers/utils/test_continuous_batching_executor.py       | 2 ++
 .../transformers/text_generation/integration_tests/test_llms.py | 1 +
 3 files changed, 5 insertions(+)

diff --git a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py
index 627202773c..4502a8fcf9 100644
--- a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py
+++ b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py
@@ -16,10 +16,12 @@
 
 import numpy
 
+import pytest
 from deepsparse.operators import EngineOperator
 from deepsparse.schedulers import ContinuousBatchingScheduler
 
 
+@pytest.mark.skip("skip continuous batching tests")
 def test_continuous_batching_executor_thread():
     # simple test that ContinuousBatchingScheduler can be instantiated and return
     # a result from a request, for testing multi-batch execution, making enough
diff --git a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py
index e26532d088..fa41259f21 100644
--- a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py
+++ b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py
@@ -16,6 +16,7 @@
 
 import numpy
 
+import pytest
 from deepsparse.operators import EngineOperator
 from deepsparse.schedulers.utils import (
     ContinuousBatchingExecutorThread,
@@ -23,6 +24,7 @@
 )
 
 
+@pytest.mark.skip("skip continuous batching tests")
 def test_continuous_batching_executor_thread():
     # mobilenet model with batch_size=2
     engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base")
diff --git a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py
index 633ee19c53..e03d97e7f3 100644
--- a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py
+++ b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py
@@ -132,6 +132,7 @@ def setup(self, params_dict, max_new_tokens, internal_kv_cache):
         self.default_pipeline = None
         self.max_new_tokens = max_new_tokens
 
+    @pytest.mark.skip("skip continuous batching tests")
     def test_continuous_batching_pipeline(self, setup):
 
         pipeline = self.get_pipeline(

From 46ce7474ab77b15f75457fd3663361786168cc5f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 31 Jan 2024 12:49:32 -0500
Subject: [PATCH 02/16] [server] Disable the elastic scheduler when continuous
 batching is enabled (#1569)

* update server to disable the context/elastic scheduler when continuous batching is enabled

* clean up when context is created
---
 src/deepsparse/server/deepsparse_server.py | 27 ++++++++++++++++++----
 src/deepsparse/server/openai_server.py     | 14 ++++++++++-
 src/deepsparse/server/server.py            | 14 ++++-------
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/src/deepsparse/server/deepsparse_server.py b/src/deepsparse/server/deepsparse_server.py
index 8ffc7508cb..0bf338cfc4 100644
--- a/src/deepsparse/server/deepsparse_server.py
+++ b/src/deepsparse/server/deepsparse_server.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 
 from deepsparse import Pipeline
@@ -73,12 +74,30 @@ def _add_endpoint(
         endpoint_config: EndpointConfig,
     ):
         pipeline_config = endpoint_config.to_pipeline_config()
-        pipeline_config.kwargs["executor"] = self.executor
 
         _LOGGER.info(f"Initializing pipeline for '{endpoint_config.name}'")
-        pipeline = Pipeline.from_config(
-            pipeline_config, context=self.context, logger=self.server_logger
-        )
+        if pipeline_config.kwargs.get("continuous_batch_sizes"):
+            pipeline_config.kwargs["executor"] = ThreadPoolExecutor(
+                max_workers=self.server_config.num_workers
+            )
+            _LOGGER.info(
+                "for continuous batching, the single stream scheduler will be enabled."
+            )
+            pipeline_config.num_cores = self.server_config.num_cores
+            pipeline_config.scheduler = "single"
+
+            pipeline = Pipeline.from_config(
+                pipeline_config,
+                num_streams=self.server_config.num_workers,
+                logger=self.server_logger,
+            )
+        else:
+            pipeline_config.kwargs["executor"] = ThreadPoolExecutor(
+                max_workers=self.context.num_streams
+            )
+            pipeline = Pipeline.from_config(
+                pipeline_config, context=self.context, logger=self.server_logger
+            )
 
         _LOGGER.info(f"Adding endpoints for '{endpoint_config.name}'")
         self._add_inference_endpoints(
diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py
index 2ef789c68e..c656323594 100644
--- a/src/deepsparse/server/openai_server.py
+++ b/src/deepsparse/server/openai_server.py
@@ -375,7 +375,19 @@ def _add_model(
                 f"{SupportedTasks.code_generation._fields}"
             )
 
-        pipeline = Pipeline.from_config(pipeline_config, context=self.context)
+        if pipeline_config.kwargs.get("continuous_batch_sizes"):
+            _LOGGER.info(
+                "for continuous batching, the single stream scheduler will be enabled."
+            )
+            pipeline_config.num_cores = self.server_config.num_cores
+            pipeline_config.scheduler = "single"
+
+            pipeline = Pipeline.from_config(
+                pipeline_config,
+                num_streams=self.server_config.num_workers,
+            )
+        else:
+            pipeline = Pipeline.from_config(pipeline_config, context=self.context)
 
         if not self.model_to_pipeline.get(endpoint_config.model):
             model_card = ModelCard(
diff --git a/src/deepsparse/server/server.py b/src/deepsparse/server/server.py
index 3c1cb053f7..8e1915a265 100644
--- a/src/deepsparse/server/server.py
+++ b/src/deepsparse/server/server.py
@@ -16,7 +16,6 @@
 import os
 from abc import abstractmethod
 from collections import Counter
-from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
 from typing import AsyncGenerator, List, Optional, Union
 
@@ -76,10 +75,11 @@ def __init__(self, server_config: Union[str, ServerConfig]):
             self.server_config = server_config
 
         _LOGGER.info(f"Using config: {repr(self.server_config)}")
-
-        self.context = None
-        self.executor = None
         self.server_logger = server_logger_from_config(self.server_config)
+        self.context = Context(
+            num_cores=self.server_config.num_cores,
+            num_streams=self.server_config.num_workers,
+        )
 
     def start_server(
         self,
@@ -109,12 +109,6 @@ def start_server(
                 self.config_path, f"http://{host}:{port}/endpoints", 0.5
             )
 
-        self.context = Context(
-            num_cores=self.server_config.num_cores,
-            num_streams=self.server_config.num_workers,
-        )
-        self.executor = ThreadPoolExecutor(max_workers=self.context.num_streams)
-
         app = self._build_app()
 
         uvicorn.run(

From 1f92f52134cfd5a938aaac6b255692593a35db5a Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 31 Jan 2024 16:45:20 -0500
Subject: [PATCH 03/16] [TextGeneration] Fix initialization; don't try v1 init
 for text gen (#1571)

* only check capacity condition durin prefill; already have check in generation

* dont try v1 if running text gen; just raise error
---
 src/deepsparse/pipeline.py                                  | 6 +++++-
 .../text_generation/autoregressive_preprocess_operator.py   | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index e2a1beeab1..aaa65409d8 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -27,6 +27,7 @@
     SchedulerGroup,
 )
 from deepsparse.subgraph_execute import SubGraphExecutor
+from deepsparse.tasks import SupportedTasks
 from deepsparse.utils import InferenceState, PipelineState
 from deepsparse.utils.subgraph import SubGraph
 from deepsparse.utils.time import TIMER_KEY, InferenceStages, TimerManager
@@ -139,7 +140,10 @@ def create(cls, task: str, **kwargs) -> "Pipeline":
                     "Pipeline was not created for the given task. The "
                     "provided task should be registered using the OperatorRegistry"
                 )
-        except Exception:
+        except Exception as e:
+            if SupportedTasks.is_text_generation(task):
+                raise e
+
             _LOGGER.warning(f"Could not create v2 '{task}' pipeline, trying legacy")
             from deepsparse.legacy import Pipeline
 
diff --git a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py
index df4e587df3..01d2a664b5 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py
@@ -51,7 +51,10 @@ def can_operate(self, inp: Any) -> bool:
         if inp.get("in_generation"):
             return True
 
-        if kv_cache.total_num_processed_tokens >= kv_cache.capacity:
+        if (
+            kv_cache.total_num_processed_tokens >= kv_cache.capacity
+            and inp.get("in_generation") is None
+        ):
             raise RuntimeError(
                 "Not enough kv_cache capacity to run generation. Please use a larger "
                 "sequence_length or a shorter prompt"

From 347caa4b1088dfc95597e4710f7a56cf7ddf04d5 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Fri, 2 Feb 2024 10:57:27 -0500
Subject: [PATCH 04/16] [BugFix] Add evaluate callable (#1576)

* Add evaluate callable

* Wrap transformers into try except
---
 src/deepsparse/__init__.py         |  1 +
 src/deepsparse/evaluation/utils.py | 23 ++++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py
index 2e00ae4949..c5c31ad48d 100644
--- a/src/deepsparse/__init__.py
+++ b/src/deepsparse/__init__.py
@@ -38,5 +38,6 @@
 from .version import __version__, is_release
 from .analytics import deepsparse_analytics as _analytics
 from .subgraph_execute import *
+from .evaluation.evaluator import evaluate
 
 _analytics.send_event("python__init")
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 0534a9f9f3..7684e54513 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -15,7 +15,14 @@
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-from transformers import AutoModelForCausalLM, PreTrainedModel
+
+try:
+    from transformers import AutoModelForCausalLM, PreTrainedModel
+
+    transformers_error = None
+except ImportError as import_error:
+    transformers_error = import_error
+
 
 from deepsparse import Pipeline
 from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE, ORT_ENGINE
@@ -50,7 +57,7 @@ def potentially_check_dependency_import(integration_name: str) -> bool:
 
 
 def resolve_integration(
-    model: Union[Pipeline, PreTrainedModel], datasets: Union[str, List[str]]
+    model: Union[Pipeline, "PreTrainedModel"], datasets: Union[str, List[str]]
 ) -> Union[str, None]:
     """
     Given a model and dataset, infer the name of the evaluation integration
@@ -73,6 +80,7 @@ def if_generative_language_model(model: Any) -> bool:
     """
     Checks if the model is a generative language model.
     """
+    _check_transformers_dependency()
     if isinstance(model, Pipeline):
         return model.__class__.__name__ == "TextGenerationPipeline"
     elif isinstance(model, PreTrainedModel):
@@ -130,7 +138,7 @@ def create_model_from_target(
     target: str,
     engine_type: Optional[str] = None,
     **kwargs,
-) -> Union[Pipeline, AutoModelForCausalLM]:
+) -> Union[Pipeline, "AutoModelForCausalLM"]:
     """
     Create a model or a pipeline from a target path.
 
@@ -146,6 +154,8 @@ def create_model_from_target(
     :param engine_type: The engine type to initialize the model with.
     :return: The initialized model
     """
+    _check_transformers_dependency()
+
     if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
         return Pipeline.create(
             task="text-generation",
@@ -157,3 +167,10 @@ def create_model_from_target(
         )
     else:
         return AutoModelForCausalLM.from_pretrained(target, **kwargs)
+
+
+def _check_transformers_dependency():
+    if transformers_error:
+        raise ImportError(
+            "transformers is needed to use this module"
+        ) from transformers_error

From 03c407837d8bfd53e37a7aa08f5babfb2a2eb8e7 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 5 Feb 2024 16:26:43 +0100
Subject: [PATCH 05/16] [Fix] ONNX model benchmarking when no sequence_length
 inferred from the model (#1581)

---
 src/deepsparse/benchmark/benchmark_model.py | 5 +++++
 src/deepsparse/utils/onnx.py                | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py
index 9539032259..4a43120f84 100644
--- a/src/deepsparse/benchmark/benchmark_model.py
+++ b/src/deepsparse/benchmark/benchmark_model.py
@@ -411,6 +411,11 @@ def benchmark_model(
         if not disable_kv_cache_overrides:
             if not sequence_length:
                 sequence_length = infer_sequence_length(model_path)
+                if not sequence_length:
+                    raise ValueError(
+                        "Unable to infer sequence length from model. "
+                        "Specify it manually through `sequence_length` argument."
+                    )
             if input_ids_length > sequence_length:
                 raise ValueError(
                     f"input_ids_length: {input_ids_length} "
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index e4b41f3286..423ec10f67 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -613,7 +613,8 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool:
 def infer_sequence_length(model: Union[str, ModelProto]) -> int:
     """
     :param model: model
-    :return: inferred sequence length of the model
+    :return: inferred sequence length of the model.
+        If unable to infer, return 0
     """
     if not isinstance(model, ModelProto):
         model = onnx.load(model, load_external_data=False)
@@ -623,9 +624,10 @@ def infer_sequence_length(model: Union[str, ModelProto]) -> int:
     for idx, inp in enumerate(model.graph.input):
         if inp.name == "attention_mask":
             target_input_idx = idx
+            break
     try:
         # return shape of second dim if possible
         target_input = model.graph.input[target_input_idx]
         return target_input.type.tensor_type.shape.dim[1].dim_value
     except Exception:
-        return 0  # unable to infer seq len
+        return 0

From cb52d6e4e51ed1e6bcd3df934fa129b82a4a6246 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Mon, 5 Feb 2024 10:54:11 -0500
Subject: [PATCH 06/16] Add analyze callable (#1574)

Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com>
---
 src/deepsparse/__init__.py |  1 +
 src/deepsparse/analyze.py  | 56 +++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py
index c5c31ad48d..436990b065 100644
--- a/src/deepsparse/__init__.py
+++ b/src/deepsparse/__init__.py
@@ -38,6 +38,7 @@
 from .version import __version__, is_release
 from .analytics import deepsparse_analytics as _analytics
 from .subgraph_execute import *
+from .analyze import analyze
 from .evaluation.evaluator import evaluate
 
 _analytics.send_event("python__init")
diff --git a/src/deepsparse/analyze.py b/src/deepsparse/analyze.py
index 2637d9b2e3..791bbbffe1 100644
--- a/src/deepsparse/analyze.py
+++ b/src/deepsparse/analyze.py
@@ -31,7 +31,11 @@
     ModelAnalysis,
     NodeInferenceResult,
 )
-from sparsezoo.analyze.cli import analyze_options, analyze_performance_options
+from sparsezoo.analyze.cli import (
+    DEEPSPARSE_ENGINE,
+    analyze_options,
+    analyze_performance_options,
+)
 
 
 _LOGGER = logging.getLogger(__name__)
@@ -74,21 +78,11 @@ def main(
             )
 
     _LOGGER.info("Starting Analysis ...")
-    analysis = ModelAnalysis.create(model_path)
-    _LOGGER.info("Analysis complete, collating results...")
-    scenario = BenchmarkScenario(
-        batch_size=batch_size_throughput,
-        num_cores=None,
-        engine=benchmark_engine,
-    )
-    performance_summary = run_benchmark_and_analysis(
-        onnx_model=model_to_path(model_path),
-        scenario=scenario,
-    )
+    analysis = analyze(model_path, batch_size_throughput, benchmark_engine)
+
     by_types: bool = convert_to_bool(by_types)
     by_layers: bool = convert_to_bool(by_layers)
 
-    analysis.benchmark_results = [performance_summary]
     summary = analysis.summary(
         by_types=by_types,
         by_layers=by_layers,
@@ -103,13 +97,9 @@ def main(
 
         print("Comparison Analysis:")
         for model_to_compare in compare:
-            compare_model_analysis = ModelAnalysis.create(model_to_compare)
-            _LOGGER.info(f"Running Performance Analysis on {model_to_compare}")
-            performance_summary = run_benchmark_and_analysis(
-                onnx_model=model_to_path(model_to_compare),
-                scenario=scenario,
+            compare_model_analysis = analyze(
+                model_to_compare, batch_size_throughput, benchmark_engine
             )
-            compare_model_analysis.benchmark_results = [performance_summary]
             summary_comparison_model = compare_model_analysis.summary(
                 by_types=by_types,
                 by_layers=by_layers,
@@ -124,6 +114,34 @@ def main(
         analysis.yaml(file_path=save)
 
 
+def analyze(
+    model_path,
+    batch_size_throughput: int = 1,
+    benchmark_engine: str = DEEPSPARSE_ENGINE,
+) -> ModelAnalysis:
+    """
+    :param model_path: Local filepath to an ONNX model, or a SparseZoo stub
+    :param batch_size_throughput: Batch size for throughput benchmark
+    :param benchmark_engine: Benchmark engine to use, can be 'deepsparse' or
+        'onnxruntime', defaults to 'deepsparse'
+    :return: A `ModelAnalysis` object encapsulating the results of the analysis
+    """
+    analysis = ModelAnalysis.create(model_path)
+    _LOGGER.info("Analysis complete, collating results...")
+    scenario = BenchmarkScenario(
+        batch_size=batch_size_throughput,
+        num_cores=None,
+        engine=benchmark_engine,
+    )
+    performance_summary = run_benchmark_and_analysis(
+        onnx_model=model_to_path(model_path),
+        scenario=scenario,
+    )
+
+    analysis.benchmark_results = [performance_summary]
+    return analysis
+
+
 def run_benchmark_and_analysis(
     onnx_model: str,
     scenario: BenchmarkScenario,

From 59e0602ba1ac24a7a02f2f1856ef44b5528c0d8f Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 5 Feb 2024 16:56:34 +0100
Subject: [PATCH 07/16] [DeepSparse Evaluation API] UX Improvements (#1568)

* initial commit

* add some more tests for hardening

* Update src/deepsparse/evaluation/cli.py

* Update src/deepsparse/transformers/pipelines/text_generation/pipeline.py

* Apply suggestions from code review

* quality

* Update test_evaluator.py

* quality
---
 src/deepsparse/evaluation/cli.py              | 30 +++----
 src/deepsparse/evaluation/evaluator.py        | 34 +++++---
 src/deepsparse/evaluation/registry.py         |  9 +-
 src/deepsparse/evaluation/utils.py            | 85 +++++++------------
 .../pipelines/text_generation/pipeline.py     |  8 ++
 .../text_generation/pipeline_no_kv_cache.py   |  8 ++
 .../test_lm_evaluation_harness.py             |  8 +-
 tests/deepsparse/evaluation/test_evaluator.py | 47 +++++++---
 tests/deepsparse/evaluation/test_utils.py     | 53 ++----------
 9 files changed, 132 insertions(+), 150 deletions(-)

diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index ed7ea72831..b68d32d4e5 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,8 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --target TARGET     A path to a remote or local directory containing ONNX/torch model
+    --model_path MODEL_PATH
+                        A path to an ONNX model, local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
                         The dataset to evaluate on. The user may pass multiple datasets
@@ -30,9 +31,7 @@
                         integration name that is registered in the evaluation registry
     -e ENGINE_TYPE, --engine_type ENGINE_TYPE
                         Inference engine to use for the evaluation. The default
-                        is the DeepSparse engine. If the evaluation should be run
-                        without initializing a pipeline (e.g. for the evaluation
-                        of a torch model), the engine type should be set to None
+                        is the DeepSparse engine.
     -s SAVE_PATH, --save_path SAVE_PATH
                         The path to save the evaluation results.
                         By default the results will be saved in the
@@ -90,10 +89,10 @@
     )
 )
 @click.option(
-    "--target",
+    "--model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to a remote or local directory containing ONNX/torch model "
+    help="A path to an ONNX model, local directory containing ONNX model"
     "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
@@ -118,9 +117,7 @@
     type=click.Choice([DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE]),
     default=DEEPSPARSE_ENGINE,
     help="The engine to use for the evaluation. The default is the "
-    "DeepSparse engine. If the evaluation should be run without "
-    "initializing a pipeline (e.g. for the evaluation of a torch "
-    "model), the engine type should be set to None",
+    "DeepSparse engine. ",
 )
 @click.option(
     "-s",
@@ -167,7 +164,7 @@
 )
 @click.argument("integration_args", nargs=-1, type=click.UNPROCESSED)
 def main(
-    target,
+    model_path,
     dataset,
     integration,
     engine_type,
@@ -183,14 +180,9 @@ def main(
     # format kwargs to a  dict
     integration_args = args_to_dict(integration_args)
 
-    _LOGGER.info(f"Target to evaluate: {target}")
-    if engine_type:
-        _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created")
-    else:
-        _LOGGER.info(
-            "No engine type specified. The target "
-            "will be evaluated using the native framework"
-        )
+    _LOGGER.info(
+        f"Creating {engine_type} pipeline to evaluate from model path: {model_path}"
+    )
 
     _LOGGER.info(
         f"Datasets to evaluate on: {datasets}\n"
@@ -201,7 +193,7 @@ def main(
     )
 
     result: Result = evaluate(
-        target=target,
+        model=model_path,
         datasets=datasets,
         integration=integration,
         engine_type=engine_type,
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index 7bd56adf6e..b513f07563 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-from typing import Any, List, Optional, Union
+from pathlib import Path
+from typing import List, Optional, Union
 
+from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Result
-from deepsparse.evaluation.utils import create_model_from_target
+from deepsparse.evaluation.utils import create_pipeline
 from deepsparse.operators.engine_operator import (
     DEEPSPARSE_ENGINE,
     ORT_ENGINE,
@@ -30,11 +32,11 @@
 
 
 def evaluate(
-    target: Any,
+    model: Union[Pipeline, Path, str],
     datasets: Union[str, List[str]],
     integration: Optional[str] = None,
     engine_type: Union[
-        DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE, None
+        DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE
     ] = DEEPSPARSE_ENGINE,
     batch_size: int = 1,
     splits: Union[List[str], str, None] = None,
@@ -42,18 +44,26 @@ def evaluate(
     **kwargs,
 ) -> Result:
 
-    # if target is a string, turn it into an appropriate model/pipeline
-    # otherwise assume it is a model/pipeline
-    model = (
-        create_model_from_target(target, engine_type)
-        if isinstance(target, str)
-        else target
+    if isinstance(model, Pipeline):
+        _LOGGER.info(
+            "Passed a Pipeline object into evaluate function. This will "
+            "override the following arguments:"
+        )
+        batch_size = model.batch_size
+        _LOGGER.info(f"batch_size: {batch_size}")
+        engine_type = engine_type
+        _LOGGER.info(f"engine_type: {engine_type}")
+
+    # if target is a string, turn it into an appropriate pipeline
+    # otherwise assume it is a pipeline
+    pipeline = (
+        create_pipeline(model, engine_type) if isinstance(model, (Path, str)) else model
     )
 
-    eval_integration = EvaluationRegistry.resolve(model, datasets, integration)
+    eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration)
 
     return eval_integration(
-        model=model,
+        pipeline=pipeline,
         datasets=datasets,
         engine_type=engine_type,
         batch_size=batch_size,
diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py
index 5b6e45bc1c..2daabb69cc 100644
--- a/src/deepsparse/evaluation/registry.py
+++ b/src/deepsparse/evaluation/registry.py
@@ -15,8 +15,9 @@
 Implementation of a registry for evaluation functions
 """
 import logging
-from typing import Any, Callable, List, Optional, Union
+from typing import Callable, List, Optional, Union
 
+from deepsparse import Pipeline
 from sparsezoo.utils.registry import RegistryMixin
 
 
@@ -38,7 +39,7 @@ def load_from_registry(cls, name: str) -> Callable[..., "Result"]:  # noqa: F821
     @classmethod
     def resolve(
         cls,
-        model: Any,
+        pipeline: Pipeline,
         datasets: Union[str, List[str]],
         integration: Optional[str] = None,
     ) -> Callable[..., "Result"]:  # noqa: F821
@@ -59,12 +60,12 @@ def resolve(
                 "No integration specified, inferring the evaluation"
                 "function from the input arguments..."
             )
-            integration = resolve_integration(model, datasets)
+            integration = resolve_integration(pipeline, datasets)
 
             if integration is None:
                 raise ValueError(
                     "Unable to resolve an evaluation function for the given model. "
-                    "Specify an integration name or use a model that is supported "
+                    "Specify an integration name or use a pipeline that is supported "
                 )
             _LOGGER.info(f"Inferred the evaluation function: {integration}")
 
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 7684e54513..87475dd5d2 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -15,21 +15,11 @@
 import os
 from typing import Any, Dict, List, Optional, Tuple, Union
 
-
-try:
-    from transformers import AutoModelForCausalLM, PreTrainedModel
-
-    transformers_error = None
-except ImportError as import_error:
-    transformers_error = import_error
-
-
 from deepsparse import Pipeline
-from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE, ORT_ENGINE
 
 
 __all__ = [
-    "create_model_from_target",
+    "create_pipeline",
     "get_save_path",
     "args_to_dict",
     "resolve_integration",
@@ -57,36 +47,36 @@ def potentially_check_dependency_import(integration_name: str) -> bool:
 
 
 def resolve_integration(
-    model: Union[Pipeline, "PreTrainedModel"], datasets: Union[str, List[str]]
+    pipeline: Pipeline, datasets: Union[str, List[str]]
 ) -> Union[str, None]:
     """
-    Given a model and dataset, infer the name of the evaluation integration
+    Given a pipeline and dataset, infer the name of the evaluation integration
     to use. If unable to infer a name, return None.
 
     Currently:
         if the model is a generative language model,
         default to 'lm-evaluation-harness' otherwise return None
 
-    :param model: The model to infer the integration for
+    :param pipeline: The pipeline to infer the integration for
     :param datasets: The datasets to infer the integration for
     :return: The name of the integration to use or None if unable to infer
     """
-    if if_generative_language_model(model):
+    if if_generative_language_model(pipeline):
         return LM_EVALUATION_HARNESS
     return None
 
 
-def if_generative_language_model(model: Any) -> bool:
+def if_generative_language_model(pipeline: Pipeline) -> bool:
     """
     Checks if the model is a generative language model.
     """
-    _check_transformers_dependency()
-    if isinstance(model, Pipeline):
-        return model.__class__.__name__ == "TextGenerationPipeline"
-    elif isinstance(model, PreTrainedModel):
-        return "CausalLM" in model.__class__.__name__
-    else:
-        return False
+    pipeline_name = pipeline.__class__.__name__
+    if pipeline_name == "TextGenerationPipeline" or (
+        pipeline_name == "TextGenerationPipelineNoKVCache"
+    ):
+        return True
+
+    return False
 
 
 def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]:
@@ -134,43 +124,30 @@ def get_save_path(
     return os.path.join(base_path, file_name)
 
 
-def create_model_from_target(
-    target: str,
+def create_pipeline(
+    model_path: str,
     engine_type: Optional[str] = None,
     **kwargs,
-) -> Union[Pipeline, "AutoModelForCausalLM"]:
+) -> Pipeline:
     """
-    Create a model or a pipeline from a target path.
+    Create a pipeline for evaluation
 
-    Note: This function is currently limited to:
-        - creating pipelines of type 'text-generation'
-        - creating dense huggingface models of type 'AutoModelForCausalLM'
-    This function will be expanded in the future to support more
-    model types and frameworks.
+    Note: This function is currently primarily
+    focused on creating pipelines of type 'text-generation'
+    This function will be expanded in the future to support
+    more tasks and models
 
-    :param target: The target path to initialize the
+    :param model_path: The target path to initialize the
         text generation model from. This can be a local
         or remote path to the model or a sparsezoo stub
     :param engine_type: The engine type to initialize the model with.
-    :return: The initialized model
+    :return: The initialized pipeline
     """
-    _check_transformers_dependency()
-
-    if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]:
-        return Pipeline.create(
-            task="text-generation",
-            model_path=target,
-            sequence_length=kwargs.pop("sequence_length", 2048),
-            engine_type=engine_type,
-            batch_size=kwargs.pop("batch_size", 1),
-            **kwargs,
-        )
-    else:
-        return AutoModelForCausalLM.from_pretrained(target, **kwargs)
-
-
-def _check_transformers_dependency():
-    if transformers_error:
-        raise ImportError(
-            "transformers is needed to use this module"
-        ) from transformers_error
+    return Pipeline.create(
+        task=kwargs.pop("task", "text-generation"),
+        model_path=model_path,
+        sequence_length=kwargs.pop("sequence_length", 2048),
+        engine_type=engine_type,
+        batch_size=kwargs.pop("batch_size", 1),
+        **kwargs,
+    )
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
index 2c858c901b..64c0c64a51 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -357,6 +357,14 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
+    @property
+    def batch_size(self) -> int:
+        return self.ops["single_engine"].batch_size
+
+    @property
+    def engine_type(self) -> str:
+        return self.ops["single_engine"]._engine_type
+
     def _get_continuous_batching_scheduler(
         self, batch_sizes: List[int], engines: List[EngineOperator]
     ) -> ContinuousBatchingScheduler:
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
index 7f6cb9db5f..c6cbc3dd59 100644
--- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
+++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
@@ -127,3 +127,11 @@ def expand_inputs(self, items, batch_size):
         out, orig_batch_size = split_engine_inputs(items, batch_size)
         combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out]
         return combined_batches, orig_batch_size
+
+    @property
+    def batch_size(self) -> int:
+        return self.ops["engine_operator"].batch_size
+
+    @property
+    def engine_type(self) -> str:
+        return self.ops["engine_operator"]._engine_type
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 9fa9b494cf..3b9016294f 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -12,19 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from transformers import AutoModelForCausalLM
+
 import pytest
 from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
-from deepsparse.evaluation.utils import create_model_from_target
+from deepsparse.evaluation.utils import create_pipeline
 
 
 @pytest.mark.parametrize(
     "pipeline, model_torch",
     [
         (
-            create_model_from_target(
+            create_pipeline(
                 "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime"
             ),
-            create_model_from_target("roneneldan/TinyStories-1M"),
+            AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M"),
         )
     ],
 )
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index dedd63fa36..816ad075e0 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -29,6 +29,7 @@
     Metric,
     Result,
 )
+from deepsparse.pipeline import Pipeline
 
 
 @EvaluationRegistry.register()
@@ -49,7 +50,7 @@ def dummy_integration(*args, **kwargs):
 
 
 @pytest.fixture()
-def target():
+def model_path():
     return "hf:mgoin/TinyStories-1M-deepsparse"
 
 
@@ -68,18 +69,42 @@ def unknown_integration_name():
     return "unknown_integration"
 
 
-def test_evaluate_unknown_integration(target, datasets, unknown_integration_name):
+def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name):
     with pytest.raises(KeyError):
         evaluate(
-            target=target,
+            model=model_path,
             datasets=datasets,
             integration=unknown_integration_name,
         )
 
 
-def test_evaluate(target, datasets, dummy_integration_name):
+def test_evaluate(model_path, datasets, dummy_integration_name):
     result = evaluate(
-        target=target,
+        model=model_path,
+        datasets=datasets,
+        integration=dummy_integration_name,
+    )
+    assert isinstance(result, Result)
+
+
+def test_evaluate_pipeline_with_kv_cache(model_path, datasets, dummy_integration_name):
+    result = evaluate(
+        model=Pipeline.create(model_path=model_path, task="text-generation"),
+        datasets=datasets,
+        integration=dummy_integration_name,
+    )
+    assert isinstance(result, Result)
+
+
+def test_evaluate_pipeline_without_kv_cache(
+    model_path, datasets, dummy_integration_name
+):
+    result = evaluate(
+        model=Pipeline.create(
+            model_path=model_path,
+            task="text-generation",
+            onnx_model_name="model-orig.onnx",
+        ),
         datasets=datasets,
         integration=dummy_integration_name,
     )
@@ -91,11 +116,11 @@ def test_evaluate(target, datasets, dummy_integration_name):
     reason="lm_evaluation_harness not installed",
 )
 def test_evaluation_llm_evaluation_harness_integration_name(
-    target,
+    model_path,
     datasets,
 ):
     assert evaluate(
-        target=target,
+        model=model_path,
         datasets=datasets,
         limit=2,
         no_cache=True,
@@ -110,15 +135,17 @@ def test_evaluation_llm_evaluation_harness_integration_name(
     "with importing functions that are decorated with "
     "click option where multiple=True",
 )
-def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serialization):
+def test_cli(
+    tmp_path, model_path, datasets, dummy_integration_name, type_serialization
+):
     from deepsparse.evaluation.cli import main
 
     runner = CliRunner()
     runner.invoke(
         main,
         [
-            "--target",
-            target,
+            "--model_path",
+            model_path,
             "--dataset",
             datasets[0],
             "--dataset",
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
index f712dce0df..f8f3c731a8 100644
--- a/tests/deepsparse/evaluation/test_utils.py
+++ b/tests/deepsparse/evaluation/test_utils.py
@@ -14,32 +14,16 @@
 
 import os
 
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForSequenceClassification,
-    GPTNeoForCausalLM,
-)
-
 import pytest
 from deepsparse import Pipeline
 from deepsparse.evaluation.utils import (
-    create_model_from_target,
+    create_pipeline,
     get_save_path,
     if_generative_language_model,
     resolve_integration,
 )
 
 
-@pytest.fixture
-def llm_type_hf_model():
-    return AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M")
-
-
-@pytest.fixture
-def not_llm_type_hf_model():
-    return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-
-
 @pytest.fixture
 def llm_type_pipeline():
     return Pipeline.create(
@@ -49,25 +33,13 @@ def llm_type_pipeline():
     )
 
 
-def test_resolve_known_llm_model(llm_type_hf_model):
+def test_resolve_known_llm_pipeline(llm_type_pipeline):
     assert (
-        resolve_integration(model=llm_type_hf_model, datasets="")
+        resolve_integration(pipeline=llm_type_pipeline, datasets="")
         == "lm-evaluation-harness"
     )
 
 
-def test_resolve_unknown_model(not_llm_type_hf_model):
-    assert resolve_integration(model=not_llm_type_hf_model, datasets="") is None
-
-
-def test_if_generative_language_model_true(llm_type_hf_model):
-    assert if_generative_language_model(llm_type_hf_model)
-
-
-def test_if_generative_language_model_false(not_llm_type_hf_model):
-    assert not if_generative_language_model(not_llm_type_hf_model)
-
-
 def test_if_generative_language_pipeline_true(llm_type_pipeline):
     assert if_generative_language_model(llm_type_pipeline)
 
@@ -89,26 +61,11 @@ def pipeline_target():
     return "hf:mgoin/TinyStories-1M-deepsparse"
 
 
-@pytest.fixture
-def torch_target():
-    return "roneneldan/TinyStories-1M"
-
-
 def test_initialize_model_from_target_pipeline_onnx(pipeline_target):
-    model = create_model_from_target(pipeline_target, "onnxruntime")
+    model = create_pipeline(pipeline_target, "onnxruntime")
     assert model.ops.get("single_engine")._engine_type == "onnxruntime"
 
 
-def test_initialize_model_from_target_pipeline_deepsparse(pipeline_target):
-    model = create_model_from_target(pipeline_target, "deepsparse")
-    assert model.ops.get("single_engine")._engine_type == "deepsparse"
-
-
 def test_initialize_model_from_target_pipeline_with_kwargs(pipeline_target):
-    model = create_model_from_target(pipeline_target, "deepsparse", sequence_length=64)
+    model = create_pipeline(pipeline_target, "deepsparse", sequence_length=64)
     assert model.ops.get("process_input").sequence_length == 64
-
-
-def test_initialize_model_from_target_torch(torch_target):
-    model = create_model_from_target(torch_target, "torch")
-    assert isinstance(model, GPTNeoForCausalLM)

From d7d1acb6b0d88dfed279f8209b138043eaeaad34 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 6 Feb 2024 10:37:13 -0500
Subject: [PATCH 08/16] add back continuous batching tests (#1585)

---
 .../deepsparse/schedulers/test_continuous_batching_scheduler.py | 2 --
 .../schedulers/utils/test_continuous_batching_executor.py       | 2 --
 .../transformers/text_generation/integration_tests/test_llms.py | 1 -
 3 files changed, 5 deletions(-)

diff --git a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py
index 4502a8fcf9..627202773c 100644
--- a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py
+++ b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py
@@ -16,12 +16,10 @@
 
 import numpy
 
-import pytest
 from deepsparse.operators import EngineOperator
 from deepsparse.schedulers import ContinuousBatchingScheduler
 
 
-@pytest.mark.skip("skip continuous batching tests")
 def test_continuous_batching_executor_thread():
     # simple test that ContinuousBatchingScheduler can be instantiated and return
     # a result from a request, for testing multi-batch execution, making enough
diff --git a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py
index fa41259f21..e26532d088 100644
--- a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py
+++ b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py
@@ -16,7 +16,6 @@
 
 import numpy
 
-import pytest
 from deepsparse.operators import EngineOperator
 from deepsparse.schedulers.utils import (
     ContinuousBatchingExecutorThread,
@@ -24,7 +23,6 @@
 )
 
 
-@pytest.mark.skip("skip continuous batching tests")
 def test_continuous_batching_executor_thread():
     # mobilenet model with batch_size=2
     engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base")
diff --git a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py
index e03d97e7f3..633ee19c53 100644
--- a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py
+++ b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py
@@ -132,7 +132,6 @@ def setup(self, params_dict, max_new_tokens, internal_kv_cache):
         self.default_pipeline = None
         self.max_new_tokens = max_new_tokens
 
-    @pytest.mark.skip("skip continuous batching tests")
     def test_continuous_batching_pipeline(self, setup):
 
         pipeline = self.get_pipeline(

From 8a83e24edd41150844525d6175b7ea9e63111ce2 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Tue, 6 Feb 2024 16:27:23 -0500
Subject: [PATCH 09/16] add top level benchmark imports (#1587)

---
 src/deepsparse/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py
index 436990b065..fab294bfaa 100644
--- a/src/deepsparse/__init__.py
+++ b/src/deepsparse/__init__.py
@@ -40,5 +40,6 @@
 from .subgraph_execute import *
 from .analyze import analyze
 from .evaluation.evaluator import evaluate
+from .benchmark import benchmark_model, benchmark_pipeline
 
 _analytics.send_event("python__init")

From c54461a9ea0832b8c76366400b21fe73baebd3d4 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 7 Feb 2024 13:08:24 -0500
Subject: [PATCH 10/16] [TextGeneration] Add helper function to parse model
 path from args (#1583)

* add helper function to parse model path from args

* update model path

* revert cli changes

* remove empty args
---
 src/deepsparse/pipeline.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index aaa65409d8..23ff3a2810 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -702,7 +702,8 @@ def text_generation_pipeline(*args, **kwargs) -> "Pipeline":
     :return: text generation pipeline with the given args and
         kwargs passed to Pipeline.create
     """
-    return Pipeline.create("text_generation", *args, **kwargs)
+    kwargs = _check_model_path_arg(*args, **kwargs)
+    return Pipeline.create("text_generation", **kwargs)
 
 
 def code_generation_pipeline(*args, **kwargs) -> "Pipeline":
@@ -710,7 +711,8 @@ def code_generation_pipeline(*args, **kwargs) -> "Pipeline":
     :return: text generation pipeline with the given args and
         kwargs passed to Pipeline.create
     """
-    return Pipeline.create("code_generation", *args, **kwargs)
+    kwargs = _check_model_path_arg(*args, **kwargs)
+    return Pipeline.create("code_generation", **kwargs)
 
 
 def chat_pipeline(*args, **kwargs) -> "Pipeline":
@@ -718,7 +720,8 @@ def chat_pipeline(*args, **kwargs) -> "Pipeline":
     :return: text generation pipeline with the given args and
         kwargs passed to Pipeline.create
     """
-    return Pipeline.create("chat", *args, **kwargs)
+    kwargs = _check_model_path_arg(*args, **kwargs)
+    return Pipeline.create("chat", **kwargs)
 
 
 TextGeneration = text_generation_pipeline
@@ -802,3 +805,13 @@ def zero_shot_text_classification_pipeline(*args, **kwargs) -> "Pipeline":
     is returned depends on the value of the passed model_scheme argument.
     """
     return Pipeline.create("zero_shot_text_classification", *args, **kwargs)
+
+
+def _check_model_path_arg(*args, **kwargs):
+    if args:
+        if len(args) > 1 or "model_path" in kwargs or "model" in kwargs:
+            raise ValueError(
+                "Only the model path can be provided as a non-kwarg argument"
+            )
+        kwargs["model_path"] = args[0]
+    return kwargs

From 2e33e673cf897ee93689a6ade070b40503ee8df3 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 7 Feb 2024 14:24:22 -0500
Subject: [PATCH 11/16] [server] Add `model` argument to server cli (#1584)

* update model path to be an argument; remove unused openai command pathway

* add model path arg and option
---
 src/deepsparse/server/cli.py | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py
index 5eacc748a0..acd7b6897c 100644
--- a/src/deepsparse/server/cli.py
+++ b/src/deepsparse/server/cli.py
@@ -79,6 +79,7 @@
     ),
 )
 
+MODEL_ARG = click.argument("model", type=str, default=None, required=False)
 MODEL_OPTION = click.option(
     "--model_path",
     type=str,
@@ -152,6 +153,7 @@
 @PORT_OPTION
 @LOG_LEVEL_OPTION
 @HOT_RELOAD_OPTION
+@MODEL_ARG
 @MODEL_OPTION
 @BATCH_OPTION
 @CORES_OPTION
@@ -167,6 +169,7 @@ def main(
     log_level: str,
     hot_reload_config: bool,
     model_path: str,
+    model: str,
     batch_size: int,
     num_cores: int,
     num_workers: int,
@@ -216,6 +219,17 @@ def main(
        ...
     ```
     """
+    # the server cli can take a model argument or --model_path option
+    # if the --model_path option is provided, use that
+    # otherwise if the argument is given and --model_path is not used, use the
+    # argument instead
+    if model and model_path == "default":
+        model_path = model
+
+    if integration == INTEGRATION_OPENAI:
+        if task is None or task != "text_generation":
+            task = "text_generation"
+
     if ctx.invoked_subcommand is not None:
         return
 
@@ -254,24 +268,6 @@ def main(
         server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)
 
 
-@main.command(
-    context_settings=dict(
-        token_normalize_func=lambda x: x.replace("-", "_"), show_default=True
-    ),
-)
-@click.argument("config-file", type=str)
-@HOST_OPTION
-@PORT_OPTION
-@LOG_LEVEL_OPTION
-@HOT_RELOAD_OPTION
-def openai(
-    config_file: str, host: str, port: int, log_level: str, hot_reload_config: bool
-):
-
-    server = OpenAIServer(server_config=config_file)
-    server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)
-
-
 @main.command(
     context_settings=dict(
         token_normalize_func=lambda x: x.replace("-", "_"), show_default=True

From de64fe5f0d93b1d036a37c6cee22dcaff13663d9 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Thu, 8 Feb 2024 15:31:42 -0500
Subject: [PATCH 12/16] make sure benchmark_* imports are the functions, not
 modules (#1593)

---
 src/deepsparse/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py
index fab294bfaa..46c49b236d 100644
--- a/src/deepsparse/__init__.py
+++ b/src/deepsparse/__init__.py
@@ -40,6 +40,7 @@
 from .subgraph_execute import *
 from .analyze import analyze
 from .evaluation.evaluator import evaluate
-from .benchmark import benchmark_model, benchmark_pipeline
+from .benchmark.benchmark_model import benchmark_model
+from .benchmark.benchmark_pipeline import benchmark_pipeline
 
 _analytics.send_event("python__init")

From 0ddeda269410cb71e2225dd96759f914a9a2e1af Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 9 Feb 2024 10:35:49 -0500
Subject: [PATCH 13/16] [server] Update readmes to no longer use the deprecated
 pathway + update pathway as per new UX docs (#1592)

* update readmes to no longer use the depreciated pathway

* update to get new ux workflows
---
 src/deepsparse/server/README.md       |  12 +--
 src/deepsparse/server/cli.py          | 106 ++------------------------
 src/deepsparse/transformers/README.md |  10 +--
 src/deepsparse/yolact/README.md       |   2 +-
 src/deepsparse/yolo/README.md         |   2 +-
 5 files changed, 16 insertions(+), 116 deletions(-)

diff --git a/src/deepsparse/server/README.md b/src/deepsparse/server/README.md
index d4beb3907f..1326ce677a 100644
--- a/src/deepsparse/server/README.md
+++ b/src/deepsparse/server/README.md
@@ -18,15 +18,15 @@ Usage: deepsparse.server [OPTIONS] COMMAND [ARGS]...
 
       1. `deepsparse.server --config_file [OPTIONS] <config path>`
 
-      2. `deepsparse.server task [OPTIONS] <task>
+      2. `deepsparse.server --task [OPTIONS] <task>
 
   Examples for using the server:
 
       `deepsparse.server --config_file server-config.yaml`
 
-      `deepsparse.server task question_answering --batch-size 2`
+      `deepsparse.server --task question_answering --batch-size 2`
 
-      `deepsparse.server task question_answering --host "0.0.0.0"`
+      `deepsparse.server --task question_answering --host "0.0.0.0"`
 
   Example config.yaml for serving:
 
@@ -63,10 +63,6 @@ Usage: deepsparse.server [OPTIONS] COMMAND [ARGS]...
   
 Options:
   --help  Show this message and exit.
-
-Commands:
-  config  Run the server using configuration from a .yaml file.
-  task    Run the server using configuration with CLI options, which can...
 ```
 ---
 <h3>Note on the latest server release</h3>
@@ -104,7 +100,7 @@ Example CLI command for serving a single model for the **question answering** ta
 
 ```bash
 deepsparse.server \
-    task question_answering \
+    --task question_answering \
     --model_path "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni"
 ```
 
diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py
index acd7b6897c..6d3952c5f5 100644
--- a/src/deepsparse/server/cli.py
+++ b/src/deepsparse/server/cli.py
@@ -11,16 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-"""
-There are two sub-commands for the server:
-1. `deepsparse.server config [OPTIONS] <config path>`
-2. `deepsparse.server task [OPTIONS] <task>
-```
-"""
-
 import os
-import warnings
 from tempfile import TemporaryDirectory
 from typing import Optional, Union
 
@@ -223,6 +214,7 @@ def main(
     # if the --model_path option is provided, use that
     # otherwise if the argument is given and --model_path is not used, use the
     # argument instead
+
     if model and model_path == "default":
         model_path = model
 
@@ -236,6 +228,10 @@ def main(
     if task is None and config_file is None:
         raise ValueError("Must specify either --task or --config_file. Found neither")
 
+    if config_file is not None:
+        server = _fetch_server(integration=integration, config=config_file)
+        server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)
+
     if task is not None:
         cfg = ServerConfig(
             num_cores=num_cores,
@@ -263,98 +259,6 @@ def main(
                 host, port, log_level, hot_reload_config=hot_reload_config
             )
 
-    if config_file is not None:
-        server = _fetch_server(integration=integration, config=config_file)
-        server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)
-
-
-@main.command(
-    context_settings=dict(
-        token_normalize_func=lambda x: x.replace("-", "_"), show_default=True
-    ),
-)
-@click.argument("config-path", type=str)
-@HOST_OPTION
-@PORT_OPTION
-@LOG_LEVEL_OPTION
-@HOT_RELOAD_OPTION
-def config(
-    config_path: str, host: str, port: int, log_level: str, hot_reload_config: bool
-):
-    "[DEPRECATED] Run the server using configuration from a .yaml file."
-    warnings.simplefilter("always", DeprecationWarning)
-    warnings.warn(
-        "Using the `config` sub command is deprecated. "
-        "Use the `--config_file` argument instead.",
-        category=DeprecationWarning,
-    )
-
-
-@main.command(
-    context_settings=dict(
-        token_normalize_func=lambda x: x.replace("-", "_"), show_default=True
-    ),
-)
-@click.argument(
-    "task",
-    type=click.Choice(SupportedTasks.task_names(), case_sensitive=False),
-)
-@MODEL_OPTION
-@BATCH_OPTION
-@CORES_OPTION
-@WORKERS_OPTION
-@HOST_OPTION
-@PORT_OPTION
-@LOG_LEVEL_OPTION
-@HOT_RELOAD_OPTION
-@INTEGRATION_OPTION
-def task(
-    task: str,
-    model_path: str,
-    batch_size: int,
-    num_cores: int,
-    num_workers: int,
-    host: str,
-    port: int,
-    log_level: str,
-    hot_reload_config: bool,
-    integration: str,
-):
-    """
-    [DEPRECATED] Run the server using configuration with CLI options,
-    which can only serve a single model.
-    """
-
-    warnings.simplefilter("always", DeprecationWarning)
-    warnings.warn(
-        "Using the `task` sub command is deprecated. "
-        "Use the `--task` argument instead.",
-        category=DeprecationWarning,
-    )
-
-    cfg = ServerConfig(
-        num_cores=num_cores,
-        num_workers=num_workers,
-        integration=integration,
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name=f"{task}",
-                model=model_path,
-                batch_size=batch_size,
-            )
-        ],
-        loggers={},
-    )
-
-    with TemporaryDirectory() as tmp_dir:
-        config_path = os.path.join(tmp_dir, "server-config.yaml")
-        with open(config_path, "w") as fp:
-            yaml.dump(cfg.dict(), fp)
-
-        server = _fetch_server(integration=integration, config=config_path)
-        server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)
-
 
 def _fetch_server(integration: str, config: Union[ServerConfig, str]):
     if isinstance(config, str):
diff --git a/src/deepsparse/transformers/README.md b/src/deepsparse/transformers/README.md
index 33cc4e758c..f1e38aee85 100644
--- a/src/deepsparse/transformers/README.md
+++ b/src/deepsparse/transformers/README.md
@@ -118,7 +118,7 @@ inference = qa_pipeline(question="What's my name?", context="My name is Snorlax"
 Spinning up:
 ```bash
 deepsparse.server \
-    task question-answering \
+    --task question-answering \
     --model_path "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni"
 ```
 
@@ -162,7 +162,7 @@ inference = opt_pipeline("Who is the president of the United States?")
 Spinning up:
 ```bash
 deepsparse.server \
-    task text-generation \
+    --task text-generation \
     --model_path zoo:opt-1.3b-opt_pretrain-pruned50_quantW8A8
 ```
 
@@ -210,7 +210,7 @@ inference = sa_pipeline("I hate it!")
 Spinning up:
 ```bash
 deepsparse.server \
-    task sentiment-analysis \
+    --task sentiment-analysis \
     --model_path "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/sst2/pruned80_quant-none-vnni"
 ```
 
@@ -263,7 +263,7 @@ inference = tc_pipeline(
 Spinning up:
 ```bash
 deepsparse.server \
-    task text-classification \
+    --task text-classification \
     --model_path "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"
 ```
 
@@ -316,7 +316,7 @@ inference = tc_pipeline("Drive from California to Texas!")
 Spinning up:
 ```bash
 deepsparse.server \
-    task token-classification \
+    --task token-classification \
     --model_path "zoo:nlp/token_classification/bert-base/pytorch/huggingface/conll2003/pruned90-none"
 ```
 
diff --git a/src/deepsparse/yolact/README.md b/src/deepsparse/yolact/README.md
index e106f4ae75..f4013d6fa3 100644
--- a/src/deepsparse/yolact/README.md
+++ b/src/deepsparse/yolact/README.md
@@ -121,7 +121,7 @@ If a `--model_filepath` arg isn't provided, then `zoo:cv/segmentation/yolact-dar
 Spinning up:
 ```bash
 deepsparse.server \
-    task yolact \
+    --task yolact \
     --model_path "zoo:cv/segmentation/yolact-darknet53/pytorch/dbolya/coco/pruned82_quant-none"
 ```
 
diff --git a/src/deepsparse/yolo/README.md b/src/deepsparse/yolo/README.md
index 0802c2589a..cfbcbfe431 100644
--- a/src/deepsparse/yolo/README.md
+++ b/src/deepsparse/yolo/README.md
@@ -120,7 +120,7 @@ If a `--model_filepath` arg isn't provided, then `zoo:cv/detection/yolov5-s/pyto
 Spinning up:
 ```bash
 deepsparse.server \
-    task yolo \
+    --task yolo \
     --model_path "zoo:cv/detection/yolov5-s/pytorch/ultralytics/coco/pruned_quant-aggressive_94"
 ```
 

From 517fd15a909d138c025cd2e99c99d015f3e25b10 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Fri, 9 Feb 2024 18:11:41 +0100
Subject: [PATCH 14/16] [Feature Branch][DeepSparse Evaluation API] Update
 lm-eval, perplexity, additional datasets (#1580)

---
 setup.py                                      |   3 +-
 src/deepsparse/evaluation/cli.py              |  14 +-
 src/deepsparse/evaluation/evaluator.py        |   1 -
 .../evaluation/integrations/__init__.py       |   6 +-
 .../integrations/lm_evaluation_harness.py     | 432 +++++++++---------
 .../evaluation/integrations/perplexity.py     | 278 +++++++++++
 src/deepsparse/evaluation/registry.py         |   2 +-
 src/deepsparse/evaluation/results.py          |   4 +-
 src/deepsparse/evaluation/utils.py            |  83 +++-
 src/deepsparse/transformers/metrics.py        |   2 +-
 .../transformers/utils/eval_helpers.py        |  34 +-
 .../test_lm_evaluation_harness.py             | 136 ++++--
 .../integrations/test_perplexity.py           | 132 ++++++
 tests/deepsparse/evaluation/test_evaluator.py |  17 +-
 14 files changed, 852 insertions(+), 292 deletions(-)
 create mode 100644 src/deepsparse/evaluation/integrations/perplexity.py
 create mode 100644 tests/deepsparse/evaluation/integrations/test_perplexity.py

diff --git a/setup.py b/setup.py
index 8fe04d23be..d9c8dffd7d 100644
--- a/setup.py
+++ b/setup.py
@@ -149,6 +149,7 @@ def _parse_requirements_file(file_path):
     "datasets<2.16",
     "accelerate<0.26",
     "seqeval",
+    "evaluate",
 ]
 _sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps
 
@@ -308,7 +309,7 @@ def _setup_entry_points() -> Dict:
             f"deepsparse.image_classification.eval={ic_eval}",
             "deepsparse.license=deepsparse.license:main",
             "deepsparse.validate_license=deepsparse.license:validate_license_cli",
-            "deepsparse.eval=deepsparse.evaluation.cli:main",
+            "deepsparse.evaluate=deepsparse.evaluation.cli:main",
         ]
     }
 
diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py
index b68d32d4e5..d192dd67a1 100644
--- a/src/deepsparse/evaluation/cli.py
+++ b/src/deepsparse/evaluation/cli.py
@@ -20,7 +20,7 @@
   Module for evaluating models on the various evaluation integrations
 
 OPTIONS:
-    --model_path MODEL_PATH
+    MODEL_PATH
                         A path to an ONNX model, local directory containing ONNX model
                         (including all the auxiliary files) or a SparseZoo stub
     -d DATASET, --dataset DATASET
@@ -72,7 +72,7 @@
 
 from deepsparse.evaluation.evaluator import evaluate
 from deepsparse.evaluation.results import Result, save_result
-from deepsparse.evaluation.utils import args_to_dict, get_save_path
+from deepsparse.evaluation.utils import get_save_path, parse_kwarg_tuples
 from deepsparse.operators.engine_operator import (
     DEEPSPARSE_ENGINE,
     ORT_ENGINE,
@@ -88,12 +88,10 @@
         ignore_unknown_options=True,
     )
 )
-@click.option(
-    "--model_path",
+@click.argument(
+    "model_path",
     type=click.Path(dir_okay=True, file_okay=True),
     required=True,
-    help="A path to an ONNX model, local directory containing ONNX model"
-    "(including all the auxiliary files) or a SparseZoo stub",
 )
 @click.option(
     "-d",
@@ -178,7 +176,7 @@ def main(
     # join datasets to a list if multiple datasets are passed
     datasets = list(dataset) if not isinstance(dataset, str) else dataset
     # format kwargs to a  dict
-    integration_args = args_to_dict(integration_args)
+    integration_args = parse_kwarg_tuples(integration_args)
 
     _LOGGER.info(
         f"Creating {engine_type} pipeline to evaluate from model path: {model_path}"
@@ -203,7 +201,7 @@ def main(
         **integration_args,
     )
 
-    _LOGGER.info(f"Evaluation done. Results:\n{result}")
+    _LOGGER.info(f"Evaluation done. Results:\n{result.formatted}")
 
     save_path = get_save_path(
         save_path=save_path,
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
index b513f07563..3d18f8489f 100644
--- a/src/deepsparse/evaluation/evaluator.py
+++ b/src/deepsparse/evaluation/evaluator.py
@@ -65,7 +65,6 @@ def evaluate(
     return eval_integration(
         pipeline=pipeline,
         datasets=datasets,
-        engine_type=engine_type,
         batch_size=batch_size,
         splits=splits,
         metrics=metrics,
diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py
index 1cc3bfacf0..f0871f135a 100644
--- a/src/deepsparse/evaluation/integrations/__init__.py
+++ b/src/deepsparse/evaluation/integrations/__init__.py
@@ -15,7 +15,7 @@
 # flake8: noqa: F401
 
 
-def try_import_lm_evaluation_harness(raise_error=False):
+def try_import_lm_evaluation_harness(raise_error=True):
     try:
         import lm_eval
 
@@ -24,11 +24,11 @@ def try_import_lm_evaluation_harness(raise_error=False):
         if raise_error:
             raise ImportError(
                 "Unable to import lm_eval. "
-                "To install run 'pip install "
-                "git+https://github.com/EleutherAI/lm-evaluation-harness@b018a7d51'"
+                "To install run 'pip install lm-eval==0.4.0'"
             )
         return False
 
 
 if try_import_lm_evaluation_harness(raise_error=False):
     from .lm_evaluation_harness import *
+from .perplexity import *
diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
index 2f8c7b8cef..69934af37a 100644
--- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
+++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py
@@ -13,35 +13,39 @@
 # limitations under the License.
 
 """
-Integration of the `lm_evaluation_harness`:
+Integration of the `lm-evaluation-harness`:
 https://github.com/EleutherAI/lm-evaluation-harness
 """
-
-import json
 import logging
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy
-from pydantic import BaseModel, Field
 from tqdm import tqdm
 
-import torch
 from deepsparse import Pipeline
 from deepsparse.evaluation.registry import EvaluationRegistry
 from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
-from lm_eval import base, evaluator, tasks, utils
+from deepsparse.evaluation.utils import LM_EVALUATION_HARNESS
+from deepsparse.utils.data import numpy_log_softmax
+from lm_eval import evaluator, tasks, utils
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+
 
+tasks.initialize_tasks("INFO")
 
 _LOGGER = logging.getLogger(__name__)
 
 __all__ = ["integration_eval"]
 
 
-@EvaluationRegistry.register(name="lm-evaluation-harness")
+@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS, alias="lm-eval-harness")
 def integration_eval(
-    model: Any,
+    pipeline: Pipeline,
     datasets: Union[List[str], str],
-    batch_size: int,
+    batch_size: int = 1,
+    splits: Union[List[str], str, None] = None,
+    metrics: Union[List[str], str, None] = None,
     **kwargs,
 ) -> Result:
     """
@@ -49,101 +53,53 @@ def integration_eval(
     https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py
     that is compatible with deepsparse.evaluator.py
 
-    :param model: the model/pipeline to evaluate
+    :param pipeline: the model/pipeline to evaluate
     :param datasets: the datasets to evaluate on
     :param batch_size: the batch size to use for evaluation
     :param kwargs: additional arguments to alter the behavior of the evaluation
 
     :return the evaluation results
     """
-    # [START]
-    # The code that sets up the interface between deepsparse and lm_evaluation_harness
-    if isinstance(model, Pipeline):
-        # If the model is a Pipeline, we need to wrap
-        # it in a DeepSparseLM object
-        model = DeepSparseLM(
-            pipeline=model,
-            batch_size=batch_size,
-            max_gen_toks=kwargs.get("max_gen_toks"),
-        )
+    pipeline = DeepSparseLM(pipeline=pipeline, batch_size=batch_size)
 
     datasets = (",").join(datasets) if isinstance(datasets, list) else datasets
-    # [END]
-
-    # [START]
-    # The code below is being adapted from:
-    # https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py
-    if kwargs.get("limit"):
-        _LOGGER.warning(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. "
-            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-
-    if datasets is None:
-        task_names = tasks.ALL_TASKS
-    else:
-        task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS)
+    task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS)
 
     _LOGGER.info(f"Selected Tasks: {task_names}")
 
-    description_dict = {}
-    if kwargs.get("description_dict_path"):
-        with open(kwargs.get("description_dict_path"), "r") as f:
-            description_dict = json.load(f)
-
-    evaluator_input = EvaluatorInputSchema(
-        model=model,
-        tasks=task_names,
-        description_dict=description_dict,
-        batch_size=batch_size,
-        **kwargs,
+    results_raw = evaluator.simple_evaluate(
+        model=pipeline, tasks=task_names, batch_size=batch_size, **kwargs
     )
 
-    results_raw = evaluator.simple_evaluate(**evaluator_input.dict())
-
     results = Result(
-        raw=dict(output=results_raw, input=filter_evaluator_input(evaluator_input)),
+        raw=results_raw,
         formatted=format_raw_results(results_raw),
     )
 
     return results
 
 
-def filter_evaluator_input(
-    evaluator_input: "EvaluatorInputSchema",
-) -> Dict[str, Any]:  # noqa: F821
-    """
-    Filter the evaluator input to remove the model field.
-    The model field is a complex object that cannot be serialized.
-
-    :param evaluator_input: the evaluator input to filter
-    :return: the filtered evaluator input
-    """
-    evaluator = evaluator_input.dict()
-    del evaluator["model"]
-
-    return evaluator
-
-
 def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
     """
     Format the raw results from lm_evaluation_harness into a list of
     Evaluation objects.
 
-    :param results: the raw results from lm_evaluation_harness
+    :param results: the raw results from lm-evaluation-harness
     :return: the formatted results as a list of Evaluation objects
     """
     formatted_results = []
     for dataset_name, dataset_result in results["results"].items():
         metrics = []
         for metric_name, metric_value in dataset_result.items():
+            if isinstance(metric_value, str):
+                continue
             metric = Metric(name=metric_name, value=metric_value)
             metrics.append(metric)
         dataset = Dataset(
             type=None, name=dataset_name, config=results["config"], split=None
         )
         evaluation = Evaluation(
-            task="lm_evaluation_harness",
+            task=LM_EVALUATION_HARNESS,
             dataset=dataset,
             metrics=metrics,
             samples=None,
@@ -152,177 +108,241 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
     return formatted_results
 
 
-class EvaluatorInputSchema(BaseModel):
-    model: Any = Field(description="The name of the model.")
-    tasks: List[str] = Field(
-        description="The task (or multiple tasks) to evaluate the target on."
-    )
-    description_dict: Optional[Dict[str, Any]] = Field(
-        None, description="Description dict."
-    )
-    batch_size: int = Field(description="The batch size to use for evaluation.")
-    model_args: str = Field(
-        "", description="Additional arguments for the evaluated model."
-    )
-    num_fewshot: int = Field(0, description="The number of few shots to use.")
-    max_batch_size: Optional[int] = Field(
-        None, description="Maximal batch size to try with --batch_size auto."
-    )
-    device: Optional[str] = Field(None, description="Device to use for evaluation.")
-    no_cache: bool = Field(False, description="Include this flag to prevent caching.")
-    limit: Optional[float] = Field(
-        None,
-        description="Limit the number of examples per task. If <1, "
-        "limit is a percentage of the total number of "
-        "examples.",
-    )
-    decontamination_ngrams_path: Optional[str] = Field(
-        None, description="Specify the path for decontamination n-grams."
-    )
-    check_integrity: bool = Field(
-        False, description="Include this flag to check integrity."
-    )
-    write_out: bool = Field(False, description="Include this flag to write out.")
-    output_base_path: Optional[str] = Field(
-        None, description="Specify the output base path."
-    )
-
-
-class DeepSparseLM(base.BaseLM):
+class DeepSparseLM(LM):
     def __init__(
         self,
         pipeline: Pipeline,
-        tokenizer: Optional[str] = None,
         batch_size: int = 1,
-        max_gen_toks: Optional[int] = None,
+        max_gen_toks: int = 256,
+        tokenizer: Optional["AutoTokenizer"] = None,  # noqa: F821
     ):
         """
         Wrapper around the DeepSparse pipeline to make it compatible with the
         llm-evaluation-harness.
+
+        :param pipeline: the pipeline object to wrap
+        :param batch_size: the batch size to use for evaluation
+        :param max_gen_toks: the maximum number of tokens to generate
+            when using the model for generation (see: greed_until method)
+        :param tokenizer: the tokenizer to use for encoding and decoding
+            strings and tokens. By default, the tokenizer from the pipeline
         """
         super().__init__()
 
-        # Initialize new model and tokenizer instances
-        self.model = pipeline
-        self.tokenizer = tokenizer if tokenizer else self.model.tokenizer
-
-        self._batch_size = batch_size
+        self.pipeline = pipeline
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer or pipeline.tokenizer
         self._max_length = pipeline.sequence_length
-        self._max_gen_toks = max_gen_toks or 256
+        self._max_gen_toks = max_gen_toks
+        self.batch_sizes = {}
 
-        self.vocab_size = self.tokenizer.vocab_size
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string)
 
-    def _model_call(self, inps) -> torch.Tensor:
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+
+    @property
+    def max_length(self) -> int:
+        return self._max_length
+
+    @property
+    def max_gen_toks(self) -> int:
+        return self._max_gen_toks
+
+    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                raise NotImplementedError(
+                    "Implementing empty context is not supported yet"
+                )
+            context_enc, continuation_enc = self._encode_pair(context, continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+    ) -> List[Tuple[float, bool]]:
         """
-        Override the _model_call method to use the DeepSparse pipeline for
-        logits generation.
+        The function to compute the loglikelihood of the continuation
+        tokens given the context tokens.
 
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
         """
-        # Encode the tokens to strings
-        prompt = self.model.tokenizer.batch_decode(inps.numpy())
-
-        # Run the model to map the prompt to logits
-        out = self.model(
-            prompt=prompt,
-            max_new_tokens=0,
-            include_prompt_logits=True,
-            output_scores=True,
-        )
-        logits_numpy = numpy.stack([generation.score for generation in out.generations])
-        return torch.from_numpy(logits_numpy)
+        res = []
 
-    def greedy_until(
-        self, requests: List[Tuple[str, Union[List[str], str]]]
-    ) -> List[str]:
         def _collate(x):
-            tokens = self.tok_encode(x[0])
-            return len(tokens), x[0]
+            """Defines the key for the sorted method"""
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
 
-        results = []
-        reorder = utils.Reorderer(requests, _collate)
+        re_ord = utils.Reorderer(requests, _collate)
 
-        for chunk in utils.chunks(
-            tqdm(reorder.get_reordered(), disable=False),
-            self.batch_size,
+        for chunk in tqdm(
+            list(utils.chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
         ):
-            context = [c[0] for c in chunk]
-            request_args = chunk[0][1]
-            stop = request_args.get("until", None)
-            stop_sequences = stop if isinstance(stop, list) else [stop]
-            max_generation_length = request_args.get("max_length", None)
-
-            assert (
-                isinstance(max_generation_length, int) or max_generation_length is None
-            )
-            assert isinstance(stop_sequences, list) or stop_sequences is None
-
-            # TODO: Find a better way to handle stop sequences for 0-shot.
-            if stop_sequences is None:
-                until = [self.eot_token]
-            else:
-                until = stop_sequences + [self.eot_token]
-
-            if max_generation_length is None:
-                max_tokens = self.max_gen_toks
-            else:
-                max_tokens = max_generation_length
-
-            responses = self.model(
-                sequences=context,
-                max_new_tokens=max_tokens,
-                stop=until,
-                do_sample=False,
+            batch_inp = []
+            batch_cache_key = []
+            batch_continuation_enc = []
+            # len(chunk) is the batch_size
+            for cache_key, context_enc, continuation_enc in chunk:
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501
+
+                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+
+                batch_inp.append(self.tokenizer.decode(inp))
+                batch_cache_key.append(cache_key)
+                batch_continuation_enc.append(continuation_enc)
+
+            response = self.pipeline(
+                prompt=batch_inp,
+                max_new_tokens=0,
+                output_scores=True,
+                include_prompt_logits=True,
             )
 
-            responses = responses if type(responses) is list else [responses]
+            for resp, continuation_enc, cache_key in zip(
+                response.generations, batch_continuation_enc, batch_cache_key
+            ):
+                # (seq_len, vocab_size)
+                multi_scores = resp.score
+                # (seq_len, vocab_size) but with softmax applied
+                multi_logits = numpy_log_softmax(multi_scores, axis=1)
+                # toss out the context half of the sequence
+                # (cont_len, vocab_size)
+                continuation_multi_logits = multi_logits[-len(continuation_enc) :]
+
+                # pick out the logits for the continuation tokens
+                # (cont_len,)
+                continuation_logits = continuation_multi_logits[
+                    numpy.arange(len(continuation_enc)), continuation_enc
+                ]
+                # check if the tokens generated greedly are the same
+                # as the expected continuation
+                greedy_tokens = continuation_multi_logits.argmax(axis=1)
+                max_equal = greedy_tokens.tolist() == continuation_enc
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(continuation_logits.sum()), bool(max_equal))
+
+                res.append(answer)
+
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+        return re_ord.get_original(res)
+
+    def loglikelihood_rolling(
+        self, requests: list[Instance]
+    ) -> list[tuple[float, bool]]:
+        raise NotImplementedError(
+            "The method not required by any of our " "current task integrations so far"
+        )
+
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        """
+        The function to generate a certain number of new tokens
+        given a context.
+
+        This function is an adapted version of the original function from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
+        """
+        if not requests:
+            return []
+        res = []
+        requests = [req.args for req in requests]
 
-            for response in responses:
-                response = response.generations[0].text
-                # Ensure the generated responses do not contain the stop sequences.
-                for term in until:
-                    response = response.split(term)[0]
-                # partial caching
-                self.cache_hook.add_partial("greedy_until", (context, until), response)
-                results.append(response)
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        def sameuntil_chunks(xs, size):
+            ret = []
+            lastuntil = xs[0][1]
+            for x in xs:
+                if len(ret) >= size or x[1] != lastuntil:
+                    yield ret, lastuntil
+                    ret = []
+                    lastuntil = x[1]
+                ret.append(x)
+
+            if ret:
+                yield ret, lastuntil
+
+        pbar = tqdm(total=len(requests))
+        for chunk, request_args in tqdm(
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+        ):
+            inps = []
 
-        return reorder.get_original(results)
+            self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
 
-    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
-        raise NotImplementedError()
+            for context, _ in chunk:
+                # add context (prompts) to the list
+                inps.append(context)
 
-    @property
-    def eot_token(self) -> str:
-        return self.tokenizer.eos_token
+            until = request_args.pop("until", ["<|endoftext|>"])
+            request_args.pop("do_sample", None)
+            request_args["temperature"] = request_args.get("temperature", 0)
 
-    @property
-    def eot_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
+            # run inference (generate max_gen_toks tokens)
+            out = self.pipeline(
+                sequences=inps,
+                max_new_tokens=self.max_gen_toks - 1,
+                stop=until,
+                **request_args,
+            )
 
-    @property
-    def max_length(self):
-        return self._max_length
+            for resp, (context, args_) in zip(out.generations, chunk):
+                text = resp.text
+                until_ = until
+                # split the text at the first occurrence of any of the until tokens
+                for term in until_:
+                    if len(term) > 0:
+                        text = text.split(term)[0]
 
-    @property
-    def max_gen_toks(self):
-        return self._max_gen_toks
+                res.append(text)
 
-    @property
-    def batch_size(self):
-        # should return self._batch_size but the
-        # TextGeneration model does not support batch_size > 1
-        return 1
+                self.cache_hook.add_partial(
+                    "generate_until", (context, {"until": until_}), text
+                )
+                pbar.update(1)
 
-    @property
-    def device(self):
-        pass
+        pbar.close()
 
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
+        return re_ord.get_original(res)
 
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
+        """
+        Copied directly from
+        https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py
+        """
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
diff --git a/src/deepsparse/evaluation/integrations/perplexity.py b/src/deepsparse/evaluation/integrations/perplexity.py
new file mode 100644
index 0000000000..a9a3f3d8a3
--- /dev/null
+++ b/src/deepsparse/evaluation/integrations/perplexity.py
@@ -0,0 +1,278 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
+
+import numpy
+from tqdm import tqdm
+
+from datasets import load_dataset
+from deepsparse import Pipeline
+from deepsparse.evaluation.registry import EvaluationRegistry
+from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result
+from deepsparse.evaluation.utils import PERPLEXITY
+from deepsparse.transformers.metrics import Perplexity
+from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
+from deepsparse.transformers.pipelines.text_generation.pipeline_no_kv_cache import (
+    TextGenerationPipelineNoCache,
+)
+from deepsparse.transformers.utils.eval_helpers import (
+    HumanEvalIteratorWrapper,
+    process_concatenated_datasets,
+)
+
+
+"""
+Integration for the evaluation module
+that computes the perplexity of a model on a dataset
+"""
+_LOGGER = logging.getLogger(__name__)
+
+
+@EvaluationRegistry.register(name=PERPLEXITY)
+def integration_eval(
+    pipeline: Pipeline,
+    datasets: Union[List[str], str] = "openai_humaneval",
+    batch_size: int = 1,
+    limit: Optional[int] = None,
+    accumulate: Optional[bool] = None,
+    splits: Union[List[str], str, None] = "test",
+    metrics: Union[List[str], str, None] = None,
+    **kwargs,
+) -> Result:
+    """
+    A function that computes the perplexity of a pipeline given a set
+    of dataset names.
+
+    :param pipeline: the pipeline to evaluate. The assumed pipeline
+        is a TextGenerationPipeline, either with or without the KV
+        cache support
+    :param datasets: the names of dataset(s) to evaluate on
+    :param batch_size: the batch size to use for evaluation
+    :param splits: the split of the dataset to evaluate on. Default is "test"
+    :param metrics: the metrics to compute. Default is None
+    :param limit: the number of batches to evaluate on. Default is None
+        (evaluates on entire dataset)
+    :param accumulate: whether to perplexity computation should
+        accumulate negative log-likelihood over samples. Defaults to
+        the default accumulate variable inferred from the dataset in
+        `datasets`. If not None, it will override the inferred accumulate
+         variable.
+    :return: a Result object containing the raw and formatted results
+    """
+    metrics = metrics or PERPLEXITY
+    if metrics != PERPLEXITY:
+        raise ValueError(f"Invalid metric {metrics} for perplexity evaluation")
+    if splits is None:
+        splits = "test"
+        _LOGGER.info("Argument `splits` is None. Defaulting to `test` split.")
+    datasets = datasets if isinstance(datasets, list) else [datasets]
+    results_raw = defaultdict(str)
+    for dataset_name in datasets:
+        results_raw[dataset_name] = defaultdict()
+        dataset, _accumulate = load_perplexity_dataset(
+            dataset_name=dataset_name, splits=splits, pipeline=pipeline, **kwargs
+        )
+        if accumulate is None:
+            accumulate = _accumulate
+        else:
+            _LOGGER.info(
+                f"Argument `accumulate` set to {accumulate}. "
+                "Overriding the inferred accumulate variable from the dataset."
+            )
+
+        perplexity = run_perplexity(
+            pipeline=pipeline,
+            dataset=dataset,
+            batch_size=batch_size,
+            accumulate=accumulate,
+            limit=limit,
+        )
+
+        results_raw[dataset_name] = defaultdict()
+        results_raw[dataset_name]["results"] = perplexity
+        results_raw[dataset_name]["split"] = splits
+
+    results = Result(
+        # omit storing raw results. they can potentially
+        # contain numpy arrays that are not serializable.
+        # all the information is stored in the formatted results
+        raw=None,
+        formatted=format_raw_results(results_raw),
+    )
+
+    return results
+
+
+def run_perplexity(
+    pipeline: Union[TextGenerationPipelineNoCache, TextGenerationPipeline],
+    dataset: "Dataset",
+    batch_size: int,
+    accumulate: bool,
+    limit: Optional[int] = None,
+) -> Dict[str, Any]:
+    """
+    Compute the perplexity of a pipeline given a dataset.
+
+    :param pipeline: the pipeline to evaluate. The assumed pipeline
+        is a TextGenerationPipeline, either with or without the KV
+        cache support
+    :param dataset: the dataset to evaluate on
+    :param batch_size: the batch size to use for evaluation
+    :param accumulate: whether to perplexity computation should
+        accumulate negative log-likelihood over samples
+    :param limit: the number of batches to evaluate on. Default is None
+        (evaluates on entire dataset)
+
+    :return: a dictionary containing the perplexity results
+    """
+
+    perplexity = Perplexity(accumulate=accumulate)
+
+    batch = []
+    for idx, sample in _enumerate_progress(
+        dataset, max_steps=None if limit is None else limit * batch_size
+    ):
+
+        if limit is not None:
+            # stop if we have reached the #limit
+            # number of batches to be processed
+            if idx >= limit * batch_size:
+                break
+
+        batch.append(sample)
+
+        if len(batch) == batch_size:
+            if isinstance(pipeline, TextGenerationPipelineNoCache):
+                out = pipeline(
+                    prompt=batch,
+                    output_scores=True,
+                    include_prompt_logits=True,
+                    return_input_tokens=True,
+                )
+            else:
+                out = pipeline(
+                    prompt=batch,
+                    output_scores=True,
+                    max_new_tokens=0,
+                    include_prompt_logits=True,
+                    return_input_tokens=True,
+                )
+
+            for s in range(batch_size):
+                # Need to remove tokens that were masked
+                input_ids = out.input_tokens["input_ids"][s].flatten()
+                attention_mask = out.input_tokens["attention_mask"][s].flatten()
+                logits = out.generations[s].score
+                if batch_size > 1 and isinstance(
+                    pipeline, TextGenerationPipelineNoCache
+                ):
+                    logits = logits[-attention_mask.sum() :, :]
+
+                logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :]
+                input_ids = numpy.compress(attention_mask, input_ids)[1:]
+
+                # Add predictions (logits) and targets (input_ids) to metric
+                perplexity.add_batch(logits, input_ids)
+
+            batch.clear()
+
+    return perplexity.compute()
+
+
+def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]:
+    """
+    Format the raw perplexity results into a list of
+    Evaluation objects.
+
+    :param results: the raw results from perplexity computation
+    :return: the formatted results as a list of Evaluation objects
+    """
+    formatted_results = []
+    for dataset_name, dataset_result in results.items():
+        metrics = []
+        for metric_name, metric_value in dataset_result["results"].items():
+            if isinstance(metric_value, numpy.ndarray):
+                metric_value = metric_value.tolist()
+            metric = Metric(name=metric_name, value=metric_value)
+            metrics.append(metric)
+        dataset = Dataset(type=None, name=dataset_name, split=dataset_result["split"])
+        evaluation = Evaluation(
+            task="perplexity",
+            dataset=dataset,
+            metrics=metrics,
+            samples=None,
+        )
+        formatted_results.append(evaluation)
+    return formatted_results
+
+
+def load_perplexity_dataset(
+    dataset_name: str,
+    splits: Union[List[str], str] = "test",
+    pipeline: Optional[Pipeline] = None,
+    **kwargs,
+):
+    """
+    Function to load the dataset for perplexity computation.
+    Eventually we want to load the dataset from the nm_utils
+
+    :param dataset_name: the name of the dataset to load
+    :param splits: the splits to load from the dataset. Default is "test"
+    :param pipeline: the pipeline to use for loading the dataset. The pipeline
+        is used to infer the model path and sequence length to use for loading
+        the dataset. This argument can be omitted if the appropriate kwargs
+        are provided, or if the dataset does not require a process_concatenated_datasets
+        function to load the dataset.
+    :param kwargs: additional keyword arguments to pass to the dataset loading function
+    :return: the dataset and whether to accumulate perplexity over samples
+    """
+    if isinstance(splits, list):
+        raise NotImplementedError("Evaluation on multiple splits not implemented")
+
+    if dataset_name == "openai_humaneval":
+        dataset = load_dataset(dataset_name, split=splits)
+        dataset = HumanEvalIteratorWrapper(dataset)
+        accumulate = False
+    elif dataset_name in {"wikitext2", "c4"}:
+        # fetch max_sequence_length from pipeline if not provided
+        max_sequence_length = kwargs.pop("max_sequence_length", None)
+        if max_sequence_length is None and pipeline is not None:
+            max_sequence_length = pipeline.sequence_length
+
+        # fetch model_path from pipeline if not provided
+        model_path = kwargs.pop("model_path", None)
+        if model_path is None and pipeline is not None:
+            model_path = os.path.dirname(pipeline.model_path)
+
+        dataset = process_concatenated_datasets(
+            dataset_name,
+            model_path=model_path,
+            max_sequence_length=max_sequence_length,
+            split=splits,
+            **kwargs,
+        )
+        accumulate = True
+    else:
+        raise NotImplementedError(f"Dataset {dataset_name} not implemented")
+
+    return dataset, accumulate
+
+
+def _enumerate_progress(dataset, max_steps):
+    progress_bar = tqdm(dataset, total=max_steps) if max_steps else tqdm(dataset)
+    return enumerate(progress_bar)
diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py
index 2daabb69cc..343cd9786c 100644
--- a/src/deepsparse/evaluation/registry.py
+++ b/src/deepsparse/evaluation/registry.py
@@ -57,7 +57,7 @@ def resolve(
 
         if integration is None:
             _LOGGER.info(
-                "No integration specified, inferring the evaluation"
+                "No integration specified, inferring the evaluation "
                 "function from the input arguments..."
             )
             integration = resolve_integration(pipeline, datasets)
diff --git a/src/deepsparse/evaluation/results.py b/src/deepsparse/evaluation/results.py
index 00212d0a1e..78c4bbd501 100644
--- a/src/deepsparse/evaluation/results.py
+++ b/src/deepsparse/evaluation/results.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Union
 
 import yaml
 from pydantic import BaseModel, Field
@@ -32,7 +32,7 @@
 
 class Metric(BaseModel):
     name: str = Field(description="Name of the metric")
-    value: float = Field(description="Value of the metric")
+    value: Union[float, List[float]] = Field(description="Value of the metric")
 
 
 class Dataset(BaseModel):
diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
index 87475dd5d2..6e5ade9344 100644
--- a/src/deepsparse/evaluation/utils.py
+++ b/src/deepsparse/evaluation/utils.py
@@ -11,21 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import ast
+import logging
 import os
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Union
 
 from deepsparse import Pipeline
+from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE
 
 
 __all__ = [
     "create_pipeline",
     "get_save_path",
-    "args_to_dict",
+    "parse_kwarg_tuples",
     "resolve_integration",
 ]
+_LOGGER = logging.getLogger(__name__)
 
 LM_EVALUATION_HARNESS = "lm-evaluation-harness"
+PERPLEXITY = "perplexity"
 
 
 def potentially_check_dependency_import(integration_name: str) -> bool:
@@ -38,10 +42,14 @@ def potentially_check_dependency_import(integration_name: str) -> bool:
     :return: True if the dependency is installed, False otherwise
     """
 
-    if integration_name.replace("_", "-") == LM_EVALUATION_HARNESS:
+    if integration_name == LM_EVALUATION_HARNESS:
         from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
 
-        try_import_lm_evaluation_harness(raise_error=True)
+        try_import_lm_evaluation_harness()
+    if integration_name == PERPLEXITY:
+        from deepsparse.evaluation.integrations.perplexity import (  # noqa F401
+            integration_eval,
+        )
 
     return True
 
@@ -79,24 +87,66 @@ def if_generative_language_model(pipeline: Pipeline) -> bool:
     return False
 
 
-def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]:
+def parse_kwarg_tuples(kwargs: tuple) -> Dict:
     """
-    Convert a tuple of args to a dict of args.
-
-    :param args: The args to convert. Should be a tuple of alternating
-        arg names and arg values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3).
+    Convert a tuple of kwargs to a dict of kwargs.
+    This function is used to enable the click parsing of kwargs.
+
+    Example use:
+    ```
+    @click.command(
+    context_settings=dict(
+        ignore_unknown_options=True)
+    )
+    @click.argument(...)
+    @click.option(...)
+    ...
+    @click.argument("kwargs", nargs=-1, type=click.UNPROCESSED)
+    def main(..., kwargs):
+        ...
+        kwargs: Dict[str, Any] = parse_kwarg_tuples(kwargs: Tuple)
+    ```
+
+    Example inputs, outputs:
+    ```
+    input = ('--arg1', 1, 'arg2', 2, '-arg3', 3)
+    output = parse_kwarg_tuples(input)
+    output = {'arg1': 1, 'arg2': 2, 'arg3': 3}
+    ```
+
+    :param kwargs: The kwargs to convert. Should be a tuple of alternating
+        kwargs names and kwargs values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3).
         The names can optionally have a '-' or `--` in front of them.
-    :return: The converted args as a dict.
+    :return: The converted kwargs as a dict.
     """
-    if len(args) == 0:
+    if len(kwargs) == 0:
         return {}
+    if len(kwargs) % 2 != 0:
+        raise ValueError(
+            "kwargs must be a tuple of alternating names and values "
+            "i.e. the length of kwargs tuple must be even. Received "
+            f"kwargs: {kwargs}"
+        )
     # names are uneven indices, values are even indices
-    args_names = args[0::2]
-    args_values = args[1::2]
+    kwargs_names = kwargs[0::2]
+    kwargs_values = kwargs[1::2]
+    # by default kwargs values are strings, so convert them
+    # to the appropriate type if possible
+    kwargs_values = list(kwargs_values)
+    for i, value in enumerate(kwargs_values):
+        try:
+            kwargs_values[i] = ast.literal_eval(value)
+        except Exception as e:  # noqa E841
+            _LOGGER.debug(
+                f"Failed to infer non-string type"
+                f"from kwarg value: {value}. It will"
+                f"be left as a string."
+            )
+
     # remove any '-' or '--' from the names
-    args_names = [name.lstrip("-") for name in args_names]
+    kwargs_names = [name.lstrip("-") for name in kwargs_names]
 
-    return dict(zip(args_names, args_values))
+    return dict(zip(kwargs_names, kwargs_values))
 
 
 def get_save_path(
@@ -143,6 +193,7 @@ def create_pipeline(
     :param engine_type: The engine type to initialize the model with.
     :return: The initialized pipeline
     """
+    engine_type = engine_type or DEEPSPARSE_ENGINE
     return Pipeline.create(
         task=kwargs.pop("task", "text-generation"),
         model_path=model_path,
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index b90c4dd744..0e7c24c8b6 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -20,7 +20,7 @@
 
 import numpy
 
-from deepsparse.utils import numpy_log_softmax
+from deepsparse.utils.data import numpy_log_softmax
 
 
 __all__ = [
diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py
index 4c0e68b9de..012520b9b5 100644
--- a/src/deepsparse/transformers/utils/eval_helpers.py
+++ b/src/deepsparse/transformers/utils/eval_helpers.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Mapping, Union
+from typing import List, Union
 
 import numpy
 from transformers import AutoTokenizer, PreTrainedTokenizerFast
@@ -27,7 +27,8 @@ def process_concatenated_datasets(
     dataset_name: str,
     model_path: str,
     max_sequence_length: int,
-    kwargs: Mapping,
+    split: str = "test",
+    **kwargs,
 ) -> list:
     """
     Concatenate text datasets and split them into chunks text that, after
@@ -38,6 +39,8 @@ def process_concatenated_datasets(
             Options: "wikitext2" or "c4".
         model_path (str): The path to a pretrained transformer model for tokenization.
         max_sequence_length (int): The maximum number of tokens in each sequence.
+        split (str, optional): The split of the dataset to use.
+            Default is "test".
         kwargs (mapping): Additional keyword arguments.
             - eos (str, optional): The end-of-sentence token.
                 Default is "\n\n" for wikitext2 and "" for c4.
@@ -65,13 +68,13 @@ def process_concatenated_datasets(
         eos = kwargs.get("eos", "\n\n")
         bos = kwargs.get("bos", "")
 
-        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split=split)
         raw_text = raw_dataset["text"]
     elif dataset_name == "c4":
         eos = kwargs.get("eos", "<|endoftext|>")
         bos = kwargs.get("bos", "")
         raw_samples = kwargs.get("raw_samples", None)
-        data_file = kwargs.get("data_file", 0)
+        data_file = kwargs.get("data_file", None)
         if data_file is not None:
             raw_dataset = load_dataset(
                 "allenai/c4",
@@ -79,13 +82,13 @@ def process_concatenated_datasets(
                 data_files={
                     "validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz"
                 },
-                split="validation",
+                split=split,
             )
         else:
             raw_dataset = load_dataset(
                 "allenai/c4",
                 "allenai--c4",
-                split="validation",
+                split=split,
             )
         if raw_samples is not None:
             raw_dataset = raw_dataset[:raw_samples]
@@ -181,3 +184,22 @@ def _split_text_by_tokens(
         )
 
     return split_text
+
+
+class HumanEvalIteratorWrapper:
+    """
+    Wrapper around the `openai_humaneval` dataset,
+    that joins the prompt and the canonical solution
+    into a single string during iteration.
+    """
+
+    def __init__(self, dataset):
+        self.iterator = iter(dataset)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # Get the next sample from the original iterator
+        sample = next(self.iterator)
+        return sample["prompt"] + sample["canonical_solution"]
diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
index 3b9016294f..8d8b343dd5 100644
--- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
+++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py
@@ -12,64 +12,118 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from transformers import AutoModelForCausalLM
-
 import pytest
 from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness
 from deepsparse.evaluation.utils import create_pipeline
 
 
-@pytest.mark.parametrize(
-    "pipeline, model_torch",
-    [
-        (
-            create_pipeline(
-                "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime"
-            ),
-            AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M"),
-        )
-    ],
-)
-@pytest.mark.parametrize(
-    "datasets",
-    [
-        ["hellaswag"],
-        ["hellaswag", "gsm8k"],
-        "gsm8k",
-        "arc_challenge",
-    ],
-)
 @pytest.mark.parametrize(
     "batch_size",
     [1, 3],
 )
-class TestLMEvaluationHarness:
-    @pytest.mark.skipif(
-        not try_import_lm_evaluation_harness(raise_error=False),
-        reason="lm_evaluation_harness not installed",
-    )
-    def test_integration_eval_onnx_matches_torch(
-        self, pipeline, model_torch, datasets, batch_size
-    ):
+@pytest.mark.skipif(
+    not try_import_lm_evaluation_harness(raise_error=False),
+    reason="lm_evaluation_harness not installed",
+)
+class TestLMEval:
+    @pytest.fixture()
+    def integration_eval(self):
         from deepsparse.evaluation.integrations.lm_evaluation_harness import (
-            integration_eval,
+            integration_eval as eval_fn,
         )
 
-        out_torch = integration_eval(
-            model=model_torch,
+        return eval_fn
+
+    @pytest.mark.parametrize(
+        "datasets",
+        [
+            "hellaswag",
+            ["arc_challenge"],
+            ["hellaswag", "arc_challenge"],
+        ],
+    )
+    def test_likelihood_scenario(self, batch_size, datasets, integration_eval):
+
+        model_path_ds = "hf:mgoin/TinyStories-1M-ds"
+        model_path_hf = "roneneldan/TinyStories-1M"
+        limit = 2
+
+        out_onnx = integration_eval(
+            create_pipeline(
+                model_path_ds,
+                engine_type="onnxruntime",
+            ),
             datasets=datasets,
             batch_size=batch_size,
-            limit=5,
-            no_cache=True,  # avoid saving files when running tests
+            limit=limit,
+            use_cache=None,  # avoid saving files when running tests
+        )
+
+        from lm_eval import evaluator, tasks, utils
+
+        datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets
+        out_torch = evaluator.simple_evaluate(
+            model="hf",
+            model_args=f"pretrained={model_path_hf}",
+            tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS),
+            batch_size=batch_size,
+            limit=limit,
+            use_cache=None,  # avoid saving files when running tests
         )
+        self._test_same(out_onnx.raw, out_torch, datasets)
+
+    @pytest.mark.parametrize(
+        "datasets",
+        [
+            "gsm8k",
+        ],
+    )
+    def test_greedy_until_scenario(self, batch_size, datasets, integration_eval):
+        model_path_ds = "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX"
+        model_path_hf = "TinyLlama/TinyLlama-1.1B-step-50K-105b"
+        limit = 2
+        # compute until 16 new tokens
+        # so that tests are faster
+        gen_kwargs = "max_gen_toks=16"
+
         out_onnx = integration_eval(
-            model=pipeline,
+            create_pipeline(model_path_ds, engine_type="onnxruntime"),
             datasets=datasets,
             batch_size=batch_size,
-            limit=5,
-            no_cache=True,  # avoid saving files when running tests
+            limit=limit,
+            gen_kwargs=gen_kwargs,
+            use_cache=None,  # avoid saving files when running tests
+        )
+
+        from lm_eval import evaluator, tasks, utils
+
+        datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets
+        out_torch = evaluator.simple_evaluate(
+            model="hf",
+            model_args=f"pretrained={model_path_hf}",
+            tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS),
+            batch_size=batch_size,
+            limit=limit,
+            gen_kwargs=gen_kwargs,
+            use_cache=None,  # avoid saving files when running tests
         )
-        out_onnx = out_onnx.raw["output"]
-        out_torch = out_torch.raw["output"]
+        self._test_same(out_onnx.raw, out_torch, datasets)
 
-        assert out_onnx["results"] == out_torch["results"]
+    @staticmethod
+    def _test_same(out_onnx, out_torch, datasets, greedy=False):
+        datasets = datasets if isinstance(datasets, list) else [datasets]
+        for dataset in datasets:
+            torch_samples = out_torch["samples"][dataset]
+            onnx_samples = out_onnx["samples"][dataset]
+            for torch_sample, onnx_sample in zip(torch_samples, onnx_samples):
+                if greedy:
+                    # for datasets that validate greedy generation
+                    # make sure that generated sequences are the same
+                    assert torch_sample["resps"] == onnx_sample["resps"]
+                else:
+                    # for datasets that validate likelihood
+                    # make sure that likelihoods are the same
+                    assert (
+                        pytest.approx(torch_sample["resps"][0][0], 0.0001)
+                        == onnx_sample["resps"][0][0]
+                    )
diff --git a/tests/deepsparse/evaluation/integrations/test_perplexity.py b/tests/deepsparse/evaluation/integrations/test_perplexity.py
new file mode 100644
index 0000000000..b156e5b9a4
--- /dev/null
+++ b/tests/deepsparse/evaluation/integrations/test_perplexity.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import copy
+
+import numpy as np
+
+import pytest
+from deepsparse.evaluation.integrations.perplexity import (
+    integration_eval,
+    load_perplexity_dataset,
+)
+from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
+from evaluate import load
+
+
+@pytest.fixture()
+def model_path():
+    return "hf:mgoin/TinyStories-1M-deepsparse"
+
+
+@pytest.fixture()
+def model_id():
+    return "roneneldan/TinyStories-1M"
+
+
+@pytest.mark.parametrize(
+    "datasets",
+    [
+        "openai_humaneval",
+        "wikitext2",
+    ],
+)
+@pytest.mark.parametrize("batch_size", [1, 2])
+class TestPerplexity:
+    limit = 2
+
+    def test_perplexity_ground_truth_equal_pipeline(
+        self, model_path, model_id, datasets, batch_size
+    ):
+        # setting max_sequence_length to 16 to speed up the test
+        kwargs_ground_truth = (
+            dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {}
+        )
+        kwargs = copy(kwargs_ground_truth)
+
+        result_gt = self._get_ground_truth(
+            datasets=datasets,
+            batch_size=batch_size,
+            limit=self.limit,
+            model_id=model_id,
+            kwargs=kwargs_ground_truth,
+        )
+
+        result = integration_eval(
+            pipeline=TextGenerationPipeline(
+                model_path="hf:mgoin/TinyStories-1M-deepsparse",
+                engine_type="onnxruntime",
+            ),
+            datasets=datasets,
+            batch_size=batch_size,
+            limit=self.limit,
+            # we are setting accumulate=False to compare
+            # with the torch ground truth apples to apples
+            accumulate=False,
+            **kwargs,
+        )
+        perplexities = result.formatted[0].metrics[0].value
+        perplexities_gt = result_gt["perplexities"]
+        assert np.allclose(perplexities, perplexities_gt, rtol=0.1)
+
+    def test_perplexity_kv_cache_pipeline_equal_no_kv_cache_pipeline(
+        self, model_path, model_id, datasets, batch_size
+    ):
+
+        kwargs_ground_truth = (
+            dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {}
+        )
+        kwargs = copy(kwargs_ground_truth)
+
+        result_kv_cache = integration_eval(
+            pipeline=TextGenerationPipeline(
+                model_path="hf:mgoin/TinyStories-1M-deepsparse",
+                engine_type="onnxruntime",
+            ),
+            datasets=datasets,
+            model_path=model_id,
+            batch_size=batch_size,
+            limit=self.limit,
+            **kwargs,
+        )
+
+        result_non_kv_cache = integration_eval(
+            pipeline=TextGenerationPipeline(
+                model_path="hf:mgoin/TinyStories-1M-deepsparse",
+                engine_type="onnxruntime",
+                onnx_model_name="model-orig.onnx",
+            ),
+            datasets=datasets,
+            batch_size=batch_size,
+            limit=self.limit,
+            **kwargs,
+        )
+
+        perplexities_kv_cache = result_kv_cache.formatted[0].metrics[0].value
+        perplexities_non_kv_cache = result_non_kv_cache.formatted[0].metrics[0].value
+        np.allclose(perplexities_kv_cache, perplexities_non_kv_cache, rtol=0.1)
+
+    @staticmethod
+    def _get_ground_truth(datasets, batch_size, limit, model_id, kwargs={}):
+        perplexity = load("perplexity", module_type="metric")
+        kwargs["model_path"] = model_id
+        dataset, *_ = load_perplexity_dataset(dataset_name=datasets, **kwargs)
+        predictions = []
+        for i, sample in enumerate(dataset):
+            if i == batch_size * limit:
+                break
+            predictions.append(sample)
+        return perplexity.compute(
+            predictions=predictions, add_start_token=False, model_id=model_id
+        )
diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py
index 816ad075e0..58eedff836 100644
--- a/tests/deepsparse/evaluation/test_evaluator.py
+++ b/tests/deepsparse/evaluation/test_evaluator.py
@@ -115,19 +115,25 @@ def test_evaluate_pipeline_without_kv_cache(
     not try_import_lm_evaluation_harness(raise_error=False),
     reason="lm_evaluation_harness not installed",
 )
-def test_evaluation_llm_evaluation_harness_integration_name(
+def test_evaluation_llm_evaluation_harness(
     model_path,
-    datasets,
 ):
     assert evaluate(
         model=model_path,
-        datasets=datasets,
-        limit=2,
-        no_cache=True,
+        # testing only on hellaswag dataset
+        # to avoid long running time
+        datasets="hellaswag",
+        limit=1,
         integration="lm_evaluation_harness",
     )
 
 
+def test_evaluation_perplexity(model_path):
+    assert evaluate(
+        model=model_path, datasets="openai_humaneval", limit=1, integration="perplexity"
+    )
+
+
 @pytest.mark.parametrize("type_serialization", ["json", "yaml"])
 @pytest.mark.skipif(
     tuple(map(int, sys.version.split(".")[:2])) < (3, 10),
@@ -144,7 +150,6 @@ def test_cli(
     runner.invoke(
         main,
         [
-            "--model_path",
             model_path,
             "--dataset",
             datasets[0],

From 20f90ac5c13ea929318a101af5e05ed5ddb4241c Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Mon, 12 Feb 2024 15:50:14 -0500
Subject: [PATCH 15/16] examples/benchmark fix for resnet50 example (#1597)

---
 examples/benchmark/resnet50_benchmark.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/benchmark/resnet50_benchmark.py b/examples/benchmark/resnet50_benchmark.py
index ed06ddea74..49832e5585 100644
--- a/examples/benchmark/resnet50_benchmark.py
+++ b/examples/benchmark/resnet50_benchmark.py
@@ -47,7 +47,8 @@
 
 import numpy
 
-from deepsparse import benchmark_model, cpu
+from deepsparse import cpu
+from deepsparse.engine import benchmark_model
 
 
 CORES_PER_SOCKET, AVX_TYPE, VNNI = cpu.cpu_details()

From c4a7b68ffe33d53b7226a7442938b5bba56d9290 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 13 Feb 2024 09:06:53 -0500
Subject: [PATCH 16/16] fix if/else conditions with new ux flow (#1599)

---
 src/deepsparse/server/cli.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py
index 6d3952c5f5..d402f616fd 100644
--- a/src/deepsparse/server/cli.py
+++ b/src/deepsparse/server/cli.py
@@ -225,14 +225,11 @@ def main(
     if ctx.invoked_subcommand is not None:
         return
 
-    if task is None and config_file is None:
-        raise ValueError("Must specify either --task or --config_file. Found neither")
-
     if config_file is not None:
         server = _fetch_server(integration=integration, config=config_file)
         server.start_server(host, port, log_level, hot_reload_config=hot_reload_config)
 
-    if task is not None:
+    elif task is not None:
         cfg = ServerConfig(
             num_cores=num_cores,
             num_workers=num_workers,
@@ -258,6 +255,8 @@ def main(
             server.start_server(
                 host, port, log_level, hot_reload_config=hot_reload_config
             )
+    else:
+        raise ValueError("Must specify either --task or --config_file. Found neither")
 
 
 def _fetch_server(integration: str, config: Union[ServerConfig, str]):