neuralmagic · dbogunowicz · Feb 26, 2024 · Mar 4, 2024 · Mar 5, 2024 · Mar 6, 2024
diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py
@@ -56,9 +56,10 @@ def evaluate(
 
     # if target is a string, turn it into an appropriate pipeline
     # otherwise assume it is a pipeline
-    pipeline = (
-        create_pipeline(model, engine_type) if isinstance(model, (Path, str)) else model
-    )
+    if isinstance(model, (Path, str)):
+        pipeline, kwargs = create_pipeline(model, engine_type, **kwargs)
+    else:
+        pipeline = model
 
     eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration)
 

diff --git a/src/deepsparse/evaluation/integrations/perplexity.py b/src/deepsparse/evaluation/integrations/perplexity.py
@@ -34,6 +34,7 @@
     HumanEvalIteratorWrapper,
     process_concatenated_datasets,
 )
+from deepsparse.transformers.utils.helpers import prepends_bos_token
 
 
 """
@@ -165,6 +166,7 @@ def run_perplexity(
                     return_input_tokens=True,
                 )
             else:
+                print(len(pipeline.tokenizer(batch[0]).input_ids))
                 out = pipeline(
                     prompt=batch,
                     output_scores=True,
@@ -252,7 +254,15 @@ def load_perplexity_dataset(
         # fetch max_sequence_length from pipeline if not provided
         max_sequence_length = kwargs.pop("max_sequence_length", None)
         if max_sequence_length is None and pipeline is not None:
-            max_sequence_length = pipeline.sequence_length
+            # max_sequence_length for the dataset concatenation needs to be
+            # smaller than the kv_cache.capacity
+            # (pipeline.sequence_length - pipeline.prompt_sequence_length)
+            max_sequence_length = (
+                pipeline.sequence_length - pipeline.prompt_sequence_length - 1
+            )
+            # account for potential additional BOS token
+            breakpoint()
+            max_sequence_length -= prepends_bos_token(pipeline.tokenizer)
 
         # fetch model_path from pipeline if not provided
         model_path = kwargs.pop("model_path", None)

diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py
@@ -199,14 +199,17 @@ def create_pipeline(
         text generation model from. This can be a local
         or remote path to the model or a sparsezoo stub
     :param engine_type: The engine type to initialize the model with.
-    :return: The initialized pipeline
+    :return: The initialized pipeline and the mutated
+        (potentially reduced number of) kwargs
     """
     engine_type = engine_type or DEEPSPARSE_ENGINE
-    return Pipeline.create(
-        task=kwargs.pop("task", "text-generation"),
-        model_path=model_path,
-        sequence_length=kwargs.pop("sequence_length", 2048),
-        engine_type=engine_type,
-        batch_size=kwargs.pop("batch_size", 1),
-        **kwargs,
+    return (
+        Pipeline.create(
+            task=kwargs.pop("task", "text-generation"),
+            model_path=model_path,
+            sequence_length=kwargs.pop("sequence_length", 2048),
+            engine_type=engine_type,
+            batch_size=kwargs.pop("batch_size", 1),
+        ),
+        kwargs,
     )
diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py
@@ -360,6 +360,10 @@ def sequence_length(self) -> int:
         """
         return self.ops["single_engine"].sequence_length
 
+    @property
+    def prompt_sequence_length(self) -> int:
+        return self.ops["multi_engine"].input_ids_length
+
     @property
     def batch_size(self) -> int:
         return self.ops["single_engine"].batch_size

diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py
@@ -135,3 +135,7 @@ def batch_size(self) -> int:
     @property
     def engine_type(self) -> str:
         return self.ops["engine_operator"]._engine_type
+
+    @property
+    def prompt_sequence_length(self) -> int:
+        return 1
diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py
@@ -62,10 +62,10 @@ def pipeline_target():
 
 
 def test_initialize_model_from_target_pipeline_onnx(pipeline_target):
-    model = create_pipeline(pipeline_target, "onnxruntime")
+    model, _ = create_pipeline(pipeline_target, "onnxruntime")
     assert model.ops.get("single_engine")._engine_type == "onnxruntime"
 
 
 def test_initialize_model_from_target_pipeline_with_kwargs(pipeline_target):
-    model = create_pipeline(pipeline_target, "deepsparse", sequence_length=64)
+    model, _ = create_pipeline(pipeline_target, "deepsparse", sequence_length=64)
     assert model.ops.get("process_input").sequence_length == 64