neuralmagic · dbogunowicz · Oct 19, 2023 · Oct 17, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -154,18 +154,29 @@ def run(
 
         :return: The output of the engine
         """
-        if bool(kv_cache.engine_internal_cache):
-            # conventionally, before dispatching
-            # inputs to the engine, we validate them
-            # if val_inp=True. However, in this case
-            # we want to pass the empty kv cache inputs
-            # (batch_size=0) to the engine. Therefore,
-            # we skip the validation
-            return self.engine._eng_net.execute_list_out(
-                inputs, kv_cache.engine_internal_cache
-            )
-        # run the engine without the LIB.kv_cache object
-        return self.engine.run(inputs, val_inp)
+        if kv_cache is not None:
+            # run the engine assuming kv cache support
+            if bool(kv_cache.engine_internal_cache):
+                # run the engine assuming internal kv cache
+                # management. In this case the LIB.kv_cache
+                # class object will be passed to the engine
+                # call as well
+                # conventionally, before dispatching
+                # inputs to the engine, we validate them
+                # if val_inp=True. However, in this case
+                # we want to pass the empty kv cache inputs
+                # (batch_size=0) to the engine. Therefore,
+                # we skip the validation
+                return self.engine._eng_net.execute_list_out(
+                    inputs, kv_cache.engine_internal_cache
+                )
+            else:
+                # run the engine assuming external kv cache
+                # management.
+                return self.engine.run(inputs, val_inp, kv_cache)
+        else:
+            # run the engine without the kv cache support
+            return self.engine.run(inputs, val_inp)
 
     def __call__(
         self,

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -683,7 +683,10 @@ def engine_forward(
                 )
                 for prompt_logit in prompt_logits:
                     token_generator.generate(prompt_logit)
-                return numpy.array([self.tokens]), prompt_logits
+                yield numpy.array([token_generator.tokens]), prompt_logits, [
+                    FinishReason.LENGTH
+                ]
+                return
 
             else:
                 # run the prompt through