From f2a06c5312eb6ec35b118b557df8c3297e09586d Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 14 Mar 2024 17:51:28 -0400 Subject: [PATCH 1/3] [TextGeneration] Fix llama tokenizer (#1635) * add llama tokenizer fix * fix generated string * only run for streaming * add TODO --------- Co-authored-by: Dipika Sikka --- .../text_generation/prep_for_generation.py | 1 + .../text_generation/process_outputs.py | 50 +++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py index 3318ec88c5..66b0c2a79b 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py @@ -101,6 +101,7 @@ def run( else [], "finished_reason": [], "token_generator": token_generator, + "past_tokens_queue": copy.copy(tokens), } if kv_cache is None: diff --git a/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py b/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py index 6033e10ea4..cae7e24599 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py +++ b/src/deepsparse/transformers/pipelines/text_generation/process_outputs.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import datetime -from typing import Optional +from typing import List, Optional import numpy @@ -54,6 +54,33 @@ def _create_generated_text_output( finished=False, ) + def _generate_streamed_text_from_past_tokens( + self, generated_tokens: numpy.ndarray, past_tokens_queue: List[int] + ) -> str: + """ + An auxiliary method that helps to properly generate the streamed text. + Some models like llama2 and mistral are using LlamaTokenizer which is + based on SentencePiece tokenizer. This specific tokenizer doesn't seem + to output appropriate prefix spaces when decoding token by token. + One can make it work if the previously generated tokens are included. + This allows the tokenizer to figure out that the appropriate spaces + from last n consecutive tokens. + + :param generated_tokens: the generated tokens from the engine + :param past_tokens_queue: the queue of last n tokens (n is the + original prompt length in tokens) + :return: the generated string + """ + string_from_n_tokens = self.tokenizer.decode( + past_tokens_queue, skip_special_tokens=True + ) + past_tokens_queue.append(generated_tokens[0]) + string_from_n_plus_1_tokens = self.tokenizer.decode( + past_tokens_queue, skip_special_tokens=True + ) + past_tokens_queue.pop(0) + return [string_from_n_plus_1_tokens[len(string_from_n_tokens) :]] + def run( self, generated_tokens: numpy.ndarray, @@ -64,9 +91,24 @@ def run( ): generation_config = inference_state.current_state.get("generation_config") generated_logits = generated_logits if generation_config.output_scores else None - sequences = self.tokenizer.batch_decode( - generated_tokens, skip_special_tokens=True - ) + + import transformers + + # Fix for LLAMA-specific models when running streaming + # TODO: make streaming a conditional input to this operator. using inference + # state is a quick fix. + if isinstance( + self.tokenizer, + (transformers.LlamaTokenizer, transformers.LlamaTokenizerFast), + ) and inference_state.current_state.get("streaming"): + past_tokens_queue = inference_state.current_state.get("past_tokens_queue") + sequences = self._generate_streamed_text_from_past_tokens( + generated_tokens, past_tokens_queue + ) + else: + sequences = self.tokenizer.batch_decode( + generated_tokens, skip_special_tokens=True + ) try: finished_reason = [f[-1] for f in finished_reason] From 34059c768dc31c88738c6c34564f8d677abe35cf Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Tue, 5 Mar 2024 15:00:17 +0100 Subject: [PATCH 2/3] Retire `flaky` in favour of `pytest-rerunfailures` (#1628) --- setup.py | 2 +- tests/deepsparse/pipelines/test_pipeline.py | 3 +-- tests/server/test_legacy_loggers.py | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index eb5d835984..a1152d5f16 100644 --- a/setup.py +++ b/setup.py @@ -99,7 +99,7 @@ def _parse_requirements_file(file_path): "black==22.12.0", "flake8>=3.8.3", "isort>=5.7.0", - "flaky~=3.7.0", + "pytest-rerunfailures>=13.0", "ndjson>=0.3.1", "wheel>=0.36.2", "pytest>=6.0.0", diff --git a/tests/deepsparse/pipelines/test_pipeline.py b/tests/deepsparse/pipelines/test_pipeline.py index 6ad1c71fe4..3406d13815 100644 --- a/tests/deepsparse/pipelines/test_pipeline.py +++ b/tests/deepsparse/pipelines/test_pipeline.py @@ -16,7 +16,6 @@ from concurrent.futures import ThreadPoolExecutor from unittest import mock -import flaky import pytest from deepsparse.legacy.base_pipeline import BasePipeline @@ -125,7 +124,7 @@ def test_pipeline_executor_num_workers(): assert executor._max_workers >= 1 -@flaky.flaky(max_runs=2, min_passes=1) +@pytest.mark.flaky(reruns=2, min_passes=1) @mock_engine(rng_seed=0) def test_pipeline_call_is_async(engine_mock): # attempts to verify that pipeline calls to engine are async diff --git a/tests/server/test_legacy_loggers.py b/tests/server/test_legacy_loggers.py index e52e6fc4d9..ce3a9b9aec 100644 --- a/tests/server/test_legacy_loggers.py +++ b/tests/server/test_legacy_loggers.py @@ -16,6 +16,7 @@ from collections import Counter from unittest import mock +import pytest from deepsparse.legacy.loggers import PythonLogger from deepsparse.legacy.loggers.config import ( PipelineSystemLoggingConfig, @@ -30,7 +31,6 @@ from deepsparse.server.deepsparse_server import DeepsparseServer from deepsparse.server.helpers import server_logger_from_config from fastapi.testclient import TestClient -from flaky import flaky from tests.deepsparse.legacy.loggers.helpers import fetch_leaf_logger from tests.helpers import find_free_port from tests.test_data.server_test_data import SAMPLE_LOGS_DICT @@ -106,7 +106,7 @@ def test_data_logging_from_predefined(): assert log == expected_log -@flaky(max_runs=4, min_passes=3) +@pytest.mark.flaky(reruns=4, min_passes=3) def test_logging_only_system_info(): server_config = ServerConfig( endpoints=[EndpointConfig(task=task, name=name, model=stub)], @@ -195,7 +195,7 @@ def test_multiple_targets_logging(): ) -@flaky(max_runs=3, min_passes=2) +@pytest.mark.flaky(reruns=3, min_passes=2) def test_function_metric_with_target_loggers(): server_config = ServerConfig( endpoints=[ From e1280bc8adad8a8f47dec3ce1abf77ebe5f81818 Mon Sep 17 00:00:00 2001 From: dhuang Date: Fri, 15 Mar 2024 11:00:13 -0400 Subject: [PATCH 3/3] pick up another fix and bump up version to 1.7.1 --- src/deepsparse/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/version.py b/src/deepsparse/version.py index 848f460af3..0554b5183f 100644 --- a/src/deepsparse/version.py +++ b/src/deepsparse/version.py @@ -39,7 +39,7 @@ from deepsparse.generated_version import is_enterprise, is_release, splash, version except Exception: # otherwise, fall back to version info in this file - version = "1.7.0" + version = "1.7.1" is_release = False is_enterprise = False splash = (