From e29211c95b75ff19e0bf0c65a26b6044ba51bf80 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 29 Jan 2024 16:19:06 -0500 Subject: [PATCH 01/16] skip continuous batching test (#1567) --- .../deepsparse/schedulers/test_continuous_batching_scheduler.py | 2 ++ .../schedulers/utils/test_continuous_batching_executor.py | 2 ++ .../transformers/text_generation/integration_tests/test_llms.py | 1 + 3 files changed, 5 insertions(+) diff --git a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py index 627202773c..4502a8fcf9 100644 --- a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py +++ b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py @@ -16,10 +16,12 @@ import numpy +import pytest from deepsparse.operators import EngineOperator from deepsparse.schedulers import ContinuousBatchingScheduler +@pytest.mark.skip("skip continuous batching tests") def test_continuous_batching_executor_thread(): # simple test that ContinuousBatchingScheduler can be instantiated and return # a result from a request, for testing multi-batch execution, making enough diff --git a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py index e26532d088..fa41259f21 100644 --- a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py +++ b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py @@ -16,6 +16,7 @@ import numpy +import pytest from deepsparse.operators import EngineOperator from deepsparse.schedulers.utils import ( ContinuousBatchingExecutorThread, @@ -23,6 +24,7 @@ ) +@pytest.mark.skip("skip continuous batching tests") def test_continuous_batching_executor_thread(): # mobilenet model with batch_size=2 engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base") diff --git a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py index 633ee19c53..e03d97e7f3 100644 --- a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py +++ b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py @@ -132,6 +132,7 @@ def setup(self, params_dict, max_new_tokens, internal_kv_cache): self.default_pipeline = None self.max_new_tokens = max_new_tokens + @pytest.mark.skip("skip continuous batching tests") def test_continuous_batching_pipeline(self, setup): pipeline = self.get_pipeline( From 46ce7474ab77b15f75457fd3663361786168cc5f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 31 Jan 2024 12:49:32 -0500 Subject: [PATCH 02/16] [server] Disable the elastic scheduler when continuous batching is enabled (#1569) * update server to disable the context/elastic scheduler when continuous batching is enabled * clean up when context is created --- src/deepsparse/server/deepsparse_server.py | 27 ++++++++++++++++++---- src/deepsparse/server/openai_server.py | 14 ++++++++++- src/deepsparse/server/server.py | 14 ++++------- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/deepsparse/server/deepsparse_server.py b/src/deepsparse/server/deepsparse_server.py index 8ffc7508cb..0bf338cfc4 100644 --- a/src/deepsparse/server/deepsparse_server.py +++ b/src/deepsparse/server/deepsparse_server.py @@ -13,6 +13,7 @@ # limitations under the License. import logging +from concurrent.futures import ThreadPoolExecutor from functools import partial from deepsparse import Pipeline @@ -73,12 +74,30 @@ def _add_endpoint( endpoint_config: EndpointConfig, ): pipeline_config = endpoint_config.to_pipeline_config() - pipeline_config.kwargs["executor"] = self.executor _LOGGER.info(f"Initializing pipeline for '{endpoint_config.name}'") - pipeline = Pipeline.from_config( - pipeline_config, context=self.context, logger=self.server_logger - ) + if pipeline_config.kwargs.get("continuous_batch_sizes"): + pipeline_config.kwargs["executor"] = ThreadPoolExecutor( + max_workers=self.server_config.num_workers + ) + _LOGGER.info( + "for continuous batching, the single stream scheduler will be enabled." + ) + pipeline_config.num_cores = self.server_config.num_cores + pipeline_config.scheduler = "single" + + pipeline = Pipeline.from_config( + pipeline_config, + num_streams=self.server_config.num_workers, + logger=self.server_logger, + ) + else: + pipeline_config.kwargs["executor"] = ThreadPoolExecutor( + max_workers=self.context.num_streams + ) + pipeline = Pipeline.from_config( + pipeline_config, context=self.context, logger=self.server_logger + ) _LOGGER.info(f"Adding endpoints for '{endpoint_config.name}'") self._add_inference_endpoints( diff --git a/src/deepsparse/server/openai_server.py b/src/deepsparse/server/openai_server.py index 2ef789c68e..c656323594 100644 --- a/src/deepsparse/server/openai_server.py +++ b/src/deepsparse/server/openai_server.py @@ -375,7 +375,19 @@ def _add_model( f"{SupportedTasks.code_generation._fields}" ) - pipeline = Pipeline.from_config(pipeline_config, context=self.context) + if pipeline_config.kwargs.get("continuous_batch_sizes"): + _LOGGER.info( + "for continuous batching, the single stream scheduler will be enabled." + ) + pipeline_config.num_cores = self.server_config.num_cores + pipeline_config.scheduler = "single" + + pipeline = Pipeline.from_config( + pipeline_config, + num_streams=self.server_config.num_workers, + ) + else: + pipeline = Pipeline.from_config(pipeline_config, context=self.context) if not self.model_to_pipeline.get(endpoint_config.model): model_card = ModelCard( diff --git a/src/deepsparse/server/server.py b/src/deepsparse/server/server.py index 3c1cb053f7..8e1915a265 100644 --- a/src/deepsparse/server/server.py +++ b/src/deepsparse/server/server.py @@ -16,7 +16,6 @@ import os from abc import abstractmethod from collections import Counter -from concurrent.futures import ThreadPoolExecutor from copy import deepcopy from typing import AsyncGenerator, List, Optional, Union @@ -76,10 +75,11 @@ def __init__(self, server_config: Union[str, ServerConfig]): self.server_config = server_config _LOGGER.info(f"Using config: {repr(self.server_config)}") - - self.context = None - self.executor = None self.server_logger = server_logger_from_config(self.server_config) + self.context = Context( + num_cores=self.server_config.num_cores, + num_streams=self.server_config.num_workers, + ) def start_server( self, @@ -109,12 +109,6 @@ def start_server( self.config_path, f"http://{host}:{port}/endpoints", 0.5 ) - self.context = Context( - num_cores=self.server_config.num_cores, - num_streams=self.server_config.num_workers, - ) - self.executor = ThreadPoolExecutor(max_workers=self.context.num_streams) - app = self._build_app() uvicorn.run( From 1f92f52134cfd5a938aaac6b255692593a35db5a Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 31 Jan 2024 16:45:20 -0500 Subject: [PATCH 03/16] [TextGeneration] Fix initialization; don't try v1 init for text gen (#1571) * only check capacity condition durin prefill; already have check in generation * dont try v1 if running text gen; just raise error --- src/deepsparse/pipeline.py | 6 +++++- .../text_generation/autoregressive_preprocess_operator.py | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py index e2a1beeab1..aaa65409d8 100644 --- a/src/deepsparse/pipeline.py +++ b/src/deepsparse/pipeline.py @@ -27,6 +27,7 @@ SchedulerGroup, ) from deepsparse.subgraph_execute import SubGraphExecutor +from deepsparse.tasks import SupportedTasks from deepsparse.utils import InferenceState, PipelineState from deepsparse.utils.subgraph import SubGraph from deepsparse.utils.time import TIMER_KEY, InferenceStages, TimerManager @@ -139,7 +140,10 @@ def create(cls, task: str, **kwargs) -> "Pipeline": "Pipeline was not created for the given task. The " "provided task should be registered using the OperatorRegistry" ) - except Exception: + except Exception as e: + if SupportedTasks.is_text_generation(task): + raise e + _LOGGER.warning(f"Could not create v2 '{task}' pipeline, trying legacy") from deepsparse.legacy import Pipeline diff --git a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py index df4e587df3..01d2a664b5 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py +++ b/src/deepsparse/transformers/pipelines/text_generation/autoregressive_preprocess_operator.py @@ -51,7 +51,10 @@ def can_operate(self, inp: Any) -> bool: if inp.get("in_generation"): return True - if kv_cache.total_num_processed_tokens >= kv_cache.capacity: + if ( + kv_cache.total_num_processed_tokens >= kv_cache.capacity + and inp.get("in_generation") is None + ): raise RuntimeError( "Not enough kv_cache capacity to run generation. Please use a larger " "sequence_length or a shorter prompt" From 347caa4b1088dfc95597e4710f7a56cf7ddf04d5 Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Fri, 2 Feb 2024 10:57:27 -0500 Subject: [PATCH 04/16] [BugFix] Add evaluate callable (#1576) * Add evaluate callable * Wrap transformers into try except --- src/deepsparse/__init__.py | 1 + src/deepsparse/evaluation/utils.py | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py index 2e00ae4949..c5c31ad48d 100644 --- a/src/deepsparse/__init__.py +++ b/src/deepsparse/__init__.py @@ -38,5 +38,6 @@ from .version import __version__, is_release from .analytics import deepsparse_analytics as _analytics from .subgraph_execute import * +from .evaluation.evaluator import evaluate _analytics.send_event("python__init") diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index 0534a9f9f3..7684e54513 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -15,7 +15,14 @@ import os from typing import Any, Dict, List, Optional, Tuple, Union -from transformers import AutoModelForCausalLM, PreTrainedModel + +try: + from transformers import AutoModelForCausalLM, PreTrainedModel + + transformers_error = None +except ImportError as import_error: + transformers_error = import_error + from deepsparse import Pipeline from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE, ORT_ENGINE @@ -50,7 +57,7 @@ def potentially_check_dependency_import(integration_name: str) -> bool: def resolve_integration( - model: Union[Pipeline, PreTrainedModel], datasets: Union[str, List[str]] + model: Union[Pipeline, "PreTrainedModel"], datasets: Union[str, List[str]] ) -> Union[str, None]: """ Given a model and dataset, infer the name of the evaluation integration @@ -73,6 +80,7 @@ def if_generative_language_model(model: Any) -> bool: """ Checks if the model is a generative language model. """ + _check_transformers_dependency() if isinstance(model, Pipeline): return model.__class__.__name__ == "TextGenerationPipeline" elif isinstance(model, PreTrainedModel): @@ -130,7 +138,7 @@ def create_model_from_target( target: str, engine_type: Optional[str] = None, **kwargs, -) -> Union[Pipeline, AutoModelForCausalLM]: +) -> Union[Pipeline, "AutoModelForCausalLM"]: """ Create a model or a pipeline from a target path. @@ -146,6 +154,8 @@ def create_model_from_target( :param engine_type: The engine type to initialize the model with. :return: The initialized model """ + _check_transformers_dependency() + if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]: return Pipeline.create( task="text-generation", @@ -157,3 +167,10 @@ def create_model_from_target( ) else: return AutoModelForCausalLM.from_pretrained(target, **kwargs) + + +def _check_transformers_dependency(): + if transformers_error: + raise ImportError( + "transformers is needed to use this module" + ) from transformers_error From 03c407837d8bfd53e37a7aa08f5babfb2a2eb8e7 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:26:43 +0100 Subject: [PATCH 05/16] [Fix] ONNX model benchmarking when no sequence_length inferred from the model (#1581) --- src/deepsparse/benchmark/benchmark_model.py | 5 +++++ src/deepsparse/utils/onnx.py | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/benchmark/benchmark_model.py b/src/deepsparse/benchmark/benchmark_model.py index 9539032259..4a43120f84 100644 --- a/src/deepsparse/benchmark/benchmark_model.py +++ b/src/deepsparse/benchmark/benchmark_model.py @@ -411,6 +411,11 @@ def benchmark_model( if not disable_kv_cache_overrides: if not sequence_length: sequence_length = infer_sequence_length(model_path) + if not sequence_length: + raise ValueError( + "Unable to infer sequence length from model. " + "Specify it manually through `sequence_length` argument." + ) if input_ids_length > sequence_length: raise ValueError( f"input_ids_length: {input_ids_length} " diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index e4b41f3286..423ec10f67 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -613,7 +613,8 @@ def has_model_kv_cache(model: Union[str, ModelProto]) -> bool: def infer_sequence_length(model: Union[str, ModelProto]) -> int: """ :param model: model - :return: inferred sequence length of the model + :return: inferred sequence length of the model. + If unable to infer, return 0 """ if not isinstance(model, ModelProto): model = onnx.load(model, load_external_data=False) @@ -623,9 +624,10 @@ def infer_sequence_length(model: Union[str, ModelProto]) -> int: for idx, inp in enumerate(model.graph.input): if inp.name == "attention_mask": target_input_idx = idx + break try: # return shape of second dim if possible target_input = model.graph.input[target_input_idx] return target_input.type.tensor_type.shape.dim[1].dim_value except Exception: - return 0 # unable to infer seq len + return 0 From cb52d6e4e51ed1e6bcd3df934fa129b82a4a6246 Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Mon, 5 Feb 2024 10:54:11 -0500 Subject: [PATCH 06/16] Add analyze callable (#1574) Co-authored-by: Benjamin Fineran --- src/deepsparse/__init__.py | 1 + src/deepsparse/analyze.py | 56 +++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py index c5c31ad48d..436990b065 100644 --- a/src/deepsparse/__init__.py +++ b/src/deepsparse/__init__.py @@ -38,6 +38,7 @@ from .version import __version__, is_release from .analytics import deepsparse_analytics as _analytics from .subgraph_execute import * +from .analyze import analyze from .evaluation.evaluator import evaluate _analytics.send_event("python__init") diff --git a/src/deepsparse/analyze.py b/src/deepsparse/analyze.py index 2637d9b2e3..791bbbffe1 100644 --- a/src/deepsparse/analyze.py +++ b/src/deepsparse/analyze.py @@ -31,7 +31,11 @@ ModelAnalysis, NodeInferenceResult, ) -from sparsezoo.analyze.cli import analyze_options, analyze_performance_options +from sparsezoo.analyze.cli import ( + DEEPSPARSE_ENGINE, + analyze_options, + analyze_performance_options, +) _LOGGER = logging.getLogger(__name__) @@ -74,21 +78,11 @@ def main( ) _LOGGER.info("Starting Analysis ...") - analysis = ModelAnalysis.create(model_path) - _LOGGER.info("Analysis complete, collating results...") - scenario = BenchmarkScenario( - batch_size=batch_size_throughput, - num_cores=None, - engine=benchmark_engine, - ) - performance_summary = run_benchmark_and_analysis( - onnx_model=model_to_path(model_path), - scenario=scenario, - ) + analysis = analyze(model_path, batch_size_throughput, benchmark_engine) + by_types: bool = convert_to_bool(by_types) by_layers: bool = convert_to_bool(by_layers) - analysis.benchmark_results = [performance_summary] summary = analysis.summary( by_types=by_types, by_layers=by_layers, @@ -103,13 +97,9 @@ def main( print("Comparison Analysis:") for model_to_compare in compare: - compare_model_analysis = ModelAnalysis.create(model_to_compare) - _LOGGER.info(f"Running Performance Analysis on {model_to_compare}") - performance_summary = run_benchmark_and_analysis( - onnx_model=model_to_path(model_to_compare), - scenario=scenario, + compare_model_analysis = analyze( + model_to_compare, batch_size_throughput, benchmark_engine ) - compare_model_analysis.benchmark_results = [performance_summary] summary_comparison_model = compare_model_analysis.summary( by_types=by_types, by_layers=by_layers, @@ -124,6 +114,34 @@ def main( analysis.yaml(file_path=save) +def analyze( + model_path, + batch_size_throughput: int = 1, + benchmark_engine: str = DEEPSPARSE_ENGINE, +) -> ModelAnalysis: + """ + :param model_path: Local filepath to an ONNX model, or a SparseZoo stub + :param batch_size_throughput: Batch size for throughput benchmark + :param benchmark_engine: Benchmark engine to use, can be 'deepsparse' or + 'onnxruntime', defaults to 'deepsparse' + :return: A `ModelAnalysis` object encapsulating the results of the analysis + """ + analysis = ModelAnalysis.create(model_path) + _LOGGER.info("Analysis complete, collating results...") + scenario = BenchmarkScenario( + batch_size=batch_size_throughput, + num_cores=None, + engine=benchmark_engine, + ) + performance_summary = run_benchmark_and_analysis( + onnx_model=model_to_path(model_path), + scenario=scenario, + ) + + analysis.benchmark_results = [performance_summary] + return analysis + + def run_benchmark_and_analysis( onnx_model: str, scenario: BenchmarkScenario, From 59e0602ba1ac24a7a02f2f1856ef44b5528c0d8f Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:56:34 +0100 Subject: [PATCH 07/16] [DeepSparse Evaluation API] UX Improvements (#1568) * initial commit * add some more tests for hardening * Update src/deepsparse/evaluation/cli.py * Update src/deepsparse/transformers/pipelines/text_generation/pipeline.py * Apply suggestions from code review * quality * Update test_evaluator.py * quality --- src/deepsparse/evaluation/cli.py | 30 +++---- src/deepsparse/evaluation/evaluator.py | 34 +++++--- src/deepsparse/evaluation/registry.py | 9 +- src/deepsparse/evaluation/utils.py | 85 +++++++------------ .../pipelines/text_generation/pipeline.py | 8 ++ .../text_generation/pipeline_no_kv_cache.py | 8 ++ .../test_lm_evaluation_harness.py | 8 +- tests/deepsparse/evaluation/test_evaluator.py | 47 +++++++--- tests/deepsparse/evaluation/test_utils.py | 53 ++---------- 9 files changed, 132 insertions(+), 150 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index ed7ea72831..b68d32d4e5 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -20,7 +20,8 @@ Module for evaluating models on the various evaluation integrations OPTIONS: - --target TARGET A path to a remote or local directory containing ONNX/torch model + --model_path MODEL_PATH + A path to an ONNX model, local directory containing ONNX model (including all the auxiliary files) or a SparseZoo stub -d DATASET, --dataset DATASET The dataset to evaluate on. The user may pass multiple datasets @@ -30,9 +31,7 @@ integration name that is registered in the evaluation registry -e ENGINE_TYPE, --engine_type ENGINE_TYPE Inference engine to use for the evaluation. The default - is the DeepSparse engine. If the evaluation should be run - without initializing a pipeline (e.g. for the evaluation - of a torch model), the engine type should be set to None + is the DeepSparse engine. -s SAVE_PATH, --save_path SAVE_PATH The path to save the evaluation results. By default the results will be saved in the @@ -90,10 +89,10 @@ ) ) @click.option( - "--target", + "--model_path", type=click.Path(dir_okay=True, file_okay=True), required=True, - help="A path to a remote or local directory containing ONNX/torch model " + help="A path to an ONNX model, local directory containing ONNX model" "(including all the auxiliary files) or a SparseZoo stub", ) @click.option( @@ -118,9 +117,7 @@ type=click.Choice([DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE]), default=DEEPSPARSE_ENGINE, help="The engine to use for the evaluation. The default is the " - "DeepSparse engine. If the evaluation should be run without " - "initializing a pipeline (e.g. for the evaluation of a torch " - "model), the engine type should be set to None", + "DeepSparse engine. ", ) @click.option( "-s", @@ -167,7 +164,7 @@ ) @click.argument("integration_args", nargs=-1, type=click.UNPROCESSED) def main( - target, + model_path, dataset, integration, engine_type, @@ -183,14 +180,9 @@ def main( # format kwargs to a dict integration_args = args_to_dict(integration_args) - _LOGGER.info(f"Target to evaluate: {target}") - if engine_type: - _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created") - else: - _LOGGER.info( - "No engine type specified. The target " - "will be evaluated using the native framework" - ) + _LOGGER.info( + f"Creating {engine_type} pipeline to evaluate from model path: {model_path}" + ) _LOGGER.info( f"Datasets to evaluate on: {datasets}\n" @@ -201,7 +193,7 @@ def main( ) result: Result = evaluate( - target=target, + model=model_path, datasets=datasets, integration=integration, engine_type=engine_type, diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py index 7bd56adf6e..b513f07563 100644 --- a/src/deepsparse/evaluation/evaluator.py +++ b/src/deepsparse/evaluation/evaluator.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -from typing import Any, List, Optional, Union +from pathlib import Path +from typing import List, Optional, Union +from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Result -from deepsparse.evaluation.utils import create_model_from_target +from deepsparse.evaluation.utils import create_pipeline from deepsparse.operators.engine_operator import ( DEEPSPARSE_ENGINE, ORT_ENGINE, @@ -30,11 +32,11 @@ def evaluate( - target: Any, + model: Union[Pipeline, Path, str], datasets: Union[str, List[str]], integration: Optional[str] = None, engine_type: Union[ - DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE, None + DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE ] = DEEPSPARSE_ENGINE, batch_size: int = 1, splits: Union[List[str], str, None] = None, @@ -42,18 +44,26 @@ def evaluate( **kwargs, ) -> Result: - # if target is a string, turn it into an appropriate model/pipeline - # otherwise assume it is a model/pipeline - model = ( - create_model_from_target(target, engine_type) - if isinstance(target, str) - else target + if isinstance(model, Pipeline): + _LOGGER.info( + "Passed a Pipeline object into evaluate function. This will " + "override the following arguments:" + ) + batch_size = model.batch_size + _LOGGER.info(f"batch_size: {batch_size}") + engine_type = engine_type + _LOGGER.info(f"engine_type: {engine_type}") + + # if target is a string, turn it into an appropriate pipeline + # otherwise assume it is a pipeline + pipeline = ( + create_pipeline(model, engine_type) if isinstance(model, (Path, str)) else model ) - eval_integration = EvaluationRegistry.resolve(model, datasets, integration) + eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration) return eval_integration( - model=model, + pipeline=pipeline, datasets=datasets, engine_type=engine_type, batch_size=batch_size, diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py index 5b6e45bc1c..2daabb69cc 100644 --- a/src/deepsparse/evaluation/registry.py +++ b/src/deepsparse/evaluation/registry.py @@ -15,8 +15,9 @@ Implementation of a registry for evaluation functions """ import logging -from typing import Any, Callable, List, Optional, Union +from typing import Callable, List, Optional, Union +from deepsparse import Pipeline from sparsezoo.utils.registry import RegistryMixin @@ -38,7 +39,7 @@ def load_from_registry(cls, name: str) -> Callable[..., "Result"]: # noqa: F821 @classmethod def resolve( cls, - model: Any, + pipeline: Pipeline, datasets: Union[str, List[str]], integration: Optional[str] = None, ) -> Callable[..., "Result"]: # noqa: F821 @@ -59,12 +60,12 @@ def resolve( "No integration specified, inferring the evaluation" "function from the input arguments..." ) - integration = resolve_integration(model, datasets) + integration = resolve_integration(pipeline, datasets) if integration is None: raise ValueError( "Unable to resolve an evaluation function for the given model. " - "Specify an integration name or use a model that is supported " + "Specify an integration name or use a pipeline that is supported " ) _LOGGER.info(f"Inferred the evaluation function: {integration}") diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index 7684e54513..87475dd5d2 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -15,21 +15,11 @@ import os from typing import Any, Dict, List, Optional, Tuple, Union - -try: - from transformers import AutoModelForCausalLM, PreTrainedModel - - transformers_error = None -except ImportError as import_error: - transformers_error = import_error - - from deepsparse import Pipeline -from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE, ORT_ENGINE __all__ = [ - "create_model_from_target", + "create_pipeline", "get_save_path", "args_to_dict", "resolve_integration", @@ -57,36 +47,36 @@ def potentially_check_dependency_import(integration_name: str) -> bool: def resolve_integration( - model: Union[Pipeline, "PreTrainedModel"], datasets: Union[str, List[str]] + pipeline: Pipeline, datasets: Union[str, List[str]] ) -> Union[str, None]: """ - Given a model and dataset, infer the name of the evaluation integration + Given a pipeline and dataset, infer the name of the evaluation integration to use. If unable to infer a name, return None. Currently: if the model is a generative language model, default to 'lm-evaluation-harness' otherwise return None - :param model: The model to infer the integration for + :param pipeline: The pipeline to infer the integration for :param datasets: The datasets to infer the integration for :return: The name of the integration to use or None if unable to infer """ - if if_generative_language_model(model): + if if_generative_language_model(pipeline): return LM_EVALUATION_HARNESS return None -def if_generative_language_model(model: Any) -> bool: +def if_generative_language_model(pipeline: Pipeline) -> bool: """ Checks if the model is a generative language model. """ - _check_transformers_dependency() - if isinstance(model, Pipeline): - return model.__class__.__name__ == "TextGenerationPipeline" - elif isinstance(model, PreTrainedModel): - return "CausalLM" in model.__class__.__name__ - else: - return False + pipeline_name = pipeline.__class__.__name__ + if pipeline_name == "TextGenerationPipeline" or ( + pipeline_name == "TextGenerationPipelineNoKVCache" + ): + return True + + return False def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]: @@ -134,43 +124,30 @@ def get_save_path( return os.path.join(base_path, file_name) -def create_model_from_target( - target: str, +def create_pipeline( + model_path: str, engine_type: Optional[str] = None, **kwargs, -) -> Union[Pipeline, "AutoModelForCausalLM"]: +) -> Pipeline: """ - Create a model or a pipeline from a target path. + Create a pipeline for evaluation - Note: This function is currently limited to: - - creating pipelines of type 'text-generation' - - creating dense huggingface models of type 'AutoModelForCausalLM' - This function will be expanded in the future to support more - model types and frameworks. + Note: This function is currently primarily + focused on creating pipelines of type 'text-generation' + This function will be expanded in the future to support + more tasks and models - :param target: The target path to initialize the + :param model_path: The target path to initialize the text generation model from. This can be a local or remote path to the model or a sparsezoo stub :param engine_type: The engine type to initialize the model with. - :return: The initialized model + :return: The initialized pipeline """ - _check_transformers_dependency() - - if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]: - return Pipeline.create( - task="text-generation", - model_path=target, - sequence_length=kwargs.pop("sequence_length", 2048), - engine_type=engine_type, - batch_size=kwargs.pop("batch_size", 1), - **kwargs, - ) - else: - return AutoModelForCausalLM.from_pretrained(target, **kwargs) - - -def _check_transformers_dependency(): - if transformers_error: - raise ImportError( - "transformers is needed to use this module" - ) from transformers_error + return Pipeline.create( + task=kwargs.pop("task", "text-generation"), + model_path=model_path, + sequence_length=kwargs.pop("sequence_length", 2048), + engine_type=engine_type, + batch_size=kwargs.pop("batch_size", 1), + **kwargs, + ) diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py index 2c858c901b..64c0c64a51 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py @@ -357,6 +357,14 @@ def sequence_length(self) -> int: """ return self.ops["single_engine"].sequence_length + @property + def batch_size(self) -> int: + return self.ops["single_engine"].batch_size + + @property + def engine_type(self) -> str: + return self.ops["single_engine"]._engine_type + def _get_continuous_batching_scheduler( self, batch_sizes: List[int], engines: List[EngineOperator] ) -> ContinuousBatchingScheduler: diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py index 7f6cb9db5f..c6cbc3dd59 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py @@ -127,3 +127,11 @@ def expand_inputs(self, items, batch_size): out, orig_batch_size = split_engine_inputs(items, batch_size) combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out] return combined_batches, orig_batch_size + + @property + def batch_size(self) -> int: + return self.ops["engine_operator"].batch_size + + @property + def engine_type(self) -> str: + return self.ops["engine_operator"]._engine_type diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index 9fa9b494cf..3b9016294f 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -12,19 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +from transformers import AutoModelForCausalLM + import pytest from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness -from deepsparse.evaluation.utils import create_model_from_target +from deepsparse.evaluation.utils import create_pipeline @pytest.mark.parametrize( "pipeline, model_torch", [ ( - create_model_from_target( + create_pipeline( "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime" ), - create_model_from_target("roneneldan/TinyStories-1M"), + AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M"), ) ], ) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index dedd63fa36..816ad075e0 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -29,6 +29,7 @@ Metric, Result, ) +from deepsparse.pipeline import Pipeline @EvaluationRegistry.register() @@ -49,7 +50,7 @@ def dummy_integration(*args, **kwargs): @pytest.fixture() -def target(): +def model_path(): return "hf:mgoin/TinyStories-1M-deepsparse" @@ -68,18 +69,42 @@ def unknown_integration_name(): return "unknown_integration" -def test_evaluate_unknown_integration(target, datasets, unknown_integration_name): +def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name): with pytest.raises(KeyError): evaluate( - target=target, + model=model_path, datasets=datasets, integration=unknown_integration_name, ) -def test_evaluate(target, datasets, dummy_integration_name): +def test_evaluate(model_path, datasets, dummy_integration_name): result = evaluate( - target=target, + model=model_path, + datasets=datasets, + integration=dummy_integration_name, + ) + assert isinstance(result, Result) + + +def test_evaluate_pipeline_with_kv_cache(model_path, datasets, dummy_integration_name): + result = evaluate( + model=Pipeline.create(model_path=model_path, task="text-generation"), + datasets=datasets, + integration=dummy_integration_name, + ) + assert isinstance(result, Result) + + +def test_evaluate_pipeline_without_kv_cache( + model_path, datasets, dummy_integration_name +): + result = evaluate( + model=Pipeline.create( + model_path=model_path, + task="text-generation", + onnx_model_name="model-orig.onnx", + ), datasets=datasets, integration=dummy_integration_name, ) @@ -91,11 +116,11 @@ def test_evaluate(target, datasets, dummy_integration_name): reason="lm_evaluation_harness not installed", ) def test_evaluation_llm_evaluation_harness_integration_name( - target, + model_path, datasets, ): assert evaluate( - target=target, + model=model_path, datasets=datasets, limit=2, no_cache=True, @@ -110,15 +135,17 @@ def test_evaluation_llm_evaluation_harness_integration_name( "with importing functions that are decorated with " "click option where multiple=True", ) -def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serialization): +def test_cli( + tmp_path, model_path, datasets, dummy_integration_name, type_serialization +): from deepsparse.evaluation.cli import main runner = CliRunner() runner.invoke( main, [ - "--target", - target, + "--model_path", + model_path, "--dataset", datasets[0], "--dataset", diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py index f712dce0df..f8f3c731a8 100644 --- a/tests/deepsparse/evaluation/test_utils.py +++ b/tests/deepsparse/evaluation/test_utils.py @@ -14,32 +14,16 @@ import os -from transformers import ( - AutoModelForCausalLM, - AutoModelForSequenceClassification, - GPTNeoForCausalLM, -) - import pytest from deepsparse import Pipeline from deepsparse.evaluation.utils import ( - create_model_from_target, + create_pipeline, get_save_path, if_generative_language_model, resolve_integration, ) -@pytest.fixture -def llm_type_hf_model(): - return AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M") - - -@pytest.fixture -def not_llm_type_hf_model(): - return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") - - @pytest.fixture def llm_type_pipeline(): return Pipeline.create( @@ -49,25 +33,13 @@ def llm_type_pipeline(): ) -def test_resolve_known_llm_model(llm_type_hf_model): +def test_resolve_known_llm_pipeline(llm_type_pipeline): assert ( - resolve_integration(model=llm_type_hf_model, datasets="") + resolve_integration(pipeline=llm_type_pipeline, datasets="") == "lm-evaluation-harness" ) -def test_resolve_unknown_model(not_llm_type_hf_model): - assert resolve_integration(model=not_llm_type_hf_model, datasets="") is None - - -def test_if_generative_language_model_true(llm_type_hf_model): - assert if_generative_language_model(llm_type_hf_model) - - -def test_if_generative_language_model_false(not_llm_type_hf_model): - assert not if_generative_language_model(not_llm_type_hf_model) - - def test_if_generative_language_pipeline_true(llm_type_pipeline): assert if_generative_language_model(llm_type_pipeline) @@ -89,26 +61,11 @@ def pipeline_target(): return "hf:mgoin/TinyStories-1M-deepsparse" -@pytest.fixture -def torch_target(): - return "roneneldan/TinyStories-1M" - - def test_initialize_model_from_target_pipeline_onnx(pipeline_target): - model = create_model_from_target(pipeline_target, "onnxruntime") + model = create_pipeline(pipeline_target, "onnxruntime") assert model.ops.get("single_engine")._engine_type == "onnxruntime" -def test_initialize_model_from_target_pipeline_deepsparse(pipeline_target): - model = create_model_from_target(pipeline_target, "deepsparse") - assert model.ops.get("single_engine")._engine_type == "deepsparse" - - def test_initialize_model_from_target_pipeline_with_kwargs(pipeline_target): - model = create_model_from_target(pipeline_target, "deepsparse", sequence_length=64) + model = create_pipeline(pipeline_target, "deepsparse", sequence_length=64) assert model.ops.get("process_input").sequence_length == 64 - - -def test_initialize_model_from_target_torch(torch_target): - model = create_model_from_target(torch_target, "torch") - assert isinstance(model, GPTNeoForCausalLM) From d7d1acb6b0d88dfed279f8209b138043eaeaad34 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 6 Feb 2024 10:37:13 -0500 Subject: [PATCH 08/16] add back continuous batching tests (#1585) --- .../deepsparse/schedulers/test_continuous_batching_scheduler.py | 2 -- .../schedulers/utils/test_continuous_batching_executor.py | 2 -- .../transformers/text_generation/integration_tests/test_llms.py | 1 - 3 files changed, 5 deletions(-) diff --git a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py index 4502a8fcf9..627202773c 100644 --- a/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py +++ b/tests/deepsparse/schedulers/test_continuous_batching_scheduler.py @@ -16,12 +16,10 @@ import numpy -import pytest from deepsparse.operators import EngineOperator from deepsparse.schedulers import ContinuousBatchingScheduler -@pytest.mark.skip("skip continuous batching tests") def test_continuous_batching_executor_thread(): # simple test that ContinuousBatchingScheduler can be instantiated and return # a result from a request, for testing multi-batch execution, making enough diff --git a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py index fa41259f21..e26532d088 100644 --- a/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py +++ b/tests/deepsparse/schedulers/utils/test_continuous_batching_executor.py @@ -16,7 +16,6 @@ import numpy -import pytest from deepsparse.operators import EngineOperator from deepsparse.schedulers.utils import ( ContinuousBatchingExecutorThread, @@ -24,7 +23,6 @@ ) -@pytest.mark.skip("skip continuous batching tests") def test_continuous_batching_executor_thread(): # mobilenet model with batch_size=2 engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base") diff --git a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py index e03d97e7f3..633ee19c53 100644 --- a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py +++ b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py @@ -132,7 +132,6 @@ def setup(self, params_dict, max_new_tokens, internal_kv_cache): self.default_pipeline = None self.max_new_tokens = max_new_tokens - @pytest.mark.skip("skip continuous batching tests") def test_continuous_batching_pipeline(self, setup): pipeline = self.get_pipeline( From 8a83e24edd41150844525d6175b7ea9e63111ce2 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Tue, 6 Feb 2024 16:27:23 -0500 Subject: [PATCH 09/16] add top level benchmark imports (#1587) --- src/deepsparse/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py index 436990b065..fab294bfaa 100644 --- a/src/deepsparse/__init__.py +++ b/src/deepsparse/__init__.py @@ -40,5 +40,6 @@ from .subgraph_execute import * from .analyze import analyze from .evaluation.evaluator import evaluate +from .benchmark import benchmark_model, benchmark_pipeline _analytics.send_event("python__init") From c54461a9ea0832b8c76366400b21fe73baebd3d4 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 7 Feb 2024 13:08:24 -0500 Subject: [PATCH 10/16] [TextGeneration] Add helper function to parse model path from args (#1583) * add helper function to parse model path from args * update model path * revert cli changes * remove empty args --- src/deepsparse/pipeline.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py index aaa65409d8..23ff3a2810 100644 --- a/src/deepsparse/pipeline.py +++ b/src/deepsparse/pipeline.py @@ -702,7 +702,8 @@ def text_generation_pipeline(*args, **kwargs) -> "Pipeline": :return: text generation pipeline with the given args and kwargs passed to Pipeline.create """ - return Pipeline.create("text_generation", *args, **kwargs) + kwargs = _check_model_path_arg(*args, **kwargs) + return Pipeline.create("text_generation", **kwargs) def code_generation_pipeline(*args, **kwargs) -> "Pipeline": @@ -710,7 +711,8 @@ def code_generation_pipeline(*args, **kwargs) -> "Pipeline": :return: text generation pipeline with the given args and kwargs passed to Pipeline.create """ - return Pipeline.create("code_generation", *args, **kwargs) + kwargs = _check_model_path_arg(*args, **kwargs) + return Pipeline.create("code_generation", **kwargs) def chat_pipeline(*args, **kwargs) -> "Pipeline": @@ -718,7 +720,8 @@ def chat_pipeline(*args, **kwargs) -> "Pipeline": :return: text generation pipeline with the given args and kwargs passed to Pipeline.create """ - return Pipeline.create("chat", *args, **kwargs) + kwargs = _check_model_path_arg(*args, **kwargs) + return Pipeline.create("chat", **kwargs) TextGeneration = text_generation_pipeline @@ -802,3 +805,13 @@ def zero_shot_text_classification_pipeline(*args, **kwargs) -> "Pipeline": is returned depends on the value of the passed model_scheme argument. """ return Pipeline.create("zero_shot_text_classification", *args, **kwargs) + + +def _check_model_path_arg(*args, **kwargs): + if args: + if len(args) > 1 or "model_path" in kwargs or "model" in kwargs: + raise ValueError( + "Only the model path can be provided as a non-kwarg argument" + ) + kwargs["model_path"] = args[0] + return kwargs From 2e33e673cf897ee93689a6ade070b40503ee8df3 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 7 Feb 2024 14:24:22 -0500 Subject: [PATCH 11/16] [server] Add `model` argument to server cli (#1584) * update model path to be an argument; remove unused openai command pathway * add model path arg and option --- src/deepsparse/server/cli.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py index 5eacc748a0..acd7b6897c 100644 --- a/src/deepsparse/server/cli.py +++ b/src/deepsparse/server/cli.py @@ -79,6 +79,7 @@ ), ) +MODEL_ARG = click.argument("model", type=str, default=None, required=False) MODEL_OPTION = click.option( "--model_path", type=str, @@ -152,6 +153,7 @@ @PORT_OPTION @LOG_LEVEL_OPTION @HOT_RELOAD_OPTION +@MODEL_ARG @MODEL_OPTION @BATCH_OPTION @CORES_OPTION @@ -167,6 +169,7 @@ def main( log_level: str, hot_reload_config: bool, model_path: str, + model: str, batch_size: int, num_cores: int, num_workers: int, @@ -216,6 +219,17 @@ def main( ... ``` """ + # the server cli can take a model argument or --model_path option + # if the --model_path option is provided, use that + # otherwise if the argument is given and --model_path is not used, use the + # argument instead + if model and model_path == "default": + model_path = model + + if integration == INTEGRATION_OPENAI: + if task is None or task != "text_generation": + task = "text_generation" + if ctx.invoked_subcommand is not None: return @@ -254,24 +268,6 @@ def main( server.start_server(host, port, log_level, hot_reload_config=hot_reload_config) -@main.command( - context_settings=dict( - token_normalize_func=lambda x: x.replace("-", "_"), show_default=True - ), -) -@click.argument("config-file", type=str) -@HOST_OPTION -@PORT_OPTION -@LOG_LEVEL_OPTION -@HOT_RELOAD_OPTION -def openai( - config_file: str, host: str, port: int, log_level: str, hot_reload_config: bool -): - - server = OpenAIServer(server_config=config_file) - server.start_server(host, port, log_level, hot_reload_config=hot_reload_config) - - @main.command( context_settings=dict( token_normalize_func=lambda x: x.replace("-", "_"), show_default=True From de64fe5f0d93b1d036a37c6cee22dcaff13663d9 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Thu, 8 Feb 2024 15:31:42 -0500 Subject: [PATCH 12/16] make sure benchmark_* imports are the functions, not modules (#1593) --- src/deepsparse/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/deepsparse/__init__.py b/src/deepsparse/__init__.py index fab294bfaa..46c49b236d 100644 --- a/src/deepsparse/__init__.py +++ b/src/deepsparse/__init__.py @@ -40,6 +40,7 @@ from .subgraph_execute import * from .analyze import analyze from .evaluation.evaluator import evaluate -from .benchmark import benchmark_model, benchmark_pipeline +from .benchmark.benchmark_model import benchmark_model +from .benchmark.benchmark_pipeline import benchmark_pipeline _analytics.send_event("python__init") From 0ddeda269410cb71e2225dd96759f914a9a2e1af Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 9 Feb 2024 10:35:49 -0500 Subject: [PATCH 13/16] [server] Update readmes to no longer use the deprecated pathway + update pathway as per new UX docs (#1592) * update readmes to no longer use the depreciated pathway * update to get new ux workflows --- src/deepsparse/server/README.md | 12 +-- src/deepsparse/server/cli.py | 106 ++------------------------ src/deepsparse/transformers/README.md | 10 +-- src/deepsparse/yolact/README.md | 2 +- src/deepsparse/yolo/README.md | 2 +- 5 files changed, 16 insertions(+), 116 deletions(-) diff --git a/src/deepsparse/server/README.md b/src/deepsparse/server/README.md index d4beb3907f..1326ce677a 100644 --- a/src/deepsparse/server/README.md +++ b/src/deepsparse/server/README.md @@ -18,15 +18,15 @@ Usage: deepsparse.server [OPTIONS] COMMAND [ARGS]... 1. `deepsparse.server --config_file [OPTIONS] ` - 2. `deepsparse.server task [OPTIONS] + 2. `deepsparse.server --task [OPTIONS] Examples for using the server: `deepsparse.server --config_file server-config.yaml` - `deepsparse.server task question_answering --batch-size 2` + `deepsparse.server --task question_answering --batch-size 2` - `deepsparse.server task question_answering --host "0.0.0.0"` + `deepsparse.server --task question_answering --host "0.0.0.0"` Example config.yaml for serving: @@ -63,10 +63,6 @@ Usage: deepsparse.server [OPTIONS] COMMAND [ARGS]... Options: --help Show this message and exit. - -Commands: - config Run the server using configuration from a .yaml file. - task Run the server using configuration with CLI options, which can... ``` ---

Note on the latest server release

@@ -104,7 +100,7 @@ Example CLI command for serving a single model for the **question answering** ta ```bash deepsparse.server \ - task question_answering \ + --task question_answering \ --model_path "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni" ``` diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py index acd7b6897c..6d3952c5f5 100644 --- a/src/deepsparse/server/cli.py +++ b/src/deepsparse/server/cli.py @@ -11,16 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -""" -There are two sub-commands for the server: -1. `deepsparse.server config [OPTIONS] ` -2. `deepsparse.server task [OPTIONS] -``` -""" - import os -import warnings from tempfile import TemporaryDirectory from typing import Optional, Union @@ -223,6 +214,7 @@ def main( # if the --model_path option is provided, use that # otherwise if the argument is given and --model_path is not used, use the # argument instead + if model and model_path == "default": model_path = model @@ -236,6 +228,10 @@ def main( if task is None and config_file is None: raise ValueError("Must specify either --task or --config_file. Found neither") + if config_file is not None: + server = _fetch_server(integration=integration, config=config_file) + server.start_server(host, port, log_level, hot_reload_config=hot_reload_config) + if task is not None: cfg = ServerConfig( num_cores=num_cores, @@ -263,98 +259,6 @@ def main( host, port, log_level, hot_reload_config=hot_reload_config ) - if config_file is not None: - server = _fetch_server(integration=integration, config=config_file) - server.start_server(host, port, log_level, hot_reload_config=hot_reload_config) - - -@main.command( - context_settings=dict( - token_normalize_func=lambda x: x.replace("-", "_"), show_default=True - ), -) -@click.argument("config-path", type=str) -@HOST_OPTION -@PORT_OPTION -@LOG_LEVEL_OPTION -@HOT_RELOAD_OPTION -def config( - config_path: str, host: str, port: int, log_level: str, hot_reload_config: bool -): - "[DEPRECATED] Run the server using configuration from a .yaml file." - warnings.simplefilter("always", DeprecationWarning) - warnings.warn( - "Using the `config` sub command is deprecated. " - "Use the `--config_file` argument instead.", - category=DeprecationWarning, - ) - - -@main.command( - context_settings=dict( - token_normalize_func=lambda x: x.replace("-", "_"), show_default=True - ), -) -@click.argument( - "task", - type=click.Choice(SupportedTasks.task_names(), case_sensitive=False), -) -@MODEL_OPTION -@BATCH_OPTION -@CORES_OPTION -@WORKERS_OPTION -@HOST_OPTION -@PORT_OPTION -@LOG_LEVEL_OPTION -@HOT_RELOAD_OPTION -@INTEGRATION_OPTION -def task( - task: str, - model_path: str, - batch_size: int, - num_cores: int, - num_workers: int, - host: str, - port: int, - log_level: str, - hot_reload_config: bool, - integration: str, -): - """ - [DEPRECATED] Run the server using configuration with CLI options, - which can only serve a single model. - """ - - warnings.simplefilter("always", DeprecationWarning) - warnings.warn( - "Using the `task` sub command is deprecated. " - "Use the `--task` argument instead.", - category=DeprecationWarning, - ) - - cfg = ServerConfig( - num_cores=num_cores, - num_workers=num_workers, - integration=integration, - endpoints=[ - EndpointConfig( - task=task, - name=f"{task}", - model=model_path, - batch_size=batch_size, - ) - ], - loggers={}, - ) - - with TemporaryDirectory() as tmp_dir: - config_path = os.path.join(tmp_dir, "server-config.yaml") - with open(config_path, "w") as fp: - yaml.dump(cfg.dict(), fp) - - server = _fetch_server(integration=integration, config=config_path) - server.start_server(host, port, log_level, hot_reload_config=hot_reload_config) - def _fetch_server(integration: str, config: Union[ServerConfig, str]): if isinstance(config, str): diff --git a/src/deepsparse/transformers/README.md b/src/deepsparse/transformers/README.md index 33cc4e758c..f1e38aee85 100644 --- a/src/deepsparse/transformers/README.md +++ b/src/deepsparse/transformers/README.md @@ -118,7 +118,7 @@ inference = qa_pipeline(question="What's my name?", context="My name is Snorlax" Spinning up: ```bash deepsparse.server \ - task question-answering \ + --task question-answering \ --model_path "zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/12layer_pruned80_quant-none-vnni" ``` @@ -162,7 +162,7 @@ inference = opt_pipeline("Who is the president of the United States?") Spinning up: ```bash deepsparse.server \ - task text-generation \ + --task text-generation \ --model_path zoo:opt-1.3b-opt_pretrain-pruned50_quantW8A8 ``` @@ -210,7 +210,7 @@ inference = sa_pipeline("I hate it!") Spinning up: ```bash deepsparse.server \ - task sentiment-analysis \ + --task sentiment-analysis \ --model_path "zoo:nlp/sentiment_analysis/bert-base/pytorch/huggingface/sst2/pruned80_quant-none-vnni" ``` @@ -263,7 +263,7 @@ inference = tc_pipeline( Spinning up: ```bash deepsparse.server \ - task text-classification \ + --task text-classification \ --model_path "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni" ``` @@ -316,7 +316,7 @@ inference = tc_pipeline("Drive from California to Texas!") Spinning up: ```bash deepsparse.server \ - task token-classification \ + --task token-classification \ --model_path "zoo:nlp/token_classification/bert-base/pytorch/huggingface/conll2003/pruned90-none" ``` diff --git a/src/deepsparse/yolact/README.md b/src/deepsparse/yolact/README.md index e106f4ae75..f4013d6fa3 100644 --- a/src/deepsparse/yolact/README.md +++ b/src/deepsparse/yolact/README.md @@ -121,7 +121,7 @@ If a `--model_filepath` arg isn't provided, then `zoo:cv/segmentation/yolact-dar Spinning up: ```bash deepsparse.server \ - task yolact \ + --task yolact \ --model_path "zoo:cv/segmentation/yolact-darknet53/pytorch/dbolya/coco/pruned82_quant-none" ``` diff --git a/src/deepsparse/yolo/README.md b/src/deepsparse/yolo/README.md index 0802c2589a..cfbcbfe431 100644 --- a/src/deepsparse/yolo/README.md +++ b/src/deepsparse/yolo/README.md @@ -120,7 +120,7 @@ If a `--model_filepath` arg isn't provided, then `zoo:cv/detection/yolov5-s/pyto Spinning up: ```bash deepsparse.server \ - task yolo \ + --task yolo \ --model_path "zoo:cv/detection/yolov5-s/pytorch/ultralytics/coco/pruned_quant-aggressive_94" ``` From 517fd15a909d138c025cd2e99c99d015f3e25b10 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Fri, 9 Feb 2024 18:11:41 +0100 Subject: [PATCH 14/16] [Feature Branch][DeepSparse Evaluation API] Update lm-eval, perplexity, additional datasets (#1580) --- setup.py | 3 +- src/deepsparse/evaluation/cli.py | 14 +- src/deepsparse/evaluation/evaluator.py | 1 - .../evaluation/integrations/__init__.py | 6 +- .../integrations/lm_evaluation_harness.py | 432 +++++++++--------- .../evaluation/integrations/perplexity.py | 278 +++++++++++ src/deepsparse/evaluation/registry.py | 2 +- src/deepsparse/evaluation/results.py | 4 +- src/deepsparse/evaluation/utils.py | 83 +++- src/deepsparse/transformers/metrics.py | 2 +- .../transformers/utils/eval_helpers.py | 34 +- .../test_lm_evaluation_harness.py | 136 ++++-- .../integrations/test_perplexity.py | 132 ++++++ tests/deepsparse/evaluation/test_evaluator.py | 17 +- 14 files changed, 852 insertions(+), 292 deletions(-) create mode 100644 src/deepsparse/evaluation/integrations/perplexity.py create mode 100644 tests/deepsparse/evaluation/integrations/test_perplexity.py diff --git a/setup.py b/setup.py index 8fe04d23be..d9c8dffd7d 100644 --- a/setup.py +++ b/setup.py @@ -149,6 +149,7 @@ def _parse_requirements_file(file_path): "datasets<2.16", "accelerate<0.26", "seqeval", + "evaluate", ] _sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps @@ -308,7 +309,7 @@ def _setup_entry_points() -> Dict: f"deepsparse.image_classification.eval={ic_eval}", "deepsparse.license=deepsparse.license:main", "deepsparse.validate_license=deepsparse.license:validate_license_cli", - "deepsparse.eval=deepsparse.evaluation.cli:main", + "deepsparse.evaluate=deepsparse.evaluation.cli:main", ] } diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index b68d32d4e5..d192dd67a1 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -20,7 +20,7 @@ Module for evaluating models on the various evaluation integrations OPTIONS: - --model_path MODEL_PATH + MODEL_PATH A path to an ONNX model, local directory containing ONNX model (including all the auxiliary files) or a SparseZoo stub -d DATASET, --dataset DATASET @@ -72,7 +72,7 @@ from deepsparse.evaluation.evaluator import evaluate from deepsparse.evaluation.results import Result, save_result -from deepsparse.evaluation.utils import args_to_dict, get_save_path +from deepsparse.evaluation.utils import get_save_path, parse_kwarg_tuples from deepsparse.operators.engine_operator import ( DEEPSPARSE_ENGINE, ORT_ENGINE, @@ -88,12 +88,10 @@ ignore_unknown_options=True, ) ) -@click.option( - "--model_path", +@click.argument( + "model_path", type=click.Path(dir_okay=True, file_okay=True), required=True, - help="A path to an ONNX model, local directory containing ONNX model" - "(including all the auxiliary files) or a SparseZoo stub", ) @click.option( "-d", @@ -178,7 +176,7 @@ def main( # join datasets to a list if multiple datasets are passed datasets = list(dataset) if not isinstance(dataset, str) else dataset # format kwargs to a dict - integration_args = args_to_dict(integration_args) + integration_args = parse_kwarg_tuples(integration_args) _LOGGER.info( f"Creating {engine_type} pipeline to evaluate from model path: {model_path}" @@ -203,7 +201,7 @@ def main( **integration_args, ) - _LOGGER.info(f"Evaluation done. Results:\n{result}") + _LOGGER.info(f"Evaluation done. Results:\n{result.formatted}") save_path = get_save_path( save_path=save_path, diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py index b513f07563..3d18f8489f 100644 --- a/src/deepsparse/evaluation/evaluator.py +++ b/src/deepsparse/evaluation/evaluator.py @@ -65,7 +65,6 @@ def evaluate( return eval_integration( pipeline=pipeline, datasets=datasets, - engine_type=engine_type, batch_size=batch_size, splits=splits, metrics=metrics, diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py index 1cc3bfacf0..f0871f135a 100644 --- a/src/deepsparse/evaluation/integrations/__init__.py +++ b/src/deepsparse/evaluation/integrations/__init__.py @@ -15,7 +15,7 @@ # flake8: noqa: F401 -def try_import_lm_evaluation_harness(raise_error=False): +def try_import_lm_evaluation_harness(raise_error=True): try: import lm_eval @@ -24,11 +24,11 @@ def try_import_lm_evaluation_harness(raise_error=False): if raise_error: raise ImportError( "Unable to import lm_eval. " - "To install run 'pip install " - "git+https://github.com/EleutherAI/lm-evaluation-harness@b018a7d51'" + "To install run 'pip install lm-eval==0.4.0'" ) return False if try_import_lm_evaluation_harness(raise_error=False): from .lm_evaluation_harness import * +from .perplexity import * diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 2f8c7b8cef..69934af37a 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -13,35 +13,39 @@ # limitations under the License. """ -Integration of the `lm_evaluation_harness`: +Integration of the `lm-evaluation-harness`: https://github.com/EleutherAI/lm-evaluation-harness """ - -import json import logging from typing import Any, Dict, List, Optional, Tuple, Union import numpy -from pydantic import BaseModel, Field from tqdm import tqdm -import torch from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result -from lm_eval import base, evaluator, tasks, utils +from deepsparse.evaluation.utils import LM_EVALUATION_HARNESS +from deepsparse.utils.data import numpy_log_softmax +from lm_eval import evaluator, tasks, utils +from lm_eval.api.instance import Instance +from lm_eval.api.model import LM + +tasks.initialize_tasks("INFO") _LOGGER = logging.getLogger(__name__) __all__ = ["integration_eval"] -@EvaluationRegistry.register(name="lm-evaluation-harness") +@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS, alias="lm-eval-harness") def integration_eval( - model: Any, + pipeline: Pipeline, datasets: Union[List[str], str], - batch_size: int, + batch_size: int = 1, + splits: Union[List[str], str, None] = None, + metrics: Union[List[str], str, None] = None, **kwargs, ) -> Result: """ @@ -49,101 +53,53 @@ def integration_eval( https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py that is compatible with deepsparse.evaluator.py - :param model: the model/pipeline to evaluate + :param pipeline: the model/pipeline to evaluate :param datasets: the datasets to evaluate on :param batch_size: the batch size to use for evaluation :param kwargs: additional arguments to alter the behavior of the evaluation :return the evaluation results """ - # [START] - # The code that sets up the interface between deepsparse and lm_evaluation_harness - if isinstance(model, Pipeline): - # If the model is a Pipeline, we need to wrap - # it in a DeepSparseLM object - model = DeepSparseLM( - pipeline=model, - batch_size=batch_size, - max_gen_toks=kwargs.get("max_gen_toks"), - ) + pipeline = DeepSparseLM(pipeline=pipeline, batch_size=batch_size) datasets = (",").join(datasets) if isinstance(datasets, list) else datasets - # [END] - - # [START] - # The code below is being adapted from: - # https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py - if kwargs.get("limit"): - _LOGGER.warning( - "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. " - "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." - ) - - if datasets is None: - task_names = tasks.ALL_TASKS - else: - task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS) + task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS) _LOGGER.info(f"Selected Tasks: {task_names}") - description_dict = {} - if kwargs.get("description_dict_path"): - with open(kwargs.get("description_dict_path"), "r") as f: - description_dict = json.load(f) - - evaluator_input = EvaluatorInputSchema( - model=model, - tasks=task_names, - description_dict=description_dict, - batch_size=batch_size, - **kwargs, + results_raw = evaluator.simple_evaluate( + model=pipeline, tasks=task_names, batch_size=batch_size, **kwargs ) - results_raw = evaluator.simple_evaluate(**evaluator_input.dict()) - results = Result( - raw=dict(output=results_raw, input=filter_evaluator_input(evaluator_input)), + raw=results_raw, formatted=format_raw_results(results_raw), ) return results -def filter_evaluator_input( - evaluator_input: "EvaluatorInputSchema", -) -> Dict[str, Any]: # noqa: F821 - """ - Filter the evaluator input to remove the model field. - The model field is a complex object that cannot be serialized. - - :param evaluator_input: the evaluator input to filter - :return: the filtered evaluator input - """ - evaluator = evaluator_input.dict() - del evaluator["model"] - - return evaluator - - def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: """ Format the raw results from lm_evaluation_harness into a list of Evaluation objects. - :param results: the raw results from lm_evaluation_harness + :param results: the raw results from lm-evaluation-harness :return: the formatted results as a list of Evaluation objects """ formatted_results = [] for dataset_name, dataset_result in results["results"].items(): metrics = [] for metric_name, metric_value in dataset_result.items(): + if isinstance(metric_value, str): + continue metric = Metric(name=metric_name, value=metric_value) metrics.append(metric) dataset = Dataset( type=None, name=dataset_name, config=results["config"], split=None ) evaluation = Evaluation( - task="lm_evaluation_harness", + task=LM_EVALUATION_HARNESS, dataset=dataset, metrics=metrics, samples=None, @@ -152,177 +108,241 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: return formatted_results -class EvaluatorInputSchema(BaseModel): - model: Any = Field(description="The name of the model.") - tasks: List[str] = Field( - description="The task (or multiple tasks) to evaluate the target on." - ) - description_dict: Optional[Dict[str, Any]] = Field( - None, description="Description dict." - ) - batch_size: int = Field(description="The batch size to use for evaluation.") - model_args: str = Field( - "", description="Additional arguments for the evaluated model." - ) - num_fewshot: int = Field(0, description="The number of few shots to use.") - max_batch_size: Optional[int] = Field( - None, description="Maximal batch size to try with --batch_size auto." - ) - device: Optional[str] = Field(None, description="Device to use for evaluation.") - no_cache: bool = Field(False, description="Include this flag to prevent caching.") - limit: Optional[float] = Field( - None, - description="Limit the number of examples per task. If <1, " - "limit is a percentage of the total number of " - "examples.", - ) - decontamination_ngrams_path: Optional[str] = Field( - None, description="Specify the path for decontamination n-grams." - ) - check_integrity: bool = Field( - False, description="Include this flag to check integrity." - ) - write_out: bool = Field(False, description="Include this flag to write out.") - output_base_path: Optional[str] = Field( - None, description="Specify the output base path." - ) - - -class DeepSparseLM(base.BaseLM): +class DeepSparseLM(LM): def __init__( self, pipeline: Pipeline, - tokenizer: Optional[str] = None, batch_size: int = 1, - max_gen_toks: Optional[int] = None, + max_gen_toks: int = 256, + tokenizer: Optional["AutoTokenizer"] = None, # noqa: F821 ): """ Wrapper around the DeepSparse pipeline to make it compatible with the llm-evaluation-harness. + + :param pipeline: the pipeline object to wrap + :param batch_size: the batch size to use for evaluation + :param max_gen_toks: the maximum number of tokens to generate + when using the model for generation (see: greed_until method) + :param tokenizer: the tokenizer to use for encoding and decoding + strings and tokens. By default, the tokenizer from the pipeline """ super().__init__() - # Initialize new model and tokenizer instances - self.model = pipeline - self.tokenizer = tokenizer if tokenizer else self.model.tokenizer - - self._batch_size = batch_size + self.pipeline = pipeline + self.batch_size = batch_size + self.tokenizer = tokenizer or pipeline.tokenizer self._max_length = pipeline.sequence_length - self._max_gen_toks = max_gen_toks or 256 + self._max_gen_toks = max_gen_toks + self.batch_sizes = {} - self.vocab_size = self.tokenizer.vocab_size + def tok_encode(self, string: str) -> List[int]: + return self.tokenizer.encode(string) - def _model_call(self, inps) -> torch.Tensor: + def tok_decode(self, tokens: List[int]) -> str: + return self.tokenizer.decode(tokens) + + @property + def max_length(self) -> int: + return self._max_length + + @property + def max_gen_toks(self) -> int: + return self._max_gen_toks + + def loglikelihood(self, requests) -> List[Tuple[float, bool]]: + """ + Copied directly from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ + new_reqs = [] + for context, continuation in [req.args for req in requests]: + if context == "": + raise NotImplementedError( + "Implementing empty context is not supported yet" + ) + context_enc, continuation_enc = self._encode_pair(context, continuation) + + new_reqs.append(((context, continuation), context_enc, continuation_enc)) + + return self._loglikelihood_tokens(new_reqs) + + def _loglikelihood_tokens( + self, + requests: List[Tuple[Tuple[str, str], List[int], List[int]]], + disable_tqdm: bool = False, + ) -> List[Tuple[float, bool]]: """ - Override the _model_call method to use the DeepSparse pipeline for - logits generation. + The function to compute the loglikelihood of the continuation + tokens given the context tokens. - inps: a torch tensor of shape [batch, sequence] - the size of sequence may vary from call to call - returns: a torch tensor of shape [batch, sequence, vocab] with the - logits returned from the model + This function is an adapted version of the original function from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py """ - # Encode the tokens to strings - prompt = self.model.tokenizer.batch_decode(inps.numpy()) - - # Run the model to map the prompt to logits - out = self.model( - prompt=prompt, - max_new_tokens=0, - include_prompt_logits=True, - output_scores=True, - ) - logits_numpy = numpy.stack([generation.score for generation in out.generations]) - return torch.from_numpy(logits_numpy) + res = [] - def greedy_until( - self, requests: List[Tuple[str, Union[List[str], str]]] - ) -> List[str]: def _collate(x): - tokens = self.tok_encode(x[0]) - return len(tokens), x[0] + """Defines the key for the sorted method""" + toks = x[1] + x[2] + return -len(toks), tuple(toks) - results = [] - reorder = utils.Reorderer(requests, _collate) + re_ord = utils.Reorderer(requests, _collate) - for chunk in utils.chunks( - tqdm(reorder.get_reordered(), disable=False), - self.batch_size, + for chunk in tqdm( + list(utils.chunks(re_ord.get_reordered(), self.batch_size)), + disable=disable_tqdm, ): - context = [c[0] for c in chunk] - request_args = chunk[0][1] - stop = request_args.get("until", None) - stop_sequences = stop if isinstance(stop, list) else [stop] - max_generation_length = request_args.get("max_length", None) - - assert ( - isinstance(max_generation_length, int) or max_generation_length is None - ) - assert isinstance(stop_sequences, list) or stop_sequences is None - - # TODO: Find a better way to handle stop sequences for 0-shot. - if stop_sequences is None: - until = [self.eot_token] - else: - until = stop_sequences + [self.eot_token] - - if max_generation_length is None: - max_tokens = self.max_gen_toks - else: - max_tokens = max_generation_length - - responses = self.model( - sequences=context, - max_new_tokens=max_tokens, - stop=until, - do_sample=False, + batch_inp = [] + batch_cache_key = [] + batch_continuation_enc = [] + # len(chunk) is the batch_size + for cache_key, context_enc, continuation_enc in chunk: + # how this all works (illustrated on a causal decoder-only setup): + # CTX CONT + # inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + # model \ \ + # logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the + # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501 + + inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1] + + batch_inp.append(self.tokenizer.decode(inp)) + batch_cache_key.append(cache_key) + batch_continuation_enc.append(continuation_enc) + + response = self.pipeline( + prompt=batch_inp, + max_new_tokens=0, + output_scores=True, + include_prompt_logits=True, ) - responses = responses if type(responses) is list else [responses] + for resp, continuation_enc, cache_key in zip( + response.generations, batch_continuation_enc, batch_cache_key + ): + # (seq_len, vocab_size) + multi_scores = resp.score + # (seq_len, vocab_size) but with softmax applied + multi_logits = numpy_log_softmax(multi_scores, axis=1) + # toss out the context half of the sequence + # (cont_len, vocab_size) + continuation_multi_logits = multi_logits[-len(continuation_enc) :] + + # pick out the logits for the continuation tokens + # (cont_len,) + continuation_logits = continuation_multi_logits[ + numpy.arange(len(continuation_enc)), continuation_enc + ] + # check if the tokens generated greedly are the same + # as the expected continuation + greedy_tokens = continuation_multi_logits.argmax(axis=1) + max_equal = greedy_tokens.tolist() == continuation_enc + + # Answer: (log prob, is-exact-match) + answer = (float(continuation_logits.sum()), bool(max_equal)) + + res.append(answer) + + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) + + return re_ord.get_original(res) + + def loglikelihood_rolling( + self, requests: list[Instance] + ) -> list[tuple[float, bool]]: + raise NotImplementedError( + "The method not required by any of our " "current task integrations so far" + ) + + def generate_until(self, requests: list[Instance]) -> list[str]: + """ + The function to generate a certain number of new tokens + given a context. + + This function is an adapted version of the original function from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py + """ + if not requests: + return [] + res = [] + requests = [req.args for req in requests] - for response in responses: - response = response.generations[0].text - # Ensure the generated responses do not contain the stop sequences. - for term in until: - response = response.split(term)[0] - # partial caching - self.cache_hook.add_partial("greedy_until", (context, until), response) - results.append(response) + def _collate(x): + toks = self.tok_encode(x[0]) + return len(toks), x[0] + + re_ord = utils.Reorderer(requests, _collate) + + def sameuntil_chunks(xs, size): + ret = [] + lastuntil = xs[0][1] + for x in xs: + if len(ret) >= size or x[1] != lastuntil: + yield ret, lastuntil + ret = [] + lastuntil = x[1] + ret.append(x) + + if ret: + yield ret, lastuntil + + pbar = tqdm(total=len(requests)) + for chunk, request_args in tqdm( + list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)) + ): + inps = [] - return reorder.get_original(results) + self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks) - def _model_generate(self, context, max_length, eos_token_id): - # Isn't used because we override greedy_until - raise NotImplementedError() + for context, _ in chunk: + # add context (prompts) to the list + inps.append(context) - @property - def eot_token(self) -> str: - return self.tokenizer.eos_token + until = request_args.pop("until", ["<|endoftext|>"]) + request_args.pop("do_sample", None) + request_args["temperature"] = request_args.get("temperature", 0) - @property - def eot_token_id(self) -> int: - return self.tokenizer.eos_token_id + # run inference (generate max_gen_toks tokens) + out = self.pipeline( + sequences=inps, + max_new_tokens=self.max_gen_toks - 1, + stop=until, + **request_args, + ) - @property - def max_length(self): - return self._max_length + for resp, (context, args_) in zip(out.generations, chunk): + text = resp.text + until_ = until + # split the text at the first occurrence of any of the until tokens + for term in until_: + if len(term) > 0: + text = text.split(term)[0] - @property - def max_gen_toks(self): - return self._max_gen_toks + res.append(text) - @property - def batch_size(self): - # should return self._batch_size but the - # TextGeneration model does not support batch_size > 1 - return 1 + self.cache_hook.add_partial( + "generate_until", (context, {"until": until_}), text + ) + pbar.update(1) - @property - def device(self): - pass + pbar.close() - def tok_encode(self, string: str): - return self.tokenizer.encode(string, add_special_tokens=False) + return re_ord.get_original(res) - def tok_decode(self, tokens): - return self.tokenizer.decode(tokens) + def _encode_pair( + self, context: str, continuation: str + ) -> Tuple[List[int], List[int]]: + """ + Copied directly from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ + n_spaces = len(context) - len(context.rstrip()) + if n_spaces > 0: + continuation = context[-n_spaces:] + continuation + context = context[:-n_spaces] + whole_enc = self.tok_encode(context + continuation) + context_enc = self.tok_encode(context) + context_enc_len = len(context_enc) + continuation_enc = whole_enc[context_enc_len:] + return context_enc, continuation_enc diff --git a/src/deepsparse/evaluation/integrations/perplexity.py b/src/deepsparse/evaluation/integrations/perplexity.py new file mode 100644 index 0000000000..a9a3f3d8a3 --- /dev/null +++ b/src/deepsparse/evaluation/integrations/perplexity.py @@ -0,0 +1,278 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from collections import defaultdict +from typing import Any, Dict, List, Optional, Union + +import numpy +from tqdm import tqdm + +from datasets import load_dataset +from deepsparse import Pipeline +from deepsparse.evaluation.registry import EvaluationRegistry +from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result +from deepsparse.evaluation.utils import PERPLEXITY +from deepsparse.transformers.metrics import Perplexity +from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline +from deepsparse.transformers.pipelines.text_generation.pipeline_no_kv_cache import ( + TextGenerationPipelineNoCache, +) +from deepsparse.transformers.utils.eval_helpers import ( + HumanEvalIteratorWrapper, + process_concatenated_datasets, +) + + +""" +Integration for the evaluation module +that computes the perplexity of a model on a dataset +""" +_LOGGER = logging.getLogger(__name__) + + +@EvaluationRegistry.register(name=PERPLEXITY) +def integration_eval( + pipeline: Pipeline, + datasets: Union[List[str], str] = "openai_humaneval", + batch_size: int = 1, + limit: Optional[int] = None, + accumulate: Optional[bool] = None, + splits: Union[List[str], str, None] = "test", + metrics: Union[List[str], str, None] = None, + **kwargs, +) -> Result: + """ + A function that computes the perplexity of a pipeline given a set + of dataset names. + + :param pipeline: the pipeline to evaluate. The assumed pipeline + is a TextGenerationPipeline, either with or without the KV + cache support + :param datasets: the names of dataset(s) to evaluate on + :param batch_size: the batch size to use for evaluation + :param splits: the split of the dataset to evaluate on. Default is "test" + :param metrics: the metrics to compute. Default is None + :param limit: the number of batches to evaluate on. Default is None + (evaluates on entire dataset) + :param accumulate: whether to perplexity computation should + accumulate negative log-likelihood over samples. Defaults to + the default accumulate variable inferred from the dataset in + `datasets`. If not None, it will override the inferred accumulate + variable. + :return: a Result object containing the raw and formatted results + """ + metrics = metrics or PERPLEXITY + if metrics != PERPLEXITY: + raise ValueError(f"Invalid metric {metrics} for perplexity evaluation") + if splits is None: + splits = "test" + _LOGGER.info("Argument `splits` is None. Defaulting to `test` split.") + datasets = datasets if isinstance(datasets, list) else [datasets] + results_raw = defaultdict(str) + for dataset_name in datasets: + results_raw[dataset_name] = defaultdict() + dataset, _accumulate = load_perplexity_dataset( + dataset_name=dataset_name, splits=splits, pipeline=pipeline, **kwargs + ) + if accumulate is None: + accumulate = _accumulate + else: + _LOGGER.info( + f"Argument `accumulate` set to {accumulate}. " + "Overriding the inferred accumulate variable from the dataset." + ) + + perplexity = run_perplexity( + pipeline=pipeline, + dataset=dataset, + batch_size=batch_size, + accumulate=accumulate, + limit=limit, + ) + + results_raw[dataset_name] = defaultdict() + results_raw[dataset_name]["results"] = perplexity + results_raw[dataset_name]["split"] = splits + + results = Result( + # omit storing raw results. they can potentially + # contain numpy arrays that are not serializable. + # all the information is stored in the formatted results + raw=None, + formatted=format_raw_results(results_raw), + ) + + return results + + +def run_perplexity( + pipeline: Union[TextGenerationPipelineNoCache, TextGenerationPipeline], + dataset: "Dataset", + batch_size: int, + accumulate: bool, + limit: Optional[int] = None, +) -> Dict[str, Any]: + """ + Compute the perplexity of a pipeline given a dataset. + + :param pipeline: the pipeline to evaluate. The assumed pipeline + is a TextGenerationPipeline, either with or without the KV + cache support + :param dataset: the dataset to evaluate on + :param batch_size: the batch size to use for evaluation + :param accumulate: whether to perplexity computation should + accumulate negative log-likelihood over samples + :param limit: the number of batches to evaluate on. Default is None + (evaluates on entire dataset) + + :return: a dictionary containing the perplexity results + """ + + perplexity = Perplexity(accumulate=accumulate) + + batch = [] + for idx, sample in _enumerate_progress( + dataset, max_steps=None if limit is None else limit * batch_size + ): + + if limit is not None: + # stop if we have reached the #limit + # number of batches to be processed + if idx >= limit * batch_size: + break + + batch.append(sample) + + if len(batch) == batch_size: + if isinstance(pipeline, TextGenerationPipelineNoCache): + out = pipeline( + prompt=batch, + output_scores=True, + include_prompt_logits=True, + return_input_tokens=True, + ) + else: + out = pipeline( + prompt=batch, + output_scores=True, + max_new_tokens=0, + include_prompt_logits=True, + return_input_tokens=True, + ) + + for s in range(batch_size): + # Need to remove tokens that were masked + input_ids = out.input_tokens["input_ids"][s].flatten() + attention_mask = out.input_tokens["attention_mask"][s].flatten() + logits = out.generations[s].score + if batch_size > 1 and isinstance( + pipeline, TextGenerationPipelineNoCache + ): + logits = logits[-attention_mask.sum() :, :] + + logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] + input_ids = numpy.compress(attention_mask, input_ids)[1:] + + # Add predictions (logits) and targets (input_ids) to metric + perplexity.add_batch(logits, input_ids) + + batch.clear() + + return perplexity.compute() + + +def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: + """ + Format the raw perplexity results into a list of + Evaluation objects. + + :param results: the raw results from perplexity computation + :return: the formatted results as a list of Evaluation objects + """ + formatted_results = [] + for dataset_name, dataset_result in results.items(): + metrics = [] + for metric_name, metric_value in dataset_result["results"].items(): + if isinstance(metric_value, numpy.ndarray): + metric_value = metric_value.tolist() + metric = Metric(name=metric_name, value=metric_value) + metrics.append(metric) + dataset = Dataset(type=None, name=dataset_name, split=dataset_result["split"]) + evaluation = Evaluation( + task="perplexity", + dataset=dataset, + metrics=metrics, + samples=None, + ) + formatted_results.append(evaluation) + return formatted_results + + +def load_perplexity_dataset( + dataset_name: str, + splits: Union[List[str], str] = "test", + pipeline: Optional[Pipeline] = None, + **kwargs, +): + """ + Function to load the dataset for perplexity computation. + Eventually we want to load the dataset from the nm_utils + + :param dataset_name: the name of the dataset to load + :param splits: the splits to load from the dataset. Default is "test" + :param pipeline: the pipeline to use for loading the dataset. The pipeline + is used to infer the model path and sequence length to use for loading + the dataset. This argument can be omitted if the appropriate kwargs + are provided, or if the dataset does not require a process_concatenated_datasets + function to load the dataset. + :param kwargs: additional keyword arguments to pass to the dataset loading function + :return: the dataset and whether to accumulate perplexity over samples + """ + if isinstance(splits, list): + raise NotImplementedError("Evaluation on multiple splits not implemented") + + if dataset_name == "openai_humaneval": + dataset = load_dataset(dataset_name, split=splits) + dataset = HumanEvalIteratorWrapper(dataset) + accumulate = False + elif dataset_name in {"wikitext2", "c4"}: + # fetch max_sequence_length from pipeline if not provided + max_sequence_length = kwargs.pop("max_sequence_length", None) + if max_sequence_length is None and pipeline is not None: + max_sequence_length = pipeline.sequence_length + + # fetch model_path from pipeline if not provided + model_path = kwargs.pop("model_path", None) + if model_path is None and pipeline is not None: + model_path = os.path.dirname(pipeline.model_path) + + dataset = process_concatenated_datasets( + dataset_name, + model_path=model_path, + max_sequence_length=max_sequence_length, + split=splits, + **kwargs, + ) + accumulate = True + else: + raise NotImplementedError(f"Dataset {dataset_name} not implemented") + + return dataset, accumulate + + +def _enumerate_progress(dataset, max_steps): + progress_bar = tqdm(dataset, total=max_steps) if max_steps else tqdm(dataset) + return enumerate(progress_bar) diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py index 2daabb69cc..343cd9786c 100644 --- a/src/deepsparse/evaluation/registry.py +++ b/src/deepsparse/evaluation/registry.py @@ -57,7 +57,7 @@ def resolve( if integration is None: _LOGGER.info( - "No integration specified, inferring the evaluation" + "No integration specified, inferring the evaluation " "function from the input arguments..." ) integration = resolve_integration(pipeline, datasets) diff --git a/src/deepsparse/evaluation/results.py b/src/deepsparse/evaluation/results.py index 00212d0a1e..78c4bbd501 100644 --- a/src/deepsparse/evaluation/results.py +++ b/src/deepsparse/evaluation/results.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List, Optional +from typing import Any, List, Optional, Union import yaml from pydantic import BaseModel, Field @@ -32,7 +32,7 @@ class Metric(BaseModel): name: str = Field(description="Name of the metric") - value: float = Field(description="Value of the metric") + value: Union[float, List[float]] = Field(description="Value of the metric") class Dataset(BaseModel): diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index 87475dd5d2..6e5ade9344 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -11,21 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import ast +import logging import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Union from deepsparse import Pipeline +from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE __all__ = [ "create_pipeline", "get_save_path", - "args_to_dict", + "parse_kwarg_tuples", "resolve_integration", ] +_LOGGER = logging.getLogger(__name__) LM_EVALUATION_HARNESS = "lm-evaluation-harness" +PERPLEXITY = "perplexity" def potentially_check_dependency_import(integration_name: str) -> bool: @@ -38,10 +42,14 @@ def potentially_check_dependency_import(integration_name: str) -> bool: :return: True if the dependency is installed, False otherwise """ - if integration_name.replace("_", "-") == LM_EVALUATION_HARNESS: + if integration_name == LM_EVALUATION_HARNESS: from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness - try_import_lm_evaluation_harness(raise_error=True) + try_import_lm_evaluation_harness() + if integration_name == PERPLEXITY: + from deepsparse.evaluation.integrations.perplexity import ( # noqa F401 + integration_eval, + ) return True @@ -79,24 +87,66 @@ def if_generative_language_model(pipeline: Pipeline) -> bool: return False -def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]: +def parse_kwarg_tuples(kwargs: tuple) -> Dict: """ - Convert a tuple of args to a dict of args. - - :param args: The args to convert. Should be a tuple of alternating - arg names and arg values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3). + Convert a tuple of kwargs to a dict of kwargs. + This function is used to enable the click parsing of kwargs. + + Example use: + ``` + @click.command( + context_settings=dict( + ignore_unknown_options=True) + ) + @click.argument(...) + @click.option(...) + ... + @click.argument("kwargs", nargs=-1, type=click.UNPROCESSED) + def main(..., kwargs): + ... + kwargs: Dict[str, Any] = parse_kwarg_tuples(kwargs: Tuple) + ``` + + Example inputs, outputs: + ``` + input = ('--arg1', 1, 'arg2', 2, '-arg3', 3) + output = parse_kwarg_tuples(input) + output = {'arg1': 1, 'arg2': 2, 'arg3': 3} + ``` + + :param kwargs: The kwargs to convert. Should be a tuple of alternating + kwargs names and kwargs values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3). The names can optionally have a '-' or `--` in front of them. - :return: The converted args as a dict. + :return: The converted kwargs as a dict. """ - if len(args) == 0: + if len(kwargs) == 0: return {} + if len(kwargs) % 2 != 0: + raise ValueError( + "kwargs must be a tuple of alternating names and values " + "i.e. the length of kwargs tuple must be even. Received " + f"kwargs: {kwargs}" + ) # names are uneven indices, values are even indices - args_names = args[0::2] - args_values = args[1::2] + kwargs_names = kwargs[0::2] + kwargs_values = kwargs[1::2] + # by default kwargs values are strings, so convert them + # to the appropriate type if possible + kwargs_values = list(kwargs_values) + for i, value in enumerate(kwargs_values): + try: + kwargs_values[i] = ast.literal_eval(value) + except Exception as e: # noqa E841 + _LOGGER.debug( + f"Failed to infer non-string type" + f"from kwarg value: {value}. It will" + f"be left as a string." + ) + # remove any '-' or '--' from the names - args_names = [name.lstrip("-") for name in args_names] + kwargs_names = [name.lstrip("-") for name in kwargs_names] - return dict(zip(args_names, args_values)) + return dict(zip(kwargs_names, kwargs_values)) def get_save_path( @@ -143,6 +193,7 @@ def create_pipeline( :param engine_type: The engine type to initialize the model with. :return: The initialized pipeline """ + engine_type = engine_type or DEEPSPARSE_ENGINE return Pipeline.create( task=kwargs.pop("task", "text-generation"), model_path=model_path, diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index b90c4dd744..0e7c24c8b6 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -20,7 +20,7 @@ import numpy -from deepsparse.utils import numpy_log_softmax +from deepsparse.utils.data import numpy_log_softmax __all__ = [ diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 4c0e68b9de..012520b9b5 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Mapping, Union +from typing import List, Union import numpy from transformers import AutoTokenizer, PreTrainedTokenizerFast @@ -27,7 +27,8 @@ def process_concatenated_datasets( dataset_name: str, model_path: str, max_sequence_length: int, - kwargs: Mapping, + split: str = "test", + **kwargs, ) -> list: """ Concatenate text datasets and split them into chunks text that, after @@ -38,6 +39,8 @@ def process_concatenated_datasets( Options: "wikitext2" or "c4". model_path (str): The path to a pretrained transformer model for tokenization. max_sequence_length (int): The maximum number of tokens in each sequence. + split (str, optional): The split of the dataset to use. + Default is "test". kwargs (mapping): Additional keyword arguments. - eos (str, optional): The end-of-sentence token. Default is "\n\n" for wikitext2 and "" for c4. @@ -65,13 +68,13 @@ def process_concatenated_datasets( eos = kwargs.get("eos", "\n\n") bos = kwargs.get("bos", "") - raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split=split) raw_text = raw_dataset["text"] elif dataset_name == "c4": eos = kwargs.get("eos", "<|endoftext|>") bos = kwargs.get("bos", "") raw_samples = kwargs.get("raw_samples", None) - data_file = kwargs.get("data_file", 0) + data_file = kwargs.get("data_file", None) if data_file is not None: raw_dataset = load_dataset( "allenai/c4", @@ -79,13 +82,13 @@ def process_concatenated_datasets( data_files={ "validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz" }, - split="validation", + split=split, ) else: raw_dataset = load_dataset( "allenai/c4", "allenai--c4", - split="validation", + split=split, ) if raw_samples is not None: raw_dataset = raw_dataset[:raw_samples] @@ -181,3 +184,22 @@ def _split_text_by_tokens( ) return split_text + + +class HumanEvalIteratorWrapper: + """ + Wrapper around the `openai_humaneval` dataset, + that joins the prompt and the canonical solution + into a single string during iteration. + """ + + def __init__(self, dataset): + self.iterator = iter(dataset) + + def __iter__(self): + return self + + def __next__(self): + # Get the next sample from the original iterator + sample = next(self.iterator) + return sample["prompt"] + sample["canonical_solution"] diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index 3b9016294f..8d8b343dd5 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -12,64 +12,118 @@ # See the License for the specific language governing permissions and # limitations under the License. -from transformers import AutoModelForCausalLM - import pytest from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness from deepsparse.evaluation.utils import create_pipeline -@pytest.mark.parametrize( - "pipeline, model_torch", - [ - ( - create_pipeline( - "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime" - ), - AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M"), - ) - ], -) -@pytest.mark.parametrize( - "datasets", - [ - ["hellaswag"], - ["hellaswag", "gsm8k"], - "gsm8k", - "arc_challenge", - ], -) @pytest.mark.parametrize( "batch_size", [1, 3], ) -class TestLMEvaluationHarness: - @pytest.mark.skipif( - not try_import_lm_evaluation_harness(raise_error=False), - reason="lm_evaluation_harness not installed", - ) - def test_integration_eval_onnx_matches_torch( - self, pipeline, model_torch, datasets, batch_size - ): +@pytest.mark.skipif( + not try_import_lm_evaluation_harness(raise_error=False), + reason="lm_evaluation_harness not installed", +) +class TestLMEval: + @pytest.fixture() + def integration_eval(self): from deepsparse.evaluation.integrations.lm_evaluation_harness import ( - integration_eval, + integration_eval as eval_fn, ) - out_torch = integration_eval( - model=model_torch, + return eval_fn + + @pytest.mark.parametrize( + "datasets", + [ + "hellaswag", + ["arc_challenge"], + ["hellaswag", "arc_challenge"], + ], + ) + def test_likelihood_scenario(self, batch_size, datasets, integration_eval): + + model_path_ds = "hf:mgoin/TinyStories-1M-ds" + model_path_hf = "roneneldan/TinyStories-1M" + limit = 2 + + out_onnx = integration_eval( + create_pipeline( + model_path_ds, + engine_type="onnxruntime", + ), datasets=datasets, batch_size=batch_size, - limit=5, - no_cache=True, # avoid saving files when running tests + limit=limit, + use_cache=None, # avoid saving files when running tests + ) + + from lm_eval import evaluator, tasks, utils + + datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets + out_torch = evaluator.simple_evaluate( + model="hf", + model_args=f"pretrained={model_path_hf}", + tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS), + batch_size=batch_size, + limit=limit, + use_cache=None, # avoid saving files when running tests ) + self._test_same(out_onnx.raw, out_torch, datasets) + + @pytest.mark.parametrize( + "datasets", + [ + "gsm8k", + ], + ) + def test_greedy_until_scenario(self, batch_size, datasets, integration_eval): + model_path_ds = "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX" + model_path_hf = "TinyLlama/TinyLlama-1.1B-step-50K-105b" + limit = 2 + # compute until 16 new tokens + # so that tests are faster + gen_kwargs = "max_gen_toks=16" + out_onnx = integration_eval( - model=pipeline, + create_pipeline(model_path_ds, engine_type="onnxruntime"), datasets=datasets, batch_size=batch_size, - limit=5, - no_cache=True, # avoid saving files when running tests + limit=limit, + gen_kwargs=gen_kwargs, + use_cache=None, # avoid saving files when running tests + ) + + from lm_eval import evaluator, tasks, utils + + datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets + out_torch = evaluator.simple_evaluate( + model="hf", + model_args=f"pretrained={model_path_hf}", + tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS), + batch_size=batch_size, + limit=limit, + gen_kwargs=gen_kwargs, + use_cache=None, # avoid saving files when running tests ) - out_onnx = out_onnx.raw["output"] - out_torch = out_torch.raw["output"] + self._test_same(out_onnx.raw, out_torch, datasets) - assert out_onnx["results"] == out_torch["results"] + @staticmethod + def _test_same(out_onnx, out_torch, datasets, greedy=False): + datasets = datasets if isinstance(datasets, list) else [datasets] + for dataset in datasets: + torch_samples = out_torch["samples"][dataset] + onnx_samples = out_onnx["samples"][dataset] + for torch_sample, onnx_sample in zip(torch_samples, onnx_samples): + if greedy: + # for datasets that validate greedy generation + # make sure that generated sequences are the same + assert torch_sample["resps"] == onnx_sample["resps"] + else: + # for datasets that validate likelihood + # make sure that likelihoods are the same + assert ( + pytest.approx(torch_sample["resps"][0][0], 0.0001) + == onnx_sample["resps"][0][0] + ) diff --git a/tests/deepsparse/evaluation/integrations/test_perplexity.py b/tests/deepsparse/evaluation/integrations/test_perplexity.py new file mode 100644 index 0000000000..b156e5b9a4 --- /dev/null +++ b/tests/deepsparse/evaluation/integrations/test_perplexity.py @@ -0,0 +1,132 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import copy + +import numpy as np + +import pytest +from deepsparse.evaluation.integrations.perplexity import ( + integration_eval, + load_perplexity_dataset, +) +from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline +from evaluate import load + + +@pytest.fixture() +def model_path(): + return "hf:mgoin/TinyStories-1M-deepsparse" + + +@pytest.fixture() +def model_id(): + return "roneneldan/TinyStories-1M" + + +@pytest.mark.parametrize( + "datasets", + [ + "openai_humaneval", + "wikitext2", + ], +) +@pytest.mark.parametrize("batch_size", [1, 2]) +class TestPerplexity: + limit = 2 + + def test_perplexity_ground_truth_equal_pipeline( + self, model_path, model_id, datasets, batch_size + ): + # setting max_sequence_length to 16 to speed up the test + kwargs_ground_truth = ( + dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {} + ) + kwargs = copy(kwargs_ground_truth) + + result_gt = self._get_ground_truth( + datasets=datasets, + batch_size=batch_size, + limit=self.limit, + model_id=model_id, + kwargs=kwargs_ground_truth, + ) + + result = integration_eval( + pipeline=TextGenerationPipeline( + model_path="hf:mgoin/TinyStories-1M-deepsparse", + engine_type="onnxruntime", + ), + datasets=datasets, + batch_size=batch_size, + limit=self.limit, + # we are setting accumulate=False to compare + # with the torch ground truth apples to apples + accumulate=False, + **kwargs, + ) + perplexities = result.formatted[0].metrics[0].value + perplexities_gt = result_gt["perplexities"] + assert np.allclose(perplexities, perplexities_gt, rtol=0.1) + + def test_perplexity_kv_cache_pipeline_equal_no_kv_cache_pipeline( + self, model_path, model_id, datasets, batch_size + ): + + kwargs_ground_truth = ( + dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {} + ) + kwargs = copy(kwargs_ground_truth) + + result_kv_cache = integration_eval( + pipeline=TextGenerationPipeline( + model_path="hf:mgoin/TinyStories-1M-deepsparse", + engine_type="onnxruntime", + ), + datasets=datasets, + model_path=model_id, + batch_size=batch_size, + limit=self.limit, + **kwargs, + ) + + result_non_kv_cache = integration_eval( + pipeline=TextGenerationPipeline( + model_path="hf:mgoin/TinyStories-1M-deepsparse", + engine_type="onnxruntime", + onnx_model_name="model-orig.onnx", + ), + datasets=datasets, + batch_size=batch_size, + limit=self.limit, + **kwargs, + ) + + perplexities_kv_cache = result_kv_cache.formatted[0].metrics[0].value + perplexities_non_kv_cache = result_non_kv_cache.formatted[0].metrics[0].value + np.allclose(perplexities_kv_cache, perplexities_non_kv_cache, rtol=0.1) + + @staticmethod + def _get_ground_truth(datasets, batch_size, limit, model_id, kwargs={}): + perplexity = load("perplexity", module_type="metric") + kwargs["model_path"] = model_id + dataset, *_ = load_perplexity_dataset(dataset_name=datasets, **kwargs) + predictions = [] + for i, sample in enumerate(dataset): + if i == batch_size * limit: + break + predictions.append(sample) + return perplexity.compute( + predictions=predictions, add_start_token=False, model_id=model_id + ) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index 816ad075e0..58eedff836 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -115,19 +115,25 @@ def test_evaluate_pipeline_without_kv_cache( not try_import_lm_evaluation_harness(raise_error=False), reason="lm_evaluation_harness not installed", ) -def test_evaluation_llm_evaluation_harness_integration_name( +def test_evaluation_llm_evaluation_harness( model_path, - datasets, ): assert evaluate( model=model_path, - datasets=datasets, - limit=2, - no_cache=True, + # testing only on hellaswag dataset + # to avoid long running time + datasets="hellaswag", + limit=1, integration="lm_evaluation_harness", ) +def test_evaluation_perplexity(model_path): + assert evaluate( + model=model_path, datasets="openai_humaneval", limit=1, integration="perplexity" + ) + + @pytest.mark.parametrize("type_serialization", ["json", "yaml"]) @pytest.mark.skipif( tuple(map(int, sys.version.split(".")[:2])) < (3, 10), @@ -144,7 +150,6 @@ def test_cli( runner.invoke( main, [ - "--model_path", model_path, "--dataset", datasets[0], From 20f90ac5c13ea929318a101af5e05ed5ddb4241c Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Mon, 12 Feb 2024 15:50:14 -0500 Subject: [PATCH 15/16] examples/benchmark fix for resnet50 example (#1597) --- examples/benchmark/resnet50_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/benchmark/resnet50_benchmark.py b/examples/benchmark/resnet50_benchmark.py index ed06ddea74..49832e5585 100644 --- a/examples/benchmark/resnet50_benchmark.py +++ b/examples/benchmark/resnet50_benchmark.py @@ -47,7 +47,8 @@ import numpy -from deepsparse import benchmark_model, cpu +from deepsparse import cpu +from deepsparse.engine import benchmark_model CORES_PER_SOCKET, AVX_TYPE, VNNI = cpu.cpu_details() From c4a7b68ffe33d53b7226a7442938b5bba56d9290 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 13 Feb 2024 09:06:53 -0500 Subject: [PATCH 16/16] fix if/else conditions with new ux flow (#1599) --- src/deepsparse/server/cli.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py index 6d3952c5f5..d402f616fd 100644 --- a/src/deepsparse/server/cli.py +++ b/src/deepsparse/server/cli.py @@ -225,14 +225,11 @@ def main( if ctx.invoked_subcommand is not None: return - if task is None and config_file is None: - raise ValueError("Must specify either --task or --config_file. Found neither") - if config_file is not None: server = _fetch_server(integration=integration, config=config_file) server.start_server(host, port, log_level, hot_reload_config=hot_reload_config) - if task is not None: + elif task is not None: cfg = ServerConfig( num_cores=num_cores, num_workers=num_workers, @@ -258,6 +255,8 @@ def main( server.start_server( host, port, log_level, hot_reload_config=hot_reload_config ) + else: + raise ValueError("Must specify either --task or --config_file. Found neither") def _fetch_server(integration: str, config: Union[ServerConfig, str]):