From 6035536b0731d9323e75fef3d35b715105c7a9bc Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 29 Jan 2024 15:18:19 +0000 Subject: [PATCH 01/24] initial implementation --- .../integrations/lm_evaluation_harness.py | 237 +++--------------- .../evaluation/integrations/None_rank0.db | Bin 0 -> 12288 bytes .../test_lm_evaluation_harness.py | 33 +-- 3 files changed, 53 insertions(+), 217 deletions(-) create mode 100644 tests/deepsparse/evaluation/integrations/None_rank0.db diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 2f8c7b8cef..6d4cb21650 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -29,8 +29,12 @@ from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result -from lm_eval import base, evaluator, tasks, utils - +from deepsparse.transformers.metrics import _cross_entropy +from lm_eval.api.model import LM +from lm_eval.api.instance import Instance +from lm_eval import evaluator, utils, tasks +from lm_eval.__main__ import cli_evaluate +tasks.initialize_tasks("INFO") _LOGGER = logging.getLogger(__name__) @@ -56,57 +60,23 @@ def integration_eval( :return the evaluation results """ - # [START] - # The code that sets up the interface between deepsparse and lm_evaluation_harness if isinstance(model, Pipeline): - # If the model is a Pipeline, we need to wrap - # it in a DeepSparseLM object - model = DeepSparseLM( - pipeline=model, - batch_size=batch_size, - max_gen_toks=kwargs.get("max_gen_toks"), - ) + model = DeepSparseLM(pipeline=model) - datasets = (",").join(datasets) if isinstance(datasets, list) else datasets - # [END] - - # [START] - # The code below is being adapted from: - # https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py - if kwargs.get("limit"): - _LOGGER.warning( - "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. " - "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." - ) - if datasets is None: - task_names = tasks.ALL_TASKS - else: - task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS) + datasets = (",").join(datasets) if isinstance(datasets, list) else datasets + task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS) _LOGGER.info(f"Selected Tasks: {task_names}") - description_dict = {} - if kwargs.get("description_dict_path"): - with open(kwargs.get("description_dict_path"), "r") as f: - description_dict = json.load(f) - - evaluator_input = EvaluatorInputSchema( - model=model, - tasks=task_names, - description_dict=description_dict, - batch_size=batch_size, - **kwargs, - ) - - results_raw = evaluator.simple_evaluate(**evaluator_input.dict()) + results_raw = evaluator.simple_evaluate(model=model, tasks=task_names, batch_size=batch_size, **kwargs) - results = Result( - raw=dict(output=results_raw, input=filter_evaluator_input(evaluator_input)), - formatted=format_raw_results(results_raw), - ) + # results = Result( + # raw=dict(output=results_raw, input=None), + # formatted=None, + # ) - return results + return results_raw def filter_evaluator_input( @@ -152,49 +122,11 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: return formatted_results -class EvaluatorInputSchema(BaseModel): - model: Any = Field(description="The name of the model.") - tasks: List[str] = Field( - description="The task (or multiple tasks) to evaluate the target on." - ) - description_dict: Optional[Dict[str, Any]] = Field( - None, description="Description dict." - ) - batch_size: int = Field(description="The batch size to use for evaluation.") - model_args: str = Field( - "", description="Additional arguments for the evaluated model." - ) - num_fewshot: int = Field(0, description="The number of few shots to use.") - max_batch_size: Optional[int] = Field( - None, description="Maximal batch size to try with --batch_size auto." - ) - device: Optional[str] = Field(None, description="Device to use for evaluation.") - no_cache: bool = Field(False, description="Include this flag to prevent caching.") - limit: Optional[float] = Field( - None, - description="Limit the number of examples per task. If <1, " - "limit is a percentage of the total number of " - "examples.", - ) - decontamination_ngrams_path: Optional[str] = Field( - None, description="Specify the path for decontamination n-grams." - ) - check_integrity: bool = Field( - False, description="Include this flag to check integrity." - ) - write_out: bool = Field(False, description="Include this flag to write out.") - output_base_path: Optional[str] = Field( - None, description="Specify the output base path." - ) - - -class DeepSparseLM(base.BaseLM): +class DeepSparseLM(LM): def __init__( self, pipeline: Pipeline, - tokenizer: Optional[str] = None, - batch_size: int = 1, - max_gen_toks: Optional[int] = None, + ): """ Wrapper around the DeepSparse pipeline to make it compatible with the @@ -203,126 +135,27 @@ def __init__( super().__init__() # Initialize new model and tokenizer instances - self.model = pipeline - self.tokenizer = tokenizer if tokenizer else self.model.tokenizer + self.pipeline = pipeline - self._batch_size = batch_size - self._max_length = pipeline.sequence_length - self._max_gen_toks = max_gen_toks or 256 - self.vocab_size = self.tokenizer.vocab_size + def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: + greedy = not self.pipeline.config.do_sample + prompts = [request.arguments[0] for request in requests] + out = self.pipeline(prompt = prompts, + output_scores=True, + ) - def _model_call(self, inps) -> torch.Tensor: - """ - Override the _model_call method to use the DeepSparse pipeline for - logits generation. + likelyhoods = [] + for prompt_idx, prompt in enumerate(prompts): + logits = out.generations[prompt_idx].score + tokenized_prompt = self.pipeline.tokenizer(prompt) + nll = _cross_entropy(logits[:sum(tokenized_prompt["attention_mask"]),:], tokenized_prompt["input_ids"]) + likelyhoods.append((nll, greedy)) + return likelyhoods - inps: a torch tensor of shape [batch, sequence] - the size of sequence may vary from call to call - returns: a torch tensor of shape [batch, sequence, vocab] with the - logits returned from the model - """ - # Encode the tokens to strings - prompt = self.model.tokenizer.batch_decode(inps.numpy()) - - # Run the model to map the prompt to logits - out = self.model( - prompt=prompt, - max_new_tokens=0, - include_prompt_logits=True, - output_scores=True, - ) - logits_numpy = numpy.stack([generation.score for generation in out.generations]) - return torch.from_numpy(logits_numpy) - - def greedy_until( - self, requests: List[Tuple[str, Union[List[str], str]]] - ) -> List[str]: - def _collate(x): - tokens = self.tok_encode(x[0]) - return len(tokens), x[0] - - results = [] - reorder = utils.Reorderer(requests, _collate) - - for chunk in utils.chunks( - tqdm(reorder.get_reordered(), disable=False), - self.batch_size, - ): - context = [c[0] for c in chunk] - request_args = chunk[0][1] - stop = request_args.get("until", None) - stop_sequences = stop if isinstance(stop, list) else [stop] - max_generation_length = request_args.get("max_length", None) - - assert ( - isinstance(max_generation_length, int) or max_generation_length is None - ) - assert isinstance(stop_sequences, list) or stop_sequences is None - - # TODO: Find a better way to handle stop sequences for 0-shot. - if stop_sequences is None: - until = [self.eot_token] - else: - until = stop_sequences + [self.eot_token] - - if max_generation_length is None: - max_tokens = self.max_gen_toks - else: - max_tokens = max_generation_length - - responses = self.model( - sequences=context, - max_new_tokens=max_tokens, - stop=until, - do_sample=False, - ) - - responses = responses if type(responses) is list else [responses] - - for response in responses: - response = response.generations[0].text - # Ensure the generated responses do not contain the stop sequences. - for term in until: - response = response.split(term)[0] - # partial caching - self.cache_hook.add_partial("greedy_until", (context, until), response) - results.append(response) - - return reorder.get_original(results) - - def _model_generate(self, context, max_length, eos_token_id): - # Isn't used because we override greedy_until - raise NotImplementedError() - - @property - def eot_token(self) -> str: - return self.tokenizer.eos_token - - @property - def eot_token_id(self) -> int: - return self.tokenizer.eos_token_id - - @property - def max_length(self): - return self._max_length - - @property - def max_gen_toks(self): - return self._max_gen_toks - - @property - def batch_size(self): - # should return self._batch_size but the - # TextGeneration model does not support batch_size > 1 - return 1 - - @property - def device(self): - pass - def tok_encode(self, string: str): - return self.tokenizer.encode(string, add_special_tokens=False) + def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]: + pass - def tok_decode(self, tokens): - return self.tokenizer.decode(tokens) + def generate_until(self, requests: list[Instance]) -> list[str]: + pass diff --git a/tests/deepsparse/evaluation/integrations/None_rank0.db b/tests/deepsparse/evaluation/integrations/None_rank0.db new file mode 100644 index 0000000000000000000000000000000000000000..6ab676686ae18918fc57fe171b5062bc08bd64e2 GIT binary patch literal 12288 zcmeI#&r8EF6u|MM$`E06w;g(U?4+QA_z&1h4{C?n6?CT(S%hKNp{AlI|33dCk2a$_ z?_RzS^5Z2WkWWr;_K+=|m6!GV$~d`DBc-*v6j4g)w(YfT8$RFPChh3+U7^+4U7M9%bVG2Jr*V-= z5hqzHlWuh)N3ZrVu^-6hAT!C)LmS}p5^{rY&T+N!=Z zjazT>vb(4}? Date: Tue, 30 Jan 2024 13:05:40 +0000 Subject: [PATCH 02/24] initial commit --- src/deepsparse/evaluation/cli.py | 28 +++----- src/deepsparse/evaluation/evaluator.py | 21 +++--- src/deepsparse/evaluation/registry.py | 9 +-- src/deepsparse/evaluation/utils.py | 64 +++++++++---------- .../test_lm_evaluation_harness.py | 6 +- tests/deepsparse/evaluation/test_evaluator.py | 22 ++++--- tests/deepsparse/evaluation/test_utils.py | 47 ++------------ 7 files changed, 74 insertions(+), 123 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index ed7ea72831..f37ed46d0c 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -20,7 +20,7 @@ Module for evaluating models on the various evaluation integrations OPTIONS: - --target TARGET A path to a remote or local directory containing ONNX/torch model + --target TARGET A path to a remote or local directory containing ONNX model (including all the auxiliary files) or a SparseZoo stub -d DATASET, --dataset DATASET The dataset to evaluate on. The user may pass multiple datasets @@ -30,9 +30,7 @@ integration name that is registered in the evaluation registry -e ENGINE_TYPE, --engine_type ENGINE_TYPE Inference engine to use for the evaluation. The default - is the DeepSparse engine. If the evaluation should be run - without initializing a pipeline (e.g. for the evaluation - of a torch model), the engine type should be set to None + is the DeepSparse engine. -s SAVE_PATH, --save_path SAVE_PATH The path to save the evaluation results. By default the results will be saved in the @@ -90,10 +88,10 @@ ) ) @click.option( - "--target", + "--model_path", type=click.Path(dir_okay=True, file_okay=True), required=True, - help="A path to a remote or local directory containing ONNX/torch model " + help="A path to a remote or local directory containing ONNX model " "(including all the auxiliary files) or a SparseZoo stub", ) @click.option( @@ -118,9 +116,7 @@ type=click.Choice([DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE]), default=DEEPSPARSE_ENGINE, help="The engine to use for the evaluation. The default is the " - "DeepSparse engine. If the evaluation should be run without " - "initializing a pipeline (e.g. for the evaluation of a torch " - "model), the engine type should be set to None", + "DeepSparse engine. ", ) @click.option( "-s", @@ -167,7 +163,7 @@ ) @click.argument("integration_args", nargs=-1, type=click.UNPROCESSED) def main( - target, + model_path, dataset, integration, engine_type, @@ -183,14 +179,8 @@ def main( # format kwargs to a dict integration_args = args_to_dict(integration_args) - _LOGGER.info(f"Target to evaluate: {target}") - if engine_type: - _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created") - else: - _LOGGER.info( - "No engine type specified. The target " - "will be evaluated using the native framework" - ) + _LOGGER.info(f"Creating pipeline to evaluate from: {model_path}") + _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created") _LOGGER.info( f"Datasets to evaluate on: {datasets}\n" @@ -201,7 +191,7 @@ def main( ) result: Result = evaluate( - target=target, + model_path=model_path, datasets=datasets, integration=integration, engine_type=engine_type, diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py index 7bd56adf6e..9d1b3228a7 100644 --- a/src/deepsparse/evaluation/evaluator.py +++ b/src/deepsparse/evaluation/evaluator.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from typing import Any, List, Optional, Union from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Result -from deepsparse.evaluation.utils import create_model_from_target +from deepsparse.evaluation.utils import create_pipeline from deepsparse.operators.engine_operator import ( DEEPSPARSE_ENGINE, ORT_ENGINE, @@ -30,11 +31,11 @@ def evaluate( - target: Any, + model_path: Any, datasets: Union[str, List[str]], integration: Optional[str] = None, engine_type: Union[ - DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE, None + DEEPSPARSE_ENGINE, ORT_ENGINE, TORCHSCRIPT_ENGINE ] = DEEPSPARSE_ENGINE, batch_size: int = 1, splits: Union[List[str], str, None] = None, @@ -42,18 +43,18 @@ def evaluate( **kwargs, ) -> Result: - # if target is a string, turn it into an appropriate model/pipeline + # if target is a string, turn it into an appropriate pipeline # otherwise assume it is a model/pipeline - model = ( - create_model_from_target(target, engine_type) - if isinstance(target, str) - else target + pipeline = ( + create_pipeline(model_path, engine_type) + if isinstance(model_path, (Path, str)) + else model_path ) - eval_integration = EvaluationRegistry.resolve(model, datasets, integration) + eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration) return eval_integration( - model=model, + pipeline=pipeline, datasets=datasets, engine_type=engine_type, batch_size=batch_size, diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py index 5b6e45bc1c..2daabb69cc 100644 --- a/src/deepsparse/evaluation/registry.py +++ b/src/deepsparse/evaluation/registry.py @@ -15,8 +15,9 @@ Implementation of a registry for evaluation functions """ import logging -from typing import Any, Callable, List, Optional, Union +from typing import Callable, List, Optional, Union +from deepsparse import Pipeline from sparsezoo.utils.registry import RegistryMixin @@ -38,7 +39,7 @@ def load_from_registry(cls, name: str) -> Callable[..., "Result"]: # noqa: F821 @classmethod def resolve( cls, - model: Any, + pipeline: Pipeline, datasets: Union[str, List[str]], integration: Optional[str] = None, ) -> Callable[..., "Result"]: # noqa: F821 @@ -59,12 +60,12 @@ def resolve( "No integration specified, inferring the evaluation" "function from the input arguments..." ) - integration = resolve_integration(model, datasets) + integration = resolve_integration(pipeline, datasets) if integration is None: raise ValueError( "Unable to resolve an evaluation function for the given model. " - "Specify an integration name or use a model that is supported " + "Specify an integration name or use a pipeline that is supported " ) _LOGGER.info(f"Inferred the evaluation function: {integration}") diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index 0534a9f9f3..7290f14adb 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -15,14 +15,11 @@ import os from typing import Any, Dict, List, Optional, Tuple, Union -from transformers import AutoModelForCausalLM, PreTrainedModel - from deepsparse import Pipeline -from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE, ORT_ENGINE __all__ = [ - "create_model_from_target", + "create_pipeline", "get_save_path", "args_to_dict", "resolve_integration", @@ -50,7 +47,7 @@ def potentially_check_dependency_import(integration_name: str) -> bool: def resolve_integration( - model: Union[Pipeline, PreTrainedModel], datasets: Union[str, List[str]] + pipeline: Pipeline, datasets: Union[str, List[str]] ) -> Union[str, None]: """ Given a model and dataset, infer the name of the evaluation integration @@ -64,21 +61,22 @@ def resolve_integration( :param datasets: The datasets to infer the integration for :return: The name of the integration to use or None if unable to infer """ - if if_generative_language_model(model): + if if_generative_language_model(pipeline): return LM_EVALUATION_HARNESS return None -def if_generative_language_model(model: Any) -> bool: +def if_generative_language_model(pipeline: Pipeline) -> bool: """ Checks if the model is a generative language model. """ - if isinstance(model, Pipeline): - return model.__class__.__name__ == "TextGenerationPipeline" - elif isinstance(model, PreTrainedModel): - return "CausalLM" in model.__class__.__name__ - else: - return False + pipeline_name = pipeline.__class__.__name__ + if pipeline_name == "TextGenerationPipeline" or ( + pipeline_name == "TextGenerationPipelineNoKVCache" + ): + return True + + return False def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]: @@ -126,34 +124,30 @@ def get_save_path( return os.path.join(base_path, file_name) -def create_model_from_target( - target: str, +def create_pipeline( + model_path: str, engine_type: Optional[str] = None, **kwargs, -) -> Union[Pipeline, AutoModelForCausalLM]: +) -> Pipeline: """ - Create a model or a pipeline from a target path. + Create a pipeline for evaluation - Note: This function is currently limited to: - - creating pipelines of type 'text-generation' - - creating dense huggingface models of type 'AutoModelForCausalLM' - This function will be expanded in the future to support more - model types and frameworks. + Note: This function is currently primarily + focused on creating pipelines of type 'text-generation' + This function will be expanded in the future to support + more tasks and models - :param target: The target path to initialize the + :param model_path: The target path to initialize the text generation model from. This can be a local or remote path to the model or a sparsezoo stub :param engine_type: The engine type to initialize the model with. - :return: The initialized model + :return: The initialized pipeline """ - if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]: - return Pipeline.create( - task="text-generation", - model_path=target, - sequence_length=kwargs.pop("sequence_length", 2048), - engine_type=engine_type, - batch_size=kwargs.pop("batch_size", 1), - **kwargs, - ) - else: - return AutoModelForCausalLM.from_pretrained(target, **kwargs) + return Pipeline.create( + task=kwargs.pop("task", "text-generation"), + model_path=model_path, + sequence_length=kwargs.pop("sequence_length", 2048), + engine_type=engine_type, + batch_size=kwargs.pop("batch_size", 1), + **kwargs, + ) diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index 9fa9b494cf..db847af1ad 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -14,17 +14,17 @@ import pytest from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness -from deepsparse.evaluation.utils import create_model_from_target +from deepsparse.evaluation.utils import create_pipeline @pytest.mark.parametrize( "pipeline, model_torch", [ ( - create_model_from_target( + create_pipeline( "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime" ), - create_model_from_target("roneneldan/TinyStories-1M"), + create_pipeline("roneneldan/TinyStories-1M"), ) ], ) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index dedd63fa36..f1bc0c277a 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -49,7 +49,7 @@ def dummy_integration(*args, **kwargs): @pytest.fixture() -def target(): +def model_path(): return "hf:mgoin/TinyStories-1M-deepsparse" @@ -68,18 +68,18 @@ def unknown_integration_name(): return "unknown_integration" -def test_evaluate_unknown_integration(target, datasets, unknown_integration_name): +def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name): with pytest.raises(KeyError): evaluate( - target=target, + model_path=model_path, datasets=datasets, integration=unknown_integration_name, ) -def test_evaluate(target, datasets, dummy_integration_name): +def test_evaluate(model_path, datasets, dummy_integration_name): result = evaluate( - target=target, + model_path=model_path, datasets=datasets, integration=dummy_integration_name, ) @@ -91,11 +91,11 @@ def test_evaluate(target, datasets, dummy_integration_name): reason="lm_evaluation_harness not installed", ) def test_evaluation_llm_evaluation_harness_integration_name( - target, + model_path, datasets, ): assert evaluate( - target=target, + model_path=model_path, datasets=datasets, limit=2, no_cache=True, @@ -110,15 +110,17 @@ def test_evaluation_llm_evaluation_harness_integration_name( "with importing functions that are decorated with " "click option where multiple=True", ) -def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serialization): +def test_cli( + tmp_path, model_path, datasets, dummy_integration_name, type_serialization +): from deepsparse.evaluation.cli import main runner = CliRunner() runner.invoke( main, [ - "--target", - target, + "--model_path", + model_path, "--dataset", datasets[0], "--dataset", diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py index f712dce0df..a16cb8ee32 100644 --- a/tests/deepsparse/evaluation/test_utils.py +++ b/tests/deepsparse/evaluation/test_utils.py @@ -23,23 +23,13 @@ import pytest from deepsparse import Pipeline from deepsparse.evaluation.utils import ( - create_model_from_target, + create_pipeline, get_save_path, if_generative_language_model, resolve_integration, ) -@pytest.fixture -def llm_type_hf_model(): - return AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M") - - -@pytest.fixture -def not_llm_type_hf_model(): - return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased") - - @pytest.fixture def llm_type_pipeline(): return Pipeline.create( @@ -49,25 +39,13 @@ def llm_type_pipeline(): ) -def test_resolve_known_llm_model(llm_type_hf_model): +def test_resolve_known_llm_pipeline(llm_type_pipeline): assert ( - resolve_integration(model=llm_type_hf_model, datasets="") + resolve_integration(pipeline=llm_type_pipeline, datasets="") == "lm-evaluation-harness" ) -def test_resolve_unknown_model(not_llm_type_hf_model): - assert resolve_integration(model=not_llm_type_hf_model, datasets="") is None - - -def test_if_generative_language_model_true(llm_type_hf_model): - assert if_generative_language_model(llm_type_hf_model) - - -def test_if_generative_language_model_false(not_llm_type_hf_model): - assert not if_generative_language_model(not_llm_type_hf_model) - - def test_if_generative_language_pipeline_true(llm_type_pipeline): assert if_generative_language_model(llm_type_pipeline) @@ -89,26 +67,11 @@ def pipeline_target(): return "hf:mgoin/TinyStories-1M-deepsparse" -@pytest.fixture -def torch_target(): - return "roneneldan/TinyStories-1M" - - def test_initialize_model_from_target_pipeline_onnx(pipeline_target): - model = create_model_from_target(pipeline_target, "onnxruntime") + model = create_pipeline(pipeline_target, "onnxruntime") assert model.ops.get("single_engine")._engine_type == "onnxruntime" -def test_initialize_model_from_target_pipeline_deepsparse(pipeline_target): - model = create_model_from_target(pipeline_target, "deepsparse") - assert model.ops.get("single_engine")._engine_type == "deepsparse" - - def test_initialize_model_from_target_pipeline_with_kwargs(pipeline_target): - model = create_model_from_target(pipeline_target, "deepsparse", sequence_length=64) + model = create_pipeline(pipeline_target, "deepsparse", sequence_length=64) assert model.ops.get("process_input").sequence_length == 64 - - -def test_initialize_model_from_target_torch(torch_target): - model = create_model_from_target(torch_target, "torch") - assert isinstance(model, GPTNeoForCausalLM) From 6599f41cb08cdb2903420d8c17fb3486c6c395ac Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Tue, 30 Jan 2024 14:03:14 +0000 Subject: [PATCH 03/24] add some more tests for hardening --- src/deepsparse/evaluation/cli.py | 7 +++-- src/deepsparse/evaluation/evaluator.py | 21 +++++++++---- .../pipelines/text_generation/pipeline.py | 7 +++++ .../text_generation/pipeline_no_kv_cache.py | 8 +++++ .../test_lm_evaluation_harness.py | 4 ++- tests/deepsparse/evaluation/test_evaluator.py | 31 +++++++++++++++++-- 6 files changed, 65 insertions(+), 13 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index f37ed46d0c..e0e16cb4ab 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -20,7 +20,8 @@ Module for evaluating models on the various evaluation integrations OPTIONS: - --target TARGET A path to a remote or local directory containing ONNX model + --model_path MODEL_PATH + A path to an ONNX model, local directory containing ONNX model (including all the auxiliary files) or a SparseZoo stub -d DATASET, --dataset DATASET The dataset to evaluate on. The user may pass multiple datasets @@ -91,7 +92,7 @@ "--model_path", type=click.Path(dir_okay=True, file_okay=True), required=True, - help="A path to a remote or local directory containing ONNX model " + help="A path to an ONNX model, local directory containing ONNX model" "(including all the auxiliary files) or a SparseZoo stub", ) @click.option( @@ -191,7 +192,7 @@ def main( ) result: Result = evaluate( - model_path=model_path, + model=model_path, datasets=datasets, integration=integration, engine_type=engine_type, diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py index 9d1b3228a7..b513f07563 100644 --- a/src/deepsparse/evaluation/evaluator.py +++ b/src/deepsparse/evaluation/evaluator.py @@ -13,8 +13,9 @@ # limitations under the License. import logging from pathlib import Path -from typing import Any, List, Optional, Union +from typing import List, Optional, Union +from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Result from deepsparse.evaluation.utils import create_pipeline @@ -31,7 +32,7 @@ def evaluate( - model_path: Any, + model: Union[Pipeline, Path, str], datasets: Union[str, List[str]], integration: Optional[str] = None, engine_type: Union[ @@ -43,12 +44,20 @@ def evaluate( **kwargs, ) -> Result: + if isinstance(model, Pipeline): + _LOGGER.info( + "Passed a Pipeline object into evaluate function. This will " + "override the following arguments:" + ) + batch_size = model.batch_size + _LOGGER.info(f"batch_size: {batch_size}") + engine_type = engine_type + _LOGGER.info(f"engine_type: {engine_type}") + # if target is a string, turn it into an appropriate pipeline - # otherwise assume it is a model/pipeline + # otherwise assume it is a pipeline pipeline = ( - create_pipeline(model_path, engine_type) - if isinstance(model_path, (Path, str)) - else model_path + create_pipeline(model, engine_type) if isinstance(model, (Path, str)) else model ) eval_integration = EvaluationRegistry.resolve(pipeline, datasets, integration) diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py index 2c858c901b..bbc0e8ba15 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py @@ -357,6 +357,13 @@ def sequence_length(self) -> int: """ return self.ops["single_engine"].sequence_length + def batch_size(self) -> int: + return self.ops["single_engine"].batch_size + + @property + def engine_type(self) -> str: + return self.ops["single_engine"]._engine_type + def _get_continuous_batching_scheduler( self, batch_sizes: List[int], engines: List[EngineOperator] ) -> ContinuousBatchingScheduler: diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py index 7f6cb9db5f..c6cbc3dd59 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py @@ -127,3 +127,11 @@ def expand_inputs(self, items, batch_size): out, orig_batch_size = split_engine_inputs(items, batch_size) combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out] return combined_batches, orig_batch_size + + @property + def batch_size(self) -> int: + return self.ops["engine_operator"].batch_size + + @property + def engine_type(self) -> str: + return self.ops["engine_operator"]._engine_type diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index db847af1ad..3b9016294f 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from transformers import AutoModelForCausalLM + import pytest from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness from deepsparse.evaluation.utils import create_pipeline @@ -24,7 +26,7 @@ create_pipeline( "hf:mgoin/TinyStories-1M-deepsparse", engine_type="onnxruntime" ), - create_pipeline("roneneldan/TinyStories-1M"), + AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M"), ) ], ) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index f1bc0c277a..816ad075e0 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -29,6 +29,7 @@ Metric, Result, ) +from deepsparse.pipeline import Pipeline @EvaluationRegistry.register() @@ -71,7 +72,7 @@ def unknown_integration_name(): def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_name): with pytest.raises(KeyError): evaluate( - model_path=model_path, + model=model_path, datasets=datasets, integration=unknown_integration_name, ) @@ -79,7 +80,31 @@ def test_evaluate_unknown_integration(model_path, datasets, unknown_integration_ def test_evaluate(model_path, datasets, dummy_integration_name): result = evaluate( - model_path=model_path, + model=model_path, + datasets=datasets, + integration=dummy_integration_name, + ) + assert isinstance(result, Result) + + +def test_evaluate_pipeline_with_kv_cache(model_path, datasets, dummy_integration_name): + result = evaluate( + model=Pipeline.create(model_path=model_path, task="text-generation"), + datasets=datasets, + integration=dummy_integration_name, + ) + assert isinstance(result, Result) + + +def test_evaluate_pipeline_without_kv_cache( + model_path, datasets, dummy_integration_name +): + result = evaluate( + model=Pipeline.create( + model_path=model_path, + task="text-generation", + onnx_model_name="model-orig.onnx", + ), datasets=datasets, integration=dummy_integration_name, ) @@ -95,7 +120,7 @@ def test_evaluation_llm_evaluation_harness_integration_name( datasets, ): assert evaluate( - model_path=model_path, + model=model_path, datasets=datasets, limit=2, no_cache=True, From 4721c1fcd656a4e04b72eb3128fb121ca2297824 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:04:32 +0100 Subject: [PATCH 04/24] Update src/deepsparse/evaluation/cli.py --- src/deepsparse/evaluation/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index e0e16cb4ab..9c8fe3d06a 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -180,7 +180,7 @@ def main( # format kwargs to a dict integration_args = args_to_dict(integration_args) - _LOGGER.info(f"Creating pipeline to evaluate from: {model_path}") + _LOGGER.info(f"Creating pipeline to evaluate from model path: {model_path}") _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created") _LOGGER.info( From 124779435927ec266a18c3486780a77068c3f71a Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:06:35 +0100 Subject: [PATCH 05/24] Update src/deepsparse/transformers/pipelines/text_generation/pipeline.py --- .../transformers/pipelines/text_generation/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py index bbc0e8ba15..4a38392d76 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py @@ -357,6 +357,7 @@ def sequence_length(self) -> int: """ return self.ops["single_engine"].sequence_length + @property def batch_size(self) -> int: return self.ops["single_engine"].batch_size From 9e88f89e7ea175d05eed4bacbb86ac1abda8f3fd Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Tue, 30 Jan 2024 15:07:31 +0100 Subject: [PATCH 06/24] Apply suggestions from code review --- src/deepsparse/evaluation/cli.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index 9c8fe3d06a..6979521c7a 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -180,8 +180,7 @@ def main( # format kwargs to a dict integration_args = args_to_dict(integration_args) - _LOGGER.info(f"Creating pipeline to evaluate from model path: {model_path}") - _LOGGER.info(f"A pipeline with the engine type: {engine_type} will be created") + _LOGGER.info(f"Creating {engine_type} pipeline to evaluate from model path: {model_path}") _LOGGER.info( f"Datasets to evaluate on: {datasets}\n" From fdb21c6cf093bc527c6c318af1bb0e5b96ee68e8 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Tue, 30 Jan 2024 14:08:09 +0000 Subject: [PATCH 07/24] quality --- src/deepsparse/evaluation/cli.py | 4 +++- .../transformers/pipelines/text_generation/pipeline.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index 6979521c7a..43eaa33790 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -180,7 +180,9 @@ def main( # format kwargs to a dict integration_args = args_to_dict(integration_args) - _LOGGER.info(f"Creating {engine_type} pipeline to evaluate from model path: {model_path}") + _LOGGER.info( + f"Creating {engine_type} pipeline to evaluate from model path: {model_path}" + ) _LOGGER.info( f"Datasets to evaluate on: {datasets}\n" diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py index 4a38392d76..64c0c64a51 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline.py @@ -357,7 +357,7 @@ def sequence_length(self) -> int: """ return self.ops["single_engine"].sequence_length - @property + @property def batch_size(self) -> int: return self.ops["single_engine"].batch_size From 3e5b7a83f5ff9599aed10434e723190991a8bfc7 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Thu, 1 Feb 2024 14:31:54 +0000 Subject: [PATCH 08/24] fix the UI, implement loglikelihood function --- .../integrations/lm_evaluation_harness.py | 175 +++++++++++++----- .../test_lm_evaluation_harness.py | 84 ++++----- 2 files changed, 167 insertions(+), 92 deletions(-) diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 6d4cb21650..9b1d23e855 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -16,24 +16,23 @@ Integration of the `lm_evaluation_harness`: https://github.com/EleutherAI/lm-evaluation-harness """ - -import json import logging from typing import Any, Dict, List, Optional, Tuple, Union import numpy -from pydantic import BaseModel, Field from tqdm import tqdm +from transformers import AutoTokenizer -import torch from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result -from deepsparse.transformers.metrics import _cross_entropy -from lm_eval.api.model import LM +from deepsparse.utils.data import numpy_log_softmax +from lm_eval import evaluator, tasks, utils from lm_eval.api.instance import Instance -from lm_eval import evaluator, utils, tasks -from lm_eval.__main__ import cli_evaluate +from lm_eval.api.model import LM +from lm_eval.utils import Reorderer + + tasks.initialize_tasks("INFO") _LOGGER = logging.getLogger(__name__) @@ -61,38 +60,23 @@ def integration_eval( :return the evaluation results """ if isinstance(model, Pipeline): - model = DeepSparseLM(pipeline=model) - + model = DeepSparseLM(pipeline=model, batch_size=batch_size) datasets = (",").join(datasets) if isinstance(datasets, list) else datasets task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS) _LOGGER.info(f"Selected Tasks: {task_names}") - results_raw = evaluator.simple_evaluate(model=model, tasks=task_names, batch_size=batch_size, **kwargs) + results_raw = evaluator.simple_evaluate( + model=model, tasks=task_names, batch_size=batch_size, **kwargs + ) - # results = Result( - # raw=dict(output=results_raw, input=None), - # formatted=None, - # ) + results = Result( + raw=results_raw, + formatted=format_raw_results(results_raw), + ) - return results_raw - - -def filter_evaluator_input( - evaluator_input: "EvaluatorInputSchema", -) -> Dict[str, Any]: # noqa: F821 - """ - Filter the evaluator input to remove the model field. - The model field is a complex object that cannot be serialized. - - :param evaluator_input: the evaluator input to filter - :return: the filtered evaluator input - """ - evaluator = evaluator_input.dict() - del evaluator["model"] - - return evaluator + return results def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: @@ -107,6 +91,8 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: for dataset_name, dataset_result in results["results"].items(): metrics = [] for metric_name, metric_value in dataset_result.items(): + if isinstance(metric_value, str): + continue metric = Metric(name=metric_name, value=metric_value) metrics.append(metric) dataset = Dataset( @@ -126,7 +112,8 @@ class DeepSparseLM(LM): def __init__( self, pipeline: Pipeline, - + batch_size: int = 1, + tokenizer: Optional[AutoTokenizer] = None, ): """ Wrapper around the DeepSparse pipeline to make it compatible with the @@ -136,26 +123,120 @@ def __init__( # Initialize new model and tokenizer instances self.pipeline = pipeline + self.batch_size = batch_size + self.tokenizer = tokenizer or pipeline.tokenizer + self._max_length = pipeline.sequence_length + + def tok_encode(self, string: str) -> List[int]: + return self.tokenizer.encode(string) + def tok_decode(self, tokens: List[int]) -> str: + return self.tokenizer.decode(tokens) - def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: - greedy = not self.pipeline.config.do_sample - prompts = [request.arguments[0] for request in requests] - out = self.pipeline(prompt = prompts, - output_scores=True, - ) + @property + def max_length(self) -> int: + return self._max_length + + def loglikelihood(self, requests) -> List[Tuple[float, bool]]: + """ + Copied directly from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ + new_reqs = [] + for context, continuation in [req.args for req in requests]: + if context == "": + raise NotImplemented("Implementing empty context is not supported yet") + context_enc, continuation_enc = self._encode_pair(context, continuation) - likelyhoods = [] - for prompt_idx, prompt in enumerate(prompts): - logits = out.generations[prompt_idx].score - tokenized_prompt = self.pipeline.tokenizer(prompt) - nll = _cross_entropy(logits[:sum(tokenized_prompt["attention_mask"]),:], tokenized_prompt["input_ids"]) - likelyhoods.append((nll, greedy)) - return likelyhoods + new_reqs.append(((context, continuation), context_enc, continuation_enc)) + return self._loglikelihood_tokens(new_reqs) - def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]: + def _loglikelihood_tokens( + self, + requests: List[Tuple[Tuple[str, str], List[int], List[int]]], + disable_tqdm: bool = False, + ) -> List[Tuple[float, bool]]: + res = [] + + def _collate(x): + """Defines the key for the sorted method""" + toks = x[1] + x[2] + return -len(toks), tuple(toks) + + re_ord = utils.Reorderer(requests, _collate) + + for chunk in tqdm( + list(utils.chunks(re_ord.get_reordered(), self.batch_size)), + disable=disable_tqdm, + ): + for cache_key, context_enc, continuation_enc in chunk: + # how this all works (illustrated on a causal decoder-only setup): + # CTX CONT + # inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + # model \ \ + # logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the + # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice + + inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1] + + response = self.pipeline( + prompt=self.tokenizer.decode(inp), + max_new_tokens=0, + output_scores=True, + include_prompt_logits=True, + ) + + for i, resp in enumerate(response.generations): + # (seq_len, vocab_size) + multi_scores = resp.score + # (seq_len, vocab_size) but with softmax applied + multi_logits = numpy_log_softmax(multi_scores, axis=1) + # toss out the context half of the sequence + # (cont_len, vocab_size) + continuation_multi_logits = multi_logits[-len(continuation_enc) :] + + # pick out the logits for the continuation tokens + # (cont_len,) + continuation_logits = continuation_multi_logits[ + numpy.arange(len(continuation_enc)), continuation_enc + ] + # check if the tokens generated greedly are the same + # as the expected continuation + greedy_tokens = continuation_multi_logits.argmax(axis=1) + max_equal = greedy_tokens.tolist() == continuation_enc + + # Answer: (log prob, is-exact-match) + answer = (float(continuation_logits.sum()), bool(max_equal)) + + res.append(answer) + + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) + + return re_ord.get_original(res) + + def loglikelihood_rolling( + self, requests: list[Instance] + ) -> list[tuple[float, bool]]: pass def generate_until(self, requests: list[Instance]) -> list[str]: pass + + def _encode_pair( + self, context: str, continuation: str + ) -> Tuple[List[int], List[int]]: + """ + Copied directly from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ + n_spaces = len(context) - len(context.rstrip()) + if n_spaces > 0: + continuation = context[-n_spaces:] + continuation + context = context[:-n_spaces] + whole_enc = self.tok_encode(context + continuation) + context_enc = self.tok_encode(context) + context_enc_len = len(context_enc) + continuation_enc = whole_enc[context_enc_len:] + return context_enc, continuation_enc diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index cba804e9e7..726f9f87eb 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -17,60 +17,54 @@ from deepsparse.evaluation.utils import create_model_from_target -@pytest.mark.parametrize( - "pipeline, model_torch", - [ - ( - create_model_from_target( - "zoo:mistral-7b-gsm8k_mistral_pretrain-pruned70", engine_type="onnxruntime", sequence_length = 256 - ), - create_model_from_target("roneneldan/TinyStories-1M"), - ) - ], -) @pytest.mark.parametrize( "datasets", [ - #["hellaswag"], + ["hellaswag"], # ["hellaswag", "gsm8k"], # "gsm8k", - #"arc_challenge", + "arc_challenge", ], ) @pytest.mark.parametrize( "batch_size", - [1], + [1], # TODO: Add test for higher batch sizes +) +@pytest.mark.skipif( + not try_import_lm_evaluation_harness(raise_error=False), + reason="lm_evaluation_harness not installed", ) -class TestLMEvaluationHarness: - @pytest.mark.skipif( - not try_import_lm_evaluation_harness(raise_error=False), - reason="lm_evaluation_harness not installed", +def test_integration_eval_onnx_matches_torch(datasets, batch_size): + from deepsparse.evaluation.integrations.lm_evaluation_harness import ( + integration_eval, + ) + + out_torch = integration_eval( + model="hf", + model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16", + datasets=datasets, + batch_size=batch_size, + limit=2, + use_cache=None, # avoid saving files when running tests + ) + + out_onnx = integration_eval( + model=create_model_from_target( + "hf:mgoin/TinyStories-1M-ds", engine_type="onnxruntime", sequence_length=128 + ), + datasets=datasets, + batch_size=batch_size, + limit=2, + use_cache=None, # avoid saving files when running tests ) - def test_integration_eval_onnx_matches_torch( - self, pipeline, model_torch, datasets, batch_size - ): - from deepsparse.evaluation.integrations.lm_evaluation_harness import ( - integration_eval, - ) - out_torch = integration_eval( - model="hf", - model_args="pretrained=roneneldan/TinyStories-1M,dtype=float32", - datasets=datasets, - batch_size=batch_size, - limit=1, - use_cache=None, # avoid saving files when running tests - ) - out_onnx = integration_eval( - model=pipeline, - datasets=datasets, - batch_size=batch_size, - limit=1, - use_cache=None, # avoid saving files when running tests - ) - print(out_onnx) - print(out_torch) - # out_onnx = out_onnx.raw["output"] - # out_torch = out_torch.raw["output"] - # - # assert out_onnx["results"] == out_torch["results"] + datasets = datasets if isinstance(datasets, list) else [datasets] + for dataset in datasets: + torch_samples = out_torch.raw["samples"][dataset] + onnx_samples = out_onnx.raw["samples"][dataset] + for torch_sample, onnx_sample in zip(torch_samples, onnx_samples): + for torch_resp, onnx_resp in zip( + torch_sample["resps"], onnx_sample["resps"] + ): + assert pytest.approx(torch_resp[0][0], 0.1) == onnx_resp[0][0] + assert torch_resp[0][1] == onnx_resp[0][1] From f38f0db2e3af8e7e094ea470aba6460dcbb89b08 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Thu, 1 Feb 2024 14:35:22 +0000 Subject: [PATCH 09/24] remove unneccessary file --- .../evaluation/integrations/None_rank0.db | Bin 12288 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/deepsparse/evaluation/integrations/None_rank0.db diff --git a/tests/deepsparse/evaluation/integrations/None_rank0.db b/tests/deepsparse/evaluation/integrations/None_rank0.db deleted file mode 100644 index 6ab676686ae18918fc57fe171b5062bc08bd64e2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI#&r8EF6u|MM$`E06w;g(U?4+QA_z&1h4{C?n6?CT(S%hKNp{AlI|33dCk2a$_ z?_RzS^5Z2WkWWr;_K+=|m6!GV$~d`DBc-*v6j4g)w(YfT8$RFPChh3+U7^+4U7M9%bVG2Jr*V-= z5hqzHlWuh)N3ZrVu^-6hAT!C)LmS}p5^{rY&T+N!=Z zjazT>vb(4}? Date: Fri, 2 Feb 2024 11:56:01 +0000 Subject: [PATCH 10/24] initial commit --- .../integrations/lm_evaluation_harness.py | 95 ++++++++++++++++++- src/deepsparse/evaluation/utils.py | 1 + .../test_lm_evaluation_harness.py | 28 +++--- 3 files changed, 107 insertions(+), 17 deletions(-) diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 9b1d23e855..cedef9c643 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -22,7 +22,7 @@ import numpy from tqdm import tqdm from transformers import AutoTokenizer - +import copy from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result @@ -30,7 +30,7 @@ from lm_eval import evaluator, tasks, utils from lm_eval.api.instance import Instance from lm_eval.api.model import LM -from lm_eval.utils import Reorderer +from collections import defaultdict tasks.initialize_tasks("INFO") @@ -113,6 +113,7 @@ def __init__( self, pipeline: Pipeline, batch_size: int = 1, + max_gen_toks: int = 128, tokenizer: Optional[AutoTokenizer] = None, ): """ @@ -126,6 +127,7 @@ def __init__( self.batch_size = batch_size self.tokenizer = tokenizer or pipeline.tokenizer self._max_length = pipeline.sequence_length + self._max_gen_toks = max_gen_toks def tok_encode(self, string: str) -> List[int]: return self.tokenizer.encode(string) @@ -137,6 +139,10 @@ def tok_decode(self, tokens: List[int]) -> str: def max_length(self) -> int: return self._max_length + @property + def max_gen_toks(self) -> int: + return self._max_gen_toks + def loglikelihood(self, requests) -> List[Tuple[float, bool]]: """ Copied directly from @@ -219,10 +225,91 @@ def _collate(x): def loglikelihood_rolling( self, requests: list[Instance] ) -> list[tuple[float, bool]]: - pass + raise NotImplementedError() def generate_until(self, requests: list[Instance]) -> list[str]: - pass + res = defaultdict(list) + re_ords = {} + + def _collate(x): + # the negative sign on len(toks) sorts descending + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + grouper = utils.Grouper(requests, lambda x: str(x.args[1])) + for key, reqs in grouper.get_grouped().items(): + # within each set of reqs for given kwargs, we reorder by token length, descending. + re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate) + + pbar = tqdm(total=len(requests)) + # for each different set of kwargs, we execute all requests, by batch. + for key, re_ord in re_ords.items(): + chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size) + for chunk in chunks: + contexts, all_gen_kwargs = zip(*chunk) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + # unpack our keyword arguments. + until = None + if isinstance(gen_kwargs, dict): + kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1 + if "until" in kwargs.keys(): + until = kwargs.pop("until") + if isinstance(until, str): + until = [kwargs] + elif not isinstance(until, list): + raise ValueError( + f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" + ) + else: + raise ValueError( + f"Expected `kwargs` to be of type `dict` but got {kwargs}" + ) + + if not until: + until = [self.tok_decode(self.eot_token_id)] + + if "max_gen_toks" in kwargs.keys(): + max_gen_toks = kwargs.pop("max_gen_toks") + else: + max_gen_toks = self.max_gen_toks + + # we require users to pass do_sample=True explicitly for non-greedy gen + if "do_sample" not in kwargs.keys(): + kwargs["do_sample"] = False + + # first stop sequence is used to halt generation upon encountering + primary_until = [until[0]] + + responses = self.pipeline( + sequences=contexts, + max_new_tokens=max_gen_toks, + stop=until, + **kwargs, + ) + + responses = responses if type(responses) is list else [responses] + for response, context in zip(responses, contexts): + text = response.generations[0].text + # use secondary stop seqs to cut off should-have-been-stopped content post-hoc + for term in until: + if len(term) > 0: + # ignore possible empty separators + text = text.split(term)[0] + + res[key].append(text) + self.cache_hook.add_partial("greedy_until", (context, gen_kwargs), text) + pbar.update(1) + # reorder this group of results back to original unsorted form + res[key] = re_ord.get_original(res[key]) + + pbar.close() + + return grouper.get_original(res) def _encode_pair( self, context: str, continuation: str diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index 0534a9f9f3..2b3bc5e8c7 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -146,6 +146,7 @@ def create_model_from_target( :param engine_type: The engine type to initialize the model with. :return: The initialized model """ + engine_type = engine_type or DEEPSPARSE_ENGINE if engine_type in [DEEPSPARSE_ENGINE, ORT_ENGINE]: return Pipeline.create( task="text-generation", diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index 726f9f87eb..e982824966 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -20,10 +20,10 @@ @pytest.mark.parametrize( "datasets", [ - ["hellaswag"], + #["hellaswag"], # ["hellaswag", "gsm8k"], - # "gsm8k", - "arc_challenge", + "gsm8k", + #"arc_challenge", ], ) @pytest.mark.parametrize( @@ -39,22 +39,24 @@ def test_integration_eval_onnx_matches_torch(datasets, batch_size): integration_eval, ) - out_torch = integration_eval( - model="hf", - model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16", - datasets=datasets, - batch_size=batch_size, - limit=2, - use_cache=None, # avoid saving files when running tests - ) + # out_torch = integration_eval( + # model="hf", + # model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16", + # datasets=datasets, + # batch_size=batch_size, + # limit=1, + # use_cache=None, # avoid saving files when running tests + # ) out_onnx = integration_eval( model=create_model_from_target( - "hf:mgoin/TinyStories-1M-ds", engine_type="onnxruntime", sequence_length=128 + "hf:mgoin/TinyStories-1M-ds", + #engine_type="onnxruntime", + sequence_length=1024 ), datasets=datasets, batch_size=batch_size, - limit=2, + limit=1, use_cache=None, # avoid saving files when running tests ) From 35454a1e81b655c46985bb15e6b1fe5bdf44d9a1 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Fri, 2 Feb 2024 12:49:54 +0000 Subject: [PATCH 11/24] tests passing, refactor time! --- .../integrations/lm_evaluation_harness.py | 9 +++-- .../test_lm_evaluation_harness.py | 39 +++++++++++-------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index cedef9c643..d4021ba8f3 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -16,13 +16,15 @@ Integration of the `lm_evaluation_harness`: https://github.com/EleutherAI/lm-evaluation-harness """ +import copy import logging +from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple, Union import numpy from tqdm import tqdm from transformers import AutoTokenizer -import copy + from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result @@ -30,7 +32,6 @@ from lm_eval import evaluator, tasks, utils from lm_eval.api.instance import Instance from lm_eval.api.model import LM -from collections import defaultdict tasks.initialize_tasks("INFO") @@ -302,7 +303,9 @@ def _collate(x): text = text.split(term)[0] res[key].append(text) - self.cache_hook.add_partial("greedy_until", (context, gen_kwargs), text) + self.cache_hook.add_partial( + "greedy_until", (context, gen_kwargs), text + ) pbar.update(1) # reorder this group of results back to original unsorted form res[key] = re_ord.get_original(res[key]) diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index e982824966..86f7eac452 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -18,12 +18,16 @@ @pytest.mark.parametrize( - "datasets", + "datasets, model_path_ds, model_path_hf", [ - #["hellaswag"], - # ["hellaswag", "gsm8k"], - "gsm8k", - #"arc_challenge", + (["hellaswag"], "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"), + # (["hellaswag", "gsm8k"],"hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX", "TinyLlama/TinyLlama-1.1B-step-50K-105b"), + ( + "gsm8k", + "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX", + "TinyLlama/TinyLlama-1.1B-step-50K-105b", + ), + # ("arc_challenge", "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"), ], ) @pytest.mark.parametrize( @@ -34,25 +38,26 @@ not try_import_lm_evaluation_harness(raise_error=False), reason="lm_evaluation_harness not installed", ) -def test_integration_eval_onnx_matches_torch(datasets, batch_size): +def test_integration_eval_onnx_matches_torch( + datasets, model_path_ds, model_path_hf, batch_size +): from deepsparse.evaluation.integrations.lm_evaluation_harness import ( integration_eval, ) - # out_torch = integration_eval( - # model="hf", - # model_args="pretrained=roneneldan/TinyStories-1M,dtype=float16", - # datasets=datasets, - # batch_size=batch_size, - # limit=1, - # use_cache=None, # avoid saving files when running tests - # ) + out_torch = integration_eval( + model="hf", + model_args=f"pretrained={model_path_hf},dtype=float16", + datasets=datasets, + batch_size=batch_size, + limit=1, + use_cache=None, # avoid saving files when running tests + ) out_onnx = integration_eval( model=create_model_from_target( - "hf:mgoin/TinyStories-1M-ds", - #engine_type="onnxruntime", - sequence_length=1024 + model_path_ds, + engine_type="onnxruntime", ), datasets=datasets, batch_size=batch_size, From d3b84f8b76d94e6beaa459556d608f9422b3337c Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Fri, 2 Feb 2024 13:28:39 +0000 Subject: [PATCH 12/24] cleanup --- .../integrations/lm_evaluation_harness.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index d4021ba8f3..6bf8062a2c 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -114,16 +114,22 @@ def __init__( self, pipeline: Pipeline, batch_size: int = 1, - max_gen_toks: int = 128, + max_gen_toks: int = 256, tokenizer: Optional[AutoTokenizer] = None, ): """ Wrapper around the DeepSparse pipeline to make it compatible with the llm-evaluation-harness. + + :param pipeline: the pipeline object to wrap + :param batch_size: the batch size to use for evaluation + :param max_gen_toks: the maximum number of tokens to generate + when using the model for generation (see: greed_until method) + :param tokenizer: the tokenizer to use for encoding and decoding + strings and tokens. By default, the tokenizer from the pipeline """ super().__init__() - # Initialize new model and tokenizer instances self.pipeline = pipeline self.batch_size = batch_size self.tokenizer = tokenizer or pipeline.tokenizer @@ -164,6 +170,13 @@ def _loglikelihood_tokens( requests: List[Tuple[Tuple[str, str], List[int], List[int]]], disable_tqdm: bool = False, ) -> List[Tuple[float, bool]]: + """ + The function to compute the loglikelihood of the continuation + tokens given the context tokens. + + This function is an adapted version of the original function from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ res = [] def _collate(x): @@ -226,9 +239,18 @@ def _collate(x): def loglikelihood_rolling( self, requests: list[Instance] ) -> list[tuple[float, bool]]: - raise NotImplementedError() + raise NotImplementedError( + "The method not required by any of our " "current task integrations so far" + ) def generate_until(self, requests: list[Instance]) -> list[str]: + """ + The function to generate a certain number of new tokens + given a context. + + This function is an adapted version of the original function from + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + """ res = defaultdict(list) re_ords = {} @@ -283,19 +305,15 @@ def _collate(x): if "do_sample" not in kwargs.keys(): kwargs["do_sample"] = False - # first stop sequence is used to halt generation upon encountering - primary_until = [until[0]] - - responses = self.pipeline( + out = self.pipeline( sequences=contexts, max_new_tokens=max_gen_toks, stop=until, **kwargs, ) - responses = responses if type(responses) is list else [responses] - for response, context in zip(responses, contexts): - text = response.generations[0].text + for gen, context in zip(out.generations, contexts): + text = gen.text # use secondary stop seqs to cut off should-have-been-stopped content post-hoc for term in until: if len(term) > 0: From e7d8c3127dafcbec9b380949a3fe189da77b24ba Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Mon, 5 Feb 2024 11:35:26 +0100 Subject: [PATCH 13/24] Update test_evaluator.py --- tests/deepsparse/evaluation/test_evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index dedd63fa36..61a1eb3891 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -133,6 +133,6 @@ def test_cli(tmp_path, target, datasets, dummy_integration_name, type_serializat standalone_mode=False, ) # makes sure that the result file is created - assert os.path.isfile( - os.path.join(os.path.dirname(str(tmp_path)), f"result.{type_serialization}") + assert os.path.isfile(os.path.join(os.path.dirname(str(tmp_path)), + f"result.{type_serialization}") ) From a148fc5177ec37db3925a02b36d4002c7f2457b9 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 5 Feb 2024 10:38:51 +0000 Subject: [PATCH 14/24] finished --- .../integrations/lm_evaluation_harness.py | 201 +++++++++--------- .../test_lm_evaluation_harness.py | 130 +++++++---- 2 files changed, 182 insertions(+), 149 deletions(-) diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 6bf8062a2c..7931d12f72 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -135,6 +135,7 @@ def __init__( self.tokenizer = tokenizer or pipeline.tokenizer self._max_length = pipeline.sequence_length self._max_gen_toks = max_gen_toks + self.batch_sizes = {} def tok_encode(self, string: str) -> List[int]: return self.tokenizer.encode(string) @@ -190,6 +191,10 @@ def _collate(x): list(utils.chunks(re_ord.get_reordered(), self.batch_size)), disable=disable_tqdm, ): + batch_inp = [] + batch_cache_key = [] + batch_continuation_enc = [] + # len(chunk) is the batch_size for cache_key, context_enc, continuation_enc in chunk: # how this all works (illustrated on a causal decoder-only setup): # CTX CONT @@ -200,39 +205,45 @@ def _collate(x): inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1] - response = self.pipeline( - prompt=self.tokenizer.decode(inp), - max_new_tokens=0, - output_scores=True, - include_prompt_logits=True, - ) - - for i, resp in enumerate(response.generations): - # (seq_len, vocab_size) - multi_scores = resp.score - # (seq_len, vocab_size) but with softmax applied - multi_logits = numpy_log_softmax(multi_scores, axis=1) - # toss out the context half of the sequence - # (cont_len, vocab_size) - continuation_multi_logits = multi_logits[-len(continuation_enc) :] - - # pick out the logits for the continuation tokens - # (cont_len,) - continuation_logits = continuation_multi_logits[ - numpy.arange(len(continuation_enc)), continuation_enc - ] - # check if the tokens generated greedly are the same - # as the expected continuation - greedy_tokens = continuation_multi_logits.argmax(axis=1) - max_equal = greedy_tokens.tolist() == continuation_enc - - # Answer: (log prob, is-exact-match) - answer = (float(continuation_logits.sum()), bool(max_equal)) - - res.append(answer) - - if cache_key is not None: - self.cache_hook.add_partial("loglikelihood", cache_key, answer) + batch_inp.append(self.tokenizer.decode(inp)) + batch_cache_key.append(cache_key) + batch_continuation_enc.append(continuation_enc) + + response = self.pipeline( + prompt=batch_inp, + max_new_tokens=0, + output_scores=True, + include_prompt_logits=True, + ) + + for resp, continuation_enc, cache_key in zip( + response.generations, batch_continuation_enc, batch_cache_key + ): + # (seq_len, vocab_size) + multi_scores = resp.score + # (seq_len, vocab_size) but with softmax applied + multi_logits = numpy_log_softmax(multi_scores, axis=1) + # toss out the context half of the sequence + # (cont_len, vocab_size) + continuation_multi_logits = multi_logits[-len(continuation_enc) :] + + # pick out the logits for the continuation tokens + # (cont_len,) + continuation_logits = continuation_multi_logits[ + numpy.arange(len(continuation_enc)), continuation_enc + ] + # check if the tokens generated greedly are the same + # as the expected continuation + greedy_tokens = continuation_multi_logits.argmax(axis=1) + max_equal = greedy_tokens.tolist() == continuation_enc + + # Answer: (log prob, is-exact-match) + answer = (float(continuation_logits.sum()), bool(max_equal)) + + res.append(answer) + + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) return re_ord.get_original(res) @@ -251,86 +262,70 @@ def generate_until(self, requests: list[Instance]) -> list[str]: This function is an adapted version of the original function from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py """ - res = defaultdict(list) - re_ords = {} + if not requests: + return [] + res = [] + requests = [req.args for req in requests] def _collate(x): - # the negative sign on len(toks) sorts descending toks = self.tok_encode(x[0]) - return -len(toks), x[0] + return len(toks), x[0] + + re_ord = utils.Reorderer(requests, _collate) + + def sameuntil_chunks(xs, size): + ret = [] + lastuntil = xs[0][1] + for x in xs: + if len(ret) >= size or x[1] != lastuntil: + yield ret, lastuntil + ret = [] + lastuntil = x[1] + ret.append(x) - # we group requests by their generation_kwargs, - # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling - # in the same batch. - grouper = utils.Grouper(requests, lambda x: str(x.args[1])) - for key, reqs in grouper.get_grouped().items(): - # within each set of reqs for given kwargs, we reorder by token length, descending. - re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate) + if ret: + yield ret, lastuntil pbar = tqdm(total=len(requests)) - # for each different set of kwargs, we execute all requests, by batch. - for key, re_ord in re_ords.items(): - chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size) - for chunk in chunks: - contexts, all_gen_kwargs = zip(*chunk) - # we assume all gen kwargs in the batch are the same - # this is safe to assume because the `grouper` object ensures it. - gen_kwargs = all_gen_kwargs[0] - # unpack our keyword arguments. - until = None - if isinstance(gen_kwargs, dict): - kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1 - if "until" in kwargs.keys(): - until = kwargs.pop("until") - if isinstance(until, str): - until = [kwargs] - elif not isinstance(until, list): - raise ValueError( - f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" - ) - else: - raise ValueError( - f"Expected `kwargs` to be of type `dict` but got {kwargs}" - ) - - if not until: - until = [self.tok_decode(self.eot_token_id)] - - if "max_gen_toks" in kwargs.keys(): - max_gen_toks = kwargs.pop("max_gen_toks") - else: - max_gen_toks = self.max_gen_toks - - # we require users to pass do_sample=True explicitly for non-greedy gen - if "do_sample" not in kwargs.keys(): - kwargs["do_sample"] = False - - out = self.pipeline( - sequences=contexts, - max_new_tokens=max_gen_toks, - stop=until, - **kwargs, - ) + for chunk, request_args in tqdm( + list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)) + ): + inps = [] - for gen, context in zip(out.generations, contexts): - text = gen.text - # use secondary stop seqs to cut off should-have-been-stopped content post-hoc - for term in until: - if len(term) > 0: - # ignore possible empty separators - text = text.split(term)[0] - - res[key].append(text) - self.cache_hook.add_partial( - "greedy_until", (context, gen_kwargs), text - ) - pbar.update(1) - # reorder this group of results back to original unsorted form - res[key] = re_ord.get_original(res[key]) + self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks) + print(self._max_gen_toks) + + for context, _ in chunk: + inps.append(context) + + until = request_args.pop("until", ["<|endoftext|>"]) + request_args.pop("do_sample", None) + request_args["temperature"] = request_args.get("temperature", 0) + + out = self.pipeline( + sequences=inps, + max_new_tokens=self.max_gen_toks - 1, + stop=until, + **request_args, + ) + + for resp, (context, args_) in zip(out.generations, chunk): + text = resp.text + until_ = until + for term in until_: + if len(term) > 0: + text = text.split(term)[0] + + res.append(text) + + self.cache_hook.add_partial( + "generate_until", (context, {"until": until_}), text + ) + pbar.update(1) pbar.close() - return grouper.get_original(res) + return re_ord.get_original(res) def _encode_pair( self, context: str, continuation: str diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index 86f7eac452..91520ee300 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -17,61 +17,99 @@ from deepsparse.evaluation.utils import create_model_from_target -@pytest.mark.parametrize( - "datasets, model_path_ds, model_path_hf", - [ - (["hellaswag"], "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"), - # (["hellaswag", "gsm8k"],"hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX", "TinyLlama/TinyLlama-1.1B-step-50K-105b"), - ( - "gsm8k", - "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX", - "TinyLlama/TinyLlama-1.1B-step-50K-105b", - ), - # ("arc_challenge", "hf:mgoin/TinyStories-1M-ds", "roneneldan/TinyStories-1M"), - ], -) @pytest.mark.parametrize( "batch_size", - [1], # TODO: Add test for higher batch sizes + [1, 3], ) @pytest.mark.skipif( not try_import_lm_evaluation_harness(raise_error=False), reason="lm_evaluation_harness not installed", ) -def test_integration_eval_onnx_matches_torch( - datasets, model_path_ds, model_path_hf, batch_size -): - from deepsparse.evaluation.integrations.lm_evaluation_harness import ( - integration_eval, - ) +class TestLMEval: + @pytest.fixture() + def integration_eval(self): + from deepsparse.evaluation.integrations.lm_evaluation_harness import ( + integration_eval as eval_fn, + ) + + return eval_fn - out_torch = integration_eval( - model="hf", - model_args=f"pretrained={model_path_hf},dtype=float16", - datasets=datasets, - batch_size=batch_size, - limit=1, - use_cache=None, # avoid saving files when running tests + @pytest.mark.parametrize( + "datasets_likelihood", + [ + "hellaswag", + ["arc_challenge"], + ["hellaswag", "arc_challenge"], + ], ) + def test_likelihood_scenario( + self, batch_size, datasets_likelihood, integration_eval + ): + model_path_ds = "hf:mgoin/TinyStories-1M-ds" + model_path_hf = "roneneldan/TinyStories-1M" + + out_onnx = integration_eval( + model=create_model_from_target( + model_path_ds, + engine_type="onnxruntime", + ), + datasets=datasets_likelihood, + batch_size=batch_size, + limit=2, + use_cache=None, # avoid saving files when running tests + ) - out_onnx = integration_eval( - model=create_model_from_target( - model_path_ds, - engine_type="onnxruntime", - ), - datasets=datasets, - batch_size=batch_size, - limit=1, - use_cache=None, # avoid saving files when running tests + out_torch = integration_eval( + model="hf", + model_args=f"pretrained={model_path_hf}", + datasets=datasets_likelihood, + batch_size=batch_size, + limit=2, + use_cache=None, # avoid saving files when running tests + ) + self._test_same(out_onnx, out_torch, datasets_likelihood) + + @pytest.mark.parametrize( + "datasets_greedy_until", + [ + "gsm8k", + ], ) + def test_greedy_until_scenario( + self, batch_size, datasets_greedy_until, integration_eval + ): + model_path_ds = "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX" + model_path_hf = "TinyLlama/TinyLlama-1.1B-step-50K-105b" + + out_onnx = integration_eval( + model=create_model_from_target(model_path_ds, engine_type="onnxruntime"), + datasets=datasets_greedy_until, + batch_size=batch_size, + limit=2, + gen_kwargs="max_gen_toks=16", + use_cache=None, # avoid saving files when running tests + ) + + out_torch = integration_eval( + model="hf", + model_args=f"pretrained={model_path_hf}", + datasets=datasets_greedy_until, + batch_size=batch_size, + limit=2, + gen_kwargs="max_gen_toks=16", + use_cache=None, # avoid saving files when running tests + ) + self._test_same(out_onnx, out_torch, datasets_greedy_until) + + @staticmethod + def _test_same(out_onnx, out_torch, datasets): + datasets = datasets if isinstance(datasets, list) else [datasets] + for dataset in datasets: + torch_samples = out_torch.raw["samples"][dataset] + onnx_samples = out_onnx.raw["samples"][dataset] + for torch_sample, onnx_sample in zip(torch_samples, onnx_samples): + print(torch_sample) + print(onnx_sample) + print(torch_sample["resps"], onnx_sample["resps"]) + assert torch_sample["resps"] == onnx_sample["resps"] - datasets = datasets if isinstance(datasets, list) else [datasets] - for dataset in datasets: - torch_samples = out_torch.raw["samples"][dataset] - onnx_samples = out_onnx.raw["samples"][dataset] - for torch_sample, onnx_sample in zip(torch_samples, onnx_samples): - for torch_resp, onnx_resp in zip( - torch_sample["resps"], onnx_sample["resps"] - ): - assert pytest.approx(torch_resp[0][0], 0.1) == onnx_resp[0][0] - assert torch_resp[0][1] == onnx_resp[0][1] From a9e98478ec394b673749f2c496228a4061b02281 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 5 Feb 2024 10:54:06 +0000 Subject: [PATCH 15/24] quality --- src/deepsparse/evaluation/cli.py | 2 +- tests/deepsparse/evaluation/test_utils.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index 43eaa33790..b68d32d4e5 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -20,7 +20,7 @@ Module for evaluating models on the various evaluation integrations OPTIONS: - --model_path MODEL_PATH + --model_path MODEL_PATH A path to an ONNX model, local directory containing ONNX model (including all the auxiliary files) or a SparseZoo stub -d DATASET, --dataset DATASET diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py index a16cb8ee32..f8f3c731a8 100644 --- a/tests/deepsparse/evaluation/test_utils.py +++ b/tests/deepsparse/evaluation/test_utils.py @@ -14,12 +14,6 @@ import os -from transformers import ( - AutoModelForCausalLM, - AutoModelForSequenceClassification, - GPTNeoForCausalLM, -) - import pytest from deepsparse import Pipeline from deepsparse.evaluation.utils import ( From b5a6d6d90af3aed8f5e091ead177951839a19e6e Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 5 Feb 2024 13:32:59 +0000 Subject: [PATCH 16/24] manual testing --- src/deepsparse/evaluation/cli.py | 14 ++-- src/deepsparse/evaluation/evaluator.py | 1 - .../evaluation/integrations/__init__.py | 3 +- .../integrations/lm_evaluation_harness.py | 24 +++---- src/deepsparse/evaluation/registry.py | 2 +- src/deepsparse/evaluation/utils.py | 71 +++++++++++++++---- .../test_lm_evaluation_harness.py | 33 +++++---- tests/deepsparse/evaluation/test_evaluator.py | 1 - 8 files changed, 96 insertions(+), 53 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index b68d32d4e5..d192dd67a1 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -20,7 +20,7 @@ Module for evaluating models on the various evaluation integrations OPTIONS: - --model_path MODEL_PATH + MODEL_PATH A path to an ONNX model, local directory containing ONNX model (including all the auxiliary files) or a SparseZoo stub -d DATASET, --dataset DATASET @@ -72,7 +72,7 @@ from deepsparse.evaluation.evaluator import evaluate from deepsparse.evaluation.results import Result, save_result -from deepsparse.evaluation.utils import args_to_dict, get_save_path +from deepsparse.evaluation.utils import get_save_path, parse_kwarg_tuples from deepsparse.operators.engine_operator import ( DEEPSPARSE_ENGINE, ORT_ENGINE, @@ -88,12 +88,10 @@ ignore_unknown_options=True, ) ) -@click.option( - "--model_path", +@click.argument( + "model_path", type=click.Path(dir_okay=True, file_okay=True), required=True, - help="A path to an ONNX model, local directory containing ONNX model" - "(including all the auxiliary files) or a SparseZoo stub", ) @click.option( "-d", @@ -178,7 +176,7 @@ def main( # join datasets to a list if multiple datasets are passed datasets = list(dataset) if not isinstance(dataset, str) else dataset # format kwargs to a dict - integration_args = args_to_dict(integration_args) + integration_args = parse_kwarg_tuples(integration_args) _LOGGER.info( f"Creating {engine_type} pipeline to evaluate from model path: {model_path}" @@ -203,7 +201,7 @@ def main( **integration_args, ) - _LOGGER.info(f"Evaluation done. Results:\n{result}") + _LOGGER.info(f"Evaluation done. Results:\n{result.formatted}") save_path = get_save_path( save_path=save_path, diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py index b513f07563..3d18f8489f 100644 --- a/src/deepsparse/evaluation/evaluator.py +++ b/src/deepsparse/evaluation/evaluator.py @@ -65,7 +65,6 @@ def evaluate( return eval_integration( pipeline=pipeline, datasets=datasets, - engine_type=engine_type, batch_size=batch_size, splits=splits, metrics=metrics, diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py index 1cc3bfacf0..c7e8d3c5fa 100644 --- a/src/deepsparse/evaluation/integrations/__init__.py +++ b/src/deepsparse/evaluation/integrations/__init__.py @@ -24,8 +24,7 @@ def try_import_lm_evaluation_harness(raise_error=False): if raise_error: raise ImportError( "Unable to import lm_eval. " - "To install run 'pip install " - "git+https://github.com/EleutherAI/lm-evaluation-harness@b018a7d51'" + "To install run 'pip install lm-eval==0.4.0'" ) return False diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 7931d12f72..1d13bb37ee 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -16,14 +16,11 @@ Integration of the `lm_evaluation_harness`: https://github.com/EleutherAI/lm-evaluation-harness """ -import copy import logging -from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple, Union import numpy from tqdm import tqdm -from transformers import AutoTokenizer from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry @@ -43,9 +40,11 @@ @EvaluationRegistry.register(name="lm-evaluation-harness") def integration_eval( - model: Any, + pipeline: Pipeline, datasets: Union[List[str], str], - batch_size: int, + batch_size: int = 1, + splits: Union[List[str], str, None] = None, + metrics: Union[List[str], str, None] = None, **kwargs, ) -> Result: """ @@ -53,15 +52,14 @@ def integration_eval( https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py that is compatible with deepsparse.evaluator.py - :param model: the model/pipeline to evaluate + :param pipeline: the model/pipeline to evaluate :param datasets: the datasets to evaluate on :param batch_size: the batch size to use for evaluation :param kwargs: additional arguments to alter the behavior of the evaluation :return the evaluation results """ - if isinstance(model, Pipeline): - model = DeepSparseLM(pipeline=model, batch_size=batch_size) + pipeline = DeepSparseLM(pipeline=pipeline, batch_size=batch_size) datasets = (",").join(datasets) if isinstance(datasets, list) else datasets task_names = utils.pattern_match(datasets.split(","), tasks.ALL_TASKS) @@ -69,7 +67,7 @@ def integration_eval( _LOGGER.info(f"Selected Tasks: {task_names}") results_raw = evaluator.simple_evaluate( - model=model, tasks=task_names, batch_size=batch_size, **kwargs + model=pipeline, tasks=task_names, batch_size=batch_size, **kwargs ) results = Result( @@ -115,7 +113,7 @@ def __init__( pipeline: Pipeline, batch_size: int = 1, max_gen_toks: int = 256, - tokenizer: Optional[AutoTokenizer] = None, + tokenizer: Optional["AutoTokenizer"] = None, ): """ Wrapper around the DeepSparse pipeline to make it compatible with the @@ -260,7 +258,7 @@ def generate_until(self, requests: list[Instance]) -> list[str]: given a context. This function is an adapted version of the original function from - https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/huggingface.py + https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py """ if not requests: return [] @@ -293,15 +291,16 @@ def sameuntil_chunks(xs, size): inps = [] self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks) - print(self._max_gen_toks) for context, _ in chunk: + # add context (prompts) to the list inps.append(context) until = request_args.pop("until", ["<|endoftext|>"]) request_args.pop("do_sample", None) request_args["temperature"] = request_args.get("temperature", 0) + # run inference (generate max_gen_toks tokens) out = self.pipeline( sequences=inps, max_new_tokens=self.max_gen_toks - 1, @@ -312,6 +311,7 @@ def sameuntil_chunks(xs, size): for resp, (context, args_) in zip(out.generations, chunk): text = resp.text until_ = until + # split the text at the first occurrence of any of the until tokens for term in until_: if len(term) > 0: text = text.split(term)[0] diff --git a/src/deepsparse/evaluation/registry.py b/src/deepsparse/evaluation/registry.py index 2daabb69cc..343cd9786c 100644 --- a/src/deepsparse/evaluation/registry.py +++ b/src/deepsparse/evaluation/registry.py @@ -57,7 +57,7 @@ def resolve( if integration is None: _LOGGER.info( - "No integration specified, inferring the evaluation" + "No integration specified, inferring the evaluation " "function from the input arguments..." ) integration = resolve_integration(pipeline, datasets) diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index 998a346780..c170b29476 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import ast +import logging import os from typing import Any, Dict, List, Optional, Tuple, Union @@ -22,10 +23,10 @@ __all__ = [ "create_pipeline", "get_save_path", - "args_to_dict", + "parse_kwarg_tuples", "resolve_integration", ] - +_LOGGER = logging.getLogger(__name__) LM_EVALUATION_HARNESS = "lm-evaluation-harness" @@ -80,24 +81,66 @@ def if_generative_language_model(pipeline: Pipeline) -> bool: return False -def args_to_dict(args: Tuple[Any, ...]) -> Dict[str, Any]: +def parse_kwarg_tuples(kwargs: tuple) -> Dict: """ - Convert a tuple of args to a dict of args. - - :param args: The args to convert. Should be a tuple of alternating - arg names and arg values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3). + Convert a tuple of kwargs to a dict of kwargs. + This function is used to enable the click parsing of kwargs. + + Example use: + ``` + @click.command( + context_settings=dict( + ignore_unknown_options=True) + ) + @click.argument(...) + @click.option(...) + ... + @click.argument("kwargs", nargs=-1, type=click.UNPROCESSED) + def main(..., kwargs): + ... + kwargs: Dict[str, Any] = parse_kwarg_tuples(kwargs: Tuple) + ``` + + Example inputs, outputs: + ``` + input = ('--arg1', 1, 'arg2', 2, '-arg3', 3) + output = parse_kwarg_tuples(input) + output = {'arg1': 1, 'arg2': 2, 'arg3': 3} + ``` + + :param kwargs: The kwargs to convert. Should be a tuple of alternating + kwargs names and kwargs values e.g.('--arg1', 1, 'arg2', 2, -arg3', 3). The names can optionally have a '-' or `--` in front of them. - :return: The converted args as a dict. + :return: The converted kwargs as a dict. """ - if len(args) == 0: + if len(kwargs) == 0: return {} + if len(kwargs) % 2 != 0: + raise ValueError( + "kwargs must be a tuple of alternating names and values " + "i.e. the length of kwargs tuple must be even. Received " + f"kwargs: {kwargs}" + ) # names are uneven indices, values are even indices - args_names = args[0::2] - args_values = args[1::2] + kwargs_names = kwargs[0::2] + kwargs_values = kwargs[1::2] + # by default kwargs values are strings, so convert them + # to the appropriate type if possible + kwargs_values = list(kwargs_values) + for i, value in enumerate(kwargs_values): + try: + kwargs_values[i] = ast.literal_eval(value) + except Exception as e: # noqa E841 + _LOGGER.debug( + f"Failed to infer non-string type" + f"from kwarg value: {value}. It will" + f"be left as a string." + ) + # remove any '-' or '--' from the names - args_names = [name.lstrip("-") for name in args_names] + kwargs_names = [name.lstrip("-") for name in kwargs_names] - return dict(zip(args_names, args_values)) + return dict(zip(kwargs_names, kwargs_values)) def get_save_path( diff --git a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py index 61a24e3d75..8d8b343dd5 100644 --- a/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py +++ b/tests/deepsparse/evaluation/integrations/test_lm_evaluation_harness.py @@ -43,6 +43,7 @@ def integration_eval(self): ], ) def test_likelihood_scenario(self, batch_size, datasets, integration_eval): + model_path_ds = "hf:mgoin/TinyStories-1M-ds" model_path_hf = "roneneldan/TinyStories-1M" limit = 2 @@ -58,15 +59,18 @@ def test_likelihood_scenario(self, batch_size, datasets, integration_eval): use_cache=None, # avoid saving files when running tests ) - out_torch = integration_eval( + from lm_eval import evaluator, tasks, utils + + datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets + out_torch = evaluator.simple_evaluate( model="hf", model_args=f"pretrained={model_path_hf}", - datasets=datasets, + tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS), batch_size=batch_size, limit=limit, use_cache=None, # avoid saving files when running tests ) - self._test_same(out_onnx, out_torch, datasets) + self._test_same(out_onnx.raw, out_torch, datasets) @pytest.mark.parametrize( "datasets", @@ -74,9 +78,7 @@ def test_likelihood_scenario(self, batch_size, datasets, integration_eval): "gsm8k", ], ) - def test_greedy_until_scenario( - self, batch_size, datasets, integration_eval, greedy=True - ): + def test_greedy_until_scenario(self, batch_size, datasets, integration_eval): model_path_ds = "hf:mgoin/TinyLlama-1.1B-step-50K-105b-ONNX" model_path_hf = "TinyLlama/TinyLlama-1.1B-step-50K-105b" limit = 2 @@ -85,7 +87,7 @@ def test_greedy_until_scenario( gen_kwargs = "max_gen_toks=16" out_onnx = integration_eval( - model=create_pipeline(model_path_ds, engine_type="onnxruntime"), + create_pipeline(model_path_ds, engine_type="onnxruntime"), datasets=datasets, batch_size=batch_size, limit=limit, @@ -93,23 +95,26 @@ def test_greedy_until_scenario( use_cache=None, # avoid saving files when running tests ) - out_torch = integration_eval( + from lm_eval import evaluator, tasks, utils + + datasets_ = (",").join(datasets) if isinstance(datasets, list) else datasets + out_torch = evaluator.simple_evaluate( model="hf", model_args=f"pretrained={model_path_hf}", - datasets=datasets, + tasks=utils.pattern_match(datasets_.split(","), tasks.ALL_TASKS), batch_size=batch_size, limit=limit, gen_kwargs=gen_kwargs, use_cache=None, # avoid saving files when running tests ) - self._test_same(out_onnx, out_torch, datasets) + self._test_same(out_onnx.raw, out_torch, datasets) @staticmethod def _test_same(out_onnx, out_torch, datasets, greedy=False): datasets = datasets if isinstance(datasets, list) else [datasets] for dataset in datasets: - torch_samples = out_torch.raw["samples"][dataset] - onnx_samples = out_onnx.raw["samples"][dataset] + torch_samples = out_torch["samples"][dataset] + onnx_samples = out_onnx["samples"][dataset] for torch_sample, onnx_sample in zip(torch_samples, onnx_samples): if greedy: # for datasets that validate greedy generation @@ -119,6 +124,6 @@ def _test_same(out_onnx, out_torch, datasets, greedy=False): # for datasets that validate likelihood # make sure that likelihoods are the same assert ( - pytest.approx(torch_sample["resps"][0], 0.01) - == onnx_sample["resps"][0] + pytest.approx(torch_sample["resps"][0][0], 0.0001) + == onnx_sample["resps"][0][0] ) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index 816ad075e0..9e7d21bdae 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -144,7 +144,6 @@ def test_cli( runner.invoke( main, [ - "--model_path", model_path, "--dataset", datasets[0], From e10f0c97073742623fbc0da91b430198a06a653a Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 5 Feb 2024 17:04:28 +0000 Subject: [PATCH 17/24] UI improvements --- setup.py | 2 +- src/deepsparse/evaluation/cli.py | 2 -- src/deepsparse/evaluation/utils.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 8fe04d23be..ff8269257f 100644 --- a/setup.py +++ b/setup.py @@ -308,7 +308,7 @@ def _setup_entry_points() -> Dict: f"deepsparse.image_classification.eval={ic_eval}", "deepsparse.license=deepsparse.license:main", "deepsparse.validate_license=deepsparse.license:validate_license_cli", - "deepsparse.eval=deepsparse.evaluation.cli:main", + "deepsparse.evaluate=deepsparse.evaluation.cli:main", ] } diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index e1b9cf5c57..d192dd67a1 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -92,8 +92,6 @@ "model_path", type=click.Path(dir_okay=True, file_okay=True), required=True, - help="A path to a remote or local directory containing ONNX/torch model " - "(including all the auxiliary files) or a SparseZoo stub", ) @click.option( "-d", diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index c170b29476..b2695abaa1 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -27,7 +27,7 @@ "resolve_integration", ] _LOGGER = logging.getLogger(__name__) -LM_EVALUATION_HARNESS = "lm-evaluation-harness" +LM_EVALUATION_HARNESS = "lm-eval-harness" def potentially_check_dependency_import(integration_name: str) -> bool: From 48a5900398d399fef23c999cb2ae3d6b973be264 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Tue, 6 Feb 2024 07:28:39 +0000 Subject: [PATCH 18/24] new UI adaptations --- src/deepsparse/evaluation/cli.py | 2 +- .../integrations/lm_evaluation_harness.py | 17 ++++++++++------- src/deepsparse/evaluation/utils.py | 2 +- tests/deepsparse/evaluation/test_evaluator.py | 5 ++--- tests/deepsparse/evaluation/test_utils.py | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index d192dd67a1..4d97c904bb 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -61,7 +61,7 @@ deepsparse.eval zoo:mpt-7b-mpt_pretrain-base_quantized \ --dataset hellaswag \ --dataset gsm8k \ - --integration lm-evaluation-harness \ + --integration lm-eval-harness \ --limit 2 # limit the number of samples to evaluate on, specific to the integration """ # noqa: E501 diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 1d13bb37ee..7347d91bfb 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -13,7 +13,7 @@ # limitations under the License. """ -Integration of the `lm_evaluation_harness`: +Integration of the `lm-evaluation-harness`: https://github.com/EleutherAI/lm-evaluation-harness """ import logging @@ -25,6 +25,7 @@ from deepsparse import Pipeline from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result +from deepsparse.evaluation.utils import LM_EVALUATION_HARNESS from deepsparse.utils.data import numpy_log_softmax from lm_eval import evaluator, tasks, utils from lm_eval.api.instance import Instance @@ -38,7 +39,7 @@ __all__ = ["integration_eval"] -@EvaluationRegistry.register(name="lm-evaluation-harness") +@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS) def integration_eval( pipeline: Pipeline, datasets: Union[List[str], str], @@ -83,7 +84,7 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: Format the raw results from lm_evaluation_harness into a list of Evaluation objects. - :param results: the raw results from lm_evaluation_harness + :param results: the raw results from lm-evaluation-harness :return: the formatted results as a list of Evaluation objects """ formatted_results = [] @@ -98,7 +99,7 @@ def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: type=None, name=dataset_name, config=results["config"], split=None ) evaluation = Evaluation( - task="lm_evaluation_harness", + task=LM_EVALUATION_HARNESS, dataset=dataset, metrics=metrics, samples=None, @@ -113,7 +114,7 @@ def __init__( pipeline: Pipeline, batch_size: int = 1, max_gen_toks: int = 256, - tokenizer: Optional["AutoTokenizer"] = None, + tokenizer: Optional["AutoTokenizer"] = None, # noqa: F821 ): """ Wrapper around the DeepSparse pipeline to make it compatible with the @@ -157,7 +158,9 @@ def loglikelihood(self, requests) -> List[Tuple[float, bool]]: new_reqs = [] for context, continuation in [req.args for req in requests]: if context == "": - raise NotImplemented("Implementing empty context is not supported yet") + raise NotImplementedError( + "Implementing empty context is not supported yet" + ) context_enc, continuation_enc = self._encode_pair(context, continuation) new_reqs.append(((context, continuation), context_enc, continuation_enc)) @@ -199,7 +202,7 @@ def _collate(x): # inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] # model \ \ # logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the - # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice + # cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice # noqa: E501 inp = (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1] diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index b2695abaa1..02f16b089c 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -14,7 +14,7 @@ import ast import logging import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Union from deepsparse import Pipeline from deepsparse.operators.engine_operator import DEEPSPARSE_ENGINE diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index 9e7d21bdae..225a255d52 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -122,9 +122,8 @@ def test_evaluation_llm_evaluation_harness_integration_name( assert evaluate( model=model_path, datasets=datasets, - limit=2, - no_cache=True, - integration="lm_evaluation_harness", + limit=1, + integration="lm_eval_harness", ) diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py index f8f3c731a8..0b8fb187ec 100644 --- a/tests/deepsparse/evaluation/test_utils.py +++ b/tests/deepsparse/evaluation/test_utils.py @@ -36,7 +36,7 @@ def llm_type_pipeline(): def test_resolve_known_llm_pipeline(llm_type_pipeline): assert ( resolve_integration(pipeline=llm_type_pipeline, datasets="") - == "lm-evaluation-harness" + == "lm-eval-harness" ) From 44e3e6e9c35b35ece9d5f941f6e04599dd928320 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Tue, 6 Feb 2024 07:46:00 +0000 Subject: [PATCH 19/24] make test more lightweight --- tests/deepsparse/evaluation/test_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index 225a255d52..2b8430a8a6 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -56,7 +56,7 @@ def model_path(): @pytest.fixture() def datasets(): - return ["hellaswag", "gsm8k"] + return ["hellaswag"] @pytest.fixture() From abb6ab8535eb28a55cb77944cea28d21155e9bb3 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Tue, 6 Feb 2024 13:11:32 +0000 Subject: [PATCH 20/24] fix tests 2 --- tests/deepsparse/evaluation/test_evaluator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index 2b8430a8a6..928fd275e2 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -56,7 +56,7 @@ def model_path(): @pytest.fixture() def datasets(): - return ["hellaswag"] + return ["hellaswag", "gsm8k"] @pytest.fixture() @@ -121,7 +121,9 @@ def test_evaluation_llm_evaluation_harness_integration_name( ): assert evaluate( model=model_path, - datasets=datasets, + # testing only on hellaswag dataset + # to avoid long running time + datasets=datasets[0], limit=1, integration="lm_eval_harness", ) From e5aad6515215c88e2a72822707a168342ebafd2e Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Wed, 7 Feb 2024 12:01:03 +0000 Subject: [PATCH 21/24] good point Michael --- src/deepsparse/evaluation/integrations/__init__.py | 2 +- src/deepsparse/evaluation/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py index c7e8d3c5fa..15eeee7d8d 100644 --- a/src/deepsparse/evaluation/integrations/__init__.py +++ b/src/deepsparse/evaluation/integrations/__init__.py @@ -15,7 +15,7 @@ # flake8: noqa: F401 -def try_import_lm_evaluation_harness(raise_error=False): +def try_import_lm_evaluation_harness(raise_error=True): try: import lm_eval diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index 02f16b089c..c089819659 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -43,7 +43,7 @@ def potentially_check_dependency_import(integration_name: str) -> bool: if integration_name.replace("_", "-") == LM_EVALUATION_HARNESS: from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness - try_import_lm_evaluation_harness(raise_error=True) + try_import_lm_evaluation_harness() return True From d65cac62051dc10b313fb4b0179837963b1efd73 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Thu, 8 Feb 2024 17:33:58 +0100 Subject: [PATCH 22/24] Return to the name `lm-evaluation-harness` but add alias `lm-eval-harness` --- src/deepsparse/evaluation/cli.py | 2 +- src/deepsparse/evaluation/integrations/lm_evaluation_harness.py | 2 +- src/deepsparse/evaluation/utils.py | 2 +- tests/deepsparse/evaluation/test_evaluator.py | 2 +- tests/deepsparse/evaluation/test_utils.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/deepsparse/evaluation/cli.py b/src/deepsparse/evaluation/cli.py index 4d97c904bb..d192dd67a1 100644 --- a/src/deepsparse/evaluation/cli.py +++ b/src/deepsparse/evaluation/cli.py @@ -61,7 +61,7 @@ deepsparse.eval zoo:mpt-7b-mpt_pretrain-base_quantized \ --dataset hellaswag \ --dataset gsm8k \ - --integration lm-eval-harness \ + --integration lm-evaluation-harness \ --limit 2 # limit the number of samples to evaluate on, specific to the integration """ # noqa: E501 diff --git a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py index 7347d91bfb..69934af37a 100644 --- a/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py +++ b/src/deepsparse/evaluation/integrations/lm_evaluation_harness.py @@ -39,7 +39,7 @@ __all__ = ["integration_eval"] -@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS) +@EvaluationRegistry.register(name=LM_EVALUATION_HARNESS, alias="lm-eval-harness") def integration_eval( pipeline: Pipeline, datasets: Union[List[str], str], diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index c089819659..ff2619315b 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -27,7 +27,7 @@ "resolve_integration", ] _LOGGER = logging.getLogger(__name__) -LM_EVALUATION_HARNESS = "lm-eval-harness" +LM_EVALUATION_HARNESS = "lm-evaluation-harness" def potentially_check_dependency_import(integration_name: str) -> bool: diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index 928fd275e2..241b9a4344 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -125,7 +125,7 @@ def test_evaluation_llm_evaluation_harness_integration_name( # to avoid long running time datasets=datasets[0], limit=1, - integration="lm_eval_harness", + integration="lm_evaluation_harness", ) diff --git a/tests/deepsparse/evaluation/test_utils.py b/tests/deepsparse/evaluation/test_utils.py index 0b8fb187ec..f8f3c731a8 100644 --- a/tests/deepsparse/evaluation/test_utils.py +++ b/tests/deepsparse/evaluation/test_utils.py @@ -36,7 +36,7 @@ def llm_type_pipeline(): def test_resolve_known_llm_pipeline(llm_type_pipeline): assert ( resolve_integration(pipeline=llm_type_pipeline, datasets="") - == "lm-eval-harness" + == "lm-evaluation-harness" ) From b82b49b1b4b55face353f188886a6f06725b55cd Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Fri, 9 Feb 2024 16:50:49 +0100 Subject: [PATCH 23/24] [DeepSparse Evaluation API] Perplexity (#1555) * initial commit * Update src/deepsparse/evaluation/integrations/__init__.py * design ready, time to define additional features * split prep_for_generation operator * fix logits * update non-kv cache pipeline and tests * add tests to address edge cases * add condition to check of kv_cache full during prompt inference, add test to cover this case, revert debugging changes * fix typing * remove commented code * remove irrelevant condition * perplexity for non-kv cache pipelines works! * logic is working * ready for review * [DeepSparse Evaluation API] Perplexity eval support for `openai_humaneval`, `c4`, `wikitext2` (#1586) * fix tests 2 * initial commit * add return to a function * make script more robust --------- Co-authored-by: Dipika Sikka --- setup.py | 1 + src/deepsparse/evaluation/evaluator.py | 3 + .../evaluation/integrations/__init__.py | 1 + .../evaluation/integrations/perplexity.py | 278 ++++++++++++++++++ src/deepsparse/evaluation/results.py | 4 +- src/deepsparse/evaluation/utils.py | 2 + src/deepsparse/transformers/metrics.py | 2 +- .../transformers/utils/eval_helpers.py | 34 ++- .../integrations/test_perplexity.py | 132 +++++++++ 9 files changed, 448 insertions(+), 9 deletions(-) create mode 100644 src/deepsparse/evaluation/integrations/perplexity.py create mode 100644 tests/deepsparse/evaluation/integrations/test_perplexity.py diff --git a/setup.py b/setup.py index ff8269257f..d9c8dffd7d 100644 --- a/setup.py +++ b/setup.py @@ -149,6 +149,7 @@ def _parse_requirements_file(file_path): "datasets<2.16", "accelerate<0.26", "seqeval", + "evaluate", ] _sentence_transformers_integration_deps = ["optimum-deepsparse"] + _torch_deps diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py index 3d18f8489f..3926b78a2a 100644 --- a/src/deepsparse/evaluation/evaluator.py +++ b/src/deepsparse/evaluation/evaluator.py @@ -16,6 +16,9 @@ from typing import List, Optional, Union from deepsparse import Pipeline +from deepsparse.evaluation.integrations.perplexity import ( # noqa + integration_eval as integration_eval_perplexity, +) from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Result from deepsparse.evaluation.utils import create_pipeline diff --git a/src/deepsparse/evaluation/integrations/__init__.py b/src/deepsparse/evaluation/integrations/__init__.py index 15eeee7d8d..f0871f135a 100644 --- a/src/deepsparse/evaluation/integrations/__init__.py +++ b/src/deepsparse/evaluation/integrations/__init__.py @@ -31,3 +31,4 @@ def try_import_lm_evaluation_harness(raise_error=True): if try_import_lm_evaluation_harness(raise_error=False): from .lm_evaluation_harness import * +from .perplexity import * diff --git a/src/deepsparse/evaluation/integrations/perplexity.py b/src/deepsparse/evaluation/integrations/perplexity.py new file mode 100644 index 0000000000..a9a3f3d8a3 --- /dev/null +++ b/src/deepsparse/evaluation/integrations/perplexity.py @@ -0,0 +1,278 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from collections import defaultdict +from typing import Any, Dict, List, Optional, Union + +import numpy +from tqdm import tqdm + +from datasets import load_dataset +from deepsparse import Pipeline +from deepsparse.evaluation.registry import EvaluationRegistry +from deepsparse.evaluation.results import Dataset, Evaluation, Metric, Result +from deepsparse.evaluation.utils import PERPLEXITY +from deepsparse.transformers.metrics import Perplexity +from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline +from deepsparse.transformers.pipelines.text_generation.pipeline_no_kv_cache import ( + TextGenerationPipelineNoCache, +) +from deepsparse.transformers.utils.eval_helpers import ( + HumanEvalIteratorWrapper, + process_concatenated_datasets, +) + + +""" +Integration for the evaluation module +that computes the perplexity of a model on a dataset +""" +_LOGGER = logging.getLogger(__name__) + + +@EvaluationRegistry.register(name=PERPLEXITY) +def integration_eval( + pipeline: Pipeline, + datasets: Union[List[str], str] = "openai_humaneval", + batch_size: int = 1, + limit: Optional[int] = None, + accumulate: Optional[bool] = None, + splits: Union[List[str], str, None] = "test", + metrics: Union[List[str], str, None] = None, + **kwargs, +) -> Result: + """ + A function that computes the perplexity of a pipeline given a set + of dataset names. + + :param pipeline: the pipeline to evaluate. The assumed pipeline + is a TextGenerationPipeline, either with or without the KV + cache support + :param datasets: the names of dataset(s) to evaluate on + :param batch_size: the batch size to use for evaluation + :param splits: the split of the dataset to evaluate on. Default is "test" + :param metrics: the metrics to compute. Default is None + :param limit: the number of batches to evaluate on. Default is None + (evaluates on entire dataset) + :param accumulate: whether to perplexity computation should + accumulate negative log-likelihood over samples. Defaults to + the default accumulate variable inferred from the dataset in + `datasets`. If not None, it will override the inferred accumulate + variable. + :return: a Result object containing the raw and formatted results + """ + metrics = metrics or PERPLEXITY + if metrics != PERPLEXITY: + raise ValueError(f"Invalid metric {metrics} for perplexity evaluation") + if splits is None: + splits = "test" + _LOGGER.info("Argument `splits` is None. Defaulting to `test` split.") + datasets = datasets if isinstance(datasets, list) else [datasets] + results_raw = defaultdict(str) + for dataset_name in datasets: + results_raw[dataset_name] = defaultdict() + dataset, _accumulate = load_perplexity_dataset( + dataset_name=dataset_name, splits=splits, pipeline=pipeline, **kwargs + ) + if accumulate is None: + accumulate = _accumulate + else: + _LOGGER.info( + f"Argument `accumulate` set to {accumulate}. " + "Overriding the inferred accumulate variable from the dataset." + ) + + perplexity = run_perplexity( + pipeline=pipeline, + dataset=dataset, + batch_size=batch_size, + accumulate=accumulate, + limit=limit, + ) + + results_raw[dataset_name] = defaultdict() + results_raw[dataset_name]["results"] = perplexity + results_raw[dataset_name]["split"] = splits + + results = Result( + # omit storing raw results. they can potentially + # contain numpy arrays that are not serializable. + # all the information is stored in the formatted results + raw=None, + formatted=format_raw_results(results_raw), + ) + + return results + + +def run_perplexity( + pipeline: Union[TextGenerationPipelineNoCache, TextGenerationPipeline], + dataset: "Dataset", + batch_size: int, + accumulate: bool, + limit: Optional[int] = None, +) -> Dict[str, Any]: + """ + Compute the perplexity of a pipeline given a dataset. + + :param pipeline: the pipeline to evaluate. The assumed pipeline + is a TextGenerationPipeline, either with or without the KV + cache support + :param dataset: the dataset to evaluate on + :param batch_size: the batch size to use for evaluation + :param accumulate: whether to perplexity computation should + accumulate negative log-likelihood over samples + :param limit: the number of batches to evaluate on. Default is None + (evaluates on entire dataset) + + :return: a dictionary containing the perplexity results + """ + + perplexity = Perplexity(accumulate=accumulate) + + batch = [] + for idx, sample in _enumerate_progress( + dataset, max_steps=None if limit is None else limit * batch_size + ): + + if limit is not None: + # stop if we have reached the #limit + # number of batches to be processed + if idx >= limit * batch_size: + break + + batch.append(sample) + + if len(batch) == batch_size: + if isinstance(pipeline, TextGenerationPipelineNoCache): + out = pipeline( + prompt=batch, + output_scores=True, + include_prompt_logits=True, + return_input_tokens=True, + ) + else: + out = pipeline( + prompt=batch, + output_scores=True, + max_new_tokens=0, + include_prompt_logits=True, + return_input_tokens=True, + ) + + for s in range(batch_size): + # Need to remove tokens that were masked + input_ids = out.input_tokens["input_ids"][s].flatten() + attention_mask = out.input_tokens["attention_mask"][s].flatten() + logits = out.generations[s].score + if batch_size > 1 and isinstance( + pipeline, TextGenerationPipelineNoCache + ): + logits = logits[-attention_mask.sum() :, :] + + logits = numpy.compress(attention_mask, logits, axis=0)[:-1, :] + input_ids = numpy.compress(attention_mask, input_ids)[1:] + + # Add predictions (logits) and targets (input_ids) to metric + perplexity.add_batch(logits, input_ids) + + batch.clear() + + return perplexity.compute() + + +def format_raw_results(results: Dict[str, Any]) -> List[Evaluation]: + """ + Format the raw perplexity results into a list of + Evaluation objects. + + :param results: the raw results from perplexity computation + :return: the formatted results as a list of Evaluation objects + """ + formatted_results = [] + for dataset_name, dataset_result in results.items(): + metrics = [] + for metric_name, metric_value in dataset_result["results"].items(): + if isinstance(metric_value, numpy.ndarray): + metric_value = metric_value.tolist() + metric = Metric(name=metric_name, value=metric_value) + metrics.append(metric) + dataset = Dataset(type=None, name=dataset_name, split=dataset_result["split"]) + evaluation = Evaluation( + task="perplexity", + dataset=dataset, + metrics=metrics, + samples=None, + ) + formatted_results.append(evaluation) + return formatted_results + + +def load_perplexity_dataset( + dataset_name: str, + splits: Union[List[str], str] = "test", + pipeline: Optional[Pipeline] = None, + **kwargs, +): + """ + Function to load the dataset for perplexity computation. + Eventually we want to load the dataset from the nm_utils + + :param dataset_name: the name of the dataset to load + :param splits: the splits to load from the dataset. Default is "test" + :param pipeline: the pipeline to use for loading the dataset. The pipeline + is used to infer the model path and sequence length to use for loading + the dataset. This argument can be omitted if the appropriate kwargs + are provided, or if the dataset does not require a process_concatenated_datasets + function to load the dataset. + :param kwargs: additional keyword arguments to pass to the dataset loading function + :return: the dataset and whether to accumulate perplexity over samples + """ + if isinstance(splits, list): + raise NotImplementedError("Evaluation on multiple splits not implemented") + + if dataset_name == "openai_humaneval": + dataset = load_dataset(dataset_name, split=splits) + dataset = HumanEvalIteratorWrapper(dataset) + accumulate = False + elif dataset_name in {"wikitext2", "c4"}: + # fetch max_sequence_length from pipeline if not provided + max_sequence_length = kwargs.pop("max_sequence_length", None) + if max_sequence_length is None and pipeline is not None: + max_sequence_length = pipeline.sequence_length + + # fetch model_path from pipeline if not provided + model_path = kwargs.pop("model_path", None) + if model_path is None and pipeline is not None: + model_path = os.path.dirname(pipeline.model_path) + + dataset = process_concatenated_datasets( + dataset_name, + model_path=model_path, + max_sequence_length=max_sequence_length, + split=splits, + **kwargs, + ) + accumulate = True + else: + raise NotImplementedError(f"Dataset {dataset_name} not implemented") + + return dataset, accumulate + + +def _enumerate_progress(dataset, max_steps): + progress_bar = tqdm(dataset, total=max_steps) if max_steps else tqdm(dataset) + return enumerate(progress_bar) diff --git a/src/deepsparse/evaluation/results.py b/src/deepsparse/evaluation/results.py index 00212d0a1e..78c4bbd501 100644 --- a/src/deepsparse/evaluation/results.py +++ b/src/deepsparse/evaluation/results.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, List, Optional +from typing import Any, List, Optional, Union import yaml from pydantic import BaseModel, Field @@ -32,7 +32,7 @@ class Metric(BaseModel): name: str = Field(description="Name of the metric") - value: float = Field(description="Value of the metric") + value: Union[float, List[float]] = Field(description="Value of the metric") class Dataset(BaseModel): diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index ff2619315b..a5dc460596 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -27,7 +27,9 @@ "resolve_integration", ] _LOGGER = logging.getLogger(__name__) + LM_EVALUATION_HARNESS = "lm-evaluation-harness" +PERPLEXITY = "perplexity" def potentially_check_dependency_import(integration_name: str) -> bool: diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py index b90c4dd744..0e7c24c8b6 100644 --- a/src/deepsparse/transformers/metrics.py +++ b/src/deepsparse/transformers/metrics.py @@ -20,7 +20,7 @@ import numpy -from deepsparse.utils import numpy_log_softmax +from deepsparse.utils.data import numpy_log_softmax __all__ = [ diff --git a/src/deepsparse/transformers/utils/eval_helpers.py b/src/deepsparse/transformers/utils/eval_helpers.py index 4c0e68b9de..012520b9b5 100644 --- a/src/deepsparse/transformers/utils/eval_helpers.py +++ b/src/deepsparse/transformers/utils/eval_helpers.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Mapping, Union +from typing import List, Union import numpy from transformers import AutoTokenizer, PreTrainedTokenizerFast @@ -27,7 +27,8 @@ def process_concatenated_datasets( dataset_name: str, model_path: str, max_sequence_length: int, - kwargs: Mapping, + split: str = "test", + **kwargs, ) -> list: """ Concatenate text datasets and split them into chunks text that, after @@ -38,6 +39,8 @@ def process_concatenated_datasets( Options: "wikitext2" or "c4". model_path (str): The path to a pretrained transformer model for tokenization. max_sequence_length (int): The maximum number of tokens in each sequence. + split (str, optional): The split of the dataset to use. + Default is "test". kwargs (mapping): Additional keyword arguments. - eos (str, optional): The end-of-sentence token. Default is "\n\n" for wikitext2 and "" for c4. @@ -65,13 +68,13 @@ def process_concatenated_datasets( eos = kwargs.get("eos", "\n\n") bos = kwargs.get("bos", "") - raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split=split) raw_text = raw_dataset["text"] elif dataset_name == "c4": eos = kwargs.get("eos", "<|endoftext|>") bos = kwargs.get("bos", "") raw_samples = kwargs.get("raw_samples", None) - data_file = kwargs.get("data_file", 0) + data_file = kwargs.get("data_file", None) if data_file is not None: raw_dataset = load_dataset( "allenai/c4", @@ -79,13 +82,13 @@ def process_concatenated_datasets( data_files={ "validation": f"en/c4-validation.{data_file:05d}-of-00008.json.gz" }, - split="validation", + split=split, ) else: raw_dataset = load_dataset( "allenai/c4", "allenai--c4", - split="validation", + split=split, ) if raw_samples is not None: raw_dataset = raw_dataset[:raw_samples] @@ -181,3 +184,22 @@ def _split_text_by_tokens( ) return split_text + + +class HumanEvalIteratorWrapper: + """ + Wrapper around the `openai_humaneval` dataset, + that joins the prompt and the canonical solution + into a single string during iteration. + """ + + def __init__(self, dataset): + self.iterator = iter(dataset) + + def __iter__(self): + return self + + def __next__(self): + # Get the next sample from the original iterator + sample = next(self.iterator) + return sample["prompt"] + sample["canonical_solution"] diff --git a/tests/deepsparse/evaluation/integrations/test_perplexity.py b/tests/deepsparse/evaluation/integrations/test_perplexity.py new file mode 100644 index 0000000000..b156e5b9a4 --- /dev/null +++ b/tests/deepsparse/evaluation/integrations/test_perplexity.py @@ -0,0 +1,132 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import copy + +import numpy as np + +import pytest +from deepsparse.evaluation.integrations.perplexity import ( + integration_eval, + load_perplexity_dataset, +) +from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline +from evaluate import load + + +@pytest.fixture() +def model_path(): + return "hf:mgoin/TinyStories-1M-deepsparse" + + +@pytest.fixture() +def model_id(): + return "roneneldan/TinyStories-1M" + + +@pytest.mark.parametrize( + "datasets", + [ + "openai_humaneval", + "wikitext2", + ], +) +@pytest.mark.parametrize("batch_size", [1, 2]) +class TestPerplexity: + limit = 2 + + def test_perplexity_ground_truth_equal_pipeline( + self, model_path, model_id, datasets, batch_size + ): + # setting max_sequence_length to 16 to speed up the test + kwargs_ground_truth = ( + dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {} + ) + kwargs = copy(kwargs_ground_truth) + + result_gt = self._get_ground_truth( + datasets=datasets, + batch_size=batch_size, + limit=self.limit, + model_id=model_id, + kwargs=kwargs_ground_truth, + ) + + result = integration_eval( + pipeline=TextGenerationPipeline( + model_path="hf:mgoin/TinyStories-1M-deepsparse", + engine_type="onnxruntime", + ), + datasets=datasets, + batch_size=batch_size, + limit=self.limit, + # we are setting accumulate=False to compare + # with the torch ground truth apples to apples + accumulate=False, + **kwargs, + ) + perplexities = result.formatted[0].metrics[0].value + perplexities_gt = result_gt["perplexities"] + assert np.allclose(perplexities, perplexities_gt, rtol=0.1) + + def test_perplexity_kv_cache_pipeline_equal_no_kv_cache_pipeline( + self, model_path, model_id, datasets, batch_size + ): + + kwargs_ground_truth = ( + dict(max_sequence_length=16) if datasets in {"c4", "wikitext2"} else {} + ) + kwargs = copy(kwargs_ground_truth) + + result_kv_cache = integration_eval( + pipeline=TextGenerationPipeline( + model_path="hf:mgoin/TinyStories-1M-deepsparse", + engine_type="onnxruntime", + ), + datasets=datasets, + model_path=model_id, + batch_size=batch_size, + limit=self.limit, + **kwargs, + ) + + result_non_kv_cache = integration_eval( + pipeline=TextGenerationPipeline( + model_path="hf:mgoin/TinyStories-1M-deepsparse", + engine_type="onnxruntime", + onnx_model_name="model-orig.onnx", + ), + datasets=datasets, + batch_size=batch_size, + limit=self.limit, + **kwargs, + ) + + perplexities_kv_cache = result_kv_cache.formatted[0].metrics[0].value + perplexities_non_kv_cache = result_non_kv_cache.formatted[0].metrics[0].value + np.allclose(perplexities_kv_cache, perplexities_non_kv_cache, rtol=0.1) + + @staticmethod + def _get_ground_truth(datasets, batch_size, limit, model_id, kwargs={}): + perplexity = load("perplexity", module_type="metric") + kwargs["model_path"] = model_id + dataset, *_ = load_perplexity_dataset(dataset_name=datasets, **kwargs) + predictions = [] + for i, sample in enumerate(dataset): + if i == batch_size * limit: + break + predictions.append(sample) + return perplexity.compute( + predictions=predictions, add_start_token=False, model_id=model_id + ) From 7a3ad2fcfe7aa2c2c782ac24d9803417aeeb2bc9 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Fri, 9 Feb 2024 16:16:51 +0000 Subject: [PATCH 24/24] move the registration of the perplexity eval function where it belongs --- src/deepsparse/evaluation/evaluator.py | 3 --- src/deepsparse/evaluation/utils.py | 6 +++++- tests/deepsparse/evaluation/test_evaluator.py | 11 ++++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/deepsparse/evaluation/evaluator.py b/src/deepsparse/evaluation/evaluator.py index 3926b78a2a..3d18f8489f 100644 --- a/src/deepsparse/evaluation/evaluator.py +++ b/src/deepsparse/evaluation/evaluator.py @@ -16,9 +16,6 @@ from typing import List, Optional, Union from deepsparse import Pipeline -from deepsparse.evaluation.integrations.perplexity import ( # noqa - integration_eval as integration_eval_perplexity, -) from deepsparse.evaluation.registry import EvaluationRegistry from deepsparse.evaluation.results import Result from deepsparse.evaluation.utils import create_pipeline diff --git a/src/deepsparse/evaluation/utils.py b/src/deepsparse/evaluation/utils.py index a5dc460596..6e5ade9344 100644 --- a/src/deepsparse/evaluation/utils.py +++ b/src/deepsparse/evaluation/utils.py @@ -42,10 +42,14 @@ def potentially_check_dependency_import(integration_name: str) -> bool: :return: True if the dependency is installed, False otherwise """ - if integration_name.replace("_", "-") == LM_EVALUATION_HARNESS: + if integration_name == LM_EVALUATION_HARNESS: from deepsparse.evaluation.integrations import try_import_lm_evaluation_harness try_import_lm_evaluation_harness() + if integration_name == PERPLEXITY: + from deepsparse.evaluation.integrations.perplexity import ( # noqa F401 + integration_eval, + ) return True diff --git a/tests/deepsparse/evaluation/test_evaluator.py b/tests/deepsparse/evaluation/test_evaluator.py index 241b9a4344..58eedff836 100644 --- a/tests/deepsparse/evaluation/test_evaluator.py +++ b/tests/deepsparse/evaluation/test_evaluator.py @@ -115,20 +115,25 @@ def test_evaluate_pipeline_without_kv_cache( not try_import_lm_evaluation_harness(raise_error=False), reason="lm_evaluation_harness not installed", ) -def test_evaluation_llm_evaluation_harness_integration_name( +def test_evaluation_llm_evaluation_harness( model_path, - datasets, ): assert evaluate( model=model_path, # testing only on hellaswag dataset # to avoid long running time - datasets=datasets[0], + datasets="hellaswag", limit=1, integration="lm_evaluation_harness", ) +def test_evaluation_perplexity(model_path): + assert evaluate( + model=model_path, datasets="openai_humaneval", limit=1, integration="perplexity" + ) + + @pytest.mark.parametrize("type_serialization", ["json", "yaml"]) @pytest.mark.skipif( tuple(map(int, sys.version.split(".")[:2])) < (3, 10),