From 850ce8b86462dbf90ecba1aa810eb3b1b150d4b7 Mon Sep 17 00:00:00 2001 From: jeanyu-habana Date: Mon, 30 Sep 2024 17:15:45 -0500 Subject: [PATCH 1/2] Add integration with Intel Gaudi in llama-index-llms-gaudi --- .../llms/llama-index-llms-gaudi/.gitignore | 153 ++++ .../llms/llama-index-llms-gaudi/BUILD | 3 + .../llms/llama-index-llms-gaudi/Makefile | 17 + .../llms/llama-index-llms-gaudi/README.md | 18 + .../llama-index-llms-gaudi/examples/BUILD | 1 + .../llama-index-llms-gaudi/examples/README.md | 30 + .../llama-index-llms-gaudi/examples/basic.py | 78 ++ .../llama_index/llms/gaudi/BUILD | 1 + .../llama_index/llms/gaudi/__init__.py | 4 + .../llama_index/llms/gaudi/base.py | 385 +++++++++ .../llama_index/llms/gaudi/utils.py | 788 ++++++++++++++++++ .../llama-index-llms-gaudi/pyproject.toml | 72 ++ 12 files changed, 1550 insertions(+) create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/Makefile create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/README.md create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore b/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile b/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md new file mode 100644 index 0000000000000..30780ffeb6b58 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md @@ -0,0 +1,18 @@ +# LlamaIndex Llms Integration with Intel Gaudi + +## Installation + +```bash +pip install --upgrade-strategy eager optimum[habana] +``` + +## Usage + +```python +from llama_index.llms.gaudi import GaudiLLM +``` + +## Examples + +- [Notebook Example](https://docs.llamaindex.ai/en/stable/examples/llm/gaudi/) +- [More Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-gaudi/examples) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md new file mode 100644 index 0000000000000..75226f0272cef --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md @@ -0,0 +1,30 @@ +# GaudiLLM Examples + +This folder contains examples showcasing how to use LlamaIndex with `gaudi` LLM integration `llama_index.llms.gaudi.GaudiLLM`. + +## Installation + +### On Intel Gaudi + +Install `llama-index-llms-gaudi`. This will also install `gaudi` and its dependencies. + +```bash +pip install --upgrade-strategy eager optimum[habana] +``` + +## List of Examples + +### Basic Example + +The example [basic.py](./basic.py) shows how to run `GaudiLLM` on Intel Gaudi and conduct tasks such as text completion. Run the example as following: + +```bash +python basic.py +``` + +> Please note that in this example we'll use [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) model for demonstration. It requires `transformers` and `tokenizers` packages. +> +> ```bash +> pip install -U transformers tokenizers +> ``` + diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py new file mode 100644 index 0000000000000..6f938533c9643 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py @@ -0,0 +1,78 @@ +# Transform a string into input zephyr-specific input +def completion_to_prompt(completion): + return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" + + +# Transform a list of chat messages into zephyr-specific input +def messages_to_prompt(messages): + prompt = "" + for message in messages: + if message.role == "system": + prompt += f"<|system|>\n{message.content}\n" + elif message.role == "user": + prompt += f"<|user|>\n{message.content}\n" + elif message.role == "assistant": + prompt += f"<|assistant|>\n{message.content}\n" + + # ensure we start with a system prompt, insert blank if needed + if not prompt.startswith("<|system|>\n"): + prompt = "<|system|>\n\n" + prompt + + # add final assistant prompt + prompt = prompt + "<|assistant|>\n" + + return prompt + + +import logging +import argparse +from llama_index.llms.gaudi import GaudiLLM +from llama_index.core.prompts import PromptTemplate +from llama_index.llms.gaudi.utils import ( + setup_parser, +) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example") + args = setup_parser(parser) + args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha" + + llm = GaudiLLM( + args=args, + logger=logger, + model_name="HuggingFaceH4/zephyr-7b-alpha", + tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", + query_wrapper_prompt=PromptTemplate("<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n"), + context_window=3900, + max_new_tokens=256, + generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, + messages_to_prompt=messages_to_prompt, + device_map="auto", + ) + + query = "Is the ocean blue?" + print("\n----------------- Complete ------------------") + completion_response = llm.complete(query) + print(completion_response.text) + print("\n----------------- Stream Complete ------------------") + response_iter = llm.stream_complete(query) + for response in response_iter: + print(response.delta, end="", flush=True) + print("\n----------------- Chat ------------------") + from llama_index.core.llms import ChatMessage + + message = ChatMessage(role="user", content=query) + resp = llm.chat([message]) + print(resp) + print("\n----------------- Stream Chat ------------------") + message = ChatMessage(role="user", content=query) + resp = llm.stream_chat([message], max_tokens=256) + for r in resp: + print(r.delta, end="") diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py new file mode 100644 index 0000000000000..5ef1883df2fb4 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py @@ -0,0 +1,4 @@ +from llama_index.llms.gaudi.base import GaudiLLM + + +__all__ = ["GaudiLLM"] diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py new file mode 100644 index 0000000000000..dfb6e9231d162 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py @@ -0,0 +1,385 @@ +# This file is adapted from +# https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/ +# llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from typing import Any, Callable, List, Optional, Sequence, Union + +import torch +from llama_index.core.base.llms.types import ( + ChatMessage, + ChatResponse, + ChatResponseGen, + CompletionResponse, + CompletionResponseGen, + LLMMetadata, +) +from llama_index.core.bridge.pydantic import Field, PrivateAttr +from llama_index.core.callbacks import CallbackManager +from llama_index.core.constants import ( + DEFAULT_CONTEXT_WINDOW, + DEFAULT_NUM_OUTPUTS, +) +from llama_index.core.llms.callbacks import ( + llm_chat_callback, + llm_completion_callback, +) +from llama_index.core.llms.custom import CustomLLM + +from llama_index.core.base.llms.generic_utils import ( + completion_response_to_chat_response, + stream_completion_response_to_chat_response, + messages_to_prompt as generic_messages_to_prompt, +) +from llama_index.core.prompts.base import PromptTemplate +from llama_index.core.types import BaseOutputParser, PydanticProgramMode, Thread +from transformers import ( + StoppingCriteria, + StoppingCriteriaList, +) +from transformers import AutoTokenizer, LlamaTokenizer +#gaudi +from llama_index.llms.gaudi.utils import initialize_model +from llama_index.llms.huggingface import HuggingFaceLLM + +#DEFAULT_HUGGINGFACE_MODEL = "meta-llama/Llama-2-7b-chat-hf" +DEFAULT_HUGGINGFACE_MODEL = "/home/ubuntu/jean/models/mistral" + +logger = logging.getLogger(__name__) + + +class GaudiLLM(CustomLLM): + r"""Gaudi-LLM. + + Example: + .. code-block:: python + + from llama_index.llms.ipex_llm import GaudiLLM + llm = GaudiLLM(model_path="/path/to/llama/model") + """ + + model_name: str = Field( + default=DEFAULT_HUGGINGFACE_MODEL, + description=( + "The model name to use from HuggingFace. " + "Unused if `model` is passed in directly." + ), + ) + context_window: int = Field( + default=DEFAULT_CONTEXT_WINDOW, + description="The maximum number of tokens available for input.", + gt=0, + ) + max_new_tokens: int = Field( + default=DEFAULT_NUM_OUTPUTS, + description="The maximum number of tokens to generate.", + gt=0, + ) + query_wrapper_prompt: PromptTemplate = Field( + default=PromptTemplate("{query_str}"), + description=( + "The query wrapper prompt, containing the query placeholder. " + "The model card on HuggingFace should specify if this is needed. " + "Should contain a `{query_str}` placeholder." + ), + ) + tokenizer_name: str = Field( + default=DEFAULT_HUGGINGFACE_MODEL, + description=( + "The name of the tokenizer to use from HuggingFace. " + "Unused if `tokenizer` is passed in directly." + ), + ) + device_map: str = Field( + default="cpu", description="The device_map to use. Defaults to 'cpu'." + ) + stopping_ids: List[int] = Field( + default_factory=list, + description=( + "The stopping ids to use. " + "Generation stops when these token IDs are predicted." + ), + ) + tokenizer_outputs_to_remove: list = Field( + default_factory=list, + description=( + "The outputs to remove from the tokenizer. " + "Sometimes huggingface tokenizers return extra inputs that cause errors." + ), + ) + tokenizer_kwargs: dict = Field( + default_factory=dict, description="The kwargs to pass to the tokenizer." + ) + model_kwargs: dict = Field( + default_factory=dict, + description="The kwargs to pass to the model during initialization.", + ) + generate_kwargs: dict = Field( + default_factory=dict, + description="The kwargs to pass to the model during generation.", + ) + is_chat_model: bool = Field( + default=False, + description=" Be sure to verify that you either pass an appropriate tokenizer " + "that can convert prompts to properly formatted chat messages or a " + "`messages_to_prompt` that does so.", + ) + + _model: Any = PrivateAttr() + _tokenizer: Any = PrivateAttr() + _stopping_criteria: Any = PrivateAttr() + + def __init__( + self, args, logger, + context_window: int = DEFAULT_CONTEXT_WINDOW, + max_new_tokens: int = DEFAULT_NUM_OUTPUTS, + query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}", + tokenizer_name: str = DEFAULT_HUGGINGFACE_MODEL, + model_name: str = DEFAULT_HUGGINGFACE_MODEL, + model: Optional[Any] = None, + tokenizer: Optional[Any] = None, + device_map: Optional[str] = "auto", + stopping_ids: Optional[List[int]] = None, + tokenizer_kwargs: Optional[dict] = None, + tokenizer_outputs_to_remove: Optional[list] = None, + model_kwargs: Optional[dict] = None, + generate_kwargs: Optional[dict] = None, + is_chat_model: Optional[bool] = False, + callback_manager: Optional[CallbackManager] = None, + system_prompt: str = "", + messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None, + completion_to_prompt: Optional[Callable[[str], str]] = None, + pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT, + output_parser: Optional[BaseOutputParser] = None, + ) -> None: + """ + Construct GaudiLLM. + + Args: + context_window: The maximum number of tokens available for input. + max_new_tokens: The maximum number of tokens to generate. + tokenizer_name: The name of the tokenizer to use from HuggingFace. + Unused if `tokenizer` is passed in directly. + model_name: The model name to use from HuggingFace. + Unused if `model` is passed in directly. + model: The HuggingFace model. + tokenizer: The tokenizer. + device_map: The device_map to use. Defaults to 'auto'. + stopping_ids: The stopping ids to use. + Generation stops when these token IDs are predicted. + tokenizer_kwargs: The kwargs to pass to the tokenizer. + tokenizer_outputs_to_remove: The outputs to remove from the tokenizer. + Sometimes huggingface tokenizers return extra inputs that cause errors. + model_kwargs: The kwargs to pass to the model during initialization. + generate_kwargs: The kwargs to pass to the model during generation. + is_chat_model: Whether the model is `chat` + callback_manager: Callback manager. + messages_to_prompt: Function to convert messages to prompt. + completion_to_prompt: Function to convert messages to prompt. + pydantic_program_mode: DEFAULT. + output_parser: BaseOutputParser. + + Returns: + None. + """ + model_kwargs = model_kwargs or {} + + model, _, tokenizer, _= initialize_model(args, logger) + + # check context_window + config_dict = model.config.to_dict() + model_context_window = int( + config_dict.get("max_position_embeddings", context_window) + ) + if model_context_window and model_context_window < context_window: + logger.warning( + f"Supplied context_window {context_window} is greater " + f"than the model's max input size {model_context_window}. " + "Disable this warning by setting a lower context_window." + ) + context_window = model_context_window + + + # setup stopping criteria + stopping_ids_list = stopping_ids or [] + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + class StopOnTokens(StoppingCriteria): + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + **kwargs: Any, + ) -> bool: + for stop_id in stopping_ids_list: + if input_ids[0][-1] == stop_id: + return True + return False + + stopping_criteria = StoppingCriteriaList([StopOnTokens()]) + if isinstance(query_wrapper_prompt, str): + query_wrapper_prompt = PromptTemplate(query_wrapper_prompt) + + messages_to_prompt = messages_to_prompt or self._tokenizer_messages_to_prompt + + super().__init__( + context_window=context_window, + max_new_tokens=max_new_tokens, + query_wrapper_prompt=query_wrapper_prompt, + tokenizer_name=tokenizer_name, + model_name=model_name, + device_map=device_map, + stopping_ids=stopping_ids or [], + tokenizer_kwargs=tokenizer_kwargs or {}, + tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or [], + model_kwargs=model_kwargs or {}, + generate_kwargs=generate_kwargs or {}, + is_chat_model=is_chat_model, + callback_manager=callback_manager, + system_prompt=system_prompt, + messages_to_prompt=messages_to_prompt, + completion_to_prompt=completion_to_prompt, + pydantic_program_mode=pydantic_program_mode, + output_parser=output_parser, + ) + + self._model = model + self._tokenizer = tokenizer + self._stopping_criteria = stopping_criteria + + @classmethod + def class_name(cls) -> str: + return "GaudiLLM" + + @property + def metadata(self) -> LLMMetadata: + """LLM metadata.""" + return LLMMetadata( + context_window=self.context_window, + num_output=self.max_new_tokens, + model_name=self.model_name, + is_chat_model=self.is_chat_model, + ) + + def _tokenizer_messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str: + if hasattr(self._tokenizer, "apply_chat_template"): + messages_dict = [ + {"role": message.role.value, "content": message.content} + for message in messages + ] + tokens = self._tokenizer.apply_chat_template(messages_dict) + return self._tokenizer.decode(tokens) + + return generic_messages_to_prompt(messages) + + @llm_completion_callback() + def complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponse: + """ + Complete by LLM. + + Args: + prompt: Prompt for completion. + formatted: Whether the prompt is formatted by wrapper. + kwargs: Other kwargs for complete. + + Returns: + CompletionReponse after generation. + """ + if not formatted: + prompt = self.completion_to_prompt(prompt) + input_ids = self._tokenizer(prompt, return_tensors="pt") + input_ids = input_ids.to(self._model.device) + # remove keys from the tokenizer if needed, to avoid HF errors + for key in self.tokenizer_outputs_to_remove: + if key in input_ids: + input_ids.pop(key, None) + tokens = self._model.generate( + **input_ids, + max_new_tokens=self.max_new_tokens, + stopping_criteria=self._stopping_criteria, + pad_token_id=self._tokenizer.pad_token_id, + **self.generate_kwargs, + ) + completion_tokens = tokens[0][input_ids["input_ids"].size(1) :] + completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True) + + return CompletionResponse(text=completion, raw={"model_output": tokens}) + + @llm_completion_callback() + def stream_complete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponseGen: + """ + Complete by LLM in stream. + + Args: + prompt: Prompt for completion. + formatted: Whether the prompt is formatted by wrapper. + kwargs: Other kwargs for complete. + + Returns: + CompletionReponse after generation. + """ + from transformers import TextIteratorStreamer + + if not formatted: + prompt = self.completion_to_prompt(prompt) + + input_ids = self._tokenizer.encode(prompt, return_tensors="pt") + input_ids = input_ids.to(self._model.device) + + for key in self.tokenizer_outputs_to_remove: + if key in input_ids: + input_ids.pop(key, None) + + streamer = TextIteratorStreamer( + self._tokenizer, skip_prompt=True, skip_special_tokens=True + ) + generation_kwargs = dict( + input_ids=input_ids, + streamer=streamer, + max_new_tokens=self.max_new_tokens, + stopping_criteria=self._stopping_criteria, + pad_token_id=self._tokenizer.pad_token_id, + **self.generate_kwargs, + ) + thread = Thread(target=self._model.generate, kwargs=generation_kwargs) + thread.start() + + # create generator based off of streamer + def gen() -> CompletionResponseGen: + text = "" + for x in streamer: + text += x + yield CompletionResponse(text=text, delta=x) + + return gen() + + @llm_chat_callback() + def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: + prompt = self.messages_to_prompt(messages) + completion_response = self.complete(prompt, formatted=True, **kwargs) + return completion_response_to_chat_response(completion_response) + + @llm_chat_callback() + def stream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponseGen: + prompt = self.messages_to_prompt(messages) + completion_response = self.stream_complete(prompt, formatted=True, **kwargs) + return stream_completion_response_to_chat_response(completion_response) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py new file mode 100644 index 0000000000000..5ce06710976f2 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py @@ -0,0 +1,788 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### +# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company +############################################################################### + +import copy +import glob +import os +import argparse +import shutil +import tempfile +import time +from pathlib import Path + +import torch +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.utils import check_min_version + +from optimum.habana.checkpoint_utils import ( + get_ds_injection_policy, + get_repo_root, + model_is_optimized, + model_on_meta, + write_checkpoints_json, +) +from optimum.habana.utils import ( + check_habana_frameworks_version, + check_optimum_habana_min_version, + get_habana_frameworks_version, + set_seed, +) + +def setup_parser(parser): + # Arguments management + parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu") + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + #required=True, + help="Path to pre-trained model (on the HF Hub or locally).", + ) + parser.add_argument( + "--bf16", + default=True, + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.") + parser.add_argument( + "--max_input_tokens", + type=int, + default=0, + help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \ + if == 0, then truncate to 16 (original default) \ + if < 0, then do not truncate, use full input prompt", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") + parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.") + parser.add_argument( + "--use_kv_cache", + default=True, + action="store_true", + help="Whether to use the key/value cache for decoding. It should speed up generation.", + ) + parser.add_argument( + "--use_hpu_graphs", + default=True, + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument( + "--dataset_name", + default=None, + type=str, + help="Optional argument if you want to assess your model on a given dataset of the HF Hub.", + ) + parser.add_argument( + "--column_name", + default=None, + type=str, + help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.", + ) + parser.add_argument( + "--do_sample", + action="store_true", + help="Whether to use sampling for generation.", + ) + parser.add_argument( + "--num_beams", + default=1, + type=int, + help="Number of beams used for beam search generation. 1 means greedy search will be performed.", + ) + parser.add_argument( + "--trim_logits", + action="store_true", + help="Calculate logits only for the last token to save memory in the first step.", + ) + parser.add_argument( + "--seed", + default=27, + type=int, + help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.", + ) + parser.add_argument( + "--profiling_warmup_steps", + default=0, + type=int, + help="Number of steps to ignore for profiling.", + ) + parser.add_argument( + "--profiling_steps", + default=0, + type=int, + help="Number of steps to capture for profiling.", + ) + parser.add_argument( + "--profiling_record_shapes", + default=False, + type=bool, + help="Record shapes when enabling profiling.", + ) + parser.add_argument( + "--prompt", + default=None, + type=str, + nargs="*", + help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")', + ) + parser.add_argument( + "--bad_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that are not allowed to be generated.", + ) + parser.add_argument( + "--force_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that must be generated.", + ) + parser.add_argument( + "--assistant_model", + default=None, + type=str, + help="Optional argument to give a path to a draft/assistant model for assisted decoding.", + ) + parser.add_argument( + "--peft_model", + default=None, + type=str, + help="Optional argument to give a path to a PEFT model.", + ) + parser.add_argument("--num_return_sequences", type=int, default=1) + parser.add_argument( + "--token", + default=None, + type=str, + help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", + ) + parser.add_argument( + "--model_revision", + default="main", + type=str, + help="The specific model version to use (can be a branch name, tag name or commit id).", + ) + parser.add_argument( + "--attn_softmax_bf16", + action="store_true", + help="Whether to run attention softmax layer in lower precision provided that the model supports it and " + "is also running in lower precision.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Output directory to store results in.", + ) + parser.add_argument( + "--bucket_size", + default=-1, + type=int, + help="Bucket size to maintain static shapes. If this number is negative (default is -1) \ + then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \ + we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).", + ) + parser.add_argument( + "--bucket_internal", + action="store_true", + help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.", + ) + parser.add_argument( + "--dataset_max_samples", + default=-1, + type=int, + help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.", + ) + parser.add_argument( + "--limit_hpu_graphs", + action="store_true", + help="Skip HPU Graph usage for first token to save memory", + ) + parser.add_argument( + "--reuse_cache", + action="store_true", + help="Whether to reuse key/value cache for decoding. It should save memory.", + ) + parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers") + parser.add_argument( + "--simulate_dyn_prompt", + default=None, + type=int, + nargs="*", + help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.", + ) + parser.add_argument( + "--reduce_recompile", + action="store_true", + help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", + ) + + parser.add_argument( + "--use_flash_attention", + action="store_true", + help="Whether to enable Habana Flash Attention, provided that the model supports it.", + ) + parser.add_argument( + "--flash_attention_recompute", + action="store_true", + help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", + ) + parser.add_argument( + "--flash_attention_causal_mask", + action="store_true", + help="Whether to enable Habana Flash Attention in causal mode on first token generation.", + ) + parser.add_argument( + "--flash_attention_fast_softmax", + action="store_true", + help="Whether to enable Habana Flash Attention in fast softmax mode.", + ) + parser.add_argument( + "--book_source", + action="store_true", + help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to use torch compiled model or not.", + ) + parser.add_argument( + "--ignore_eos", + default=True, + action=argparse.BooleanOptionalAction, + help="Whether to ignore eos, set False to disable it", + ) + parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation") + parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling") + parser.add_argument( + "--const_serialization_path", + "--csp", + type=str, + help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", + ) + parser.add_argument( + "--disk_offload", + action="store_true", + help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", + ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", + ) + args = parser.parse_args() + + if args.torch_compile: + args.use_hpu_graphs = False + + if not args.use_hpu_graphs: + args.limit_hpu_graphs = False + + args.quant_config = os.getenv("QUANT_CONFIG", "") + if args.quant_config == "" and args.disk_offload: + logger.warning( + "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag." + ) + return args + +def adjust_batch(batch, size): + curr_size = batch["input_ids"].shape[1] + if curr_size >= size: + adjusted_batch = { + "input_ids": batch["input_ids"][:, :size], + "attention_mask": batch["attention_mask"][:, :size], + } + else: + adjusted_batch = {} + for k in batch.keys(): + last_colm = batch[k][:, -1] + expanded = last_colm.tile((size - curr_size, 1)).T + adjusted_batch[k] = torch.concat([batch[k], expanded], 1) + assert adjusted_batch["input_ids"].shape[1] == size + assert adjusted_batch["attention_mask"].shape[1] == size + return adjusted_batch + + +def override_print(enable): + import builtins as __builtin__ + + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop("force", False) + if force or enable: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def override_logger(logger, enable): + logger_info = logger.info + + def info(*args, **kwargs): + force = kwargs.pop("force", False) + if force or enable: + logger_info(*args, **kwargs) + + logger.info = info + + +def count_hpu_graphs(): + return len(glob.glob(".graph_dumps/*PreGraph*")) + + +def override_prints(enable, logger): + override_print(enable) + override_logger(logger, enable) + + +def setup_distributed(args): + args.local_rank = int(os.getenv("LOCAL_RANK", "0")) + args.world_size = int(os.getenv("WORLD_SIZE", "0")) + args.global_rank = int(os.getenv("RANK", "0")) + + +def setup_inference(args, model): + import habana_frameworks.torch.core as htcore + + habana_version = get_habana_frameworks_version() + + print("Initializing inference mode") + # Keeping the if-else here for back compat. TODO remove later + if habana_version.major >= 1 and habana_version.minor >= 16: + htcore.hpu_initialize(model, mark_only_scales_as_const=True) + else: + const_marking = os.getenv("ENABLE_CONST_MARKING", "True") + if const_marking == "True": + htcore.hpu_initialize(model) + return model + + +def setup_const_serialization(const_serialization_path): + import uuid + + const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex) + os.makedirs(const_serialization_path) + from habana_frameworks.torch.hpu import enable_const_section_serialization + + print("Serializing const params to {}".format(const_serialization_path)) + enable_const_section_serialization(const_serialization_path, True) + + +def setup_env(args): + # Will error if the minimal version of Transformers is not installed. Remove at your own risks. + check_min_version("4.34.0") + check_optimum_habana_min_version("1.9.0.dev0") + # TODO: SW-167588 - WA for memory issue in hqt prep_model + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") + + if args.global_rank == 0 and not args.torch_compile: + os.environ.setdefault("GRAPH_VISUALIZATION", "true") + shutil.rmtree(".graph_dumps", ignore_errors=True) + + if args.world_size > 0: + os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0") + os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") + + if args.use_hpu_graphs and args.limit_hpu_graphs and not args.reuse_cache and args.bucket_internal: + # Based upon above conditions and below env variable, + # we can call HPU graphs clear_inputs(). + os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1") + + # Tweak generation so that it runs faster on Gaudi + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + + +def setup_device(args): + if args.device == "hpu": + import habana_frameworks.torch.core as htcore + + if args.quant_config: + htcore.hpu_set_env() + return torch.device(args.device) + + +# patching LinearAllreduce to use ScopedLinearAllReduce +def patch_scoped_linear_all_reduce(model): + from deepspeed.module_inject.layers import LinearAllreduce + + from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce + + for name, module in model.named_children(): + if type(module) is LinearAllreduce: + SL = ScopedLinearAllReduce(mod=module) + setattr(model, name, SL) + patch_scoped_linear_all_reduce(module) + + +def get_torch_compiled_model(model): + model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True}) + return model + + +def setup_model(args, model_dtype, model_kwargs, logger): + logger.info("Single-device run.") + if args.assistant_model is None: + assistant_model = None + else: + logger.info(f"Using asssitant model {args.assistant_model}.") + if args.disk_offload: + from accelerate import infer_auto_device_map, init_empty_weights + + config = AutoConfig.from_pretrained(args.model_name_or_path) + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + max_memory = {"cpu": "10GiB"} + device_map = infer_auto_device_map(model, max_memory=max_memory, dtype=model_dtype) + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + device_map=device_map, + offload_folder="/tmp/offload_folder/", + offload_state_dict=True, + torch_dtype=model_dtype, + **model_kwargs, + ) + else: + if args.assistant_model is not None: + assistant_model = AutoModelForCausalLM.from_pretrained( + args.assistant_model, torch_dtype=model_dtype, **model_kwargs + ) + if args.peft_model is not None: + model = peft_model(args, model_dtype, logger, **model_kwargs) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + if args.quant_config: + import habana_quantization_toolkit + + habana_quantization_toolkit.prep_model(model) + if args.assistant_model is not None: + habana_quantization_toolkit.quantize_model(assistant_model) + + model = model.eval().to(args.device) + if args.assistant_model is not None: + assistant_model = assistant_model.eval().to(args.device) + + if args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + from optimum.habana.transformers.trainer import _is_peft_model + + if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": + model = wrap_in_hpu_graph(model, hash_with_views=False) + else: + model = wrap_in_hpu_graph(model) + if args.assistant_model is not None: + assistant_model = wrap_in_hpu_graph(assistant_model) + if _is_peft_model(model): + model.base_model = wrap_in_hpu_graph(model.base_model) + + if args.torch_compile and model.config.model_type == "llama": + model = get_torch_compiled_model(model) + # if args.assistant_model is not None: + # assistant_model = get_torch_compiled_model(assistant_model) + return model, assistant_model + + +def setup_distributed_model(args, model_dtype, model_kwargs, logger): + import deepspeed + + logger.info("DeepSpeed is enabled.") + deepspeed.init_distributed(dist_backend="hccl") + config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) + load_to_meta = model_on_meta(config) + + if args.assistant_model is None: + assistant_model = None + else: + logger.info(f"Using asssitant model {args.assistant_model}.") + + if load_to_meta: + # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load + with deepspeed.OnDevice(dtype=model_dtype, device="meta"): + model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype) + + # Model loaded to meta is managed differently + checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w") + + # For PEFT models, write the merged model on disk to be able to load it on the meta device + if args.peft_model is not None: + merged_model_dir = "/tmp/text_generation_merged_peft_model" + if args.local_rank == 0: + if Path(merged_model_dir).is_dir(): + shutil.rmtree(merged_model_dir) + peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir) + torch.distributed.barrier() + + write_checkpoints_json( + merged_model_dir if args.peft_model is not None else args.model_name_or_path, + args.local_rank, + checkpoints_json, + token=args.token, + ) + else: + # TODO: revisit placement on CPU when auto-injection is possible + with deepspeed.OnDevice(dtype=model_dtype, device="cpu"): + if args.peft_model is not None: + model = peft_model(args, model_dtype, logger, **model_kwargs) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + model.eval() + + if args.assistant_model is not None: + assistant_model = AutoModelForCausalLM.from_pretrained( + args.assistant_model, torch_dtype=model_dtype, **model_kwargs + ).eval() + + # Initialize the model + ds_inference_kwargs = {"dtype": model_dtype} + ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size} + ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs + ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config) + if load_to_meta: + ds_inference_kwargs["checkpoint"] = checkpoints_json.name + + model = deepspeed.init_inference(model, **ds_inference_kwargs) + model = model.module + if model.config.model_type in ["llama", "falcon", "qwen2"]: + patch_scoped_linear_all_reduce(model) + + if args.quant_config: + import habana_quantization_toolkit + + habana_quantization_toolkit.prep_model(model) + if args.assistant_model is not None: + habana_quantization_toolkit.prep_model(assistant_model) + + if args.torch_compile and model.config.model_type == "llama": + model = get_torch_compiled_model(model) + # if args.assistant_model is not None: + # assistant_model = get_torch_compiled_model(assistant_model) + return model, assistant_model + + +def peft_model(args, model_dtype, logger, **model_kwargs): + import importlib.util + + if importlib.util.find_spec("peft") is None: + raise ImportError("The `peft` package is not installed, please run: `pip install peft`.") + from peft import AutoPeftModelForCausalLM + from peft.config import PeftConfigMixin + + base_model_name = PeftConfigMixin.from_pretrained( + args.peft_model, + token=model_kwargs["token"] if "token" in model_kwargs else None, + ).base_model_name_or_path + + base_model_is_local = Path(base_model_name).is_dir() + if not base_model_is_local: + # Check if the base model path to a remote repository on the HF Hub exists + from huggingface_hub import list_repo_files + + try: + list_repo_files(base_model_name) + base_model_is_remote = True + except Exception: + base_model_is_remote = False + + if base_model_is_local or base_model_is_remote: + model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs) + else: + # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model + logger.warning( + f"The base model `{base_model_name}` of the LoRA configuration associated" + f" to `{args.peft_model}` does not exist locally or remotely. Using " + f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model." + ) + from peft import PeftModel + + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) + model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs) + if hasattr(model, "merge_and_unload"): + model = model.merge_and_unload() + if model_dtype == torch.bfloat16: + model = model.to(torch.bfloat16) + return model + else: + from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation + + model.__class__.generate = gaudi_generate + model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation + return model + + +def setup_tokenizer(args, model, assistant_model): + tokenizer_kwargs = { + "revision": args.model_revision, + "token": args.token, + "trust_remote_code": args.trust_remote_code, + } + if args.bad_words is not None or args.force_words is not None: + tokenizer_kwargs["add_prefix_space"] = True + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs) + if not model.config.is_encoder_decoder: + tokenizer.padding_side = "left" + + if model.config.model_type == "llama": + # unwind broken decapoda-research config + model.generation_config.pad_token_id = 0 + model.generation_config.bos_token_id = 1 + model.generation_config.eos_token_id = 2 + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = 0 + assistant_model.generation_config.bos_token_id = 1 + assistant_model.generation_config.eos_token_id = 2 + tokenizer.bos_token_id = model.generation_config.bos_token_id + tokenizer.eos_token_id = model.generation_config.eos_token_id + tokenizer.pad_token_id = model.generation_config.pad_token_id + tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id) + tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id) + tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id) + if model.config.model_type == "persimmon": + model.generation_config.pad_token_id = model.generation_config.eos_token_id + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id + tokenizer.bos_token_id = model.generation_config.bos_token_id + tokenizer.eos_token_id = model.generation_config.eos_token_id + tokenizer.pad_token_id = model.generation_config.pad_token_id + tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id) + tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id) + tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id) + + # Some models like GPT2 do not have a PAD token so we have to set it if necessary + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model.generation_config.pad_token_id = model.generation_config.eos_token_id + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id + + return tokenizer, model, assistant_model + + +def setup_generation_config(args, model, assistant_model, tokenizer): + bad_words_ids = None + force_words_ids = None + if args.bad_words is not None: + bad_words_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in args.bad_words] + if args.force_words is not None: + force_words_ids = [tokenizer.encode(force_word, add_special_tokens=False) for force_word in args.force_words] + + is_optimized = model_is_optimized(model.config) + + # Generation configuration + generation_config = copy.deepcopy(model.generation_config) + generation_config.max_new_tokens = args.max_new_tokens + generation_config.use_cache = args.use_kv_cache + generation_config.static_shapes = is_optimized and assistant_model is None + generation_config.bucket_size = args.bucket_size if is_optimized else -1 + generation_config.bucket_internal = args.bucket_internal + generation_config.do_sample = args.do_sample + generation_config.num_beams = args.num_beams + generation_config.bad_words_ids = bad_words_ids + generation_config.force_words_ids = force_words_ids + generation_config.num_return_sequences = args.num_return_sequences + generation_config.trim_logits = args.trim_logits + generation_config.attn_softmax_bf16 = args.attn_softmax_bf16 + generation_config.limit_hpu_graphs = args.limit_hpu_graphs + generation_config.reuse_cache = args.reuse_cache + generation_config.reduce_recompile = args.reduce_recompile + if generation_config.reduce_recompile: + assert generation_config.bucket_size > 0 + generation_config.use_flash_attention = args.use_flash_attention + generation_config.flash_attention_recompute = args.flash_attention_recompute + generation_config.flash_attention_causal_mask = args.flash_attention_causal_mask + generation_config.flash_attention_fast_softmax = args.flash_attention_fast_softmax + generation_config.trust_remote_code = args.trust_remote_code + + return generation_config + + +def exclude_hpu_graph_configs(args): + # Excluded configs for batch size 1 for hpu graph + if args.batch_size == 1 and args.limit_hpu_graphs: + if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path: + return False + if args.world_size == 2 or args.world_size == 4 or args.world_size == 8: + if args.quant_config: + if args.max_input_tokens >= 8192 and args.max_new_tokens >= 128: + return False + else: + if args.max_input_tokens >= 4096 and args.max_new_tokens >= 128: + return False + return True + else: + return False + + +def initialize_model(args, logger): + init_start = time.perf_counter() + setup_distributed(args) + if exclude_hpu_graph_configs(args): + args.limit_hpu_graphs = False + override_prints(args.global_rank == 0 or args.verbose_workers, logger) + setup_env(args) + setup_device(args) + set_seed(args.seed) + get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token) + if args.assistant_model is not None: + get_repo_root(args.assistant_model, local_rank=args.local_rank, token=args.token) + use_deepspeed = False + if use_deepspeed or args.bf16: + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float + args.attn_softmax_bf16 = False + + model_kwargs = { + "revision": args.model_revision, + "token": args.token, + "trust_remote_code": args.trust_remote_code, + } + if args.trust_remote_code: + logger.warning("`trust_remote_code` is set, there is no guarantee this model works properly and it may fail") + + model, assistant_model = ( + setup_model(args, model_dtype, model_kwargs, logger) + if not use_deepspeed + else setup_distributed_model(args, model_dtype, model_kwargs, logger) + ) + tokenizer, model, assistant_model = setup_tokenizer(args, model, assistant_model) + generation_config = setup_generation_config(args, model, assistant_model, tokenizer) + + if args.const_serialization_path: + setup_const_serialization(args.const_serialization_path) + if args.quant_config: + model = setup_inference(args, model) + init_end = time.perf_counter() + logger.info(f"Args: {args}") + logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}") + logger.info(f"Model initialization took {(init_end - init_start):.3f}s") + return model, assistant_model, tokenizer, generation_config diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml new file mode 100644 index 0000000000000..ba65f29ae2ac8 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml @@ -0,0 +1,72 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.llms.gaudi" + +[tool.llamahub.class_authors] +GaudiLLM = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.10" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index llms gaudi integration" +license = "MIT" +name = "llama-index-llms-gaudi" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +huggingface-hub = "^0.23.0" +torch = "^2.1.2" +text-generation = "^0.7.0" +llama-index-core = "^0.11.0" + +[tool.poetry.dependencies.transformers] +extras = ["torch"] +version = "^4.37.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" From 3d51c673b22b8cdc73b03e6a8da5a1f889c47ccb Mon Sep 17 00:00:00 2001 From: jeanyu-habana Date: Sat, 5 Oct 2024 12:05:33 -0500 Subject: [PATCH 2/2] resolve review comments and lint compaints --- .../llms/llama-index-llms-gaudi/README.md | 39 +- .../llama-index-llms-gaudi/examples/README.md | 1 - .../llama-index-llms-gaudi/examples/basic.py | 327 ++++++++- .../llama_index/llms/gaudi/base.py | 670 ++++++++++-------- .../llama_index/llms/gaudi/utils.py | 369 +++------- .../llama-index-llms-gaudi/pyproject.toml | 7 +- 6 files changed, 799 insertions(+), 614 deletions(-) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md index 30780ffeb6b58..07ff53ba5d7e6 100644 --- a/llama-index-integrations/llms/llama-index-llms-gaudi/README.md +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md @@ -4,15 +4,52 @@ ```bash pip install --upgrade-strategy eager optimum[habana] +pip install llama-index-llms-gaudi +pip install llama-index-llms-huggingface ``` ## Usage ```python +import argparse +import os, logging from llama_index.llms.gaudi import GaudiLLM + + +def setup_parser(parser): + parser.add_argument(...) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="GaudiLLM Basic Usage Example" + ) + args = setup_parser(parser) + args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha" + + llm = GaudiLLM( + args=args, + logger=logger, + model_name="HuggingFaceH4/zephyr-7b-alpha", + tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", + query_wrapper_prompt=PromptTemplate( + "<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n" + ), + context_window=3900, + max_new_tokens=256, + generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, + messages_to_prompt=messages_to_prompt, + device_map="auto", + ) + + query = "Is the ocean blue?" + print("\n----------------- Complete ------------------") + completion_response = llm.complete(query) + print(completion_response.text) ``` ## Examples -- [Notebook Example](https://docs.llamaindex.ai/en/stable/examples/llm/gaudi/) - [More Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-gaudi/examples) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md index 75226f0272cef..a9bdec0912010 100644 --- a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md @@ -27,4 +27,3 @@ python basic.py > ```bash > pip install -U transformers tokenizers > ``` - diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py index 6f938533c9643..c2ec27582cf2f 100644 --- a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py @@ -1,3 +1,311 @@ +import os, logging +import argparse +from llama_index.llms.gaudi import GaudiLLM +from llama_index.core.prompts import PromptTemplate + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def setup_parser(parser): + # Arguments management + parser.add_argument( + "--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu" + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + # required=True, + help="Path to pre-trained model (on the HF Hub or locally).", + ) + parser.add_argument( + "--bf16", + default=True, + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument( + "--max_new_tokens", type=int, default=100, help="Number of tokens to generate." + ) + parser.add_argument( + "--max_input_tokens", + type=int, + default=0, + help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \ + if == 0, then truncate to 16 (original default) \ + if < 0, then do not truncate, use full input prompt", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument( + "--warmup", + type=int, + default=3, + help="Number of warmup iterations for benchmarking.", + ) + parser.add_argument( + "--n_iterations", + type=int, + default=5, + help="Number of inference iterations for benchmarking.", + ) + parser.add_argument( + "--local_rank", type=int, default=0, metavar="N", help="Local process rank." + ) + parser.add_argument( + "--use_kv_cache", + default=True, + action="store_true", + help="Whether to use the key/value cache for decoding. It should speed up generation.", + ) + parser.add_argument( + "--use_hpu_graphs", + default=True, + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument( + "--dataset_name", + default=None, + type=str, + help="Optional argument if you want to assess your model on a given dataset of the HF Hub.", + ) + parser.add_argument( + "--column_name", + default=None, + type=str, + help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.", + ) + parser.add_argument( + "--do_sample", + action="store_true", + help="Whether to use sampling for generation.", + ) + parser.add_argument( + "--num_beams", + default=1, + type=int, + help="Number of beams used for beam search generation. 1 means greedy search will be performed.", + ) + parser.add_argument( + "--trim_logits", + action="store_true", + help="Calculate logits only for the last token to save memory in the first step.", + ) + parser.add_argument( + "--seed", + default=27, + type=int, + help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.", + ) + parser.add_argument( + "--profiling_warmup_steps", + default=0, + type=int, + help="Number of steps to ignore for profiling.", + ) + parser.add_argument( + "--profiling_steps", + default=0, + type=int, + help="Number of steps to capture for profiling.", + ) + parser.add_argument( + "--profiling_record_shapes", + default=False, + type=bool, + help="Record shapes when enabling profiling.", + ) + parser.add_argument( + "--prompt", + default=None, + type=str, + nargs="*", + help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")', + ) + parser.add_argument( + "--bad_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that are not allowed to be generated.", + ) + parser.add_argument( + "--force_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that must be generated.", + ) + parser.add_argument( + "--assistant_model", + default=None, + type=str, + help="Optional argument to give a path to a draft/assistant model for assisted decoding.", + ) + parser.add_argument( + "--peft_model", + default=None, + type=str, + help="Optional argument to give a path to a PEFT model.", + ) + parser.add_argument("--num_return_sequences", type=int, default=1) + parser.add_argument( + "--token", + default=None, + type=str, + help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", + ) + parser.add_argument( + "--model_revision", + default="main", + type=str, + help="The specific model version to use (can be a branch name, tag name or commit id).", + ) + parser.add_argument( + "--attn_softmax_bf16", + action="store_true", + help="Whether to run attention softmax layer in lower precision provided that the model supports it and " + "is also running in lower precision.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Output directory to store results in.", + ) + parser.add_argument( + "--bucket_size", + default=-1, + type=int, + help="Bucket size to maintain static shapes. If this number is negative (default is -1) \ + then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \ + we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).", + ) + parser.add_argument( + "--bucket_internal", + action="store_true", + help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.", + ) + parser.add_argument( + "--dataset_max_samples", + default=-1, + type=int, + help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.", + ) + parser.add_argument( + "--limit_hpu_graphs", + action="store_true", + help="Skip HPU Graph usage for first token to save memory", + ) + parser.add_argument( + "--reuse_cache", + action="store_true", + help="Whether to reuse key/value cache for decoding. It should save memory.", + ) + parser.add_argument( + "--verbose_workers", + action="store_true", + help="Enable output from non-master workers", + ) + parser.add_argument( + "--simulate_dyn_prompt", + default=None, + type=int, + nargs="*", + help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.", + ) + parser.add_argument( + "--reduce_recompile", + action="store_true", + help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", + ) + + parser.add_argument( + "--use_flash_attention", + action="store_true", + help="Whether to enable Habana Flash Attention, provided that the model supports it.", + ) + parser.add_argument( + "--flash_attention_recompute", + action="store_true", + help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", + ) + parser.add_argument( + "--flash_attention_causal_mask", + action="store_true", + help="Whether to enable Habana Flash Attention in causal mode on first token generation.", + ) + parser.add_argument( + "--flash_attention_fast_softmax", + action="store_true", + help="Whether to enable Habana Flash Attention in fast softmax mode.", + ) + parser.add_argument( + "--book_source", + action="store_true", + help="Whether to use project Guttenberg books data as input. Useful for testing large sequence lengths.", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to use torch compiled model or not.", + ) + parser.add_argument( + "--ignore_eos", + default=True, + action=argparse.BooleanOptionalAction, + help="Whether to ignore eos, set False to disable it", + ) + parser.add_argument( + "--temperature", + default=1.0, + type=float, + help="Temperature value for text generation", + ) + parser.add_argument( + "--top_p", + default=1.0, + type=float, + help="Top_p value for generating text via sampling", + ) + parser.add_argument( + "--const_serialization_path", + "--csp", + type=str, + help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", + ) + parser.add_argument( + "--disk_offload", + action="store_true", + help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", + ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", + ) + args = parser.parse_args() + + if args.torch_compile: + args.use_hpu_graphs = False + + if not args.use_hpu_graphs: + args.limit_hpu_graphs = False + + args.quant_config = os.getenv("QUANT_CONFIG", "") + if args.quant_config == "" and args.disk_offload: + logger.warning( + "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag." + ) + return args + + # Transform a string into input zephyr-specific input def completion_to_prompt(completion): return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" @@ -24,21 +332,6 @@ def messages_to_prompt(messages): return prompt -import logging -import argparse -from llama_index.llms.gaudi import GaudiLLM -from llama_index.core.prompts import PromptTemplate -from llama_index.llms.gaudi.utils import ( - setup_parser, -) -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, -) -logger = logging.getLogger(__name__) - - if __name__ == "__main__": parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example") args = setup_parser(parser) @@ -49,7 +342,9 @@ def messages_to_prompt(messages): logger=logger, model_name="HuggingFaceH4/zephyr-7b-alpha", tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", - query_wrapper_prompt=PromptTemplate("<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n"), + query_wrapper_prompt=PromptTemplate( + "<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n" + ), context_window=3900, max_new_tokens=256, generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py index dfb6e9231d162..25732482fddb5 100644 --- a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py @@ -1,72 +1,381 @@ -# This file is adapted from -# https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/ -# llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. import logging from typing import Any, Callable, List, Optional, Sequence, Union -import torch -from llama_index.core.base.llms.types import ( - ChatMessage, - ChatResponse, - ChatResponseGen, - CompletionResponse, - CompletionResponseGen, - LLMMetadata, -) -from llama_index.core.bridge.pydantic import Field, PrivateAttr +from llama_index.core.base.llms.types import ChatMessage +from llama_index.core.bridge.pydantic import Field +from llama_index.llms.huggingface.base import HuggingFaceLLM from llama_index.core.callbacks import CallbackManager from llama_index.core.constants import ( DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS, ) -from llama_index.core.llms.callbacks import ( - llm_chat_callback, - llm_completion_callback, -) -from llama_index.core.llms.custom import CustomLLM - -from llama_index.core.base.llms.generic_utils import ( - completion_response_to_chat_response, - stream_completion_response_to_chat_response, - messages_to_prompt as generic_messages_to_prompt, -) +from llama_index.core.types import BaseOutputParser, PydanticProgramMode from llama_index.core.prompts.base import PromptTemplate -from llama_index.core.types import BaseOutputParser, PydanticProgramMode, Thread -from transformers import ( - StoppingCriteria, - StoppingCriteriaList, -) -from transformers import AutoTokenizer, LlamaTokenizer -#gaudi + from llama_index.llms.gaudi.utils import initialize_model -from llama_index.llms.huggingface import HuggingFaceLLM -#DEFAULT_HUGGINGFACE_MODEL = "meta-llama/Llama-2-7b-chat-hf" -DEFAULT_HUGGINGFACE_MODEL = "/home/ubuntu/jean/models/mistral" +DEFAULT_HUGGINGFACE_MODEL = "Intel/neural-chat-7b-v3-1" +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) logger = logging.getLogger(__name__) -class GaudiLLM(CustomLLM): - r"""Gaudi-LLM. +class GaudiLLM(HuggingFaceLLM): + r"""GaudiLLM LLM. - Example: - .. code-block:: python + Examples: + `pip install llama-index-llms-gaudi` + + ```python + from llama_index.llms.gaudi import GaudiLLM + import argparse + import os, logging + + def setup_parser(parser): + # Arguments management + parser.add_argument( + "--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu" + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + # required=True, + help="Path to pre-trained model (on the HF Hub or locally).", + ) + parser.add_argument( + "--bf16", + default=True, + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument( + "--max_new_tokens", type=int, default=100, help="Number of tokens to generate." + ) + parser.add_argument( + "--max_input_tokens", + type=int, + default=0, + help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \ + if == 0, then truncate to 16 (original default) \ + if < 0, then do not truncate, use full input prompt", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument( + "--warmup", + type=int, + default=3, + help="Number of warmup iterations for benchmarking.", + ) + parser.add_argument( + "--n_iterations", + type=int, + default=5, + help="Number of inference iterations for benchmarking.", + ) + parser.add_argument( + "--local_rank", type=int, default=0, metavar="N", help="Local process rank." + ) + parser.add_argument( + "--use_kv_cache", + default=True, + action="store_true", + help="Whether to use the key/value cache for decoding. It should speed up generation.", + ) + parser.add_argument( + "--use_hpu_graphs", + default=True, + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument( + "--dataset_name", + default=None, + type=str, + help="Optional argument if you want to assess your model on a given dataset of the HF Hub.", + ) + parser.add_argument( + "--column_name", + default=None, + type=str, + help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.", + ) + parser.add_argument( + "--do_sample", + action="store_true", + help="Whether to use sampling for generation.", + ) + parser.add_argument( + "--num_beams", + default=1, + type=int, + help="Number of beams used for beam search generation. 1 means greedy search will be performed.", + ) + parser.add_argument( + "--trim_logits", + action="store_true", + help="Calculate logits only for the last token to save memory in the first step.", + ) + parser.add_argument( + "--seed", + default=27, + type=int, + help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.", + ) + parser.add_argument( + "--profiling_warmup_steps", + default=0, + type=int, + help="Number of steps to ignore for profiling.", + ) + parser.add_argument( + "--profiling_steps", + default=0, + type=int, + help="Number of steps to capture for profiling.", + ) + parser.add_argument( + "--profiling_record_shapes", + default=False, + type=bool, + help="Record shapes when enabling profiling.", + ) + parser.add_argument( + "--prompt", + default=None, + type=str, + nargs="*", + help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")', + ) + parser.add_argument( + "--bad_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that are not allowed to be generated.", + ) + parser.add_argument( + "--force_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that must be generated.", + ) + parser.add_argument( + "--assistant_model", + default=None, + type=str, + help="Optional argument to give a path to a draft/assistant model for assisted decoding.", + ) + parser.add_argument( + "--peft_model", + default=None, + type=str, + help="Optional argument to give a path to a PEFT model.", + ) + parser.add_argument("--num_return_sequences", type=int, default=1) + parser.add_argument( + "--token", + default=None, + type=str, + help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", + ) + parser.add_argument( + "--model_revision", + default="main", + type=str, + help="The specific model version to use (can be a branch name, tag name or commit id).", + ) + parser.add_argument( + "--attn_softmax_bf16", + action="store_true", + help="Whether to run attention softmax layer in lower precision provided that the model supports it and " + "is also running in lower precision.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Output directory to store results in.", + ) + parser.add_argument( + "--bucket_size", + default=-1, + type=int, + help="Bucket size to maintain static shapes. If this number is negative (default is -1) \ + then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \ + we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).", + ) + parser.add_argument( + "--bucket_internal", + action="store_true", + help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.", + ) + parser.add_argument( + "--dataset_max_samples", + default=-1, + type=int, + help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.", + ) + parser.add_argument( + "--limit_hpu_graphs", + action="store_true", + help="Skip HPU Graph usage for first token to save memory", + ) + parser.add_argument( + "--reuse_cache", + action="store_true", + help="Whether to reuse key/value cache for decoding. It should save memory.", + ) + parser.add_argument( + "--verbose_workers", + action="store_true", + help="Enable output from non-master workers", + ) + parser.add_argument( + "--simulate_dyn_prompt", + default=None, + type=int, + nargs="*", + help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.", + ) + parser.add_argument( + "--reduce_recompile", + action="store_true", + help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", + ) + parser.add_argument( + "--use_flash_attention", + action="store_true", + help="Whether to enable Habana Flash Attention, provided that the model supports it.", + ) + parser.add_argument( + "--flash_attention_recompute", + action="store_true", + help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", + ) + parser.add_argument( + "--flash_attention_causal_mask", + action="store_true", + help="Whether to enable Habana Flash Attention in causal mode on first token generation.", + ) + parser.add_argument( + "--flash_attention_fast_softmax", + action="store_true", + help="Whether to enable Habana Flash Attention in fast softmax mode.", + ) + parser.add_argument( + "--book_source", + action="store_true", + help="Whether to use project Guttenberg books data as input. Useful for testing large sequence lengths.", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to use torch compiled model or not.", + ) + parser.add_argument( + "--ignore_eos", + default=True, + action=argparse.BooleanOptionalAction, + help="Whether to ignore eos, set False to disable it", + ) + parser.add_argument( + "--temperature", + default=1.0, + type=float, + help="Temperature value for text generation", + ) + parser.add_argument( + "--top_p", + default=1.0, + type=float, + help="Top_p value for generating text via sampling", + ) + parser.add_argument( + "--const_serialization_path", + "--csp", + type=str, + help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", + ) + parser.add_argument( + "--disk_offload", + action="store_true", + help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", + ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", + ) + args = parser.parse_args() + + if args.torch_compile: + args.use_hpu_graphs = False + + if not args.use_hpu_graphs: + args.limit_hpu_graphs = False + + args.quant_config = os.getenv("QUANT_CONFIG", "") + if args.quant_config == "" and args.disk_offload: + logger.warning( + "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag." + ) + return args + + def messages_to_prompt(messages): + prompt = "" + for message in messages: + if message.role == 'system': + prompt += f"<|system|>\n{message.content}\n" + elif message.role == 'user': + prompt += f"<|user|>\n{message.content}\n" + elif message.role == 'assistant': + prompt += f"<|assistant|>\n{message.content}\n" + + # ensure we start with a system prompt, insert blank if needed + if not prompt.startswith("<|system|>\n"): + prompt = "<|system|>\n\n" + prompt + + # add final assistant prompt + prompt = prompt + "<|assistant|>\n" + + return prompt + + def completion_to_prompt(completion): + return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" + + import torch + from llama_index.core.prompts import PromptTemplate + from llama_index.llms.optimum-intel import GaudiLLM + + parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example") + args = setup_parser(parser) + args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha" + + llm = GaudiLLM( + args=args, + logger=logger, + model_name="HuggingFaceH4/zephyr-7b-alpha", + tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", + query_wrapper_prompt=PromptTemplate( + "<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n" + ), + context_window=3900, + max_new_tokens=256, + generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, + messages_to_prompt=messages_to_prompt, + device_map="auto", + ) - from llama_index.llms.ipex_llm import GaudiLLM - llm = GaudiLLM(model_path="/path/to/llama/model") + response = llm.complete("What is the meaning of life?") + print(str(response)) + ``` """ model_name: str = Field( @@ -76,24 +385,6 @@ class GaudiLLM(CustomLLM): "Unused if `model` is passed in directly." ), ) - context_window: int = Field( - default=DEFAULT_CONTEXT_WINDOW, - description="The maximum number of tokens available for input.", - gt=0, - ) - max_new_tokens: int = Field( - default=DEFAULT_NUM_OUTPUTS, - description="The maximum number of tokens to generate.", - gt=0, - ) - query_wrapper_prompt: PromptTemplate = Field( - default=PromptTemplate("{query_str}"), - description=( - "The query wrapper prompt, containing the query placeholder. " - "The model card on HuggingFace should specify if this is needed. " - "Should contain a `{query_str}` placeholder." - ), - ) tokenizer_name: str = Field( default=DEFAULT_HUGGINGFACE_MODEL, description=( @@ -101,47 +392,11 @@ class GaudiLLM(CustomLLM): "Unused if `tokenizer` is passed in directly." ), ) - device_map: str = Field( - default="cpu", description="The device_map to use. Defaults to 'cpu'." - ) - stopping_ids: List[int] = Field( - default_factory=list, - description=( - "The stopping ids to use. " - "Generation stops when these token IDs are predicted." - ), - ) - tokenizer_outputs_to_remove: list = Field( - default_factory=list, - description=( - "The outputs to remove from the tokenizer. " - "Sometimes huggingface tokenizers return extra inputs that cause errors." - ), - ) - tokenizer_kwargs: dict = Field( - default_factory=dict, description="The kwargs to pass to the tokenizer." - ) - model_kwargs: dict = Field( - default_factory=dict, - description="The kwargs to pass to the model during initialization.", - ) - generate_kwargs: dict = Field( - default_factory=dict, - description="The kwargs to pass to the model during generation.", - ) - is_chat_model: bool = Field( - default=False, - description=" Be sure to verify that you either pass an appropriate tokenizer " - "that can convert prompts to properly formatted chat messages or a " - "`messages_to_prompt` that does so.", - ) - - _model: Any = PrivateAttr() - _tokenizer: Any = PrivateAttr() - _stopping_criteria: Any = PrivateAttr() def __init__( - self, args, logger, + self, + args, + logger, context_window: int = DEFAULT_CONTEXT_WINDOW, max_new_tokens: int = DEFAULT_NUM_OUTPUTS, query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}", @@ -163,77 +418,10 @@ def __init__( pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT, output_parser: Optional[BaseOutputParser] = None, ) -> None: - """ - Construct GaudiLLM. - - Args: - context_window: The maximum number of tokens available for input. - max_new_tokens: The maximum number of tokens to generate. - tokenizer_name: The name of the tokenizer to use from HuggingFace. - Unused if `tokenizer` is passed in directly. - model_name: The model name to use from HuggingFace. - Unused if `model` is passed in directly. - model: The HuggingFace model. - tokenizer: The tokenizer. - device_map: The device_map to use. Defaults to 'auto'. - stopping_ids: The stopping ids to use. - Generation stops when these token IDs are predicted. - tokenizer_kwargs: The kwargs to pass to the tokenizer. - tokenizer_outputs_to_remove: The outputs to remove from the tokenizer. - Sometimes huggingface tokenizers return extra inputs that cause errors. - model_kwargs: The kwargs to pass to the model during initialization. - generate_kwargs: The kwargs to pass to the model during generation. - is_chat_model: Whether the model is `chat` - callback_manager: Callback manager. - messages_to_prompt: Function to convert messages to prompt. - completion_to_prompt: Function to convert messages to prompt. - pydantic_program_mode: DEFAULT. - output_parser: BaseOutputParser. - - Returns: - None. - """ + """Initialize params.""" model_kwargs = model_kwargs or {} - model, _, tokenizer, _= initialize_model(args, logger) - - # check context_window - config_dict = model.config.to_dict() - model_context_window = int( - config_dict.get("max_position_embeddings", context_window) - ) - if model_context_window and model_context_window < context_window: - logger.warning( - f"Supplied context_window {context_window} is greater " - f"than the model's max input size {model_context_window}. " - "Disable this warning by setting a lower context_window." - ) - context_window = model_context_window - - - # setup stopping criteria - stopping_ids_list = stopping_ids or [] - - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - class StopOnTokens(StoppingCriteria): - def __call__( - self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - **kwargs: Any, - ) -> bool: - for stop_id in stopping_ids_list: - if input_ids[0][-1] == stop_id: - return True - return False - - stopping_criteria = StoppingCriteriaList([StopOnTokens()]) - if isinstance(query_wrapper_prompt, str): - query_wrapper_prompt = PromptTemplate(query_wrapper_prompt) - - messages_to_prompt = messages_to_prompt or self._tokenizer_messages_to_prompt + model, _, tokenizer, _ = initialize_model(args, logger) super().__init__( context_window=context_window, @@ -241,6 +429,8 @@ def __call__( query_wrapper_prompt=query_wrapper_prompt, tokenizer_name=tokenizer_name, model_name=model_name, + model=model, + tokenizer=tokenizer, device_map=device_map, stopping_ids=stopping_ids or [], tokenizer_kwargs=tokenizer_kwargs or {}, @@ -256,130 +446,6 @@ def __call__( output_parser=output_parser, ) - self._model = model - self._tokenizer = tokenizer - self._stopping_criteria = stopping_criteria - @classmethod def class_name(cls) -> str: return "GaudiLLM" - - @property - def metadata(self) -> LLMMetadata: - """LLM metadata.""" - return LLMMetadata( - context_window=self.context_window, - num_output=self.max_new_tokens, - model_name=self.model_name, - is_chat_model=self.is_chat_model, - ) - - def _tokenizer_messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str: - if hasattr(self._tokenizer, "apply_chat_template"): - messages_dict = [ - {"role": message.role.value, "content": message.content} - for message in messages - ] - tokens = self._tokenizer.apply_chat_template(messages_dict) - return self._tokenizer.decode(tokens) - - return generic_messages_to_prompt(messages) - - @llm_completion_callback() - def complete( - self, prompt: str, formatted: bool = False, **kwargs: Any - ) -> CompletionResponse: - """ - Complete by LLM. - - Args: - prompt: Prompt for completion. - formatted: Whether the prompt is formatted by wrapper. - kwargs: Other kwargs for complete. - - Returns: - CompletionReponse after generation. - """ - if not formatted: - prompt = self.completion_to_prompt(prompt) - input_ids = self._tokenizer(prompt, return_tensors="pt") - input_ids = input_ids.to(self._model.device) - # remove keys from the tokenizer if needed, to avoid HF errors - for key in self.tokenizer_outputs_to_remove: - if key in input_ids: - input_ids.pop(key, None) - tokens = self._model.generate( - **input_ids, - max_new_tokens=self.max_new_tokens, - stopping_criteria=self._stopping_criteria, - pad_token_id=self._tokenizer.pad_token_id, - **self.generate_kwargs, - ) - completion_tokens = tokens[0][input_ids["input_ids"].size(1) :] - completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True) - - return CompletionResponse(text=completion, raw={"model_output": tokens}) - - @llm_completion_callback() - def stream_complete( - self, prompt: str, formatted: bool = False, **kwargs: Any - ) -> CompletionResponseGen: - """ - Complete by LLM in stream. - - Args: - prompt: Prompt for completion. - formatted: Whether the prompt is formatted by wrapper. - kwargs: Other kwargs for complete. - - Returns: - CompletionReponse after generation. - """ - from transformers import TextIteratorStreamer - - if not formatted: - prompt = self.completion_to_prompt(prompt) - - input_ids = self._tokenizer.encode(prompt, return_tensors="pt") - input_ids = input_ids.to(self._model.device) - - for key in self.tokenizer_outputs_to_remove: - if key in input_ids: - input_ids.pop(key, None) - - streamer = TextIteratorStreamer( - self._tokenizer, skip_prompt=True, skip_special_tokens=True - ) - generation_kwargs = dict( - input_ids=input_ids, - streamer=streamer, - max_new_tokens=self.max_new_tokens, - stopping_criteria=self._stopping_criteria, - pad_token_id=self._tokenizer.pad_token_id, - **self.generate_kwargs, - ) - thread = Thread(target=self._model.generate, kwargs=generation_kwargs) - thread.start() - - # create generator based off of streamer - def gen() -> CompletionResponseGen: - text = "" - for x in streamer: - text += x - yield CompletionResponse(text=text, delta=x) - - return gen() - - @llm_chat_callback() - def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: - prompt = self.messages_to_prompt(messages) - completion_response = self.complete(prompt, formatted=True, **kwargs) - return completion_response_to_chat_response(completion_response) - - @llm_chat_callback() - def stream_chat( - self, messages: Sequence[ChatMessage], **kwargs: Any - ) -> ChatResponseGen: - prompt = self.messages_to_prompt(messages) - completion_response = self.stream_complete(prompt, formatted=True, **kwargs) - return stream_completion_response_to_chat_response(completion_response) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py index 5ce06710976f2..060d7a649b5cb 100644 --- a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2022 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +19,6 @@ import copy import glob import os -import argparse import shutil import tempfile import time @@ -44,269 +42,6 @@ set_seed, ) -def setup_parser(parser): - # Arguments management - parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu") - parser.add_argument( - "--model_name_or_path", - default=None, - type=str, - #required=True, - help="Path to pre-trained model (on the HF Hub or locally).", - ) - parser.add_argument( - "--bf16", - default=True, - action="store_true", - help="Whether to perform generation in bf16 precision.", - ) - parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.") - parser.add_argument( - "--max_input_tokens", - type=int, - default=0, - help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \ - if == 0, then truncate to 16 (original default) \ - if < 0, then do not truncate, use full input prompt", - ) - parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") - parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") - parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") - parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.") - parser.add_argument( - "--use_kv_cache", - default=True, - action="store_true", - help="Whether to use the key/value cache for decoding. It should speed up generation.", - ) - parser.add_argument( - "--use_hpu_graphs", - default=True, - action="store_true", - help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", - ) - parser.add_argument( - "--dataset_name", - default=None, - type=str, - help="Optional argument if you want to assess your model on a given dataset of the HF Hub.", - ) - parser.add_argument( - "--column_name", - default=None, - type=str, - help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.", - ) - parser.add_argument( - "--do_sample", - action="store_true", - help="Whether to use sampling for generation.", - ) - parser.add_argument( - "--num_beams", - default=1, - type=int, - help="Number of beams used for beam search generation. 1 means greedy search will be performed.", - ) - parser.add_argument( - "--trim_logits", - action="store_true", - help="Calculate logits only for the last token to save memory in the first step.", - ) - parser.add_argument( - "--seed", - default=27, - type=int, - help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.", - ) - parser.add_argument( - "--profiling_warmup_steps", - default=0, - type=int, - help="Number of steps to ignore for profiling.", - ) - parser.add_argument( - "--profiling_steps", - default=0, - type=int, - help="Number of steps to capture for profiling.", - ) - parser.add_argument( - "--profiling_record_shapes", - default=False, - type=bool, - help="Record shapes when enabling profiling.", - ) - parser.add_argument( - "--prompt", - default=None, - type=str, - nargs="*", - help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")', - ) - parser.add_argument( - "--bad_words", - default=None, - type=str, - nargs="+", - help="Optional argument list of words that are not allowed to be generated.", - ) - parser.add_argument( - "--force_words", - default=None, - type=str, - nargs="+", - help="Optional argument list of words that must be generated.", - ) - parser.add_argument( - "--assistant_model", - default=None, - type=str, - help="Optional argument to give a path to a draft/assistant model for assisted decoding.", - ) - parser.add_argument( - "--peft_model", - default=None, - type=str, - help="Optional argument to give a path to a PEFT model.", - ) - parser.add_argument("--num_return_sequences", type=int, default=1) - parser.add_argument( - "--token", - default=None, - type=str, - help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " - "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", - ) - parser.add_argument( - "--model_revision", - default="main", - type=str, - help="The specific model version to use (can be a branch name, tag name or commit id).", - ) - parser.add_argument( - "--attn_softmax_bf16", - action="store_true", - help="Whether to run attention softmax layer in lower precision provided that the model supports it and " - "is also running in lower precision.", - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - help="Output directory to store results in.", - ) - parser.add_argument( - "--bucket_size", - default=-1, - type=int, - help="Bucket size to maintain static shapes. If this number is negative (default is -1) \ - then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \ - we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).", - ) - parser.add_argument( - "--bucket_internal", - action="store_true", - help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.", - ) - parser.add_argument( - "--dataset_max_samples", - default=-1, - type=int, - help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.", - ) - parser.add_argument( - "--limit_hpu_graphs", - action="store_true", - help="Skip HPU Graph usage for first token to save memory", - ) - parser.add_argument( - "--reuse_cache", - action="store_true", - help="Whether to reuse key/value cache for decoding. It should save memory.", - ) - parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers") - parser.add_argument( - "--simulate_dyn_prompt", - default=None, - type=int, - nargs="*", - help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.", - ) - parser.add_argument( - "--reduce_recompile", - action="store_true", - help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", - ) - - parser.add_argument( - "--use_flash_attention", - action="store_true", - help="Whether to enable Habana Flash Attention, provided that the model supports it.", - ) - parser.add_argument( - "--flash_attention_recompute", - action="store_true", - help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", - ) - parser.add_argument( - "--flash_attention_causal_mask", - action="store_true", - help="Whether to enable Habana Flash Attention in causal mode on first token generation.", - ) - parser.add_argument( - "--flash_attention_fast_softmax", - action="store_true", - help="Whether to enable Habana Flash Attention in fast softmax mode.", - ) - parser.add_argument( - "--book_source", - action="store_true", - help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.", - ) - parser.add_argument( - "--torch_compile", - action="store_true", - help="Whether to use torch compiled model or not.", - ) - parser.add_argument( - "--ignore_eos", - default=True, - action=argparse.BooleanOptionalAction, - help="Whether to ignore eos, set False to disable it", - ) - parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation") - parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling") - parser.add_argument( - "--const_serialization_path", - "--csp", - type=str, - help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", - ) - parser.add_argument( - "--disk_offload", - action="store_true", - help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", - ) - parser.add_argument( - "--trust_remote_code", - action="store_true", - help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", - ) - args = parser.parse_args() - - if args.torch_compile: - args.use_hpu_graphs = False - - if not args.use_hpu_graphs: - args.limit_hpu_graphs = False - - args.quant_config = os.getenv("QUANT_CONFIG", "") - if args.quant_config == "" and args.disk_offload: - logger.warning( - "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag." - ) - return args def adjust_batch(batch, size): curr_size = batch["input_ids"].shape[1] @@ -317,7 +52,7 @@ def adjust_batch(batch, size): } else: adjusted_batch = {} - for k in batch.keys(): + for k in batch: last_colm = batch[k][:, -1] expanded = last_colm.tile((size - curr_size, 1)).T adjusted_batch[k] = torch.concat([batch[k], expanded], 1) @@ -388,7 +123,7 @@ def setup_const_serialization(const_serialization_path): os.makedirs(const_serialization_path) from habana_frameworks.torch.hpu import enable_const_section_serialization - print("Serializing const params to {}".format(const_serialization_path)) + print(f"Serializing const params to {const_serialization_path}") enable_const_section_serialization(const_serialization_path, True) @@ -407,7 +142,12 @@ def setup_env(args): os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0") os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") - if args.use_hpu_graphs and args.limit_hpu_graphs and not args.reuse_cache and args.bucket_internal: + if ( + args.use_hpu_graphs + and args.limit_hpu_graphs + and not args.reuse_cache + and args.bucket_internal + ): # Based upon above conditions and below env variable, # we can call HPU graphs clear_inputs(). os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1") @@ -431,7 +171,9 @@ def setup_device(args): def patch_scoped_linear_all_reduce(model): from deepspeed.module_inject.layers import LinearAllreduce - from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce + from optimum.habana.transformers.models.modeling_all_models import ( + ScopedLinearAllReduce, + ) for name, module in model.named_children(): if type(module) is LinearAllreduce: @@ -441,7 +183,9 @@ def patch_scoped_linear_all_reduce(model): def get_torch_compiled_model(model): - model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True}) + model.model = torch.compile( + model.model, backend="hpu_backend", options={"keep_input_mutations": True} + ) return model @@ -458,7 +202,9 @@ def setup_model(args, model_dtype, model_kwargs, logger): with init_empty_weights(): model = AutoModelForCausalLM.from_config(config) max_memory = {"cpu": "10GiB"} - device_map = infer_auto_device_map(model, max_memory=max_memory, dtype=model_dtype) + device_map = infer_auto_device_map( + model, max_memory=max_memory, dtype=model_dtype + ) model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, device_map=device_map, @@ -494,7 +240,10 @@ def setup_model(args, model_dtype, model_kwargs, logger): from optimum.habana.transformers.trainer import _is_peft_model - if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": + if ( + check_habana_frameworks_version("1.13.0") + and model.config.model_type == "falcon" + ): model = wrap_in_hpu_graph(model, hash_with_views=False) else: model = wrap_in_hpu_graph(model) @@ -515,7 +264,9 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): logger.info("DeepSpeed is enabled.") deepspeed.init_distributed(dist_backend="hccl") - config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) + config = AutoConfig.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) load_to_meta = model_on_meta(config) if args.assistant_model is None: @@ -537,11 +288,15 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): if args.local_rank == 0: if Path(merged_model_dir).is_dir(): shutil.rmtree(merged_model_dir) - peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir) + peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained( + merged_model_dir + ) torch.distributed.barrier() write_checkpoints_json( - merged_model_dir if args.peft_model is not None else args.model_name_or_path, + merged_model_dir + if args.peft_model is not None + else args.model_name_or_path, args.local_rank, checkpoints_json, token=args.token, @@ -593,7 +348,9 @@ def peft_model(args, model_dtype, logger, **model_kwargs): import importlib.util if importlib.util.find_spec("peft") is None: - raise ImportError("The `peft` package is not installed, please run: `pip install peft`.") + raise ImportError( + "The `peft` package is not installed, please run: `pip install peft`." + ) from peft import AutoPeftModelForCausalLM from peft.config import PeftConfigMixin @@ -614,7 +371,9 @@ def peft_model(args, model_dtype, logger, **model_kwargs): base_model_is_remote = False if base_model_is_local or base_model_is_remote: - model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs) + model = AutoPeftModelForCausalLM.from_pretrained( + args.peft_model, torch_dtype=model_dtype, **model_kwargs + ) else: # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model logger.warning( @@ -624,18 +383,27 @@ def peft_model(args, model_dtype, logger, **model_kwargs): ) from peft import PeftModel - model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) - model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs) + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + model = PeftModel.from_pretrained( + model, args.peft_model, torch_dtype=model_dtype, **model_kwargs + ) if hasattr(model, "merge_and_unload"): model = model.merge_and_unload() if model_dtype == torch.bfloat16: model = model.to(torch.bfloat16) return model else: - from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation + from optimum.habana.peft.peft_model import ( + gaudi_generate, + gaudi_prepare_inputs_for_generation, + ) model.__class__.generate = gaudi_generate - model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation + model.__class__.prepare_inputs_for_generation = ( + gaudi_prepare_inputs_for_generation + ) return model @@ -647,7 +415,9 @@ def setup_tokenizer(args, model, assistant_model): } if args.bad_words is not None or args.force_words is not None: tokenizer_kwargs["add_prefix_space"] = True - tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs) + tokenizer = AutoTokenizer.from_pretrained( + args.model_name_or_path, **tokenizer_kwargs + ) if not model.config.is_encoder_decoder: tokenizer.padding_side = "left" @@ -669,7 +439,9 @@ def setup_tokenizer(args, model, assistant_model): if model.config.model_type == "persimmon": model.generation_config.pad_token_id = model.generation_config.eos_token_id if assistant_model is not None: - assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id + assistant_model.generation_config.pad_token_id = ( + assistant_model.generation_config.eos_token_id + ) tokenizer.bos_token_id = model.generation_config.bos_token_id tokenizer.eos_token_id = model.generation_config.eos_token_id tokenizer.pad_token_id = model.generation_config.pad_token_id @@ -682,7 +454,9 @@ def setup_tokenizer(args, model, assistant_model): tokenizer.pad_token = tokenizer.eos_token model.generation_config.pad_token_id = model.generation_config.eos_token_id if assistant_model is not None: - assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id + assistant_model.generation_config.pad_token_id = ( + assistant_model.generation_config.eos_token_id + ) return tokenizer, model, assistant_model @@ -691,9 +465,15 @@ def setup_generation_config(args, model, assistant_model, tokenizer): bad_words_ids = None force_words_ids = None if args.bad_words is not None: - bad_words_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in args.bad_words] + bad_words_ids = [ + tokenizer.encode(bad_word, add_special_tokens=False) + for bad_word in args.bad_words + ] if args.force_words is not None: - force_words_ids = [tokenizer.encode(force_word, add_special_tokens=False) for force_word in args.force_words] + force_words_ids = [ + tokenizer.encode(force_word, add_special_tokens=False) + for force_word in args.force_words + ] is_optimized = model_is_optimized(model.config) @@ -728,7 +508,10 @@ def setup_generation_config(args, model, assistant_model, tokenizer): def exclude_hpu_graph_configs(args): # Excluded configs for batch size 1 for hpu graph if args.batch_size == 1 and args.limit_hpu_graphs: - if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path: + if ( + "falcon-180B" in args.model_name_or_path + or "falcon-180b" in args.model_name_or_path + ): return False if args.world_size == 2 or args.world_size == 4 or args.world_size == 8: if args.quant_config: @@ -753,7 +536,9 @@ def initialize_model(args, logger): set_seed(args.seed) get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token) if args.assistant_model is not None: - get_repo_root(args.assistant_model, local_rank=args.local_rank, token=args.token) + get_repo_root( + args.assistant_model, local_rank=args.local_rank, token=args.token + ) use_deepspeed = False if use_deepspeed or args.bf16: model_dtype = torch.bfloat16 @@ -767,7 +552,9 @@ def initialize_model(args, logger): "trust_remote_code": args.trust_remote_code, } if args.trust_remote_code: - logger.warning("`trust_remote_code` is set, there is no guarantee this model works properly and it may fail") + logger.warning( + "`trust_remote_code` is set, there is no guarantee this model works properly and it may fail" + ) model, assistant_model = ( setup_model(args, model_dtype, model_kwargs, logger) @@ -783,6 +570,8 @@ def initialize_model(args, logger): model = setup_inference(args, model) init_end = time.perf_counter() logger.info(f"Args: {args}") - logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}") + logger.info( + f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}" + ) logger.info(f"Model initialization took {(init_end - init_start):.3f}s") return model, assistant_model, tokenizer, generation_config diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml index ba65f29ae2ac8..c540f328ad284 100644 --- a/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml @@ -33,11 +33,13 @@ readme = "README.md" version = "0.1.0" [tool.poetry.dependencies] -python = ">=3.8.1,<4.0" +python = ">=3.9,<4.0" huggingface-hub = "^0.23.0" torch = "^2.1.2" text-generation = "^0.7.0" llama-index-core = "^0.11.0" +llama-index-llms-huggingface = "^0.3.0" +optimum = {extras = ["habana"], version = ">=1.21.2"} [tool.poetry.dependencies.transformers] extras = ["torch"] @@ -67,6 +69,3 @@ version = "<=23.9.1,>=23.7.0" [tool.poetry.group.dev.dependencies.codespell] extras = ["toml"] version = ">=v2.2.6" - -[[tool.poetry.packages]] -include = "llama_index/"