From 850ce8b86462dbf90ecba1aa810eb3b1b150d4b7 Mon Sep 17 00:00:00 2001
From: jeanyu-habana <jean1.yu@intel.com>
Date: Mon, 30 Sep 2024 17:15:45 -0500
Subject: [PATCH 1/2] Add integration with Intel Gaudi in
 llama-index-llms-gaudi

---
 .../llms/llama-index-llms-gaudi/.gitignore    | 153 ++++
 .../llms/llama-index-llms-gaudi/BUILD         |   3 +
 .../llms/llama-index-llms-gaudi/Makefile      |  17 +
 .../llms/llama-index-llms-gaudi/README.md     |  18 +
 .../llama-index-llms-gaudi/examples/BUILD     |   1 +
 .../llama-index-llms-gaudi/examples/README.md |  30 +
 .../llama-index-llms-gaudi/examples/basic.py  |  78 ++
 .../llama_index/llms/gaudi/BUILD              |   1 +
 .../llama_index/llms/gaudi/__init__.py        |   4 +
 .../llama_index/llms/gaudi/base.py            | 385 +++++++++
 .../llama_index/llms/gaudi/utils.py           | 788 ++++++++++++++++++
 .../llama-index-llms-gaudi/pyproject.toml     |  72 ++
 12 files changed, 1550 insertions(+)
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/Makefile
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/README.md
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py
 create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml

diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore b/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore
new file mode 100644
index 0000000000000..990c18de22908
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD
new file mode 100644
index 0000000000000..0896ca890d8bf
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile b/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile
new file mode 100644
index 0000000000000..b9eab05aa3706
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md
new file mode 100644
index 0000000000000..30780ffeb6b58
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md
@@ -0,0 +1,18 @@
+# LlamaIndex Llms Integration with Intel Gaudi
+
+## Installation
+
+```bash
+pip install --upgrade-strategy eager optimum[habana]
+```
+
+## Usage
+
+```python
+from llama_index.llms.gaudi import GaudiLLM
+```
+
+## Examples
+
+- [Notebook Example](https://docs.llamaindex.ai/en/stable/examples/llm/gaudi/)
+- [More Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-gaudi/examples)
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD
new file mode 100644
index 0000000000000..db46e8d6c978c
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md
new file mode 100644
index 0000000000000..75226f0272cef
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md
@@ -0,0 +1,30 @@
+# GaudiLLM Examples
+
+This folder contains examples showcasing how to use LlamaIndex with `gaudi` LLM integration `llama_index.llms.gaudi.GaudiLLM`.
+
+## Installation
+
+### On Intel Gaudi
+
+Install `llama-index-llms-gaudi`. This will also install `gaudi` and its dependencies.
+
+```bash
+pip install --upgrade-strategy eager optimum[habana]
+```
+
+## List of Examples
+
+### Basic Example
+
+The example [basic.py](./basic.py) shows how to run `GaudiLLM` on Intel Gaudi and conduct tasks such as text completion. Run the example as following:
+
+```bash
+python basic.py
+```
+
+> Please note that in this example we'll use [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) model for demonstration. It requires `transformers` and `tokenizers` packages.
+>
+> ```bash
+> pip install -U transformers tokenizers
+> ```
+
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py
new file mode 100644
index 0000000000000..6f938533c9643
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py
@@ -0,0 +1,78 @@
+# Transform a string into input zephyr-specific input
+def completion_to_prompt(completion):
+    return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
+
+
+# Transform a list of chat messages into zephyr-specific input
+def messages_to_prompt(messages):
+    prompt = ""
+    for message in messages:
+        if message.role == "system":
+            prompt += f"<|system|>\n{message.content}</s>\n"
+        elif message.role == "user":
+            prompt += f"<|user|>\n{message.content}</s>\n"
+        elif message.role == "assistant":
+            prompt += f"<|assistant|>\n{message.content}</s>\n"
+
+    # ensure we start with a system prompt, insert blank if needed
+    if not prompt.startswith("<|system|>\n"):
+        prompt = "<|system|>\n</s>\n" + prompt
+
+    # add final assistant prompt
+    prompt = prompt + "<|assistant|>\n"
+
+    return prompt
+
+
+import logging
+import argparse
+from llama_index.llms.gaudi import GaudiLLM
+from llama_index.core.prompts import PromptTemplate
+from llama_index.llms.gaudi.utils import (
+    setup_parser,
+)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example")
+    args = setup_parser(parser)
+    args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha"
+
+    llm = GaudiLLM(
+        args=args,
+        logger=logger,
+        model_name="HuggingFaceH4/zephyr-7b-alpha",
+        tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
+        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
+        context_window=3900,
+        max_new_tokens=256,
+        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
+        messages_to_prompt=messages_to_prompt,
+        device_map="auto",
+    )
+
+    query = "Is the ocean blue?"
+    print("\n----------------- Complete ------------------")
+    completion_response = llm.complete(query)
+    print(completion_response.text)
+    print("\n----------------- Stream Complete ------------------")
+    response_iter = llm.stream_complete(query)
+    for response in response_iter:
+        print(response.delta, end="", flush=True)
+    print("\n----------------- Chat ------------------")
+    from llama_index.core.llms import ChatMessage
+
+    message = ChatMessage(role="user", content=query)
+    resp = llm.chat([message])
+    print(resp)
+    print("\n----------------- Stream Chat ------------------")
+    message = ChatMessage(role="user", content=query)
+    resp = llm.stream_chat([message], max_tokens=256)
+    for r in resp:
+        print(r.delta, end="")
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD
new file mode 100644
index 0000000000000..db46e8d6c978c
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py
new file mode 100644
index 0000000000000..5ef1883df2fb4
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py
@@ -0,0 +1,4 @@
+from llama_index.llms.gaudi.base import GaudiLLM
+
+
+__all__ = ["GaudiLLM"]
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py
new file mode 100644
index 0000000000000..dfb6e9231d162
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py
@@ -0,0 +1,385 @@
+# This file is adapted from
+# https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/
+# llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Any, Callable, List, Optional, Sequence, Union
+
+import torch
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.callbacks import CallbackManager
+from llama_index.core.constants import (
+    DEFAULT_CONTEXT_WINDOW,
+    DEFAULT_NUM_OUTPUTS,
+)
+from llama_index.core.llms.callbacks import (
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.core.llms.custom import CustomLLM
+
+from llama_index.core.base.llms.generic_utils import (
+    completion_response_to_chat_response,
+    stream_completion_response_to_chat_response,
+    messages_to_prompt as generic_messages_to_prompt,
+)
+from llama_index.core.prompts.base import PromptTemplate
+from llama_index.core.types import BaseOutputParser, PydanticProgramMode, Thread
+from transformers import (
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+from transformers import AutoTokenizer, LlamaTokenizer
+#gaudi
+from llama_index.llms.gaudi.utils import initialize_model
+from llama_index.llms.huggingface import HuggingFaceLLM
+
+#DEFAULT_HUGGINGFACE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
+DEFAULT_HUGGINGFACE_MODEL = "/home/ubuntu/jean/models/mistral"
+
+logger = logging.getLogger(__name__)
+
+
+class GaudiLLM(CustomLLM):
+    r"""Gaudi-LLM.
+
+    Example:
+        .. code-block:: python
+
+            from llama_index.llms.ipex_llm import GaudiLLM
+            llm = GaudiLLM(model_path="/path/to/llama/model")
+    """
+
+    model_name: str = Field(
+        default=DEFAULT_HUGGINGFACE_MODEL,
+        description=(
+            "The model name to use from HuggingFace. "
+            "Unused if `model` is passed in directly."
+        ),
+    )
+    context_window: int = Field(
+        default=DEFAULT_CONTEXT_WINDOW,
+        description="The maximum number of tokens available for input.",
+        gt=0,
+    )
+    max_new_tokens: int = Field(
+        default=DEFAULT_NUM_OUTPUTS,
+        description="The maximum number of tokens to generate.",
+        gt=0,
+    )
+    query_wrapper_prompt: PromptTemplate = Field(
+        default=PromptTemplate("{query_str}"),
+        description=(
+            "The query wrapper prompt, containing the query placeholder. "
+            "The model card on HuggingFace should specify if this is needed. "
+            "Should contain a `{query_str}` placeholder."
+        ),
+    )
+    tokenizer_name: str = Field(
+        default=DEFAULT_HUGGINGFACE_MODEL,
+        description=(
+            "The name of the tokenizer to use from HuggingFace. "
+            "Unused if `tokenizer` is passed in directly."
+        ),
+    )
+    device_map: str = Field(
+        default="cpu", description="The device_map to use. Defaults to 'cpu'."
+    )
+    stopping_ids: List[int] = Field(
+        default_factory=list,
+        description=(
+            "The stopping ids to use. "
+            "Generation stops when these token IDs are predicted."
+        ),
+    )
+    tokenizer_outputs_to_remove: list = Field(
+        default_factory=list,
+        description=(
+            "The outputs to remove from the tokenizer. "
+            "Sometimes huggingface tokenizers return extra inputs that cause errors."
+        ),
+    )
+    tokenizer_kwargs: dict = Field(
+        default_factory=dict, description="The kwargs to pass to the tokenizer."
+    )
+    model_kwargs: dict = Field(
+        default_factory=dict,
+        description="The kwargs to pass to the model during initialization.",
+    )
+    generate_kwargs: dict = Field(
+        default_factory=dict,
+        description="The kwargs to pass to the model during generation.",
+    )
+    is_chat_model: bool = Field(
+        default=False,
+        description=" Be sure to verify that you either pass an appropriate tokenizer "
+                    "that can convert prompts to properly formatted chat messages or a "
+                                "`messages_to_prompt` that does so.",
+    )
+
+    _model: Any = PrivateAttr()
+    _tokenizer: Any = PrivateAttr()
+    _stopping_criteria: Any = PrivateAttr()
+
+    def __init__(
+        self, args, logger,
+        context_window: int = DEFAULT_CONTEXT_WINDOW,
+        max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
+        query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}",
+        tokenizer_name: str = DEFAULT_HUGGINGFACE_MODEL,
+        model_name: str = DEFAULT_HUGGINGFACE_MODEL,
+        model: Optional[Any] = None,
+        tokenizer: Optional[Any] = None,
+        device_map: Optional[str] = "auto",
+        stopping_ids: Optional[List[int]] = None,
+        tokenizer_kwargs: Optional[dict] = None,
+        tokenizer_outputs_to_remove: Optional[list] = None,
+        model_kwargs: Optional[dict] = None,
+        generate_kwargs: Optional[dict] = None,
+        is_chat_model: Optional[bool] = False,
+        callback_manager: Optional[CallbackManager] = None,
+        system_prompt: str = "",
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
+    ) -> None:
+        """
+        Construct GaudiLLM.
+
+        Args:
+            context_window: The maximum number of tokens available for input.
+            max_new_tokens: The maximum number of tokens to generate.
+            tokenizer_name: The name of the tokenizer to use from HuggingFace.
+                        Unused if `tokenizer` is passed in directly.
+            model_name: The model name to use from HuggingFace.
+                        Unused if `model` is passed in directly.
+            model: The HuggingFace model.
+            tokenizer: The tokenizer.
+            device_map: The device_map to use. Defaults to 'auto'.
+            stopping_ids: The stopping ids to use.
+                        Generation stops when these token IDs are predicted.
+            tokenizer_kwargs: The kwargs to pass to the tokenizer.
+            tokenizer_outputs_to_remove: The outputs to remove from the tokenizer.
+                        Sometimes huggingface tokenizers return extra inputs that cause errors.
+            model_kwargs: The kwargs to pass to the model during initialization.
+            generate_kwargs: The kwargs to pass to the model during generation.
+            is_chat_model: Whether the model is `chat`
+            callback_manager: Callback manager.
+            messages_to_prompt: Function to convert messages to prompt.
+            completion_to_prompt: Function to convert messages to prompt.
+            pydantic_program_mode: DEFAULT.
+            output_parser: BaseOutputParser.
+
+        Returns:
+            None.
+        """
+        model_kwargs = model_kwargs or {}
+
+        model, _, tokenizer, _= initialize_model(args, logger)
+
+        # check context_window
+        config_dict = model.config.to_dict()
+        model_context_window = int(
+            config_dict.get("max_position_embeddings", context_window)
+        )
+        if model_context_window and model_context_window < context_window:
+            logger.warning(
+                f"Supplied context_window {context_window} is greater "
+                f"than the model's max input size {model_context_window}. "
+                "Disable this warning by setting a lower context_window."
+            )
+            context_window = model_context_window
+
+
+        # setup stopping criteria
+        stopping_ids_list = stopping_ids or []
+
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        class StopOnTokens(StoppingCriteria):
+            def __call__(
+                self,
+                input_ids: torch.LongTensor,
+                scores: torch.FloatTensor,
+                **kwargs: Any,
+            ) -> bool:
+                for stop_id in stopping_ids_list:
+                    if input_ids[0][-1] == stop_id:
+                        return True
+                return False
+
+        stopping_criteria = StoppingCriteriaList([StopOnTokens()])
+        if isinstance(query_wrapper_prompt, str):
+            query_wrapper_prompt = PromptTemplate(query_wrapper_prompt)
+
+        messages_to_prompt = messages_to_prompt or self._tokenizer_messages_to_prompt
+
+        super().__init__(
+            context_window=context_window,
+            max_new_tokens=max_new_tokens,
+            query_wrapper_prompt=query_wrapper_prompt,
+            tokenizer_name=tokenizer_name,
+            model_name=model_name,
+            device_map=device_map,
+            stopping_ids=stopping_ids or [],
+            tokenizer_kwargs=tokenizer_kwargs or {},
+            tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or [],
+            model_kwargs=model_kwargs or {},
+            generate_kwargs=generate_kwargs or {},
+            is_chat_model=is_chat_model,
+            callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
+        )
+
+        self._model = model
+        self._tokenizer = tokenizer
+        self._stopping_criteria = stopping_criteria
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "GaudiLLM"
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        """LLM metadata."""
+        return LLMMetadata(
+            context_window=self.context_window,
+            num_output=self.max_new_tokens,
+            model_name=self.model_name,
+            is_chat_model=self.is_chat_model,
+        )
+
+    def _tokenizer_messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
+        if hasattr(self._tokenizer, "apply_chat_template"):
+            messages_dict = [
+                {"role": message.role.value, "content": message.content}
+                for message in messages
+            ]
+            tokens = self._tokenizer.apply_chat_template(messages_dict)
+            return self._tokenizer.decode(tokens)
+
+        return generic_messages_to_prompt(messages)
+
+    @llm_completion_callback()
+    def complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        """
+        Complete by LLM.
+
+        Args:
+            prompt: Prompt for completion.
+            formatted: Whether the prompt is formatted by wrapper.
+            kwargs: Other kwargs for complete.
+
+        Returns:
+            CompletionReponse after generation.
+        """
+        if not formatted:
+            prompt = self.completion_to_prompt(prompt)
+        input_ids = self._tokenizer(prompt, return_tensors="pt")
+        input_ids = input_ids.to(self._model.device)
+        # remove keys from the tokenizer if needed, to avoid HF errors
+        for key in self.tokenizer_outputs_to_remove:
+            if key in input_ids:
+                input_ids.pop(key, None)
+        tokens = self._model.generate(
+            **input_ids,
+            max_new_tokens=self.max_new_tokens,
+            stopping_criteria=self._stopping_criteria,
+            pad_token_id=self._tokenizer.pad_token_id,
+            **self.generate_kwargs,
+        )
+        completion_tokens = tokens[0][input_ids["input_ids"].size(1) :]
+        completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)
+
+        return CompletionResponse(text=completion, raw={"model_output": tokens})
+
+    @llm_completion_callback()
+    def stream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseGen:
+        """
+        Complete by LLM in stream.
+
+        Args:
+            prompt: Prompt for completion.
+            formatted: Whether the prompt is formatted by wrapper.
+            kwargs: Other kwargs for complete.
+
+        Returns:
+            CompletionReponse after generation.
+        """
+        from transformers import TextIteratorStreamer
+
+        if not formatted:
+            prompt = self.completion_to_prompt(prompt)
+
+        input_ids = self._tokenizer.encode(prompt, return_tensors="pt")
+        input_ids = input_ids.to(self._model.device)
+
+        for key in self.tokenizer_outputs_to_remove:
+            if key in input_ids:
+                input_ids.pop(key, None)
+
+        streamer = TextIteratorStreamer(
+            self._tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        generation_kwargs = dict(
+            input_ids=input_ids,
+            streamer=streamer,
+            max_new_tokens=self.max_new_tokens,
+            stopping_criteria=self._stopping_criteria,
+            pad_token_id=self._tokenizer.pad_token_id,
+            **self.generate_kwargs,
+        )
+        thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        # create generator based off of streamer
+        def gen() -> CompletionResponseGen:
+            text = ""
+            for x in streamer:
+                text += x
+                yield CompletionResponse(text=text, delta=x)
+
+        return gen()
+
+    @llm_chat_callback()
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.complete(prompt, formatted=True, **kwargs)
+        return completion_response_to_chat_response(completion_response)
+
+    @llm_chat_callback()
+    def stream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseGen:
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
+        return stream_completion_response_to_chat_response(completion_response)
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py
new file mode 100644
index 0000000000000..5ce06710976f2
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py
@@ -0,0 +1,788 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###############################################################################
+# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import copy
+import glob
+import os
+import argparse
+import shutil
+import tempfile
+import time
+from pathlib import Path
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.utils import check_min_version
+
+from optimum.habana.checkpoint_utils import (
+    get_ds_injection_policy,
+    get_repo_root,
+    model_is_optimized,
+    model_on_meta,
+    write_checkpoints_json,
+)
+from optimum.habana.utils import (
+    check_habana_frameworks_version,
+    check_optimum_habana_min_version,
+    get_habana_frameworks_version,
+    set_seed,
+)
+
+def setup_parser(parser):
+    # Arguments management
+    parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu")
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        #required=True,
+        help="Path to pre-trained model (on the HF Hub or locally).",
+    )
+    parser.add_argument(
+        "--bf16",
+        default=True,
+        action="store_true",
+        help="Whether to perform generation in bf16 precision.",
+    )
+    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
+    parser.add_argument(
+        "--max_input_tokens",
+        type=int,
+        default=0,
+        help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \
+            if == 0, then truncate to 16 (original default) \
+            if < 0, then do not truncate, use full input prompt",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
+    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
+    parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
+    parser.add_argument(
+        "--use_kv_cache",
+        default=True,
+        action="store_true",
+        help="Whether to use the key/value cache for decoding. It should speed up generation.",
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        default=True,
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        default=None,
+        type=str,
+        help="Optional argument if you want to assess your model on a given dataset of the HF Hub.",
+    )
+    parser.add_argument(
+        "--column_name",
+        default=None,
+        type=str,
+        help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
+    )
+    parser.add_argument(
+        "--do_sample",
+        action="store_true",
+        help="Whether to use sampling for generation.",
+    )
+    parser.add_argument(
+        "--num_beams",
+        default=1,
+        type=int,
+        help="Number of beams used for beam search generation. 1 means greedy search will be performed.",
+    )
+    parser.add_argument(
+        "--trim_logits",
+        action="store_true",
+        help="Calculate logits only for the last token to save memory in the first step.",
+    )
+    parser.add_argument(
+        "--seed",
+        default=27,
+        type=int,
+        help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.",
+    )
+    parser.add_argument(
+        "--profiling_warmup_steps",
+        default=0,
+        type=int,
+        help="Number of steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps",
+        default=0,
+        type=int,
+        help="Number of steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_record_shapes",
+        default=False,
+        type=bool,
+        help="Record shapes when enabling profiling.",
+    )
+    parser.add_argument(
+        "--prompt",
+        default=None,
+        type=str,
+        nargs="*",
+        help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")',
+    )
+    parser.add_argument(
+        "--bad_words",
+        default=None,
+        type=str,
+        nargs="+",
+        help="Optional argument list of words that are not allowed to be generated.",
+    )
+    parser.add_argument(
+        "--force_words",
+        default=None,
+        type=str,
+        nargs="+",
+        help="Optional argument list of words that must be generated.",
+    )
+    parser.add_argument(
+        "--assistant_model",
+        default=None,
+        type=str,
+        help="Optional argument to give a path to a draft/assistant model for assisted decoding.",
+    )
+    parser.add_argument(
+        "--peft_model",
+        default=None,
+        type=str,
+        help="Optional argument to give a path to a PEFT model.",
+    )
+    parser.add_argument("--num_return_sequences", type=int, default=1)
+    parser.add_argument(
+        "--token",
+        default=None,
+        type=str,
+        help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+        "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
+    )
+    parser.add_argument(
+        "--model_revision",
+        default="main",
+        type=str,
+        help="The specific model version to use (can be a branch name, tag name or commit id).",
+    )
+    parser.add_argument(
+        "--attn_softmax_bf16",
+        action="store_true",
+        help="Whether to run attention softmax layer in lower precision provided that the model supports it and "
+        "is also running in lower precision.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        help="Output directory to store results in.",
+    )
+    parser.add_argument(
+        "--bucket_size",
+        default=-1,
+        type=int,
+        help="Bucket size to maintain static shapes. If this number is negative (default is -1) \
+            then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
+            we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
+    )
+    parser.add_argument(
+        "--bucket_internal",
+        action="store_true",
+        help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
+    )
+    parser.add_argument(
+        "--dataset_max_samples",
+        default=-1,
+        type=int,
+        help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.",
+    )
+    parser.add_argument(
+        "--limit_hpu_graphs",
+        action="store_true",
+        help="Skip HPU Graph usage for first token to save memory",
+    )
+    parser.add_argument(
+        "--reuse_cache",
+        action="store_true",
+        help="Whether to reuse key/value cache for decoding. It should save memory.",
+    )
+    parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers")
+    parser.add_argument(
+        "--simulate_dyn_prompt",
+        default=None,
+        type=int,
+        nargs="*",
+        help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.",
+    )
+    parser.add_argument(
+        "--reduce_recompile",
+        action="store_true",
+        help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
+    )
+
+    parser.add_argument(
+        "--use_flash_attention",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention, provided that the model supports it.",
+    )
+    parser.add_argument(
+        "--flash_attention_recompute",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
+    )
+    parser.add_argument(
+        "--flash_attention_causal_mask",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
+    )
+    parser.add_argument(
+        "--flash_attention_fast_softmax",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in fast softmax mode.",
+    )
+    parser.add_argument(
+        "--book_source",
+        action="store_true",
+        help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.",
+    )
+    parser.add_argument(
+        "--torch_compile",
+        action="store_true",
+        help="Whether to use torch compiled model or not.",
+    )
+    parser.add_argument(
+        "--ignore_eos",
+        default=True,
+        action=argparse.BooleanOptionalAction,
+        help="Whether to ignore eos, set False to disable it",
+    )
+    parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation")
+    parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling")
+    parser.add_argument(
+        "--const_serialization_path",
+        "--csp",
+        type=str,
+        help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
+    )
+    parser.add_argument(
+        "--disk_offload",
+        action="store_true",
+        help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+    )
+    args = parser.parse_args()
+
+    if args.torch_compile:
+        args.use_hpu_graphs = False
+
+    if not args.use_hpu_graphs:
+        args.limit_hpu_graphs = False
+
+    args.quant_config = os.getenv("QUANT_CONFIG", "")
+    if args.quant_config == "" and args.disk_offload:
+        logger.warning(
+            "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
+        )
+    return args
+
+def adjust_batch(batch, size):
+    curr_size = batch["input_ids"].shape[1]
+    if curr_size >= size:
+        adjusted_batch = {
+            "input_ids": batch["input_ids"][:, :size],
+            "attention_mask": batch["attention_mask"][:, :size],
+        }
+    else:
+        adjusted_batch = {}
+        for k in batch.keys():
+            last_colm = batch[k][:, -1]
+            expanded = last_colm.tile((size - curr_size, 1)).T
+            adjusted_batch[k] = torch.concat([batch[k], expanded], 1)
+    assert adjusted_batch["input_ids"].shape[1] == size
+    assert adjusted_batch["attention_mask"].shape[1] == size
+    return adjusted_batch
+
+
+def override_print(enable):
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if force or enable:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def override_logger(logger, enable):
+    logger_info = logger.info
+
+    def info(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if force or enable:
+            logger_info(*args, **kwargs)
+
+    logger.info = info
+
+
+def count_hpu_graphs():
+    return len(glob.glob(".graph_dumps/*PreGraph*"))
+
+
+def override_prints(enable, logger):
+    override_print(enable)
+    override_logger(logger, enable)
+
+
+def setup_distributed(args):
+    args.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    args.world_size = int(os.getenv("WORLD_SIZE", "0"))
+    args.global_rank = int(os.getenv("RANK", "0"))
+
+
+def setup_inference(args, model):
+    import habana_frameworks.torch.core as htcore
+
+    habana_version = get_habana_frameworks_version()
+
+    print("Initializing inference mode")
+    # Keeping the if-else here for back compat. TODO remove later
+    if habana_version.major >= 1 and habana_version.minor >= 16:
+        htcore.hpu_initialize(model, mark_only_scales_as_const=True)
+    else:
+        const_marking = os.getenv("ENABLE_CONST_MARKING", "True")
+        if const_marking == "True":
+            htcore.hpu_initialize(model)
+    return model
+
+
+def setup_const_serialization(const_serialization_path):
+    import uuid
+
+    const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex)
+    os.makedirs(const_serialization_path)
+    from habana_frameworks.torch.hpu import enable_const_section_serialization
+
+    print("Serializing const params to {}".format(const_serialization_path))
+    enable_const_section_serialization(const_serialization_path, True)
+
+
+def setup_env(args):
+    # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+    check_min_version("4.34.0")
+    check_optimum_habana_min_version("1.9.0.dev0")
+    # TODO: SW-167588 - WA for memory issue in hqt prep_model
+    os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
+
+    if args.global_rank == 0 and not args.torch_compile:
+        os.environ.setdefault("GRAPH_VISUALIZATION", "true")
+        shutil.rmtree(".graph_dumps", ignore_errors=True)
+
+    if args.world_size > 0:
+        os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+        os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")
+
+    if args.use_hpu_graphs and args.limit_hpu_graphs and not args.reuse_cache and args.bucket_internal:
+        # Based upon above conditions and below env variable,
+        # we can call HPU graphs clear_inputs().
+        os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1")
+
+    # Tweak generation so that it runs faster on Gaudi
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+    adapt_transformers_to_gaudi()
+
+
+def setup_device(args):
+    if args.device == "hpu":
+        import habana_frameworks.torch.core as htcore
+
+        if args.quant_config:
+            htcore.hpu_set_env()
+    return torch.device(args.device)
+
+
+# patching LinearAllreduce to use ScopedLinearAllReduce
+def patch_scoped_linear_all_reduce(model):
+    from deepspeed.module_inject.layers import LinearAllreduce
+
+    from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce
+
+    for name, module in model.named_children():
+        if type(module) is LinearAllreduce:
+            SL = ScopedLinearAllReduce(mod=module)
+            setattr(model, name, SL)
+        patch_scoped_linear_all_reduce(module)
+
+
+def get_torch_compiled_model(model):
+    model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True})
+    return model
+
+
+def setup_model(args, model_dtype, model_kwargs, logger):
+    logger.info("Single-device run.")
+    if args.assistant_model is None:
+        assistant_model = None
+    else:
+        logger.info(f"Using asssitant model {args.assistant_model}.")
+    if args.disk_offload:
+        from accelerate import infer_auto_device_map, init_empty_weights
+
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        with init_empty_weights():
+            model = AutoModelForCausalLM.from_config(config)
+        max_memory = {"cpu": "10GiB"}
+        device_map = infer_auto_device_map(model, max_memory=max_memory, dtype=model_dtype)
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            device_map=device_map,
+            offload_folder="/tmp/offload_folder/",
+            offload_state_dict=True,
+            torch_dtype=model_dtype,
+            **model_kwargs,
+        )
+    else:
+        if args.assistant_model is not None:
+            assistant_model = AutoModelForCausalLM.from_pretrained(
+                args.assistant_model, torch_dtype=model_dtype, **model_kwargs
+            )
+        if args.peft_model is not None:
+            model = peft_model(args, model_dtype, logger, **model_kwargs)
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+            )
+    if args.quant_config:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+        if args.assistant_model is not None:
+            habana_quantization_toolkit.quantize_model(assistant_model)
+
+    model = model.eval().to(args.device)
+    if args.assistant_model is not None:
+        assistant_model = assistant_model.eval().to(args.device)
+
+    if args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+        from optimum.habana.transformers.trainer import _is_peft_model
+
+        if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
+            model = wrap_in_hpu_graph(model, hash_with_views=False)
+        else:
+            model = wrap_in_hpu_graph(model)
+        if args.assistant_model is not None:
+            assistant_model = wrap_in_hpu_graph(assistant_model)
+        if _is_peft_model(model):
+            model.base_model = wrap_in_hpu_graph(model.base_model)
+
+    if args.torch_compile and model.config.model_type == "llama":
+        model = get_torch_compiled_model(model)
+        # if args.assistant_model is not None:
+        #     assistant_model = get_torch_compiled_model(assistant_model)
+    return model, assistant_model
+
+
+def setup_distributed_model(args, model_dtype, model_kwargs, logger):
+    import deepspeed
+
+    logger.info("DeepSpeed is enabled.")
+    deepspeed.init_distributed(dist_backend="hccl")
+    config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+    load_to_meta = model_on_meta(config)
+
+    if args.assistant_model is None:
+        assistant_model = None
+    else:
+        logger.info(f"Using asssitant model {args.assistant_model}.")
+
+    if load_to_meta:
+        # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
+        with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
+            model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
+
+        # Model loaded to meta is managed differently
+        checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
+
+        # For PEFT models, write the merged model on disk to be able to load it on the meta device
+        if args.peft_model is not None:
+            merged_model_dir = "/tmp/text_generation_merged_peft_model"
+            if args.local_rank == 0:
+                if Path(merged_model_dir).is_dir():
+                    shutil.rmtree(merged_model_dir)
+                peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir)
+            torch.distributed.barrier()
+
+        write_checkpoints_json(
+            merged_model_dir if args.peft_model is not None else args.model_name_or_path,
+            args.local_rank,
+            checkpoints_json,
+            token=args.token,
+        )
+    else:
+        # TODO: revisit placement on CPU when auto-injection is possible
+        with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
+            if args.peft_model is not None:
+                model = peft_model(args, model_dtype, logger, **model_kwargs)
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+                )
+    model.eval()
+
+    if args.assistant_model is not None:
+        assistant_model = AutoModelForCausalLM.from_pretrained(
+            args.assistant_model, torch_dtype=model_dtype, **model_kwargs
+        ).eval()
+
+    # Initialize the model
+    ds_inference_kwargs = {"dtype": model_dtype}
+    ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size}
+    ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs
+    ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)
+    if load_to_meta:
+        ds_inference_kwargs["checkpoint"] = checkpoints_json.name
+
+    model = deepspeed.init_inference(model, **ds_inference_kwargs)
+    model = model.module
+    if model.config.model_type in ["llama", "falcon", "qwen2"]:
+        patch_scoped_linear_all_reduce(model)
+
+    if args.quant_config:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+        if args.assistant_model is not None:
+            habana_quantization_toolkit.prep_model(assistant_model)
+
+    if args.torch_compile and model.config.model_type == "llama":
+        model = get_torch_compiled_model(model)
+        # if args.assistant_model is not None:
+        #     assistant_model = get_torch_compiled_model(assistant_model)
+    return model, assistant_model
+
+
+def peft_model(args, model_dtype, logger, **model_kwargs):
+    import importlib.util
+
+    if importlib.util.find_spec("peft") is None:
+        raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
+    from peft import AutoPeftModelForCausalLM
+    from peft.config import PeftConfigMixin
+
+    base_model_name = PeftConfigMixin.from_pretrained(
+        args.peft_model,
+        token=model_kwargs["token"] if "token" in model_kwargs else None,
+    ).base_model_name_or_path
+
+    base_model_is_local = Path(base_model_name).is_dir()
+    if not base_model_is_local:
+        # Check if the base model path to a remote repository on the HF Hub exists
+        from huggingface_hub import list_repo_files
+
+        try:
+            list_repo_files(base_model_name)
+            base_model_is_remote = True
+        except Exception:
+            base_model_is_remote = False
+
+    if base_model_is_local or base_model_is_remote:
+        model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+    else:
+        # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
+        logger.warning(
+            f"The base model `{base_model_name}` of the LoRA configuration associated"
+            f" to `{args.peft_model}` does not exist locally or remotely. Using "
+            f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model."
+        )
+        from peft import PeftModel
+
+        model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+        model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+    if hasattr(model, "merge_and_unload"):
+        model = model.merge_and_unload()
+        if model_dtype == torch.bfloat16:
+            model = model.to(torch.bfloat16)
+        return model
+    else:
+        from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation
+
+        model.__class__.generate = gaudi_generate
+        model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation
+        return model
+
+
+def setup_tokenizer(args, model, assistant_model):
+    tokenizer_kwargs = {
+        "revision": args.model_revision,
+        "token": args.token,
+        "trust_remote_code": args.trust_remote_code,
+    }
+    if args.bad_words is not None or args.force_words is not None:
+        tokenizer_kwargs["add_prefix_space"] = True
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs)
+    if not model.config.is_encoder_decoder:
+        tokenizer.padding_side = "left"
+
+    if model.config.model_type == "llama":
+        # unwind broken decapoda-research config
+        model.generation_config.pad_token_id = 0
+        model.generation_config.bos_token_id = 1
+        model.generation_config.eos_token_id = 2
+        if assistant_model is not None:
+            assistant_model.generation_config.pad_token_id = 0
+            assistant_model.generation_config.bos_token_id = 1
+            assistant_model.generation_config.eos_token_id = 2
+        tokenizer.bos_token_id = model.generation_config.bos_token_id
+        tokenizer.eos_token_id = model.generation_config.eos_token_id
+        tokenizer.pad_token_id = model.generation_config.pad_token_id
+        tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
+        tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
+        tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
+    if model.config.model_type == "persimmon":
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        if assistant_model is not None:
+            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
+        tokenizer.bos_token_id = model.generation_config.bos_token_id
+        tokenizer.eos_token_id = model.generation_config.eos_token_id
+        tokenizer.pad_token_id = model.generation_config.pad_token_id
+        tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
+        tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
+        tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
+
+    # Some models like GPT2 do not have a PAD token so we have to set it if necessary
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        if assistant_model is not None:
+            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
+
+    return tokenizer, model, assistant_model
+
+
+def setup_generation_config(args, model, assistant_model, tokenizer):
+    bad_words_ids = None
+    force_words_ids = None
+    if args.bad_words is not None:
+        bad_words_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in args.bad_words]
+    if args.force_words is not None:
+        force_words_ids = [tokenizer.encode(force_word, add_special_tokens=False) for force_word in args.force_words]
+
+    is_optimized = model_is_optimized(model.config)
+
+    # Generation configuration
+    generation_config = copy.deepcopy(model.generation_config)
+    generation_config.max_new_tokens = args.max_new_tokens
+    generation_config.use_cache = args.use_kv_cache
+    generation_config.static_shapes = is_optimized and assistant_model is None
+    generation_config.bucket_size = args.bucket_size if is_optimized else -1
+    generation_config.bucket_internal = args.bucket_internal
+    generation_config.do_sample = args.do_sample
+    generation_config.num_beams = args.num_beams
+    generation_config.bad_words_ids = bad_words_ids
+    generation_config.force_words_ids = force_words_ids
+    generation_config.num_return_sequences = args.num_return_sequences
+    generation_config.trim_logits = args.trim_logits
+    generation_config.attn_softmax_bf16 = args.attn_softmax_bf16
+    generation_config.limit_hpu_graphs = args.limit_hpu_graphs
+    generation_config.reuse_cache = args.reuse_cache
+    generation_config.reduce_recompile = args.reduce_recompile
+    if generation_config.reduce_recompile:
+        assert generation_config.bucket_size > 0
+    generation_config.use_flash_attention = args.use_flash_attention
+    generation_config.flash_attention_recompute = args.flash_attention_recompute
+    generation_config.flash_attention_causal_mask = args.flash_attention_causal_mask
+    generation_config.flash_attention_fast_softmax = args.flash_attention_fast_softmax
+    generation_config.trust_remote_code = args.trust_remote_code
+
+    return generation_config
+
+
+def exclude_hpu_graph_configs(args):
+    # Excluded configs for batch size 1 for hpu graph
+    if args.batch_size == 1 and args.limit_hpu_graphs:
+        if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path:
+            return False
+        if args.world_size == 2 or args.world_size == 4 or args.world_size == 8:
+            if args.quant_config:
+                if args.max_input_tokens >= 8192 and args.max_new_tokens >= 128:
+                    return False
+            else:
+                if args.max_input_tokens >= 4096 and args.max_new_tokens >= 128:
+                    return False
+        return True
+    else:
+        return False
+
+
+def initialize_model(args, logger):
+    init_start = time.perf_counter()
+    setup_distributed(args)
+    if exclude_hpu_graph_configs(args):
+        args.limit_hpu_graphs = False
+    override_prints(args.global_rank == 0 or args.verbose_workers, logger)
+    setup_env(args)
+    setup_device(args)
+    set_seed(args.seed)
+    get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token)
+    if args.assistant_model is not None:
+        get_repo_root(args.assistant_model, local_rank=args.local_rank, token=args.token)
+    use_deepspeed = False
+    if use_deepspeed or args.bf16:
+        model_dtype = torch.bfloat16
+    else:
+        model_dtype = torch.float
+        args.attn_softmax_bf16 = False
+
+    model_kwargs = {
+        "revision": args.model_revision,
+        "token": args.token,
+        "trust_remote_code": args.trust_remote_code,
+    }
+    if args.trust_remote_code:
+        logger.warning("`trust_remote_code` is set, there is no guarantee this model works properly and it may fail")
+
+    model, assistant_model = (
+        setup_model(args, model_dtype, model_kwargs, logger)
+        if not use_deepspeed
+        else setup_distributed_model(args, model_dtype, model_kwargs, logger)
+    )
+    tokenizer, model, assistant_model = setup_tokenizer(args, model, assistant_model)
+    generation_config = setup_generation_config(args, model, assistant_model, tokenizer)
+
+    if args.const_serialization_path:
+        setup_const_serialization(args.const_serialization_path)
+    if args.quant_config:
+        model = setup_inference(args, model)
+    init_end = time.perf_counter()
+    logger.info(f"Args: {args}")
+    logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
+    logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
+    return model, assistant_model, tokenizer, generation_config
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml
new file mode 100644
index 0000000000000..ba65f29ae2ac8
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml
@@ -0,0 +1,72 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+# Feel free to un-skip examples, and experimental, you will just need to
+# work through many typos (--write-changes and --interactive will help)
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+contains_example = false
+import_path = "llama_index.llms.gaudi"
+
+[tool.llamahub.class_authors]
+GaudiLLM = "llama-index"
+
+[tool.mypy]
+disallow_untyped_defs = true
+# Remove venv skip when integrated with pre-commit
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.10"
+
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index llms gaudi integration"
+license = "MIT"
+name = "llama-index-llms-gaudi"
+packages = [{include = "llama_index/"}]
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+huggingface-hub = "^0.23.0"
+torch = "^2.1.2"
+text-generation = "^0.7.0"
+llama-index-core = "^0.11.0"
+
+[tool.poetry.dependencies.transformers]
+extras = ["torch"]
+version = "^4.37.0"
+
+[tool.poetry.group.dev.dependencies]
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"
+types-setuptools = "67.1.0.0"
+
+[tool.poetry.group.dev.dependencies.black]
+extras = ["jupyter"]
+version = "<=23.9.1,>=23.7.0"
+
+[tool.poetry.group.dev.dependencies.codespell]
+extras = ["toml"]
+version = ">=v2.2.6"
+
+[[tool.poetry.packages]]
+include = "llama_index/"

From 3d51c673b22b8cdc73b03e6a8da5a1f889c47ccb Mon Sep 17 00:00:00 2001
From: jeanyu-habana <jean1.yu@intel.com>
Date: Sat, 5 Oct 2024 12:05:33 -0500
Subject: [PATCH 2/2] resolve review comments and lint compaints

---
 .../llms/llama-index-llms-gaudi/README.md     |  39 +-
 .../llama-index-llms-gaudi/examples/README.md |   1 -
 .../llama-index-llms-gaudi/examples/basic.py  | 327 ++++++++-
 .../llama_index/llms/gaudi/base.py            | 670 ++++++++++--------
 .../llama_index/llms/gaudi/utils.py           | 369 +++-------
 .../llama-index-llms-gaudi/pyproject.toml     |   7 +-
 6 files changed, 799 insertions(+), 614 deletions(-)

diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md
index 30780ffeb6b58..07ff53ba5d7e6 100644
--- a/llama-index-integrations/llms/llama-index-llms-gaudi/README.md
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md
@@ -4,15 +4,52 @@
 
 ```bash
 pip install --upgrade-strategy eager optimum[habana]
+pip install llama-index-llms-gaudi
+pip install llama-index-llms-huggingface
 ```
 
 ## Usage
 
 ```python
+import argparse
+import os, logging
 from llama_index.llms.gaudi import GaudiLLM
+
+
+def setup_parser(parser):
+    parser.add_argument(...)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="GaudiLLM Basic Usage Example"
+    )
+    args = setup_parser(parser)
+    args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha"
+
+    llm = GaudiLLM(
+        args=args,
+        logger=logger,
+        model_name="HuggingFaceH4/zephyr-7b-alpha",
+        tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
+        query_wrapper_prompt=PromptTemplate(
+            "<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"
+        ),
+        context_window=3900,
+        max_new_tokens=256,
+        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
+        messages_to_prompt=messages_to_prompt,
+        device_map="auto",
+    )
+
+    query = "Is the ocean blue?"
+    print("\n----------------- Complete ------------------")
+    completion_response = llm.complete(query)
+    print(completion_response.text)
 ```
 
 ## Examples
 
-- [Notebook Example](https://docs.llamaindex.ai/en/stable/examples/llm/gaudi/)
 - [More Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-gaudi/examples)
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md
index 75226f0272cef..a9bdec0912010 100644
--- a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md
@@ -27,4 +27,3 @@ python basic.py
 > ```bash
 > pip install -U transformers tokenizers
 > ```
-
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py
index 6f938533c9643..c2ec27582cf2f 100644
--- a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py
@@ -1,3 +1,311 @@
+import os, logging
+import argparse
+from llama_index.llms.gaudi import GaudiLLM
+from llama_index.core.prompts import PromptTemplate
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def setup_parser(parser):
+    # Arguments management
+    parser.add_argument(
+        "--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu"
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        # required=True,
+        help="Path to pre-trained model (on the HF Hub or locally).",
+    )
+    parser.add_argument(
+        "--bf16",
+        default=True,
+        action="store_true",
+        help="Whether to perform generation in bf16 precision.",
+    )
+    parser.add_argument(
+        "--max_new_tokens", type=int, default=100, help="Number of tokens to generate."
+    )
+    parser.add_argument(
+        "--max_input_tokens",
+        type=int,
+        default=0,
+        help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \
+            if == 0, then truncate to 16 (original default) \
+            if < 0, then do not truncate, use full input prompt",
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=3,
+        help="Number of warmup iterations for benchmarking.",
+    )
+    parser.add_argument(
+        "--n_iterations",
+        type=int,
+        default=5,
+        help="Number of inference iterations for benchmarking.",
+    )
+    parser.add_argument(
+        "--local_rank", type=int, default=0, metavar="N", help="Local process rank."
+    )
+    parser.add_argument(
+        "--use_kv_cache",
+        default=True,
+        action="store_true",
+        help="Whether to use the key/value cache for decoding. It should speed up generation.",
+    )
+    parser.add_argument(
+        "--use_hpu_graphs",
+        default=True,
+        action="store_true",
+        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        default=None,
+        type=str,
+        help="Optional argument if you want to assess your model on a given dataset of the HF Hub.",
+    )
+    parser.add_argument(
+        "--column_name",
+        default=None,
+        type=str,
+        help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
+    )
+    parser.add_argument(
+        "--do_sample",
+        action="store_true",
+        help="Whether to use sampling for generation.",
+    )
+    parser.add_argument(
+        "--num_beams",
+        default=1,
+        type=int,
+        help="Number of beams used for beam search generation. 1 means greedy search will be performed.",
+    )
+    parser.add_argument(
+        "--trim_logits",
+        action="store_true",
+        help="Calculate logits only for the last token to save memory in the first step.",
+    )
+    parser.add_argument(
+        "--seed",
+        default=27,
+        type=int,
+        help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.",
+    )
+    parser.add_argument(
+        "--profiling_warmup_steps",
+        default=0,
+        type=int,
+        help="Number of steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps",
+        default=0,
+        type=int,
+        help="Number of steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_record_shapes",
+        default=False,
+        type=bool,
+        help="Record shapes when enabling profiling.",
+    )
+    parser.add_argument(
+        "--prompt",
+        default=None,
+        type=str,
+        nargs="*",
+        help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")',
+    )
+    parser.add_argument(
+        "--bad_words",
+        default=None,
+        type=str,
+        nargs="+",
+        help="Optional argument list of words that are not allowed to be generated.",
+    )
+    parser.add_argument(
+        "--force_words",
+        default=None,
+        type=str,
+        nargs="+",
+        help="Optional argument list of words that must be generated.",
+    )
+    parser.add_argument(
+        "--assistant_model",
+        default=None,
+        type=str,
+        help="Optional argument to give a path to a draft/assistant model for assisted decoding.",
+    )
+    parser.add_argument(
+        "--peft_model",
+        default=None,
+        type=str,
+        help="Optional argument to give a path to a PEFT model.",
+    )
+    parser.add_argument("--num_return_sequences", type=int, default=1)
+    parser.add_argument(
+        "--token",
+        default=None,
+        type=str,
+        help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+        "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
+    )
+    parser.add_argument(
+        "--model_revision",
+        default="main",
+        type=str,
+        help="The specific model version to use (can be a branch name, tag name or commit id).",
+    )
+    parser.add_argument(
+        "--attn_softmax_bf16",
+        action="store_true",
+        help="Whether to run attention softmax layer in lower precision provided that the model supports it and "
+        "is also running in lower precision.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        help="Output directory to store results in.",
+    )
+    parser.add_argument(
+        "--bucket_size",
+        default=-1,
+        type=int,
+        help="Bucket size to maintain static shapes. If this number is negative (default is -1) \
+            then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
+            we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
+    )
+    parser.add_argument(
+        "--bucket_internal",
+        action="store_true",
+        help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
+    )
+    parser.add_argument(
+        "--dataset_max_samples",
+        default=-1,
+        type=int,
+        help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.",
+    )
+    parser.add_argument(
+        "--limit_hpu_graphs",
+        action="store_true",
+        help="Skip HPU Graph usage for first token to save memory",
+    )
+    parser.add_argument(
+        "--reuse_cache",
+        action="store_true",
+        help="Whether to reuse key/value cache for decoding. It should save memory.",
+    )
+    parser.add_argument(
+        "--verbose_workers",
+        action="store_true",
+        help="Enable output from non-master workers",
+    )
+    parser.add_argument(
+        "--simulate_dyn_prompt",
+        default=None,
+        type=int,
+        nargs="*",
+        help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.",
+    )
+    parser.add_argument(
+        "--reduce_recompile",
+        action="store_true",
+        help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
+    )
+
+    parser.add_argument(
+        "--use_flash_attention",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention, provided that the model supports it.",
+    )
+    parser.add_argument(
+        "--flash_attention_recompute",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
+    )
+    parser.add_argument(
+        "--flash_attention_causal_mask",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
+    )
+    parser.add_argument(
+        "--flash_attention_fast_softmax",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in fast softmax mode.",
+    )
+    parser.add_argument(
+        "--book_source",
+        action="store_true",
+        help="Whether to use project Guttenberg books data as input. Useful for testing large sequence lengths.",
+    )
+    parser.add_argument(
+        "--torch_compile",
+        action="store_true",
+        help="Whether to use torch compiled model or not.",
+    )
+    parser.add_argument(
+        "--ignore_eos",
+        default=True,
+        action=argparse.BooleanOptionalAction,
+        help="Whether to ignore eos, set False to disable it",
+    )
+    parser.add_argument(
+        "--temperature",
+        default=1.0,
+        type=float,
+        help="Temperature value for text generation",
+    )
+    parser.add_argument(
+        "--top_p",
+        default=1.0,
+        type=float,
+        help="Top_p value for generating text via sampling",
+    )
+    parser.add_argument(
+        "--const_serialization_path",
+        "--csp",
+        type=str,
+        help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
+    )
+    parser.add_argument(
+        "--disk_offload",
+        action="store_true",
+        help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+    )
+    args = parser.parse_args()
+
+    if args.torch_compile:
+        args.use_hpu_graphs = False
+
+    if not args.use_hpu_graphs:
+        args.limit_hpu_graphs = False
+
+    args.quant_config = os.getenv("QUANT_CONFIG", "")
+    if args.quant_config == "" and args.disk_offload:
+        logger.warning(
+            "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
+        )
+    return args
+
+
 # Transform a string into input zephyr-specific input
 def completion_to_prompt(completion):
     return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
@@ -24,21 +332,6 @@ def messages_to_prompt(messages):
     return prompt
 
 
-import logging
-import argparse
-from llama_index.llms.gaudi import GaudiLLM
-from llama_index.core.prompts import PromptTemplate
-from llama_index.llms.gaudi.utils import (
-    setup_parser,
-)
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example")
     args = setup_parser(parser)
@@ -49,7 +342,9 @@ def messages_to_prompt(messages):
         logger=logger,
         model_name="HuggingFaceH4/zephyr-7b-alpha",
         tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
-        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
+        query_wrapper_prompt=PromptTemplate(
+            "<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"
+        ),
         context_window=3900,
         max_new_tokens=256,
         generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py
index dfb6e9231d162..25732482fddb5 100644
--- a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py
@@ -1,72 +1,381 @@
-# This file is adapted from
-# https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/
-# llms/llama-index-llms-huggingface/llama_index/llms/huggingface/base.py
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import logging
 from typing import Any, Callable, List, Optional, Sequence, Union
 
-import torch
-from llama_index.core.base.llms.types import (
-    ChatMessage,
-    ChatResponse,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseGen,
-    LLMMetadata,
-)
-from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.base.llms.types import ChatMessage
+from llama_index.core.bridge.pydantic import Field
+from llama_index.llms.huggingface.base import HuggingFaceLLM
 from llama_index.core.callbacks import CallbackManager
 from llama_index.core.constants import (
     DEFAULT_CONTEXT_WINDOW,
     DEFAULT_NUM_OUTPUTS,
 )
-from llama_index.core.llms.callbacks import (
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.core.llms.custom import CustomLLM
-
-from llama_index.core.base.llms.generic_utils import (
-    completion_response_to_chat_response,
-    stream_completion_response_to_chat_response,
-    messages_to_prompt as generic_messages_to_prompt,
-)
+from llama_index.core.types import BaseOutputParser, PydanticProgramMode
 from llama_index.core.prompts.base import PromptTemplate
-from llama_index.core.types import BaseOutputParser, PydanticProgramMode, Thread
-from transformers import (
-    StoppingCriteria,
-    StoppingCriteriaList,
-)
-from transformers import AutoTokenizer, LlamaTokenizer
-#gaudi
+
 from llama_index.llms.gaudi.utils import initialize_model
-from llama_index.llms.huggingface import HuggingFaceLLM
 
-#DEFAULT_HUGGINGFACE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
-DEFAULT_HUGGINGFACE_MODEL = "/home/ubuntu/jean/models/mistral"
+DEFAULT_HUGGINGFACE_MODEL = "Intel/neural-chat-7b-v3-1"
 
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
 logger = logging.getLogger(__name__)
 
 
-class GaudiLLM(CustomLLM):
-    r"""Gaudi-LLM.
+class GaudiLLM(HuggingFaceLLM):
+    r"""GaudiLLM LLM.
 
-    Example:
-        .. code-block:: python
+    Examples:
+        `pip install llama-index-llms-gaudi`
+
+        ```python
+        from llama_index.llms.gaudi import GaudiLLM
+        import argparse
+        import os, logging
+
+        def setup_parser(parser):
+            # Arguments management
+            parser.add_argument(
+                "--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu"
+            )
+            parser.add_argument(
+                "--model_name_or_path",
+                default=None,
+                type=str,
+                # required=True,
+                help="Path to pre-trained model (on the HF Hub or locally).",
+            )
+            parser.add_argument(
+                "--bf16",
+                default=True,
+                action="store_true",
+                help="Whether to perform generation in bf16 precision.",
+            )
+            parser.add_argument(
+                "--max_new_tokens", type=int, default=100, help="Number of tokens to generate."
+            )
+            parser.add_argument(
+                "--max_input_tokens",
+                type=int,
+                default=0,
+                help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \
+                    if == 0, then truncate to 16 (original default) \
+                    if < 0, then do not truncate, use full input prompt",
+            )
+            parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
+            parser.add_argument(
+                "--warmup",
+                type=int,
+                default=3,
+                help="Number of warmup iterations for benchmarking.",
+            )
+            parser.add_argument(
+                "--n_iterations",
+                type=int,
+                default=5,
+                help="Number of inference iterations for benchmarking.",
+            )
+            parser.add_argument(
+                "--local_rank", type=int, default=0, metavar="N", help="Local process rank."
+            )
+            parser.add_argument(
+                "--use_kv_cache",
+                default=True,
+                action="store_true",
+                help="Whether to use the key/value cache for decoding. It should speed up generation.",
+            )
+            parser.add_argument(
+                "--use_hpu_graphs",
+                default=True,
+                action="store_true",
+                help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
+            )
+            parser.add_argument(
+                "--dataset_name",
+                default=None,
+                type=str,
+                help="Optional argument if you want to assess your model on a given dataset of the HF Hub.",
+            )
+            parser.add_argument(
+                "--column_name",
+                default=None,
+                type=str,
+                help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
+            )
+            parser.add_argument(
+                "--do_sample",
+                action="store_true",
+                help="Whether to use sampling for generation.",
+            )
+            parser.add_argument(
+                "--num_beams",
+                default=1,
+                type=int,
+                help="Number of beams used for beam search generation. 1 means greedy search will be performed.",
+            )
+            parser.add_argument(
+                "--trim_logits",
+                action="store_true",
+                help="Calculate logits only for the last token to save memory in the first step.",
+            )
+            parser.add_argument(
+                "--seed",
+                default=27,
+                type=int,
+                help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.",
+            )
+            parser.add_argument(
+                "--profiling_warmup_steps",
+                default=0,
+                type=int,
+                help="Number of steps to ignore for profiling.",
+            )
+            parser.add_argument(
+                "--profiling_steps",
+                default=0,
+                type=int,
+                help="Number of steps to capture for profiling.",
+            )
+            parser.add_argument(
+                "--profiling_record_shapes",
+                default=False,
+                type=bool,
+                help="Record shapes when enabling profiling.",
+            )
+            parser.add_argument(
+                "--prompt",
+                default=None,
+                type=str,
+                nargs="*",
+                help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")',
+            )
+            parser.add_argument(
+                "--bad_words",
+                default=None,
+                type=str,
+                nargs="+",
+                help="Optional argument list of words that are not allowed to be generated.",
+            )
+            parser.add_argument(
+                "--force_words",
+                default=None,
+                type=str,
+                nargs="+",
+                help="Optional argument list of words that must be generated.",
+            )
+            parser.add_argument(
+                "--assistant_model",
+                default=None,
+                type=str,
+                help="Optional argument to give a path to a draft/assistant model for assisted decoding.",
+            )
+            parser.add_argument(
+                 "--peft_model",
+                default=None,
+                type=str,
+                help="Optional argument to give a path to a PEFT model.",
+            )
+            parser.add_argument("--num_return_sequences", type=int, default=1)
+            parser.add_argument(
+                "--token",
+                default=None,
+                type=str,
+                help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
+            )
+            parser.add_argument(
+                "--model_revision",
+                default="main",
+                type=str,
+                help="The specific model version to use (can be a branch name, tag name or commit id).",
+            )
+            parser.add_argument(
+                "--attn_softmax_bf16",
+                action="store_true",
+                help="Whether to run attention softmax layer in lower precision provided that the model supports it and "
+                "is also running in lower precision.",
+            )
+            parser.add_argument(
+                "--output_dir",
+                default=None,
+                type=str,
+                help="Output directory to store results in.",
+            )
+            parser.add_argument(
+                "--bucket_size",
+                default=-1,
+                type=int,
+                help="Bucket size to maintain static shapes. If this number is negative (default is -1) \
+                    then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
+                    we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
+            )
+            parser.add_argument(
+                "--bucket_internal",
+                action="store_true",
+                help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
+            )
+            parser.add_argument(
+                "--dataset_max_samples",
+                default=-1,
+                type=int,
+                help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.",
+            )
+            parser.add_argument(
+                "--limit_hpu_graphs",
+                action="store_true",
+                help="Skip HPU Graph usage for first token to save memory",
+            )
+            parser.add_argument(
+                "--reuse_cache",
+                action="store_true",
+                help="Whether to reuse key/value cache for decoding. It should save memory.",
+            )
+            parser.add_argument(
+                "--verbose_workers",
+                action="store_true",
+                help="Enable output from non-master workers",
+            )
+            parser.add_argument(
+                "--simulate_dyn_prompt",
+                default=None,
+                type=int,
+                nargs="*",
+                help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.",
+            )
+            parser.add_argument(
+                "--reduce_recompile",
+                action="store_true",
+                help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
+            )
+            parser.add_argument(
+                "--use_flash_attention",
+                action="store_true",
+                help="Whether to enable Habana Flash Attention, provided that the model supports it.",
+            )
+            parser.add_argument(
+                "--flash_attention_recompute",
+                action="store_true",
+                help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
+            )
+            parser.add_argument(
+                "--flash_attention_causal_mask",
+                action="store_true",
+                help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
+            )
+            parser.add_argument(
+                "--flash_attention_fast_softmax",
+                action="store_true",
+                help="Whether to enable Habana Flash Attention in fast softmax mode.",
+            )
+            parser.add_argument(
+                "--book_source",
+                action="store_true",
+                help="Whether to use project Guttenberg books data as input. Useful for testing large sequence lengths.",
+            )
+            parser.add_argument(
+                "--torch_compile",
+                action="store_true",
+                help="Whether to use torch compiled model or not.",
+            )
+            parser.add_argument(
+                "--ignore_eos",
+                default=True,
+                action=argparse.BooleanOptionalAction,
+                help="Whether to ignore eos, set False to disable it",
+            )
+            parser.add_argument(
+                "--temperature",
+                default=1.0,
+                type=float,
+                help="Temperature value for text generation",
+            )
+            parser.add_argument(
+                "--top_p",
+                default=1.0,
+                type=float,
+                help="Top_p value for generating text via sampling",
+            )
+            parser.add_argument(
+                "--const_serialization_path",
+                "--csp",
+                type=str,
+                help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
+            )
+            parser.add_argument(
+                "--disk_offload",
+                action="store_true",
+                help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
+            )
+            parser.add_argument(
+                "--trust_remote_code",
+                action="store_true",
+                help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+            )
+            args = parser.parse_args()
+
+            if args.torch_compile:
+                args.use_hpu_graphs = False
+
+            if not args.use_hpu_graphs:
+                args.limit_hpu_graphs = False
+
+            args.quant_config = os.getenv("QUANT_CONFIG", "")
+            if args.quant_config == "" and args.disk_offload:
+                logger.warning(
+                    "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
+                )
+            return args
+
+        def messages_to_prompt(messages):
+            prompt = ""
+            for message in messages:
+                if message.role == 'system':
+                prompt += f"<|system|>\n{message.content}</s>\n"
+                elif message.role == 'user':
+                prompt += f"<|user|>\n{message.content}</s>\n"
+                elif message.role == 'assistant':
+                prompt += f"<|assistant|>\n{message.content}</s>\n"
+
+            # ensure we start with a system prompt, insert blank if needed
+            if not prompt.startswith("<|system|>\n"):
+                prompt = "<|system|>\n</s>\n" + prompt
+
+            # add final assistant prompt
+            prompt = prompt + "<|assistant|>\n"
+
+            return prompt
+
+        def completion_to_prompt(completion):
+            return f"<|system|>\n</s>\n<|user|>\n{completion}</s>\n<|assistant|>\n"
+
+        import torch
+        from llama_index.core.prompts import PromptTemplate
+        from llama_index.llms.optimum-intel import GaudiLLM
+
+        parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example")
+        args = setup_parser(parser)
+        args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha"
+
+        llm = GaudiLLM(
+            args=args,
+            logger=logger,
+            model_name="HuggingFaceH4/zephyr-7b-alpha",
+            tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
+            query_wrapper_prompt=PromptTemplate(
+                "<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"
+            ),
+            context_window=3900,
+            max_new_tokens=256,
+            generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
+            messages_to_prompt=messages_to_prompt,
+            device_map="auto",
+        )
 
-            from llama_index.llms.ipex_llm import GaudiLLM
-            llm = GaudiLLM(model_path="/path/to/llama/model")
+        response = llm.complete("What is the meaning of life?")
+        print(str(response))
+        ```
     """
 
     model_name: str = Field(
@@ -76,24 +385,6 @@ class GaudiLLM(CustomLLM):
             "Unused if `model` is passed in directly."
         ),
     )
-    context_window: int = Field(
-        default=DEFAULT_CONTEXT_WINDOW,
-        description="The maximum number of tokens available for input.",
-        gt=0,
-    )
-    max_new_tokens: int = Field(
-        default=DEFAULT_NUM_OUTPUTS,
-        description="The maximum number of tokens to generate.",
-        gt=0,
-    )
-    query_wrapper_prompt: PromptTemplate = Field(
-        default=PromptTemplate("{query_str}"),
-        description=(
-            "The query wrapper prompt, containing the query placeholder. "
-            "The model card on HuggingFace should specify if this is needed. "
-            "Should contain a `{query_str}` placeholder."
-        ),
-    )
     tokenizer_name: str = Field(
         default=DEFAULT_HUGGINGFACE_MODEL,
         description=(
@@ -101,47 +392,11 @@ class GaudiLLM(CustomLLM):
             "Unused if `tokenizer` is passed in directly."
         ),
     )
-    device_map: str = Field(
-        default="cpu", description="The device_map to use. Defaults to 'cpu'."
-    )
-    stopping_ids: List[int] = Field(
-        default_factory=list,
-        description=(
-            "The stopping ids to use. "
-            "Generation stops when these token IDs are predicted."
-        ),
-    )
-    tokenizer_outputs_to_remove: list = Field(
-        default_factory=list,
-        description=(
-            "The outputs to remove from the tokenizer. "
-            "Sometimes huggingface tokenizers return extra inputs that cause errors."
-        ),
-    )
-    tokenizer_kwargs: dict = Field(
-        default_factory=dict, description="The kwargs to pass to the tokenizer."
-    )
-    model_kwargs: dict = Field(
-        default_factory=dict,
-        description="The kwargs to pass to the model during initialization.",
-    )
-    generate_kwargs: dict = Field(
-        default_factory=dict,
-        description="The kwargs to pass to the model during generation.",
-    )
-    is_chat_model: bool = Field(
-        default=False,
-        description=" Be sure to verify that you either pass an appropriate tokenizer "
-                    "that can convert prompts to properly formatted chat messages or a "
-                                "`messages_to_prompt` that does so.",
-    )
-
-    _model: Any = PrivateAttr()
-    _tokenizer: Any = PrivateAttr()
-    _stopping_criteria: Any = PrivateAttr()
 
     def __init__(
-        self, args, logger,
+        self,
+        args,
+        logger,
         context_window: int = DEFAULT_CONTEXT_WINDOW,
         max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
         query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}",
@@ -163,77 +418,10 @@ def __init__(
         pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
         output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
-        """
-        Construct GaudiLLM.
-
-        Args:
-            context_window: The maximum number of tokens available for input.
-            max_new_tokens: The maximum number of tokens to generate.
-            tokenizer_name: The name of the tokenizer to use from HuggingFace.
-                        Unused if `tokenizer` is passed in directly.
-            model_name: The model name to use from HuggingFace.
-                        Unused if `model` is passed in directly.
-            model: The HuggingFace model.
-            tokenizer: The tokenizer.
-            device_map: The device_map to use. Defaults to 'auto'.
-            stopping_ids: The stopping ids to use.
-                        Generation stops when these token IDs are predicted.
-            tokenizer_kwargs: The kwargs to pass to the tokenizer.
-            tokenizer_outputs_to_remove: The outputs to remove from the tokenizer.
-                        Sometimes huggingface tokenizers return extra inputs that cause errors.
-            model_kwargs: The kwargs to pass to the model during initialization.
-            generate_kwargs: The kwargs to pass to the model during generation.
-            is_chat_model: Whether the model is `chat`
-            callback_manager: Callback manager.
-            messages_to_prompt: Function to convert messages to prompt.
-            completion_to_prompt: Function to convert messages to prompt.
-            pydantic_program_mode: DEFAULT.
-            output_parser: BaseOutputParser.
-
-        Returns:
-            None.
-        """
+        """Initialize params."""
         model_kwargs = model_kwargs or {}
 
-        model, _, tokenizer, _= initialize_model(args, logger)
-
-        # check context_window
-        config_dict = model.config.to_dict()
-        model_context_window = int(
-            config_dict.get("max_position_embeddings", context_window)
-        )
-        if model_context_window and model_context_window < context_window:
-            logger.warning(
-                f"Supplied context_window {context_window} is greater "
-                f"than the model's max input size {model_context_window}. "
-                "Disable this warning by setting a lower context_window."
-            )
-            context_window = model_context_window
-
-
-        # setup stopping criteria
-        stopping_ids_list = stopping_ids or []
-
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-
-        class StopOnTokens(StoppingCriteria):
-            def __call__(
-                self,
-                input_ids: torch.LongTensor,
-                scores: torch.FloatTensor,
-                **kwargs: Any,
-            ) -> bool:
-                for stop_id in stopping_ids_list:
-                    if input_ids[0][-1] == stop_id:
-                        return True
-                return False
-
-        stopping_criteria = StoppingCriteriaList([StopOnTokens()])
-        if isinstance(query_wrapper_prompt, str):
-            query_wrapper_prompt = PromptTemplate(query_wrapper_prompt)
-
-        messages_to_prompt = messages_to_prompt or self._tokenizer_messages_to_prompt
+        model, _, tokenizer, _ = initialize_model(args, logger)
 
         super().__init__(
             context_window=context_window,
@@ -241,6 +429,8 @@ def __call__(
             query_wrapper_prompt=query_wrapper_prompt,
             tokenizer_name=tokenizer_name,
             model_name=model_name,
+            model=model,
+            tokenizer=tokenizer,
             device_map=device_map,
             stopping_ids=stopping_ids or [],
             tokenizer_kwargs=tokenizer_kwargs or {},
@@ -256,130 +446,6 @@ def __call__(
             output_parser=output_parser,
         )
 
-        self._model = model
-        self._tokenizer = tokenizer
-        self._stopping_criteria = stopping_criteria
-
     @classmethod
     def class_name(cls) -> str:
         return "GaudiLLM"
-
-    @property
-    def metadata(self) -> LLMMetadata:
-        """LLM metadata."""
-        return LLMMetadata(
-            context_window=self.context_window,
-            num_output=self.max_new_tokens,
-            model_name=self.model_name,
-            is_chat_model=self.is_chat_model,
-        )
-
-    def _tokenizer_messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
-        if hasattr(self._tokenizer, "apply_chat_template"):
-            messages_dict = [
-                {"role": message.role.value, "content": message.content}
-                for message in messages
-            ]
-            tokens = self._tokenizer.apply_chat_template(messages_dict)
-            return self._tokenizer.decode(tokens)
-
-        return generic_messages_to_prompt(messages)
-
-    @llm_completion_callback()
-    def complete(
-        self, prompt: str, formatted: bool = False, **kwargs: Any
-    ) -> CompletionResponse:
-        """
-        Complete by LLM.
-
-        Args:
-            prompt: Prompt for completion.
-            formatted: Whether the prompt is formatted by wrapper.
-            kwargs: Other kwargs for complete.
-
-        Returns:
-            CompletionReponse after generation.
-        """
-        if not formatted:
-            prompt = self.completion_to_prompt(prompt)
-        input_ids = self._tokenizer(prompt, return_tensors="pt")
-        input_ids = input_ids.to(self._model.device)
-        # remove keys from the tokenizer if needed, to avoid HF errors
-        for key in self.tokenizer_outputs_to_remove:
-            if key in input_ids:
-                input_ids.pop(key, None)
-        tokens = self._model.generate(
-            **input_ids,
-            max_new_tokens=self.max_new_tokens,
-            stopping_criteria=self._stopping_criteria,
-            pad_token_id=self._tokenizer.pad_token_id,
-            **self.generate_kwargs,
-        )
-        completion_tokens = tokens[0][input_ids["input_ids"].size(1) :]
-        completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)
-
-        return CompletionResponse(text=completion, raw={"model_output": tokens})
-
-    @llm_completion_callback()
-    def stream_complete(
-        self, prompt: str, formatted: bool = False, **kwargs: Any
-    ) -> CompletionResponseGen:
-        """
-        Complete by LLM in stream.
-
-        Args:
-            prompt: Prompt for completion.
-            formatted: Whether the prompt is formatted by wrapper.
-            kwargs: Other kwargs for complete.
-
-        Returns:
-            CompletionReponse after generation.
-        """
-        from transformers import TextIteratorStreamer
-
-        if not formatted:
-            prompt = self.completion_to_prompt(prompt)
-
-        input_ids = self._tokenizer.encode(prompt, return_tensors="pt")
-        input_ids = input_ids.to(self._model.device)
-
-        for key in self.tokenizer_outputs_to_remove:
-            if key in input_ids:
-                input_ids.pop(key, None)
-
-        streamer = TextIteratorStreamer(
-            self._tokenizer, skip_prompt=True, skip_special_tokens=True
-        )
-        generation_kwargs = dict(
-            input_ids=input_ids,
-            streamer=streamer,
-            max_new_tokens=self.max_new_tokens,
-            stopping_criteria=self._stopping_criteria,
-            pad_token_id=self._tokenizer.pad_token_id,
-            **self.generate_kwargs,
-        )
-        thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
-        thread.start()
-
-        # create generator based off of streamer
-        def gen() -> CompletionResponseGen:
-            text = ""
-            for x in streamer:
-                text += x
-                yield CompletionResponse(text=text, delta=x)
-
-        return gen()
-
-    @llm_chat_callback()
-    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
-        prompt = self.messages_to_prompt(messages)
-        completion_response = self.complete(prompt, formatted=True, **kwargs)
-        return completion_response_to_chat_response(completion_response)
-
-    @llm_chat_callback()
-    def stream_chat(
-        self, messages: Sequence[ChatMessage], **kwargs: Any
-    ) -> ChatResponseGen:
-        prompt = self.messages_to_prompt(messages)
-        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
-        return stream_completion_response_to_chat_response(completion_response)
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py
index 5ce06710976f2..060d7a649b5cb 100644
--- a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +19,6 @@
 import copy
 import glob
 import os
-import argparse
 import shutil
 import tempfile
 import time
@@ -44,269 +42,6 @@
     set_seed,
 )
 
-def setup_parser(parser):
-    # Arguments management
-    parser.add_argument("--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu")
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        #required=True,
-        help="Path to pre-trained model (on the HF Hub or locally).",
-    )
-    parser.add_argument(
-        "--bf16",
-        default=True,
-        action="store_true",
-        help="Whether to perform generation in bf16 precision.",
-    )
-    parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
-    parser.add_argument(
-        "--max_input_tokens",
-        type=int,
-        default=0,
-        help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \
-            if == 0, then truncate to 16 (original default) \
-            if < 0, then do not truncate, use full input prompt",
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.")
-    parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.")
-    parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.")
-    parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
-    parser.add_argument(
-        "--use_kv_cache",
-        default=True,
-        action="store_true",
-        help="Whether to use the key/value cache for decoding. It should speed up generation.",
-    )
-    parser.add_argument(
-        "--use_hpu_graphs",
-        default=True,
-        action="store_true",
-        help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        default=None,
-        type=str,
-        help="Optional argument if you want to assess your model on a given dataset of the HF Hub.",
-    )
-    parser.add_argument(
-        "--column_name",
-        default=None,
-        type=str,
-        help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
-    )
-    parser.add_argument(
-        "--do_sample",
-        action="store_true",
-        help="Whether to use sampling for generation.",
-    )
-    parser.add_argument(
-        "--num_beams",
-        default=1,
-        type=int,
-        help="Number of beams used for beam search generation. 1 means greedy search will be performed.",
-    )
-    parser.add_argument(
-        "--trim_logits",
-        action="store_true",
-        help="Calculate logits only for the last token to save memory in the first step.",
-    )
-    parser.add_argument(
-        "--seed",
-        default=27,
-        type=int,
-        help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.",
-    )
-    parser.add_argument(
-        "--profiling_warmup_steps",
-        default=0,
-        type=int,
-        help="Number of steps to ignore for profiling.",
-    )
-    parser.add_argument(
-        "--profiling_steps",
-        default=0,
-        type=int,
-        help="Number of steps to capture for profiling.",
-    )
-    parser.add_argument(
-        "--profiling_record_shapes",
-        default=False,
-        type=bool,
-        help="Record shapes when enabling profiling.",
-    )
-    parser.add_argument(
-        "--prompt",
-        default=None,
-        type=str,
-        nargs="*",
-        help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")',
-    )
-    parser.add_argument(
-        "--bad_words",
-        default=None,
-        type=str,
-        nargs="+",
-        help="Optional argument list of words that are not allowed to be generated.",
-    )
-    parser.add_argument(
-        "--force_words",
-        default=None,
-        type=str,
-        nargs="+",
-        help="Optional argument list of words that must be generated.",
-    )
-    parser.add_argument(
-        "--assistant_model",
-        default=None,
-        type=str,
-        help="Optional argument to give a path to a draft/assistant model for assisted decoding.",
-    )
-    parser.add_argument(
-        "--peft_model",
-        default=None,
-        type=str,
-        help="Optional argument to give a path to a PEFT model.",
-    )
-    parser.add_argument("--num_return_sequences", type=int, default=1)
-    parser.add_argument(
-        "--token",
-        default=None,
-        type=str,
-        help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-        "generated when running `huggingface-cli login` (stored in `~/.huggingface`).",
-    )
-    parser.add_argument(
-        "--model_revision",
-        default="main",
-        type=str,
-        help="The specific model version to use (can be a branch name, tag name or commit id).",
-    )
-    parser.add_argument(
-        "--attn_softmax_bf16",
-        action="store_true",
-        help="Whether to run attention softmax layer in lower precision provided that the model supports it and "
-        "is also running in lower precision.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        help="Output directory to store results in.",
-    )
-    parser.add_argument(
-        "--bucket_size",
-        default=-1,
-        type=int,
-        help="Bucket size to maintain static shapes. If this number is negative (default is -1) \
-            then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
-            we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
-    )
-    parser.add_argument(
-        "--bucket_internal",
-        action="store_true",
-        help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
-    )
-    parser.add_argument(
-        "--dataset_max_samples",
-        default=-1,
-        type=int,
-        help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.",
-    )
-    parser.add_argument(
-        "--limit_hpu_graphs",
-        action="store_true",
-        help="Skip HPU Graph usage for first token to save memory",
-    )
-    parser.add_argument(
-        "--reuse_cache",
-        action="store_true",
-        help="Whether to reuse key/value cache for decoding. It should save memory.",
-    )
-    parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers")
-    parser.add_argument(
-        "--simulate_dyn_prompt",
-        default=None,
-        type=int,
-        nargs="*",
-        help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.",
-    )
-    parser.add_argument(
-        "--reduce_recompile",
-        action="store_true",
-        help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
-    )
-
-    parser.add_argument(
-        "--use_flash_attention",
-        action="store_true",
-        help="Whether to enable Habana Flash Attention, provided that the model supports it.",
-    )
-    parser.add_argument(
-        "--flash_attention_recompute",
-        action="store_true",
-        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
-    )
-    parser.add_argument(
-        "--flash_attention_causal_mask",
-        action="store_true",
-        help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
-    )
-    parser.add_argument(
-        "--flash_attention_fast_softmax",
-        action="store_true",
-        help="Whether to enable Habana Flash Attention in fast softmax mode.",
-    )
-    parser.add_argument(
-        "--book_source",
-        action="store_true",
-        help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.",
-    )
-    parser.add_argument(
-        "--torch_compile",
-        action="store_true",
-        help="Whether to use torch compiled model or not.",
-    )
-    parser.add_argument(
-        "--ignore_eos",
-        default=True,
-        action=argparse.BooleanOptionalAction,
-        help="Whether to ignore eos, set False to disable it",
-    )
-    parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation")
-    parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling")
-    parser.add_argument(
-        "--const_serialization_path",
-        "--csp",
-        type=str,
-        help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
-    )
-    parser.add_argument(
-        "--disk_offload",
-        action="store_true",
-        help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
-    )
-    parser.add_argument(
-        "--trust_remote_code",
-        action="store_true",
-        help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
-    )
-    args = parser.parse_args()
-
-    if args.torch_compile:
-        args.use_hpu_graphs = False
-
-    if not args.use_hpu_graphs:
-        args.limit_hpu_graphs = False
-
-    args.quant_config = os.getenv("QUANT_CONFIG", "")
-    if args.quant_config == "" and args.disk_offload:
-        logger.warning(
-            "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
-        )
-    return args
 
 def adjust_batch(batch, size):
     curr_size = batch["input_ids"].shape[1]
@@ -317,7 +52,7 @@ def adjust_batch(batch, size):
         }
     else:
         adjusted_batch = {}
-        for k in batch.keys():
+        for k in batch:
             last_colm = batch[k][:, -1]
             expanded = last_colm.tile((size - curr_size, 1)).T
             adjusted_batch[k] = torch.concat([batch[k], expanded], 1)
@@ -388,7 +123,7 @@ def setup_const_serialization(const_serialization_path):
     os.makedirs(const_serialization_path)
     from habana_frameworks.torch.hpu import enable_const_section_serialization
 
-    print("Serializing const params to {}".format(const_serialization_path))
+    print(f"Serializing const params to {const_serialization_path}")
     enable_const_section_serialization(const_serialization_path, True)
 
 
@@ -407,7 +142,12 @@ def setup_env(args):
         os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0")
         os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")
 
-    if args.use_hpu_graphs and args.limit_hpu_graphs and not args.reuse_cache and args.bucket_internal:
+    if (
+        args.use_hpu_graphs
+        and args.limit_hpu_graphs
+        and not args.reuse_cache
+        and args.bucket_internal
+    ):
         # Based upon above conditions and below env variable,
         # we can call HPU graphs clear_inputs().
         os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1")
@@ -431,7 +171,9 @@ def setup_device(args):
 def patch_scoped_linear_all_reduce(model):
     from deepspeed.module_inject.layers import LinearAllreduce
 
-    from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce
+    from optimum.habana.transformers.models.modeling_all_models import (
+        ScopedLinearAllReduce,
+    )
 
     for name, module in model.named_children():
         if type(module) is LinearAllreduce:
@@ -441,7 +183,9 @@ def patch_scoped_linear_all_reduce(model):
 
 
 def get_torch_compiled_model(model):
-    model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True})
+    model.model = torch.compile(
+        model.model, backend="hpu_backend", options={"keep_input_mutations": True}
+    )
     return model
 
 
@@ -458,7 +202,9 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         with init_empty_weights():
             model = AutoModelForCausalLM.from_config(config)
         max_memory = {"cpu": "10GiB"}
-        device_map = infer_auto_device_map(model, max_memory=max_memory, dtype=model_dtype)
+        device_map = infer_auto_device_map(
+            model, max_memory=max_memory, dtype=model_dtype
+        )
         model = AutoModelForCausalLM.from_pretrained(
             args.model_name_or_path,
             device_map=device_map,
@@ -494,7 +240,10 @@ def setup_model(args, model_dtype, model_kwargs, logger):
 
         from optimum.habana.transformers.trainer import _is_peft_model
 
-        if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
+        if (
+            check_habana_frameworks_version("1.13.0")
+            and model.config.model_type == "falcon"
+        ):
             model = wrap_in_hpu_graph(model, hash_with_views=False)
         else:
             model = wrap_in_hpu_graph(model)
@@ -515,7 +264,9 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
 
     logger.info("DeepSpeed is enabled.")
     deepspeed.init_distributed(dist_backend="hccl")
-    config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+    config = AutoConfig.from_pretrained(
+        args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+    )
     load_to_meta = model_on_meta(config)
 
     if args.assistant_model is None:
@@ -537,11 +288,15 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
             if args.local_rank == 0:
                 if Path(merged_model_dir).is_dir():
                     shutil.rmtree(merged_model_dir)
-                peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir)
+                peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(
+                    merged_model_dir
+                )
             torch.distributed.barrier()
 
         write_checkpoints_json(
-            merged_model_dir if args.peft_model is not None else args.model_name_or_path,
+            merged_model_dir
+            if args.peft_model is not None
+            else args.model_name_or_path,
             args.local_rank,
             checkpoints_json,
             token=args.token,
@@ -593,7 +348,9 @@ def peft_model(args, model_dtype, logger, **model_kwargs):
     import importlib.util
 
     if importlib.util.find_spec("peft") is None:
-        raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
+        raise ImportError(
+            "The `peft` package is not installed, please run: `pip install peft`."
+        )
     from peft import AutoPeftModelForCausalLM
     from peft.config import PeftConfigMixin
 
@@ -614,7 +371,9 @@ def peft_model(args, model_dtype, logger, **model_kwargs):
             base_model_is_remote = False
 
     if base_model_is_local or base_model_is_remote:
-        model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+        model = AutoPeftModelForCausalLM.from_pretrained(
+            args.peft_model, torch_dtype=model_dtype, **model_kwargs
+        )
     else:
         # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
         logger.warning(
@@ -624,18 +383,27 @@ def peft_model(args, model_dtype, logger, **model_kwargs):
         )
         from peft import PeftModel
 
-        model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
-        model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+        )
+        model = PeftModel.from_pretrained(
+            model, args.peft_model, torch_dtype=model_dtype, **model_kwargs
+        )
     if hasattr(model, "merge_and_unload"):
         model = model.merge_and_unload()
         if model_dtype == torch.bfloat16:
             model = model.to(torch.bfloat16)
         return model
     else:
-        from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation
+        from optimum.habana.peft.peft_model import (
+            gaudi_generate,
+            gaudi_prepare_inputs_for_generation,
+        )
 
         model.__class__.generate = gaudi_generate
-        model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation
+        model.__class__.prepare_inputs_for_generation = (
+            gaudi_prepare_inputs_for_generation
+        )
         return model
 
 
@@ -647,7 +415,9 @@ def setup_tokenizer(args, model, assistant_model):
     }
     if args.bad_words is not None or args.force_words is not None:
         tokenizer_kwargs["add_prefix_space"] = True
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path, **tokenizer_kwargs
+    )
     if not model.config.is_encoder_decoder:
         tokenizer.padding_side = "left"
 
@@ -669,7 +439,9 @@ def setup_tokenizer(args, model, assistant_model):
     if model.config.model_type == "persimmon":
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         if assistant_model is not None:
-            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
+            assistant_model.generation_config.pad_token_id = (
+                assistant_model.generation_config.eos_token_id
+            )
         tokenizer.bos_token_id = model.generation_config.bos_token_id
         tokenizer.eos_token_id = model.generation_config.eos_token_id
         tokenizer.pad_token_id = model.generation_config.pad_token_id
@@ -682,7 +454,9 @@ def setup_tokenizer(args, model, assistant_model):
         tokenizer.pad_token = tokenizer.eos_token
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
         if assistant_model is not None:
-            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
+            assistant_model.generation_config.pad_token_id = (
+                assistant_model.generation_config.eos_token_id
+            )
 
     return tokenizer, model, assistant_model
 
@@ -691,9 +465,15 @@ def setup_generation_config(args, model, assistant_model, tokenizer):
     bad_words_ids = None
     force_words_ids = None
     if args.bad_words is not None:
-        bad_words_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in args.bad_words]
+        bad_words_ids = [
+            tokenizer.encode(bad_word, add_special_tokens=False)
+            for bad_word in args.bad_words
+        ]
     if args.force_words is not None:
-        force_words_ids = [tokenizer.encode(force_word, add_special_tokens=False) for force_word in args.force_words]
+        force_words_ids = [
+            tokenizer.encode(force_word, add_special_tokens=False)
+            for force_word in args.force_words
+        ]
 
     is_optimized = model_is_optimized(model.config)
 
@@ -728,7 +508,10 @@ def setup_generation_config(args, model, assistant_model, tokenizer):
 def exclude_hpu_graph_configs(args):
     # Excluded configs for batch size 1 for hpu graph
     if args.batch_size == 1 and args.limit_hpu_graphs:
-        if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path:
+        if (
+            "falcon-180B" in args.model_name_or_path
+            or "falcon-180b" in args.model_name_or_path
+        ):
             return False
         if args.world_size == 2 or args.world_size == 4 or args.world_size == 8:
             if args.quant_config:
@@ -753,7 +536,9 @@ def initialize_model(args, logger):
     set_seed(args.seed)
     get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token)
     if args.assistant_model is not None:
-        get_repo_root(args.assistant_model, local_rank=args.local_rank, token=args.token)
+        get_repo_root(
+            args.assistant_model, local_rank=args.local_rank, token=args.token
+        )
     use_deepspeed = False
     if use_deepspeed or args.bf16:
         model_dtype = torch.bfloat16
@@ -767,7 +552,9 @@ def initialize_model(args, logger):
         "trust_remote_code": args.trust_remote_code,
     }
     if args.trust_remote_code:
-        logger.warning("`trust_remote_code` is set, there is no guarantee this model works properly and it may fail")
+        logger.warning(
+            "`trust_remote_code` is set, there is no guarantee this model works properly and it may fail"
+        )
 
     model, assistant_model = (
         setup_model(args, model_dtype, model_kwargs, logger)
@@ -783,6 +570,8 @@ def initialize_model(args, logger):
         model = setup_inference(args, model)
     init_end = time.perf_counter()
     logger.info(f"Args: {args}")
-    logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
+    logger.info(
+        f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}"
+    )
     logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
     return model, assistant_model, tokenizer, generation_config
diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml
index ba65f29ae2ac8..c540f328ad284 100644
--- a/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml
+++ b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml
@@ -33,11 +33,13 @@ readme = "README.md"
 version = "0.1.0"
 
 [tool.poetry.dependencies]
-python = ">=3.8.1,<4.0"
+python = ">=3.9,<4.0"
 huggingface-hub = "^0.23.0"
 torch = "^2.1.2"
 text-generation = "^0.7.0"
 llama-index-core = "^0.11.0"
+llama-index-llms-huggingface = "^0.3.0"
+optimum = {extras = ["habana"], version = ">=1.21.2"}
 
 [tool.poetry.dependencies.transformers]
 extras = ["torch"]
@@ -67,6 +69,3 @@ version = "<=23.9.1,>=23.7.0"
 [tool.poetry.group.dev.dependencies.codespell]
 extras = ["toml"]
 version = ">=v2.2.6"
-
-[[tool.poetry.packages]]
-include = "llama_index/"