diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8100a6ae8..efcfc53fa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,9 +14,11 @@ repos:
         alias: r
         verbose: true
         args: [--exit-non-zero-on-fix, --show-fixes]
+        types_or: [python, pyi, jupyter]
       - id: ruff-format
         alias: rf
         verbose: true
+        types_or: [python, pyi, jupyter]
   - repo: local
     hooks:
       - id: mypy
diff --git a/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb b/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb
index 6df20b5e8..d95694985 100644
--- a/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb
+++ b/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb
@@ -96,6 +96,7 @@
     "import psutil\n",
     "import torch\n",
     "\n",
+    "\n",
     "ram = psutil.virtual_memory()\n",
     "ram_total = ram.total / (1024**3)\n",
     "print('MemTotal: %.2f GB' % ram_total)\n",
@@ -344,31 +345,8 @@
    "source": [
     "import openllm\n",
     "\n",
-    "llm = openllm.LLM('meta-llama/Llama-2-7b-chat-hf', backend='vllm')  # NOTE: You don't have to pass 'backend' if vllm is available in the environment"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "6slh7cFpwohd"
-   },
-   "source": [
-    "You can manually trigger the model download if it is not downloaded automatically."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "ukY2Cugr8tBD",
-    "outputId": "bba81848-865b-4286-b052-542e556d3c4b"
-   },
-   "outputs": [],
-   "source": [
-    "llm.save_pretrained()"
+    "\n",
+    "llm = openllm.LLM('meta-llama/Llama-2-7b-chat-hf', backend='vllm')"
    ]
   },
   {
@@ -393,8 +371,10 @@
    "outputs": [],
    "source": [
     "import asyncio\n",
+    "\n",
     "import nest_asyncio\n",
     "\n",
+    "\n",
     "nest_asyncio.apply()\n",
     "\n",
     "\n",
@@ -457,8 +437,10 @@
    "outputs": [],
    "source": [
     "import gc\n",
+    "\n",
     "import torch\n",
     "\n",
+    "\n",
     "torch.cuda.empty_cache()\n",
     "gc.collect()"
    ]
@@ -498,7 +480,7 @@
    },
    "outputs": [],
    "source": [
-    "!nohup openllm start llama --model-id NousResearch/llama-2-7b-chat-hf --port 8001 --backend vllm > openllm.log 2>&1 &"
+    "!nohup openllm start meta-llama/Llama-2-7b-chat-hf --port 8001 --backend vllm > openllm.log 2>&1 &"
    ]
   },
   {
@@ -596,6 +578,7 @@
    "source": [
     "import sys\n",
     "\n",
+    "\n",
     "if 'google.colab' in sys.modules:\n",
     "  # using colab proxy URL\n",
     "  from google.colab.output import eval_js\n",
@@ -622,13 +605,14 @@
    "source": [
     "import openllm\n",
     "\n",
+    "\n",
     "# sync API\n",
-    "client = openllm.client.HTTPClient('http://127.0.0.1:8001', timeout=120)\n",
+    "client = openllm.HTTPClient('http://127.0.0.1:8001', timeout=120)\n",
     "res = client.generate('What is the weight of the earth?', max_new_tokens=8192)\n",
     "\n",
     "# Async API\n",
-    "# async_client = openllm.client.AsyncHTTPClient(\"http://127.0.0.1:8001\", timeout=120)\n",
-    "# res = await async_client.query(\"what is the weight of the earth?\", max_new_tokens=8192)\n",
+    "# async_client = openllm.AsyncHTTPClient(\"http://127.0.0.1:8001\", timeout=120)\n",
+    "# res = await async_client.generate(\"what is the weight of the earth?\", max_new_tokens=8192)\n",
     "print(res.outputs[0].text)"
    ]
   },
@@ -665,7 +649,8 @@
     "  'http://127.0.0.1:8001/v1/generate_stream' \\\n",
     "  -H 'accept: text/event-stream' \\\n",
     "  -H 'Content-Type: application/json' \\\n",
-    "  -d '{\"prompt\":\"write a tagline for an ice cream shop\\n\", \"llm_config\": {\"max_new_tokens\": 256}}'"
+    "  -d '{\"prompt\":\"write a tagline for an ice cream shop\n",
+    "\", \"llm_config\": {\"max_new_tokens\": 256}}'"
    ]
   },
   {
@@ -687,13 +672,14 @@
    "source": [
     "import openai\n",
     "\n",
+    "\n",
     "openai.api_base = 'http://localhost:8001/v1'\n",
     "openai.api_key = 'na'\n",
     "\n",
-    "response = openai.Completion.create(model='llama2', prompt='Say this is a test')\n",
+    "response = openai.Completion.create(model='meta-llama--Llama-2-7b-chat-hf', prompt='Say this is a test')\n",
     "print(response)\n",
     "\n",
-    "chatCompletion = openai.ChatCompletion.create(model='llama2', messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Hello!'}])\n",
+    "chatCompletion = openai.ChatCompletion.create(model='meta-llama--Llama-2-7b-chat-hf', messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Hello!'}])\n",
     "print(chatCompletion)"
    ]
   },
@@ -718,6 +704,7 @@
    "source": [
     "from langchain.llms import OpenLLM\n",
     "\n",
+    "\n",
     "llm = OpenLLM(server_url='http://localhost:8001')"
    ]
   },
@@ -729,8 +716,9 @@
    },
    "outputs": [],
    "source": [
-    "from langchain.prompts import PromptTemplate\n",
     "from langchain.chains import LLMChain\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "\n",
     "\n",
     "template = 'What is a good name for a company that makes {product}?'\n",
     "\n",
@@ -815,7 +803,7 @@
     "Use OpenLLM to build the model into a standardized distribution unit in BentoML, also known as a Bento. Command:\n",
     "\n",
     "```\n",
-    "openllm build llama --model-id {model-id} --backend [pt|vllm]\n",
+    "openllm build {model-id} --backend [pt|vllm]\n",
     "```"
    ]
   },
@@ -831,7 +819,7 @@
    },
    "outputs": [],
    "source": [
-    "!openllm build llama --model-id NousResearch/llama-2-7b-chat-hf --backend vllm"
+    "!openllm build NousResearch/llama-2-7b-chat-hf --backend vllm"
    ]
   },
   {
@@ -924,9 +912,11 @@
    "outputs": [],
    "source": [
     "###@title Alternatively, use the BentoML client to create a Deployment.\n",
-    "import bentoml\n",
     "import json\n",
     "\n",
+    "import bentoml\n",
+    "\n",
+    "\n",
     "return_code = !bentoml cloud list-context\n",
     "if 'colab-user' not in ''.join(return_code):\n",
     "  print('please login first!')\n",
@@ -943,7 +933,7 @@
     "\n",
     "  res = client.deployment.create(\n",
     "    deployment_name='test-llama2',\n",
-    "    bento='nousresearch--llama-2-7b-chat-hf-service:37892f30c23786c0d5367d80481fa0d9fba93cf8',\n",
+    "    bento='meta-llama--llama-2-7b-chat-hf-service:37892f30c23786c0d5367d80481fa0d9fba93cf8',\n",
     "    context='colab-user',\n",
     "    cluster_name='default',\n",
     "    # mode=\"deployment\",\n",