diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8100a6ae8..efcfc53fa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,9 +14,11 @@ repos: alias: r verbose: true args: [--exit-non-zero-on-fix, --show-fixes] + types_or: [python, pyi, jupyter] - id: ruff-format alias: rf verbose: true + types_or: [python, pyi, jupyter] - repo: local hooks: - id: mypy diff --git a/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb b/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb index 6df20b5e8..d95694985 100644 --- a/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb +++ b/examples/openllm-llama2-demo/openllm_llama2_demo.ipynb @@ -96,6 +96,7 @@ "import psutil\n", "import torch\n", "\n", + "\n", "ram = psutil.virtual_memory()\n", "ram_total = ram.total / (1024**3)\n", "print('MemTotal: %.2f GB' % ram_total)\n", @@ -344,31 +345,8 @@ "source": [ "import openllm\n", "\n", - "llm = openllm.LLM('meta-llama/Llama-2-7b-chat-hf', backend='vllm') # NOTE: You don't have to pass 'backend' if vllm is available in the environment" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6slh7cFpwohd" - }, - "source": [ - "You can manually trigger the model download if it is not downloaded automatically." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ukY2Cugr8tBD", - "outputId": "bba81848-865b-4286-b052-542e556d3c4b" - }, - "outputs": [], - "source": [ - "llm.save_pretrained()" + "\n", + "llm = openllm.LLM('meta-llama/Llama-2-7b-chat-hf', backend='vllm')" ] }, { @@ -393,8 +371,10 @@ "outputs": [], "source": [ "import asyncio\n", + "\n", "import nest_asyncio\n", "\n", + "\n", "nest_asyncio.apply()\n", "\n", "\n", @@ -457,8 +437,10 @@ "outputs": [], "source": [ "import gc\n", + "\n", "import torch\n", "\n", + "\n", "torch.cuda.empty_cache()\n", "gc.collect()" ] @@ -498,7 +480,7 @@ }, "outputs": [], "source": [ - "!nohup openllm start llama --model-id NousResearch/llama-2-7b-chat-hf --port 8001 --backend vllm > openllm.log 2>&1 &" + "!nohup openllm start meta-llama/Llama-2-7b-chat-hf --port 8001 --backend vllm > openllm.log 2>&1 &" ] }, { @@ -596,6 +578,7 @@ "source": [ "import sys\n", "\n", + "\n", "if 'google.colab' in sys.modules:\n", " # using colab proxy URL\n", " from google.colab.output import eval_js\n", @@ -622,13 +605,14 @@ "source": [ "import openllm\n", "\n", + "\n", "# sync API\n", - "client = openllm.client.HTTPClient('http://127.0.0.1:8001', timeout=120)\n", + "client = openllm.HTTPClient('http://127.0.0.1:8001', timeout=120)\n", "res = client.generate('What is the weight of the earth?', max_new_tokens=8192)\n", "\n", "# Async API\n", - "# async_client = openllm.client.AsyncHTTPClient(\"http://127.0.0.1:8001\", timeout=120)\n", - "# res = await async_client.query(\"what is the weight of the earth?\", max_new_tokens=8192)\n", + "# async_client = openllm.AsyncHTTPClient(\"http://127.0.0.1:8001\", timeout=120)\n", + "# res = await async_client.generate(\"what is the weight of the earth?\", max_new_tokens=8192)\n", "print(res.outputs[0].text)" ] }, @@ -665,7 +649,8 @@ " 'http://127.0.0.1:8001/v1/generate_stream' \\\n", " -H 'accept: text/event-stream' \\\n", " -H 'Content-Type: application/json' \\\n", - " -d '{\"prompt\":\"write a tagline for an ice cream shop\\n\", \"llm_config\": {\"max_new_tokens\": 256}}'" + " -d '{\"prompt\":\"write a tagline for an ice cream shop\n", + "\", \"llm_config\": {\"max_new_tokens\": 256}}'" ] }, { @@ -687,13 +672,14 @@ "source": [ "import openai\n", "\n", + "\n", "openai.api_base = 'http://localhost:8001/v1'\n", "openai.api_key = 'na'\n", "\n", - "response = openai.Completion.create(model='llama2', prompt='Say this is a test')\n", + "response = openai.Completion.create(model='meta-llama--Llama-2-7b-chat-hf', prompt='Say this is a test')\n", "print(response)\n", "\n", - "chatCompletion = openai.ChatCompletion.create(model='llama2', messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Hello!'}])\n", + "chatCompletion = openai.ChatCompletion.create(model='meta-llama--Llama-2-7b-chat-hf', messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Hello!'}])\n", "print(chatCompletion)" ] }, @@ -718,6 +704,7 @@ "source": [ "from langchain.llms import OpenLLM\n", "\n", + "\n", "llm = OpenLLM(server_url='http://localhost:8001')" ] }, @@ -729,8 +716,9 @@ }, "outputs": [], "source": [ - "from langchain.prompts import PromptTemplate\n", "from langchain.chains import LLMChain\n", + "from langchain.prompts import PromptTemplate\n", + "\n", "\n", "template = 'What is a good name for a company that makes {product}?'\n", "\n", @@ -815,7 +803,7 @@ "Use OpenLLM to build the model into a standardized distribution unit in BentoML, also known as a Bento. Command:\n", "\n", "```\n", - "openllm build llama --model-id {model-id} --backend [pt|vllm]\n", + "openllm build {model-id} --backend [pt|vllm]\n", "```" ] }, @@ -831,7 +819,7 @@ }, "outputs": [], "source": [ - "!openllm build llama --model-id NousResearch/llama-2-7b-chat-hf --backend vllm" + "!openllm build NousResearch/llama-2-7b-chat-hf --backend vllm" ] }, { @@ -924,9 +912,11 @@ "outputs": [], "source": [ "###@title Alternatively, use the BentoML client to create a Deployment.\n", - "import bentoml\n", "import json\n", "\n", + "import bentoml\n", + "\n", + "\n", "return_code = !bentoml cloud list-context\n", "if 'colab-user' not in ''.join(return_code):\n", " print('please login first!')\n", @@ -943,7 +933,7 @@ "\n", " res = client.deployment.create(\n", " deployment_name='test-llama2',\n", - " bento='nousresearch--llama-2-7b-chat-hf-service:37892f30c23786c0d5367d80481fa0d9fba93cf8',\n", + " bento='meta-llama--llama-2-7b-chat-hf-service:37892f30c23786c0d5367d80481fa0d9fba93cf8',\n", " context='colab-user',\n", " cluster_name='default',\n", " # mode=\"deployment\",\n",