From d0b41e54703cebf327c23e1577451ab90b169498 Mon Sep 17 00:00:00 2001 From: sykp241095 Date: Sat, 15 Jun 2024 00:16:08 +0800 Subject: [PATCH 1/2] examples: add demo to showcase how to use jina ai embedding api --- .gitignore | 2 + examples/jina-ai-embeddings-demo/README.md | 61 +++++++++ .../jina-ai-embeddings-demo.py | 126 ++++++++++++++++++ .../jina-ai-embeddings-demo/requirements.txt | 6 + 4 files changed, 195 insertions(+) create mode 100644 examples/jina-ai-embeddings-demo/README.md create mode 100644 examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py create mode 100644 examples/jina-ai-embeddings-demo/requirements.txt diff --git a/.gitignore b/.gitignore index 81ca073..e160b92 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,5 @@ cython_debug/ .idea/ django_tests_dir + +*.swp diff --git a/examples/jina-ai-embeddings-demo/README.md b/examples/jina-ai-embeddings-demo/README.md new file mode 100644 index 0000000..77125b5 --- /dev/null +++ b/examples/jina-ai-embeddings-demo/README.md @@ -0,0 +1,61 @@ +# Jina AI Embeddings Demo +This is a simple demo to show how to use Jina AI to generate embeddings for text data. Then store the embeddings in TiDB Vector Storage and search for similar embeddings. + +## Prerequisites + +- A running TiDB Serverless cluster with vector search enabled +- Python 3.8 or later +- Jina AI API key + +## Run the example + +### Clone this repo + +```bash +git clone https://github.com/pingcap/tidb-vector-python.git +``` + +### Create a virtual environment + +```bash +cd tidb-vector-python/examples/jina-ai-embeddings-demo +python3 -m venv .venv +source .venv/bin/activate +``` + +### Install dependencies + +```bash +pip install -r requirements.txt +``` + +### Set the environment variables + +Get the Jina AI API key from the [Jina AI Embedding API](https://jina.ai/embeddings/) page + +Get the `TIDB_HOST`, `TIDB_USERNAME`, `TIDB_PASSWORD`, `TIDB_HOST`, `TIDB_PORT` and `TIDB_DATABASE` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. + +```bash +export JINA_API_KEY="****" +export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com" +export TIDB_USERNAME="****.root" +export TIDB_PASSWORD="****" +export TIDB_PORT="4000" +export TIDB_DATABASE="test" +``` +or create a `.env` file with the above environment variables. + + +### Run this example + +```text +$ python jina-ai-embeddings-demo.py +- Inserting Data to TiDB... + - Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. + - Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. +- List All Documents and Their Distances to the Query: + - Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.: 0.3585317326132522 + - TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 +- The Most Relevant Document and Its Distance to the Query: + - TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 +``` \ No newline at end of file diff --git a/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py b/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py new file mode 100644 index 0000000..45adf56 --- /dev/null +++ b/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py @@ -0,0 +1,126 @@ +import os +import requests +import dotenv +dotenv.load_dotenv() + +JINAAI_API_KEY = os.getenv('JINAAI_API_KEY') +assert JINAAI_API_KEY is not None +TIDB_USERNAME = os.getenv('TIDB_USERNAME') +TIDB_PASSWORD = os.getenv('TIDB_PASSWORD') +TIDB_HOST = os.getenv('TIDB_HOST') +TIDB_PORT = os.getenv('TIDB_PORT') +TIDB_DATABASE = os.getenv('TIDB_DATABASE') +assert TIDB_USERNAME is not None +assert TIDB_PASSWORD is not None +assert TIDB_HOST is not None +assert TIDB_PORT is not None +assert TIDB_DATABASE is not None + +TEXTS = [ + 'Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.', + 'TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.', +] + +# 1. Get Embeddings from Jina AI +def generate_embeddings(text: str): + JINAAI_API_URL = 'https://api.jina.ai/v1/embeddings' + JINAAI_HEADERS = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {JINAAI_API_KEY}' + } + JINAAI_REQUEST_DATA = { + 'input': [text], + 'model': 'jina-embeddings-v2-base-en' # with dimisions 768 + } + response = requests.post(JINAAI_API_URL, headers=JINAAI_HEADERS, json=JINAAI_REQUEST_DATA) + return response.json()['data'][0]['embedding'] + +data = [] +for text in TEXTS: + embedding = generate_embeddings(text) + data.append({ + 'text': text, + 'embedding': embedding + }) + + +# 2. Connect TiDB Serverless and Create Table +from sqlalchemy import Column, Integer, String, create_engine, URL +from sqlalchemy.orm import Session, declarative_base +from tidb_vector.sqlalchemy import VectorType + +assert os.getenv("TIDB_USERNAME") is not None +assert os.getenv("TIDB_PASSWORD") is not None +assert os.getenv("TIDB_HOST") is not None +assert os.getenv("TIDB_PORT") is not None +assert os.getenv("TIDB_DATABASE") is not None + +url = URL( + drivername="mysql+pymysql", + username=TIDB_USERNAME, + password=TIDB_PASSWORD, + host=TIDB_HOST, + port=int(TIDB_PORT), + database=TIDB_DATABASE, + query={"ssl_verify_cert": True, "ssl_verify_identity": True}, +) +engine = create_engine(url, pool_recycle=300) +Base = declarative_base() + +class Document(Base): + __tablename__ = "jinaai_tidb_demo_documents" + + id = Column(Integer, primary_key=True) + content = Column(String(255), nullable=False) + content_vec = Column( + # DIMENSIONS is determined by the embedding model, + # for Jina AI's jina-embeddings-v2-base-en model it's 768 + VectorType(dim=768), + comment="hnsw(distance=l2)" + ) +# Create the table +Base.metadata.create_all(engine) + + +# 3. Insert Data from Jina AI to TiDB +with Session(engine) as session: + print('- Inserting Data to TiDB...') + for item in data: + print(f' - Inserting: {item["text"]}') + session.add(Document( + content=item['text'], + content_vec=item['embedding'] + )) + session.commit() + + +# 4. Query Data from TiDB +query = 'What is TiDB?' +query_embedding = generate_embeddings(query) +with Session(engine) as session: + print('- List All Documents and Their Distances to the Query:') + for doc, distance in session.query( + Document, + Document.content_vec.cosine_distance(query_embedding).label('distance') + ).all(): + print(f' - {doc.content}: {distance}') + + print('- The Most Relevant Document and Its Distance to the Query:') + doc, distance = session.query( + Document, + Document.content_vec.cosine_distance(query_embedding).label('distance') + ).order_by( + 'distance' + ).limit(1).first() + print(f' - {doc.content}: {distance}') + +# Output: +# +# - Inserting Data to TiDB... +# - Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. +# - Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. +# - List All Documents and Their Distances to the Query: +# - Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.: 0.3585317326132522 +# - TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 +# - The Most Relevant Document and Its Distance to the Query: +# - TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 \ No newline at end of file diff --git a/examples/jina-ai-embeddings-demo/requirements.txt b/examples/jina-ai-embeddings-demo/requirements.txt new file mode 100644 index 0000000..db235b0 --- /dev/null +++ b/examples/jina-ai-embeddings-demo/requirements.txt @@ -0,0 +1,6 @@ +requests +PyMySQL +openai==1.27.0 +SQLAlchemy +tidb-vector>=0.0.9 +python-dotenv \ No newline at end of file From 9b7848a1f8b6b96e7fd89eada3f74496c2e55930 Mon Sep 17 00:00:00 2001 From: sykp241095 Date: Sat, 15 Jun 2024 00:19:04 +0800 Subject: [PATCH 2/2] examples: add demo to showcase how to use jina ai embedding api --- .../jina-ai-embeddings-demo.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py b/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py index 45adf56..38659f5 100644 --- a/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py +++ b/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py @@ -1,6 +1,11 @@ import os import requests import dotenv + +from sqlalchemy import Column, Integer, String, create_engine, URL +from sqlalchemy.orm import Session, declarative_base +from tidb_vector.sqlalchemy import VectorType + dotenv.load_dotenv() JINAAI_API_KEY = os.getenv('JINAAI_API_KEY') @@ -45,16 +50,6 @@ def generate_embeddings(text: str): # 2. Connect TiDB Serverless and Create Table -from sqlalchemy import Column, Integer, String, create_engine, URL -from sqlalchemy.orm import Session, declarative_base -from tidb_vector.sqlalchemy import VectorType - -assert os.getenv("TIDB_USERNAME") is not None -assert os.getenv("TIDB_PASSWORD") is not None -assert os.getenv("TIDB_HOST") is not None -assert os.getenv("TIDB_PORT") is not None -assert os.getenv("TIDB_DATABASE") is not None - url = URL( drivername="mysql+pymysql", username=TIDB_USERNAME,