-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
examples: add demo to showcase how to use jina ai embedding api (#35)
- Loading branch information
1 parent
477fde3
commit 46732a0
Showing
4 changed files
with
190 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -139,3 +139,5 @@ cython_debug/ | |
|
||
.idea/ | ||
django_tests_dir | ||
|
||
*.swp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# Jina AI Embeddings Demo | ||
This is a simple demo to show how to use Jina AI to generate embeddings for text data. Then store the embeddings in TiDB Vector Storage and search for similar embeddings. | ||
|
||
## Prerequisites | ||
|
||
- A running TiDB Serverless cluster with vector search enabled | ||
- Python 3.8 or later | ||
- Jina AI API key | ||
|
||
## Run the example | ||
|
||
### Clone this repo | ||
|
||
```bash | ||
git clone https://github.com/pingcap/tidb-vector-python.git | ||
``` | ||
|
||
### Create a virtual environment | ||
|
||
```bash | ||
cd tidb-vector-python/examples/jina-ai-embeddings-demo | ||
python3 -m venv .venv | ||
source .venv/bin/activate | ||
``` | ||
|
||
### Install dependencies | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### Set the environment variables | ||
|
||
Get the Jina AI API key from the [Jina AI Embedding API](https://jina.ai/embeddings/) page | ||
|
||
Get the `TIDB_HOST`, `TIDB_USERNAME`, `TIDB_PASSWORD`, `TIDB_HOST`, `TIDB_PORT` and `TIDB_DATABASE` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. | ||
|
||
```bash | ||
export JINA_API_KEY="****" | ||
export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com" | ||
export TIDB_USERNAME="****.root" | ||
export TIDB_PASSWORD="****" | ||
export TIDB_PORT="4000" | ||
export TIDB_DATABASE="test" | ||
``` | ||
or create a `.env` file with the above environment variables. | ||
|
||
|
||
### Run this example | ||
|
||
```text | ||
$ python jina-ai-embeddings-demo.py | ||
- Inserting Data to TiDB... | ||
- Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. | ||
- Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. | ||
- List All Documents and Their Distances to the Query: | ||
- Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.: 0.3585317326132522 | ||
- TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 | ||
- The Most Relevant Document and Its Distance to the Query: | ||
- TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 | ||
``` |
121 changes: 121 additions & 0 deletions
121
examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import os | ||
import requests | ||
import dotenv | ||
|
||
from sqlalchemy import Column, Integer, String, create_engine, URL | ||
from sqlalchemy.orm import Session, declarative_base | ||
from tidb_vector.sqlalchemy import VectorType | ||
|
||
dotenv.load_dotenv() | ||
|
||
JINAAI_API_KEY = os.getenv('JINAAI_API_KEY') | ||
assert JINAAI_API_KEY is not None | ||
TIDB_USERNAME = os.getenv('TIDB_USERNAME') | ||
TIDB_PASSWORD = os.getenv('TIDB_PASSWORD') | ||
TIDB_HOST = os.getenv('TIDB_HOST') | ||
TIDB_PORT = os.getenv('TIDB_PORT') | ||
TIDB_DATABASE = os.getenv('TIDB_DATABASE') | ||
assert TIDB_USERNAME is not None | ||
assert TIDB_PASSWORD is not None | ||
assert TIDB_HOST is not None | ||
assert TIDB_PORT is not None | ||
assert TIDB_DATABASE is not None | ||
|
||
TEXTS = [ | ||
'Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.', | ||
'TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.', | ||
] | ||
|
||
# 1. Get Embeddings from Jina AI | ||
def generate_embeddings(text: str): | ||
JINAAI_API_URL = 'https://api.jina.ai/v1/embeddings' | ||
JINAAI_HEADERS = { | ||
'Content-Type': 'application/json', | ||
'Authorization': f'Bearer {JINAAI_API_KEY}' | ||
} | ||
JINAAI_REQUEST_DATA = { | ||
'input': [text], | ||
'model': 'jina-embeddings-v2-base-en' # with dimisions 768 | ||
} | ||
response = requests.post(JINAAI_API_URL, headers=JINAAI_HEADERS, json=JINAAI_REQUEST_DATA) | ||
return response.json()['data'][0]['embedding'] | ||
|
||
data = [] | ||
for text in TEXTS: | ||
embedding = generate_embeddings(text) | ||
data.append({ | ||
'text': text, | ||
'embedding': embedding | ||
}) | ||
|
||
|
||
# 2. Connect TiDB Serverless and Create Table | ||
url = URL( | ||
drivername="mysql+pymysql", | ||
username=TIDB_USERNAME, | ||
password=TIDB_PASSWORD, | ||
host=TIDB_HOST, | ||
port=int(TIDB_PORT), | ||
database=TIDB_DATABASE, | ||
query={"ssl_verify_cert": True, "ssl_verify_identity": True}, | ||
) | ||
engine = create_engine(url, pool_recycle=300) | ||
Base = declarative_base() | ||
|
||
class Document(Base): | ||
__tablename__ = "jinaai_tidb_demo_documents" | ||
|
||
id = Column(Integer, primary_key=True) | ||
content = Column(String(255), nullable=False) | ||
content_vec = Column( | ||
# DIMENSIONS is determined by the embedding model, | ||
# for Jina AI's jina-embeddings-v2-base-en model it's 768 | ||
VectorType(dim=768), | ||
comment="hnsw(distance=l2)" | ||
) | ||
# Create the table | ||
Base.metadata.create_all(engine) | ||
|
||
|
||
# 3. Insert Data from Jina AI to TiDB | ||
with Session(engine) as session: | ||
print('- Inserting Data to TiDB...') | ||
for item in data: | ||
print(f' - Inserting: {item["text"]}') | ||
session.add(Document( | ||
content=item['text'], | ||
content_vec=item['embedding'] | ||
)) | ||
session.commit() | ||
|
||
|
||
# 4. Query Data from TiDB | ||
query = 'What is TiDB?' | ||
query_embedding = generate_embeddings(query) | ||
with Session(engine) as session: | ||
print('- List All Documents and Their Distances to the Query:') | ||
for doc, distance in session.query( | ||
Document, | ||
Document.content_vec.cosine_distance(query_embedding).label('distance') | ||
).all(): | ||
print(f' - {doc.content}: {distance}') | ||
|
||
print('- The Most Relevant Document and Its Distance to the Query:') | ||
doc, distance = session.query( | ||
Document, | ||
Document.content_vec.cosine_distance(query_embedding).label('distance') | ||
).order_by( | ||
'distance' | ||
).limit(1).first() | ||
print(f' - {doc.content}: {distance}') | ||
|
||
# Output: | ||
# | ||
# - Inserting Data to TiDB... | ||
# - Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. | ||
# - Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. | ||
# - List All Documents and Their Distances to the Query: | ||
# - Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.: 0.3585317326132522 | ||
# - TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 | ||
# - The Most Relevant Document and Its Distance to the Query: | ||
# - TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.: 0.10858658947444844 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
requests | ||
PyMySQL | ||
openai==1.27.0 | ||
SQLAlchemy | ||
tidb-vector>=0.0.9 | ||
python-dotenv |