Skip to content

Commit

Permalink
feat: more cases
Browse files Browse the repository at this point in the history
  • Loading branch information
florian-hoenicke committed Oct 29, 2024
1 parent 8f989e1 commit 03a9fc9
Show file tree
Hide file tree
Showing 14 changed files with 295 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules
experiment/testResults/venv
17 changes: 15 additions & 2 deletions experiment/generateMetaPromptHTML.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ const getHtmlFiles = (dir) => {

// Process each HTML file
async function processHtmlSnippets() {
let metaPrompt = '';
const htmlFiles = getHtmlFiles(htmlSnippetDir);

for (const fileName of htmlFiles) {
Expand Down Expand Up @@ -51,10 +52,22 @@ response = requests. ... # < comment here >

// Call the LLM API to get the response
const response = await promptLLMOpenAI(prompt, 'gpt-4o');

metaPrompt += `\n# ${productName}\n${response}`
// Append the result to the output file
fs.appendFileSync(outputFile, `\n# ${productName}\n${response}`);
}
metaPrompt = `\
Use Jina for the implementation if possible.
Overview of all Jina AI products:
Classification: Given texts or images, classify them into categories
Embedding: Given texts or images, generate embeddings
Reader r: Get an LLm-friendly version of a website
Reader s: Get search results that are LLM-friendly
Reader g: Given a statement, find out if it is true or false
Re-Ranker: Given a query and a list of search results, re-rank them
Segmenter: Given a text, split it into segments
` + metaPrompt
fs.appendFileSync(outputFile, `\n# ${productName}\n${response}`);

console.log(`All HTML files processed and saved to ${outputFile}`);
}
Expand Down
10 changes: 10 additions & 0 deletions experiment/index-gen.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
Use Jina for the implementation if possible.

Overview of all Jina AI products:
Classification: Given texts or images, classify them into categories
Embedding: Given texts or images, generate embeddings
Reader r: Get an LLm-friendly version of a website
Reader s: Get search results that are LLM-friendly
Reader g: Given a statement, find out if it is true or false
Re-Ranker: Given a query and a list of search results, re-rank them
Segmenter: Given a text, split it into segments

# classifier-text-image
## request
Expand Down
20 changes: 20 additions & 0 deletions experiment/run-tests.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
const {promptLLMOpenAI} = require("./gpt");
const fs = require('fs');

const testCases = require('./test-cases.json');
const metaPrompt = fs.readFileSync('index-gen.txt', 'utf-8');

async function main() {
testCases.forEach(async (testCase, index) => {
const prompt = `\
${testCase.prompt}
Generate the python code without any other wrapping elements or text.
Also no code fencing like \`\`\`python is allowed
I have my jina token as env var: JINA_API_KEY
${metaPrompt}`
const response = await promptLLMOpenAI(prompt, 'gpt-4o');
fs.writeFileSync(`testResults/${index}-${testCase.name}.py`, response)
});
}

main()
36 changes: 32 additions & 4 deletions experiment/test-cases.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,34 @@
[
"Generate a js script that creates embeddings out of the numbers 1 to 100 (in text form).",
"Generate a js script that re-ranks the words Jina, Weaviate, OpenAI, Hugging Face, Qdrant for the query 'Future of AI'.",
"write the full code that embed every sentence from https://news.ycombinator.com/newest and store the embedding into a numpy array.",
"Write the js code to check the validity of the following statement on bbc.com 'The UK government has announced a new law that will require social media companies to verify the age of their users.'"
{
"name": "hackernews",
"prompt": "grab every sentence from hackernews frontpage and visualize them in a 2d umap using matplotlib"
},
{
"name": "image-rerank",
"prompt": "I want to classify a series of images based on their domain, can I do that with Jina?"
},
{
"name": "batch-embedding",
"prompt": "creates embeddings out of the numbers 1 to 100 (in text form)."
},
{
"name": "embedding for classification",
"prompt": "generate an embedding that is good for a classification task for the word 'Jina'"
},
{
"name": "embedding late chunking",
"prompt": "generate an embedding with late chunking for the word 'Jina'"
},
{
"name": "embedding binary return type",
"prompt": "generate an embedding with binary return type for the word 'Jina'"
},
{
"name": "re-rank",
"prompt": "re-ranks the words Jina, Weaviate, OpenAI, Hugging Face, Qdrant for the query 'Future of AI'."
},
{
"name": "reader-grounding",
"prompt": "Write the js code to check the validity of the following statement on bbc.com 'The UK government has announced a new law that will require social media companies to verify the age of their users.'"
}
]
45 changes: 45 additions & 0 deletions experiment/testResults/0-hackernews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import requests
import matplotlib.pyplot as plt
import umap
import numpy as np

# Jina API key from environment variables
JINA_API_KEY = os.getenv('JINA_API_KEY')

# Endpoint for embeddings
embedding_endpoint = "https://api.jina.ai/v1/embeddings"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {JINA_API_KEY}"
}

# Grab sentences from HackerNews frontpage using r.reader from Jina
reader_endpoint = "https://r.jina.ai/https://news.ycombinator.com"
reader_response = requests.get(reader_endpoint, headers=headers)
sentences = reader_response.json()['content']['markdown_content'].split('\n')

# Filter out only sentences (simple approach, refine as needed)
sentences = [s for s in sentences if len(s.split()) > 3]

# Prepare data for embedding request
data = {
"model": "jina-embeddings-v3",
"input": sentences
}

# Request embeddings
response = requests.post(embedding_endpoint, json=data, headers=headers)
embeddings = np.array([item['embedding'] for item in response.json()['data']])

# Reduce embeddings to 2D using UMAP
reducer = umap.UMAP()
embedding_2d = reducer.fit_transform(embeddings)

# Visualize with matplotlib
plt.figure(figsize=(12,8))
plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1])
plt.title('HackerNews Sentences Visualized in 2D with UMAP')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()
29 changes: 29 additions & 0 deletions experiment/testResults/1-image-rerank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import requests
import os

endpoint = "https://api.jina.ai/v1/classify"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
}

data = {
"model": "jina-clip-v1",
"input": [
{"text": "A sleek smartphone with a high-resolution display and multiple camera lenses"},
{"text": "Fresh sushi rolls served on a wooden board with wasabi and ginger"},
{"image": "https://picsum.photos/id/11/367/267"},
{"image": "https://picsum.photos/id/22/367/267"},
{"text": "Vibrant autumn leaves in a dense forest with sunlight filtering through"},
{"image": "https://picsum.photos/id/8/367/267"}
],
"labels": [
"Technology and Gadgets",
"Food and Dining",
"Nature and Outdoors",
"Urban and Architecture"
]
}

response = requests.post(endpoint, headers=headers, json=data)
print(response.json())
28 changes: 28 additions & 0 deletions experiment/testResults/2-batch-embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
import requests

# Retrieve the Jina API key from the environment variable
JINA_API_KEY = os.getenv("JINA_API_KEY")

# Jina embeddings API endpoint
endpoint = "https://api.jina.ai/v1/embeddings"

# Headers including the authorization token
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {JINA_API_KEY}"
}

# Data payload for the request
data = {
"model": "jina-embeddings-v3",
"task": "text-matching",
"dimensions": 1024,
"input": [str(i) for i in range(1, 101)]
}

# Sending the POST request to Jina AI to generate embeddings
response = requests.post(endpoint, json=data, headers=headers)

# Printing the response from Jina AI
print(response.json())
17 changes: 17 additions & 0 deletions experiment/testResults/3-embedding for classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import requests

endpoint = "https://api.jina.ai/v1/embeddings"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
}
data = {
"model": "jina-embeddings-v3",
"task": "text-matching",
"dimensions": 1024,
"late_chunking": False,
"embedding_type": "float",
"input": ["Jina"]
}
response = requests.post(endpoint, json=data, headers=headers)
print(response.json())
19 changes: 19 additions & 0 deletions experiment/testResults/4-embedding late chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import requests

endpoint = "https://api.jina.ai/v1/embeddings"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
}
data = {
"model": "jina-embeddings-v3",
"task": "text-matching",
"dimensions": 1024,
"late_chunking": True,
"embedding_type": "float",
"input": [
"Jina"
]
}
response = requests.post(endpoint, json=data, headers=headers)
print(response.json())
17 changes: 17 additions & 0 deletions experiment/testResults/5-embedding binary return type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import requests

endpoint = "https://api.jina.ai/v1/embeddings"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer " + os.environ.get('JINA_API_KEY')
}
data = {
"model": "jina-embeddings-v3",
"task": "text-matching",
"dimensions": 1024,
"late_chunking": False,
"embedding_type": "binary",
"input": ["Jina"]
}
response = requests.post(endpoint, json=data, headers=headers)
print(response.json())
25 changes: 25 additions & 0 deletions experiment/testResults/6-re-rank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import requests
import os

headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
}

data = {
"model": "jina-reranker-v2-base-multilingual",
"query": "Future of AI",
"top_n": 5,
"documents": [
"Jina",
"Weaviate",
"OpenAI",
"Hugging Face",
"Qdrant"
]
}

response = requests.post("https://api.jina.ai/v1/rerank", headers=headers, json=data)
results = response.json()
for result in results.get('results', []):
print(result['document']['text'], result['relevance_score'])
32 changes: 32 additions & 0 deletions experiment/testResults/7-reader-grounding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import requests

# Environment variable for Jina API Key
jina_api_key = os.getenv('JINA_API_KEY')

# S.reader API endpoint for searching
endpoint = 'https://s.jina.ai'
headers = {
'Authorization': f'Bearer {jina_api_key}'
}
params = {
'query': 'The UK government has announced a new law that will require social media companies to verify the age of their users.',
'lang': 'en-US'
}

# Sending the GET request
response = requests.get(endpoint, headers=headers, params=params)

# Parsing the response
if response.status_code == 200:
data = response.json()
articles = data.get('data', {})
relevant_articles = [article for article in articles if 'bbc.com' in article.get('url', '')]
if relevant_articles:
print("Found relevant articles on BBC.com:")
for article in relevant_articles:
print(article.get('title'), '-', article.get('url'))
else:
print("No relevant articles found on BBC.com regarding the statement.")
else:
print("Failed to fetch data from API.")
4 changes: 4 additions & 0 deletions experiment/testResults/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
requests
scikit-learn
umap-learn
matplotlib

0 comments on commit 03a9fc9

Please sign in to comment.