feat: more cases

jina-ai · Oct 29, 2024 · 03a9fc9 · 03a9fc9
1 parent 8f989e1
commit 03a9fc9
Show file tree

Hide file tree

Showing 14 changed files with 295 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+node_modules
+experiment/testResults/venv
diff --git a/experiment/generateMetaPromptHTML.js b/experiment/generateMetaPromptHTML.js
@@ -18,6 +18,7 @@ const getHtmlFiles = (dir) => {
 
 // Process each HTML file
 async function processHtmlSnippets() {
+    let metaPrompt = '';
     const htmlFiles = getHtmlFiles(htmlSnippetDir);
 
     for (const fileName of htmlFiles) {
@@ -51,10 +52,22 @@ response = requests. ... # < comment here >
 
         // Call the LLM API to get the response
         const response = await promptLLMOpenAI(prompt, 'gpt-4o');
-
+        metaPrompt += `\n# ${productName}\n${response}`
         // Append the result to the output file
-        fs.appendFileSync(outputFile, `\n# ${productName}\n${response}`);
     }
+    metaPrompt = `\
+Use Jina for the implementation if possible.
+
+Overview of all Jina AI products:
+Classification: Given texts or images, classify them into categories
+Embedding: Given texts or images, generate embeddings
+Reader r: Get an LLm-friendly version of a website
+Reader s: Get search results that are LLM-friendly
+Reader g: Given a statement, find out if it is true or false
+Re-Ranker: Given a query and a list of search results, re-rank them
+Segmenter: Given a text, split it into segments
+` + metaPrompt
+    fs.appendFileSync(outputFile, `\n# ${productName}\n${response}`);
 
     console.log(`All HTML files processed and saved to ${outputFile}`);
 }

diff --git a/experiment/index-gen.txt b/experiment/index-gen.txt
@@ -1,3 +1,13 @@
+Use Jina for the implementation if possible.
+
+Overview of all Jina AI products:
+Classification: Given texts or images, classify them into categories
+Embedding: Given texts or images, generate embeddings
+Reader r: Get an LLm-friendly version of a website
+Reader s: Get search results that are LLM-friendly
+Reader g: Given a statement, find out if it is true or false
+Re-Ranker: Given a query and a list of search results, re-rank them
+Segmenter: Given a text, split it into segments
 
 # classifier-text-image
 ## request

diff --git a/experiment/run-tests.js b/experiment/run-tests.js
@@ -0,0 +1,20 @@
+const {promptLLMOpenAI} = require("./gpt");
+const fs = require('fs');
+
+const testCases = require('./test-cases.json');
+const metaPrompt = fs.readFileSync('index-gen.txt', 'utf-8');
+
+async function main() {
+    testCases.forEach(async (testCase, index) => {
+        const prompt = `\
+${testCase.prompt}
+Generate the python code without any other wrapping elements or text.
+Also no code fencing like \`\`\`python is allowed
+I have my jina token as env var: JINA_API_KEY
+${metaPrompt}`
+        const response = await promptLLMOpenAI(prompt, 'gpt-4o');
+        fs.writeFileSync(`testResults/${index}-${testCase.name}.py`, response)
+    });
+}
+
+main()
diff --git a/experiment/test-cases.json b/experiment/test-cases.json
@@ -1,6 +1,34 @@
 [
-  "Generate a js script that creates embeddings out of the numbers 1 to 100 (in text form).",
-  "Generate a js script that re-ranks the words Jina, Weaviate, OpenAI, Hugging Face, Qdrant for the query 'Future of AI'.",
-  "write the full code that embed every sentence from https://news.ycombinator.com/newest and store the embedding into a numpy array.",
-  "Write the js code to check the validity of the following statement on bbc.com 'The UK government has announced a new law that will require social media companies to verify the age of their users.'"
+  {
+    "name": "hackernews",
+    "prompt": "grab every sentence from hackernews frontpage and visualize them in a 2d umap using matplotlib"
+  },
+  {
+    "name": "image-rerank",
+    "prompt": "I want to classify a series of images based on their domain, can I do that with Jina?"
+  },
+  {
+    "name": "batch-embedding",
+    "prompt": "creates embeddings out of the numbers 1 to 100 (in text form)."
+  },
+  {
+    "name": "embedding for classification",
+    "prompt": "generate an embedding that is good for a classification task for the word 'Jina'"
+  },
+  {
+    "name": "embedding late chunking",
+    "prompt": "generate an embedding with late chunking for the word 'Jina'"
+  },
+  {
+    "name": "embedding binary return type",
+    "prompt": "generate an embedding with binary return type for the word 'Jina'"
+  },
+  {
+    "name": "re-rank",
+    "prompt": "re-ranks the words Jina, Weaviate, OpenAI, Hugging Face, Qdrant for the query 'Future of AI'."
+  },
+  {
+    "name": "reader-grounding",
+    "prompt": "Write the js code to check the validity of the following statement on bbc.com 'The UK government has announced a new law that will require social media companies to verify the age of their users.'"
+  }
 ]
diff --git a/experiment/testResults/0-hackernews.py b/experiment/testResults/0-hackernews.py
@@ -0,0 +1,45 @@
+import os
+import requests
+import matplotlib.pyplot as plt
+import umap
+import numpy as np
+
+# Jina API key from environment variables
+JINA_API_KEY = os.getenv('JINA_API_KEY')
+
+# Endpoint for embeddings
+embedding_endpoint = "https://api.jina.ai/v1/embeddings"
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {JINA_API_KEY}"
+}
+
+# Grab sentences from HackerNews frontpage using r.reader from Jina
+reader_endpoint = "https://r.jina.ai/https://news.ycombinator.com"
+reader_response = requests.get(reader_endpoint, headers=headers)
+sentences = reader_response.json()['content']['markdown_content'].split('\n')
+
+# Filter out only sentences (simple approach, refine as needed)
+sentences = [s for s in sentences if len(s.split()) > 3]
+
+# Prepare data for embedding request
+data = {
+    "model": "jina-embeddings-v3",
+    "input": sentences
+}
+
+# Request embeddings
+response = requests.post(embedding_endpoint, json=data, headers=headers)
+embeddings = np.array([item['embedding'] for item in response.json()['data']])
+
+# Reduce embeddings to 2D using UMAP
+reducer = umap.UMAP()
+embedding_2d = reducer.fit_transform(embeddings)
+
+# Visualize with matplotlib
+plt.figure(figsize=(12,8))
+plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1])
+plt.title('HackerNews Sentences Visualized in 2D with UMAP')
+plt.xlabel('UMAP Dimension 1')
+plt.ylabel('UMAP Dimension 2')
+plt.show()
diff --git a/experiment/testResults/1-image-rerank.py b/experiment/testResults/1-image-rerank.py
@@ -0,0 +1,29 @@
+import requests
+import os
+
+endpoint = "https://api.jina.ai/v1/classify"
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
+}
+
+data = {
+    "model": "jina-clip-v1",
+    "input": [
+        {"text": "A sleek smartphone with a high-resolution display and multiple camera lenses"},
+        {"text": "Fresh sushi rolls served on a wooden board with wasabi and ginger"},
+        {"image": "https://picsum.photos/id/11/367/267"},
+        {"image": "https://picsum.photos/id/22/367/267"},
+        {"text": "Vibrant autumn leaves in a dense forest with sunlight filtering through"},
+        {"image": "https://picsum.photos/id/8/367/267"}
+    ],
+    "labels": [
+        "Technology and Gadgets",
+        "Food and Dining",
+        "Nature and Outdoors",
+        "Urban and Architecture"
+    ]
+}
+
+response = requests.post(endpoint, headers=headers, json=data)
+print(response.json())
diff --git a/experiment/testResults/2-batch-embedding.py b/experiment/testResults/2-batch-embedding.py
@@ -0,0 +1,28 @@
+import os
+import requests
+
+# Retrieve the Jina API key from the environment variable
+JINA_API_KEY = os.getenv("JINA_API_KEY")
+
+# Jina embeddings API endpoint
+endpoint = "https://api.jina.ai/v1/embeddings"
+
+# Headers including the authorization token
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {JINA_API_KEY}"
+}
+
+# Data payload for the request
+data = {
+    "model": "jina-embeddings-v3",
+    "task": "text-matching",
+    "dimensions": 1024,
+    "input": [str(i) for i in range(1, 101)]
+}
+
+# Sending the POST request to Jina AI to generate embeddings
+response = requests.post(endpoint, json=data, headers=headers)
+
+# Printing the response from Jina AI
+print(response.json())
diff --git a/experiment/testResults/3-embedding for classification.py b/experiment/testResults/3-embedding for classification.py
@@ -0,0 +1,17 @@
+import requests
+
+endpoint = "https://api.jina.ai/v1/embeddings"
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
+}
+data = {
+    "model": "jina-embeddings-v3",
+    "task": "text-matching",
+    "dimensions": 1024,
+    "late_chunking": False,
+    "embedding_type": "float",
+    "input": ["Jina"]
+}
+response = requests.post(endpoint, json=data, headers=headers)
+print(response.json())
diff --git a/experiment/testResults/4-embedding late chunking.py b/experiment/testResults/4-embedding late chunking.py
@@ -0,0 +1,19 @@
+import requests
+
+endpoint = "https://api.jina.ai/v1/embeddings"
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
+}
+data = {
+    "model": "jina-embeddings-v3",
+    "task": "text-matching",
+    "dimensions": 1024,
+    "late_chunking": True,
+    "embedding_type": "float",
+    "input": [
+        "Jina"
+    ]
+}
+response = requests.post(endpoint, json=data, headers=headers)
+print(response.json())
diff --git a/experiment/testResults/5-embedding binary return type.py b/experiment/testResults/5-embedding binary return type.py
@@ -0,0 +1,17 @@
+import requests
+
+endpoint = "https://api.jina.ai/v1/embeddings"
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": "Bearer " + os.environ.get('JINA_API_KEY')
+}
+data = {
+    "model": "jina-embeddings-v3",
+    "task": "text-matching",
+    "dimensions": 1024,
+    "late_chunking": False,
+    "embedding_type": "binary",
+    "input": ["Jina"]
+}
+response = requests.post(endpoint, json=data, headers=headers)
+print(response.json())
diff --git a/experiment/testResults/6-re-rank.py b/experiment/testResults/6-re-rank.py
@@ -0,0 +1,25 @@
+import requests
+import os
+
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}"
+}
+
+data = {
+    "model": "jina-reranker-v2-base-multilingual",
+    "query": "Future of AI",
+    "top_n": 5,
+    "documents": [
+        "Jina",
+        "Weaviate",
+        "OpenAI",
+        "Hugging Face",
+        "Qdrant"
+    ]
+}
+
+response = requests.post("https://api.jina.ai/v1/rerank", headers=headers, json=data)
+results = response.json()
+for result in results.get('results', []):
+    print(result['document']['text'], result['relevance_score'])
diff --git a/experiment/testResults/7-reader-grounding.py b/experiment/testResults/7-reader-grounding.py
@@ -0,0 +1,32 @@
+import os
+import requests
+
+# Environment variable for Jina API Key
+jina_api_key = os.getenv('JINA_API_KEY')
+
+# S.reader API endpoint for searching
+endpoint = 'https://s.jina.ai'
+headers = {
+    'Authorization': f'Bearer {jina_api_key}'
+}
+params = {
+    'query': 'The UK government has announced a new law that will require social media companies to verify the age of their users.',
+    'lang': 'en-US'
+}
+
+# Sending the GET request
+response = requests.get(endpoint, headers=headers, params=params)
+
+# Parsing the response
+if response.status_code == 200:
+    data = response.json()
+    articles = data.get('data', {})
+    relevant_articles = [article for article in articles if 'bbc.com' in article.get('url', '')]
+    if relevant_articles:
+        print("Found relevant articles on BBC.com:")
+        for article in relevant_articles:
+            print(article.get('title'), '-', article.get('url'))
+    else:
+        print("No relevant articles found on BBC.com regarding the statement.")
+else:
+    print("Failed to fetch data from API.")
diff --git a/experiment/testResults/requirements.txt b/experiment/testResults/requirements.txt
@@ -0,0 +1,4 @@
+requests
+scikit-learn
+umap-learn
+matplotlib