diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b37bbd3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +node_modules +experiment/testResults/venv \ No newline at end of file diff --git a/experiment/generateMetaPromptHTML.js b/experiment/generateMetaPromptHTML.js index 4b3d4d2..2149428 100644 --- a/experiment/generateMetaPromptHTML.js +++ b/experiment/generateMetaPromptHTML.js @@ -18,6 +18,7 @@ const getHtmlFiles = (dir) => { // Process each HTML file async function processHtmlSnippets() { + let metaPrompt = ''; const htmlFiles = getHtmlFiles(htmlSnippetDir); for (const fileName of htmlFiles) { @@ -51,10 +52,22 @@ response = requests. ... # < comment here > // Call the LLM API to get the response const response = await promptLLMOpenAI(prompt, 'gpt-4o'); - + metaPrompt += `\n# ${productName}\n${response}` // Append the result to the output file - fs.appendFileSync(outputFile, `\n# ${productName}\n${response}`); } + metaPrompt = `\ +Use Jina for the implementation if possible. + +Overview of all Jina AI products: +Classification: Given texts or images, classify them into categories +Embedding: Given texts or images, generate embeddings +Reader r: Get an LLm-friendly version of a website +Reader s: Get search results that are LLM-friendly +Reader g: Given a statement, find out if it is true or false +Re-Ranker: Given a query and a list of search results, re-rank them +Segmenter: Given a text, split it into segments +` + metaPrompt + fs.appendFileSync(outputFile, `\n# ${productName}\n${response}`); console.log(`All HTML files processed and saved to ${outputFile}`); } diff --git a/experiment/index-gen.txt b/experiment/index-gen.txt index 183a415..56565ce 100644 --- a/experiment/index-gen.txt +++ b/experiment/index-gen.txt @@ -1,3 +1,13 @@ +Use Jina for the implementation if possible. + +Overview of all Jina AI products: +Classification: Given texts or images, classify them into categories +Embedding: Given texts or images, generate embeddings +Reader r: Get an LLm-friendly version of a website +Reader s: Get search results that are LLM-friendly +Reader g: Given a statement, find out if it is true or false +Re-Ranker: Given a query and a list of search results, re-rank them +Segmenter: Given a text, split it into segments # classifier-text-image ## request diff --git a/experiment/run-tests.js b/experiment/run-tests.js new file mode 100644 index 0000000..6359507 --- /dev/null +++ b/experiment/run-tests.js @@ -0,0 +1,20 @@ +const {promptLLMOpenAI} = require("./gpt"); +const fs = require('fs'); + +const testCases = require('./test-cases.json'); +const metaPrompt = fs.readFileSync('index-gen.txt', 'utf-8'); + +async function main() { + testCases.forEach(async (testCase, index) => { + const prompt = `\ +${testCase.prompt} +Generate the python code without any other wrapping elements or text. +Also no code fencing like \`\`\`python is allowed +I have my jina token as env var: JINA_API_KEY +${metaPrompt}` + const response = await promptLLMOpenAI(prompt, 'gpt-4o'); + fs.writeFileSync(`testResults/${index}-${testCase.name}.py`, response) + }); +} + +main() \ No newline at end of file diff --git a/experiment/test-cases.json b/experiment/test-cases.json index 99be2af..55da9c9 100644 --- a/experiment/test-cases.json +++ b/experiment/test-cases.json @@ -1,6 +1,34 @@ [ - "Generate a js script that creates embeddings out of the numbers 1 to 100 (in text form).", - "Generate a js script that re-ranks the words Jina, Weaviate, OpenAI, Hugging Face, Qdrant for the query 'Future of AI'.", - "write the full code that embed every sentence from https://news.ycombinator.com/newest and store the embedding into a numpy array.", - "Write the js code to check the validity of the following statement on bbc.com 'The UK government has announced a new law that will require social media companies to verify the age of their users.'" + { + "name": "hackernews", + "prompt": "grab every sentence from hackernews frontpage and visualize them in a 2d umap using matplotlib" + }, + { + "name": "image-rerank", + "prompt": "I want to classify a series of images based on their domain, can I do that with Jina?" + }, + { + "name": "batch-embedding", + "prompt": "creates embeddings out of the numbers 1 to 100 (in text form)." + }, + { + "name": "embedding for classification", + "prompt": "generate an embedding that is good for a classification task for the word 'Jina'" + }, + { + "name": "embedding late chunking", + "prompt": "generate an embedding with late chunking for the word 'Jina'" + }, + { + "name": "embedding binary return type", + "prompt": "generate an embedding with binary return type for the word 'Jina'" + }, + { + "name": "re-rank", + "prompt": "re-ranks the words Jina, Weaviate, OpenAI, Hugging Face, Qdrant for the query 'Future of AI'." + }, + { + "name": "reader-grounding", + "prompt": "Write the js code to check the validity of the following statement on bbc.com 'The UK government has announced a new law that will require social media companies to verify the age of their users.'" + } ] \ No newline at end of file diff --git a/experiment/testResults/0-hackernews.py b/experiment/testResults/0-hackernews.py new file mode 100644 index 0000000..41d4a22 --- /dev/null +++ b/experiment/testResults/0-hackernews.py @@ -0,0 +1,45 @@ +import os +import requests +import matplotlib.pyplot as plt +import umap +import numpy as np + +# Jina API key from environment variables +JINA_API_KEY = os.getenv('JINA_API_KEY') + +# Endpoint for embeddings +embedding_endpoint = "https://api.jina.ai/v1/embeddings" +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {JINA_API_KEY}" +} + +# Grab sentences from HackerNews frontpage using r.reader from Jina +reader_endpoint = "https://r.jina.ai/https://news.ycombinator.com" +reader_response = requests.get(reader_endpoint, headers=headers) +sentences = reader_response.json()['content']['markdown_content'].split('\n') + +# Filter out only sentences (simple approach, refine as needed) +sentences = [s for s in sentences if len(s.split()) > 3] + +# Prepare data for embedding request +data = { + "model": "jina-embeddings-v3", + "input": sentences +} + +# Request embeddings +response = requests.post(embedding_endpoint, json=data, headers=headers) +embeddings = np.array([item['embedding'] for item in response.json()['data']]) + +# Reduce embeddings to 2D using UMAP +reducer = umap.UMAP() +embedding_2d = reducer.fit_transform(embeddings) + +# Visualize with matplotlib +plt.figure(figsize=(12,8)) +plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1]) +plt.title('HackerNews Sentences Visualized in 2D with UMAP') +plt.xlabel('UMAP Dimension 1') +plt.ylabel('UMAP Dimension 2') +plt.show() \ No newline at end of file diff --git a/experiment/testResults/1-image-rerank.py b/experiment/testResults/1-image-rerank.py new file mode 100644 index 0000000..f9a0f32 --- /dev/null +++ b/experiment/testResults/1-image-rerank.py @@ -0,0 +1,29 @@ +import requests +import os + +endpoint = "https://api.jina.ai/v1/classify" +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}" +} + +data = { + "model": "jina-clip-v1", + "input": [ + {"text": "A sleek smartphone with a high-resolution display and multiple camera lenses"}, + {"text": "Fresh sushi rolls served on a wooden board with wasabi and ginger"}, + {"image": "https://picsum.photos/id/11/367/267"}, + {"image": "https://picsum.photos/id/22/367/267"}, + {"text": "Vibrant autumn leaves in a dense forest with sunlight filtering through"}, + {"image": "https://picsum.photos/id/8/367/267"} + ], + "labels": [ + "Technology and Gadgets", + "Food and Dining", + "Nature and Outdoors", + "Urban and Architecture" + ] +} + +response = requests.post(endpoint, headers=headers, json=data) +print(response.json()) \ No newline at end of file diff --git a/experiment/testResults/2-batch-embedding.py b/experiment/testResults/2-batch-embedding.py new file mode 100644 index 0000000..46bbcd3 --- /dev/null +++ b/experiment/testResults/2-batch-embedding.py @@ -0,0 +1,28 @@ +import os +import requests + +# Retrieve the Jina API key from the environment variable +JINA_API_KEY = os.getenv("JINA_API_KEY") + +# Jina embeddings API endpoint +endpoint = "https://api.jina.ai/v1/embeddings" + +# Headers including the authorization token +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {JINA_API_KEY}" +} + +# Data payload for the request +data = { + "model": "jina-embeddings-v3", + "task": "text-matching", + "dimensions": 1024, + "input": [str(i) for i in range(1, 101)] +} + +# Sending the POST request to Jina AI to generate embeddings +response = requests.post(endpoint, json=data, headers=headers) + +# Printing the response from Jina AI +print(response.json()) \ No newline at end of file diff --git a/experiment/testResults/3-embedding for classification.py b/experiment/testResults/3-embedding for classification.py new file mode 100644 index 0000000..f41a4bf --- /dev/null +++ b/experiment/testResults/3-embedding for classification.py @@ -0,0 +1,17 @@ +import requests + +endpoint = "https://api.jina.ai/v1/embeddings" +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}" +} +data = { + "model": "jina-embeddings-v3", + "task": "text-matching", + "dimensions": 1024, + "late_chunking": False, + "embedding_type": "float", + "input": ["Jina"] +} +response = requests.post(endpoint, json=data, headers=headers) +print(response.json()) \ No newline at end of file diff --git a/experiment/testResults/4-embedding late chunking.py b/experiment/testResults/4-embedding late chunking.py new file mode 100644 index 0000000..cd5e54f --- /dev/null +++ b/experiment/testResults/4-embedding late chunking.py @@ -0,0 +1,19 @@ +import requests + +endpoint = "https://api.jina.ai/v1/embeddings" +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}" +} +data = { + "model": "jina-embeddings-v3", + "task": "text-matching", + "dimensions": 1024, + "late_chunking": True, + "embedding_type": "float", + "input": [ + "Jina" + ] +} +response = requests.post(endpoint, json=data, headers=headers) +print(response.json()) \ No newline at end of file diff --git a/experiment/testResults/5-embedding binary return type.py b/experiment/testResults/5-embedding binary return type.py new file mode 100644 index 0000000..5c4c09e --- /dev/null +++ b/experiment/testResults/5-embedding binary return type.py @@ -0,0 +1,17 @@ +import requests + +endpoint = "https://api.jina.ai/v1/embeddings" +headers = { + "Content-Type": "application/json", + "Authorization": "Bearer " + os.environ.get('JINA_API_KEY') +} +data = { + "model": "jina-embeddings-v3", + "task": "text-matching", + "dimensions": 1024, + "late_chunking": False, + "embedding_type": "binary", + "input": ["Jina"] +} +response = requests.post(endpoint, json=data, headers=headers) +print(response.json()) \ No newline at end of file diff --git a/experiment/testResults/6-re-rank.py b/experiment/testResults/6-re-rank.py new file mode 100644 index 0000000..18c4935 --- /dev/null +++ b/experiment/testResults/6-re-rank.py @@ -0,0 +1,25 @@ +import requests +import os + +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.getenv('JINA_API_KEY')}" +} + +data = { + "model": "jina-reranker-v2-base-multilingual", + "query": "Future of AI", + "top_n": 5, + "documents": [ + "Jina", + "Weaviate", + "OpenAI", + "Hugging Face", + "Qdrant" + ] +} + +response = requests.post("https://api.jina.ai/v1/rerank", headers=headers, json=data) +results = response.json() +for result in results.get('results', []): + print(result['document']['text'], result['relevance_score']) \ No newline at end of file diff --git a/experiment/testResults/7-reader-grounding.py b/experiment/testResults/7-reader-grounding.py new file mode 100644 index 0000000..f6e4c81 --- /dev/null +++ b/experiment/testResults/7-reader-grounding.py @@ -0,0 +1,32 @@ +import os +import requests + +# Environment variable for Jina API Key +jina_api_key = os.getenv('JINA_API_KEY') + +# S.reader API endpoint for searching +endpoint = 'https://s.jina.ai' +headers = { + 'Authorization': f'Bearer {jina_api_key}' +} +params = { + 'query': 'The UK government has announced a new law that will require social media companies to verify the age of their users.', + 'lang': 'en-US' +} + +# Sending the GET request +response = requests.get(endpoint, headers=headers, params=params) + +# Parsing the response +if response.status_code == 200: + data = response.json() + articles = data.get('data', {}) + relevant_articles = [article for article in articles if 'bbc.com' in article.get('url', '')] + if relevant_articles: + print("Found relevant articles on BBC.com:") + for article in relevant_articles: + print(article.get('title'), '-', article.get('url')) + else: + print("No relevant articles found on BBC.com regarding the statement.") +else: + print("Failed to fetch data from API.") \ No newline at end of file diff --git a/experiment/testResults/requirements.txt b/experiment/testResults/requirements.txt new file mode 100644 index 0000000..4ae812e --- /dev/null +++ b/experiment/testResults/requirements.txt @@ -0,0 +1,4 @@ +requests +scikit-learn +umap-learn +matplotlib