-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ Add URL content vectorization feature to migration web app
- Loading branch information
Showing
9 changed files
with
1,188 additions
and
154 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
OPENAI_API_KEY="YOUR_API_KEY" | ||
SUPABASE_URL="YOUR_SUPABASE_URL" | ||
SUPABASE_SERVICE_ROLE_KEY="YOUR_SERVICE_ROLE_KEY" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import { vectorizeUrl } from '@/lib/vectorization' | ||
import type { NextRequest } from 'next/server' | ||
|
||
export const runtime = 'edge' | ||
|
||
export async function POST(req: NextRequest) { | ||
try { | ||
const { url } = await req.json() | ||
|
||
if (!url || typeof url !== 'string') { | ||
return new Response( | ||
JSON.stringify({ | ||
error: 'URL is not provided or is in an invalid format', | ||
}), | ||
{ status: 400 }, | ||
) | ||
} | ||
|
||
const result = await vectorizeUrl(url) | ||
|
||
return new Response( | ||
JSON.stringify({ | ||
success: true, | ||
message: 'Content vectorized and stored successfully', | ||
id: result.documentId, | ||
chunkCount: result.chunkCount, | ||
}), | ||
{ status: 200 }, | ||
) | ||
} catch (error) { | ||
console.error('Error in vectorize API:', error) | ||
return new Response( | ||
JSON.stringify({ error: 'An error occurred while processing the URL' }), | ||
{ status: 500 }, | ||
) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
74 changes: 74 additions & 0 deletions
74
frontend/apps/migration-web/components/UrlVectorizer.module.css
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
.container { | ||
margin-top: 2rem; | ||
padding: 1.5rem; | ||
border: 1px solid #e0e0e0; | ||
border-radius: 8px; | ||
background-color: #f9f9f9; | ||
} | ||
|
||
.title { | ||
font-size: 1.5rem; | ||
margin-bottom: 1rem; | ||
color: #333; | ||
} | ||
|
||
.form { | ||
display: flex; | ||
flex-direction: column; | ||
gap: 1rem; | ||
} | ||
|
||
.inputGroup { | ||
display: flex; | ||
gap: 0.5rem; | ||
} | ||
|
||
.input { | ||
flex: 1; | ||
padding: 0.75rem; | ||
border: 1px solid #ccc; | ||
border-radius: 4px; | ||
font-size: 1rem; | ||
} | ||
|
||
.button { | ||
padding: 0.75rem 1.5rem; | ||
background-color: #0070f3; | ||
color: white; | ||
border: none; | ||
border-radius: 4px; | ||
font-size: 1rem; | ||
cursor: pointer; | ||
transition: background-color 0.2s; | ||
} | ||
|
||
.button:hover { | ||
background-color: #0060df; | ||
} | ||
|
||
.button:disabled { | ||
background-color: #ccc; | ||
cursor: not-allowed; | ||
} | ||
|
||
.error { | ||
color: #e53e3e; | ||
padding: 0.5rem; | ||
border-radius: 4px; | ||
background-color: #fff5f5; | ||
border: 1px solid #fed7d7; | ||
} | ||
|
||
.success { | ||
color: #38a169; | ||
padding: 0.5rem; | ||
border-radius: 4px; | ||
background-color: #f0fff4; | ||
border: 1px solid #c6f6d5; | ||
} | ||
|
||
.description { | ||
font-size: 0.875rem; | ||
color: #666; | ||
line-height: 1.5; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
'use client' | ||
|
||
import { type FormEvent, useState } from 'react' | ||
import styles from './UrlVectorizer.module.css' | ||
|
||
export type UrlVectorizerProps = { | ||
endpoint: string | ||
} | ||
|
||
export const UrlVectorizer = ({ endpoint }: UrlVectorizerProps) => { | ||
const [url, setUrl] = useState<string>('') | ||
const [isLoading, setIsLoading] = useState<boolean>(false) | ||
const [error, setError] = useState<string | null>(null) | ||
const [success, setSuccess] = useState<string | null>(null) | ||
|
||
const handleSubmit = async (e: FormEvent) => { | ||
e.preventDefault() | ||
|
||
if (!url.trim()) { | ||
setError('Please enter a URL') | ||
return | ||
} | ||
|
||
if (!url.startsWith('http://') && !url.startsWith('https://')) { | ||
setError('Please enter a valid URL (must start with http:// or https://)') | ||
return | ||
} | ||
|
||
setIsLoading(true) | ||
setError(null) | ||
setSuccess(null) | ||
|
||
try { | ||
const response = await fetch(`/${endpoint}`, { | ||
method: 'POST', | ||
headers: { | ||
'Content-Type': 'application/json', | ||
}, | ||
body: JSON.stringify({ url }), | ||
}) | ||
|
||
const data = await response.json() | ||
|
||
if (!response.ok) { | ||
throw new Error(data.error || `Error: ${response.statusText}`) | ||
} | ||
|
||
setSuccess( | ||
`URL content successfully vectorized and stored. ID: ${data.id}`, | ||
) | ||
setUrl('') | ||
} catch (err) { | ||
console.error('Error during vectorization:', err) | ||
setError( | ||
err instanceof Error | ||
? err.message | ||
: 'An error occurred while processing the URL', | ||
) | ||
} finally { | ||
setIsLoading(false) | ||
} | ||
} | ||
|
||
return ( | ||
<div className={styles.container}> | ||
<h2 className={styles.title}>URL Content Vectorization</h2> | ||
<form onSubmit={handleSubmit} className={styles.form}> | ||
<div className={styles.inputGroup}> | ||
<input | ||
type="text" | ||
value={url} | ||
onChange={(e) => setUrl(e.target.value)} | ||
placeholder="https://example.com" | ||
className={styles.input} | ||
disabled={isLoading} | ||
/> | ||
<button type="submit" className={styles.button} disabled={isLoading}> | ||
{isLoading ? 'Processing...' : 'Vectorize'} | ||
</button> | ||
</div> | ||
|
||
{error && <div className={styles.error}>{error}</div>} | ||
{success && <div className={styles.success}>{success}</div>} | ||
|
||
<p className={styles.description}> | ||
Enter a URL to fetch its content, vectorize it, and store it in the | ||
database. | ||
</p> | ||
</form> | ||
</div> | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import { createClient } from '@supabase/supabase-js' | ||
|
||
export const supabaseClient = createClient( | ||
process.env.SUPABASE_URL || '', | ||
process.env.SUPABASE_SERVICE_ROLE_KEY || '', | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import { CheerioWebBaseLoader } from '@langchain/community/document_loaders/web/cheerio' | ||
import { HtmlToTextTransformer } from '@langchain/community/document_transformers/html_to_text' | ||
import { SupabaseVectorStore } from '@langchain/community/vectorstores/supabase' | ||
import { OpenAIEmbeddings } from '@langchain/openai' | ||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter' | ||
import { supabaseClient } from './supabase' | ||
|
||
const embeddings = new OpenAIEmbeddings({ | ||
modelName: 'text-embedding-3-small', | ||
}) | ||
|
||
const vectorStore = new SupabaseVectorStore(embeddings, { | ||
client: supabaseClient, | ||
tableName: 'documents', | ||
queryName: 'match_documents', | ||
}) | ||
|
||
export type VectorizationResult = { | ||
documentId?: string | ||
chunkCount: number | ||
} | ||
|
||
export async function vectorizeUrl(url: string): Promise<VectorizationResult> { | ||
const loader = new CheerioWebBaseLoader(url) | ||
const docs = await loader.load() | ||
|
||
const transformer = new HtmlToTextTransformer() | ||
const sequence = | ||
RecursiveCharacterTextSplitter.fromLanguage('html').pipe(transformer) | ||
const newDocuments = await sequence.invoke(docs) | ||
const pages = newDocuments.map((doc) => doc.pageContent) | ||
|
||
const extractedText = await Promise.all( | ||
pages.map(async (page) => { | ||
return page.split('\n').join(' ') | ||
}), | ||
) | ||
|
||
const extractedDocs = extractedText.map((text) => ({ | ||
pageContent: text, | ||
metadata: {}, | ||
})) | ||
|
||
const splitter = new RecursiveCharacterTextSplitter({ | ||
chunkSize: 300, | ||
chunkOverlap: 0, | ||
}) | ||
|
||
const chunks = await splitter.splitDocuments(extractedDocs) | ||
|
||
for (const chunk of chunks) { | ||
chunk.metadata = { | ||
...chunk.metadata, | ||
source: url, | ||
type: 'webpage', | ||
} | ||
} | ||
|
||
const ids = await vectorStore.addDocuments(chunks) | ||
|
||
return { | ||
documentId: ids[0], | ||
chunkCount: chunks.length, | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.