Skip to content

Commit

Permalink
✨ Add URL content vectorization feature to migration web app
Browse files Browse the repository at this point in the history
  • Loading branch information
sasamuku committed Mar 4, 2025
1 parent 42ec0da commit ee90557
Show file tree
Hide file tree
Showing 9 changed files with 1,188 additions and 154 deletions.
2 changes: 2 additions & 0 deletions frontend/apps/migration-web/.env.template
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
OPENAI_API_KEY="YOUR_API_KEY"
SUPABASE_URL="YOUR_SUPABASE_URL"
SUPABASE_SERVICE_ROLE_KEY="YOUR_SERVICE_ROLE_KEY"
37 changes: 37 additions & 0 deletions frontend/apps/migration-web/app/api/vectorize/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { vectorizeUrl } from '@/lib/vectorization'
import type { NextRequest } from 'next/server'

export const runtime = 'edge'

export async function POST(req: NextRequest) {
try {
const { url } = await req.json()

if (!url || typeof url !== 'string') {
return new Response(
JSON.stringify({
error: 'URL is not provided or is in an invalid format',
}),
{ status: 400 },
)
}

const result = await vectorizeUrl(url)

return new Response(
JSON.stringify({
success: true,
message: 'Content vectorized and stored successfully',
id: result.documentId,
chunkCount: result.chunkCount,
}),
{ status: 200 },
)
} catch (error) {
console.error('Error in vectorize API:', error)
return new Response(
JSON.stringify({ error: 'An error occurred while processing the URL' }),
{ status: 500 },
)
}
}
2 changes: 2 additions & 0 deletions frontend/apps/migration-web/app/review/page.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import { ReviewWindow } from '@/components/ReviewWindow'
import { UrlVectorizer } from '@/components/UrlVectorizer'
import styles from './page.module.css'

export default function ReviewPage() {
return (
<div className={styles.container}>
<h1 className={styles.title}>Database Schema Review</h1>
<UrlVectorizer endpoint="api/vectorize" />
<ReviewWindow
endpoint="api/review"
placeholder="CREATE TABLE users (id INT PRIMARY KEY, name VARCHAR(255), ...);"
Expand Down
74 changes: 74 additions & 0 deletions frontend/apps/migration-web/components/UrlVectorizer.module.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
.container {
margin-top: 2rem;
padding: 1.5rem;
border: 1px solid #e0e0e0;
border-radius: 8px;
background-color: #f9f9f9;
}

.title {
font-size: 1.5rem;
margin-bottom: 1rem;
color: #333;
}

.form {
display: flex;
flex-direction: column;
gap: 1rem;
}

.inputGroup {
display: flex;
gap: 0.5rem;
}

.input {
flex: 1;
padding: 0.75rem;
border: 1px solid #ccc;
border-radius: 4px;
font-size: 1rem;
}

.button {
padding: 0.75rem 1.5rem;
background-color: #0070f3;
color: white;
border: none;
border-radius: 4px;
font-size: 1rem;
cursor: pointer;
transition: background-color 0.2s;
}

.button:hover {
background-color: #0060df;
}

.button:disabled {
background-color: #ccc;
cursor: not-allowed;
}

.error {
color: #e53e3e;
padding: 0.5rem;
border-radius: 4px;
background-color: #fff5f5;
border: 1px solid #fed7d7;
}

.success {
color: #38a169;
padding: 0.5rem;
border-radius: 4px;
background-color: #f0fff4;
border: 1px solid #c6f6d5;
}

.description {
font-size: 0.875rem;
color: #666;
line-height: 1.5;
}
92 changes: 92 additions & 0 deletions frontend/apps/migration-web/components/UrlVectorizer.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
'use client'

import { type FormEvent, useState } from 'react'
import styles from './UrlVectorizer.module.css'

export type UrlVectorizerProps = {
endpoint: string
}

export const UrlVectorizer = ({ endpoint }: UrlVectorizerProps) => {
const [url, setUrl] = useState<string>('')
const [isLoading, setIsLoading] = useState<boolean>(false)
const [error, setError] = useState<string | null>(null)
const [success, setSuccess] = useState<string | null>(null)

const handleSubmit = async (e: FormEvent) => {
e.preventDefault()

if (!url.trim()) {
setError('Please enter a URL')
return
}

if (!url.startsWith('http://') && !url.startsWith('https://')) {
setError('Please enter a valid URL (must start with http:// or https://)')
return
}

setIsLoading(true)
setError(null)
setSuccess(null)

try {
const response = await fetch(`/${endpoint}`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ url }),
})

const data = await response.json()

if (!response.ok) {
throw new Error(data.error || `Error: ${response.statusText}`)
}

setSuccess(
`URL content successfully vectorized and stored. ID: ${data.id}`,
)
setUrl('')
} catch (err) {
console.error('Error during vectorization:', err)
setError(
err instanceof Error
? err.message
: 'An error occurred while processing the URL',
)
} finally {
setIsLoading(false)
}
}

return (
<div className={styles.container}>
<h2 className={styles.title}>URL Content Vectorization</h2>
<form onSubmit={handleSubmit} className={styles.form}>
<div className={styles.inputGroup}>
<input
type="text"
value={url}
onChange={(e) => setUrl(e.target.value)}
placeholder="https://example.com"
className={styles.input}
disabled={isLoading}
/>
<button type="submit" className={styles.button} disabled={isLoading}>
{isLoading ? 'Processing...' : 'Vectorize'}
</button>
</div>

{error && <div className={styles.error}>{error}</div>}
{success && <div className={styles.success}>{success}</div>}

<p className={styles.description}>
Enter a URL to fetch its content, vectorize it, and store it in the
database.
</p>
</form>
</div>
)
}
6 changes: 6 additions & 0 deletions frontend/apps/migration-web/lib/supabase.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import { createClient } from '@supabase/supabase-js'

export const supabaseClient = createClient(
process.env.SUPABASE_URL || '',
process.env.SUPABASE_SERVICE_ROLE_KEY || '',
)
65 changes: 65 additions & 0 deletions frontend/apps/migration-web/lib/vectorization.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { CheerioWebBaseLoader } from '@langchain/community/document_loaders/web/cheerio'
import { HtmlToTextTransformer } from '@langchain/community/document_transformers/html_to_text'
import { SupabaseVectorStore } from '@langchain/community/vectorstores/supabase'
import { OpenAIEmbeddings } from '@langchain/openai'
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'
import { supabaseClient } from './supabase'

const embeddings = new OpenAIEmbeddings({
modelName: 'text-embedding-3-small',
})

const vectorStore = new SupabaseVectorStore(embeddings, {
client: supabaseClient,
tableName: 'documents',
queryName: 'match_documents',
})

export type VectorizationResult = {
documentId?: string
chunkCount: number
}

export async function vectorizeUrl(url: string): Promise<VectorizationResult> {
const loader = new CheerioWebBaseLoader(url)
const docs = await loader.load()

const transformer = new HtmlToTextTransformer()
const sequence =
RecursiveCharacterTextSplitter.fromLanguage('html').pipe(transformer)
const newDocuments = await sequence.invoke(docs)
const pages = newDocuments.map((doc) => doc.pageContent)

const extractedText = await Promise.all(
pages.map(async (page) => {
return page.split('\n').join(' ')
}),
)

const extractedDocs = extractedText.map((text) => ({
pageContent: text,
metadata: {},
}))

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 300,
chunkOverlap: 0,
})

const chunks = await splitter.splitDocuments(extractedDocs)

for (const chunk of chunks) {
chunk.metadata = {
...chunk.metadata,
source: url,
type: 'webpage',
}
}

const ids = await vectorStore.addDocuments(chunks)

return {
documentId: ids[0],
chunkCount: chunks.length,
}
}
4 changes: 3 additions & 1 deletion frontend/apps/migration-web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
"private": true,
"version": "0.1.0",
"dependencies": {
"@langchain/community": "0.3.33",
"@langchain/core": "0.3.42",
"@langchain/openai": "0.4.4",
"@supabase/supabase-js": "2.49.1",
"html-to-text": "9.0.5",
"langchain": "0.3.19",
"next": "15.1.2",
"react": "18.3.1",
Expand All @@ -15,7 +18,6 @@
"@types/node": "22.9.0",
"@types/react": "18",
"@types/react-dom": "18",
"supabase": "2.15.8",
"typed-css-modules": "0.9.1",
"typescript": "5"
},
Expand Down
Loading

0 comments on commit ee90557

Please sign in to comment.