Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add url filtering to Cheerio scraper. Also fix multiple issues of link limit enforcement. #1417

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
106 changes: 93 additions & 13 deletions packages/components/nodes/documentloaders/Cheerio/Cheerio.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { SelectorType } from 'cheerio'
import { parse } from 'css-what'
import { Document } from 'langchain/document'
import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
import { TextSplitter } from 'langchain/text_splitter'
import { test } from 'linkifyjs'
import { parse } from 'css-what'
import { webCrawl, xmlScrape } from '../../../src'
import { SelectorType } from 'cheerio'
import { INode, INodeData, INodeParams } from '../../../src/Interface'

class Cheerio_DocumentLoaders implements INode {
label: string
Expand Down Expand Up @@ -76,6 +77,23 @@ class Cheerio_DocumentLoaders implements INode {
optional: true,
additionalParams: true
},
{
label: 'Base URL Prefixes',
name: 'urlFilter',
type: 'string',
description: 'Delimited by comma. If specified, only links that start with this URL will be retrieved. (Web Crawl only)',
optional: true,
additionalParams: true
},
{
label: 'Excluded URL Prefixes',
name: 'exUrlFilter',
type: 'string',
description:
"Delimited by comma. If specified, only links that don't start with this URL will be retrieved. (Web Crawl only)",
optional: true,
additionalParams: true
},
{
label: 'Metadata',
name: 'metadata',
Expand Down Expand Up @@ -106,18 +124,53 @@ class Cheerio_DocumentLoaders implements INode {
params['selector'] = selector
}

async function cheerioLoader(url: string): Promise<any> {
const baseUrlFilters = ((nodeData.inputs?.urlFilter as string)?.trim()?.toLowerCase() || '').split(',').filter((x) => !!x)
const exBaseUrlFilter = ((nodeData.inputs?.exUrlFilter as string)?.trim()?.toLowerCase() || '').split(',').filter((x) => !!x)

console.info(`Prefix urls: ${baseUrlFilters.join(',')}`)
console.info(`Excluded urls: ${exBaseUrlFilter.join(',')}`)

const loadedDocUrls = new Set<string>()

async function cheerioLoader(url: string): Promise<Document[]> {
try {
let docs = []
let docs = [] as Document[]
if (!!baseUrlFilters?.length && !baseUrlFilters.some((baseUrl) => url.toLowerCase().startsWith(baseUrl))) {
console.info(`scraping - skipping url ${url} because it does not start with ${baseUrlFilters}`)
return docs
}

if (!!exBaseUrlFilter?.length && exBaseUrlFilter.some((exBaseUrl) => url.toLowerCase().startsWith(exBaseUrl))) {
console.info(`scraping - skipping url ${url} because it starts with ${exBaseUrlFilter}`)
return docs
}

const loader = new CheerioWebBaseLoader(url, params)
console.info(`scraping - loading url ${url}`)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}
return docs

const newDocs = docs.filter(
(doc) =>
!loadedDocUrls.has(doc.metadata.source) &&
(!baseUrlFilters || baseUrlFilters.some((baseUrl) => doc.metadata.source.toLowerCase().startsWith(baseUrl))) &&
(!exBaseUrlFilter || !exBaseUrlFilter.some((exBaseUrl) => doc.metadata.source.toLowerCase().startsWith(exBaseUrl)))
)

newDocs
.map((doc) => doc.metadata.source)
.forEach((docUrl) => {
if (!loadedDocUrls.has(docUrl)) loadedDocUrls.add(docUrl)
})

console.info(`scraping - loaded ${docs.length} new docs from ${url}`)
return newDocs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
return []
}
}

Expand All @@ -126,13 +179,33 @@ class Cheerio_DocumentLoaders implements INode {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
console.info(`scrape limit: ${limit}`)
console.info(`scraping url: ${url}`)
let pages: string[] =
relativeLinksMethod === 'webCrawl'
? await webCrawl(url, parseInt(limit), baseUrlFilters, exBaseUrlFilter)
: await xmlScrape(url, parseInt(limit))

if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await cheerioLoader(page)))
if (!pages || !Array.isArray(pages) || pages.length === 0) {
console.warn(`No relative links found for ${url}`)
return
}

if (!!limit && parseInt(limit) > 0) {
console.info(`scraping limit to ${limit}`)
pages = pages.slice(0, parseInt(limit)) // limit docs to be returned
}

try {
console.info(`scraping found ${pages.length} pages: ${pages.join(', ')}`)
for (const page of pages) {
docs.push(...(await cheerioLoader(page)))
}
} catch (err) {
console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
}

if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
} else {
docs = await cheerioLoader(url)
Expand All @@ -141,6 +214,7 @@ class Cheerio_DocumentLoaders implements INode {
if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []

for (const doc of docs) {
const newdoc = {
...doc,
Expand All @@ -154,6 +228,12 @@ class Cheerio_DocumentLoaders implements INode {
return finaldocs
}

console.info(`scraped ${docs.length} docs from ${url}`)
if (!!limit && parseInt(limit) > 0 && docs.length > parseInt(limit)) {
console.info(`scraped docs limiting to ${limit}`)
docs = docs.slice(0, parseInt(limit)) // limit docs to be returned
}

return docs
}
}
Expand Down
13 changes: 4 additions & 9 deletions packages/components/nodes/vectorstores/Pinecone/Pinecone.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { flatten } from 'lodash'
import { Pinecone } from '@pinecone-database/pinecone'
import { PineconeLibArgs, PineconeStore } from 'langchain/vectorstores/pinecone'
import { Embeddings } from 'langchain/embeddings/base'
import { Document } from 'langchain/document'
import { Embeddings } from 'langchain/embeddings/base'
import { PineconeLibArgs, PineconeStore } from 'langchain/vectorstores/pinecone'
import { flatten } from 'lodash'
import { ICommonObject, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
import { getBaseClasses, getCredentialData, getCredentialParam } from '../../../src/utils'

Expand Down Expand Up @@ -113,12 +113,7 @@ class Pinecone_VectorStores implements INode {
const pineconeIndex = client.Index(index)

const flattenDocs = docs && docs.length ? flatten(docs) : []
const finalDocs = []
for (let i = 0; i < flattenDocs.length; i += 1) {
if (flattenDocs[i] && flattenDocs[i].pageContent) {
finalDocs.push(new Document(flattenDocs[i]))
}
}
const finalDocs = flattenDocs.filter((doc) => !!doc?.pageContent).map((doc) => new Document(doc))

const obj: PineconeLibArgs = {
pineconeIndex
Expand Down
65 changes: 52 additions & 13 deletions packages/components/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import axios from 'axios'
import { load } from 'cheerio'
import { AES, enc } from 'crypto-js'
import * as fs from 'fs'
import * as path from 'path'
import { JSDOM } from 'jsdom'
import { z } from 'zod'
import { ChatMessageHistory } from 'langchain/memory'
import { AIMessage, HumanMessage, BaseMessage } from 'langchain/schema'
import * as path from 'path'
import { DataSource } from 'typeorm'
import { z } from 'zod'
import { ICommonObject, IDatabaseEntity, IMessage, INodeData } from './Interface'
import { AES, enc } from 'crypto-js'
import { ChatMessageHistory } from 'langchain/memory'
import { AIMessage, HumanMessage, BaseMessage } from 'langchain/schema'

Check failure on line 12 in packages/components/src/utils.ts

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, 18.15.0)

'AIMessage' is already defined

Check failure on line 12 in packages/components/src/utils.ts

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, 18.15.0)

'HumanMessage' is already defined

Check failure on line 12 in packages/components/src/utils.ts

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, 18.15.0)

'BaseMessage' is already defined

export const numberOrExpressionRegex = '^(\\d+\\.?\\d*|{{.*}})$' //return true if string consists only numbers OR expression {{}}
export const notEmptyRegex = '(.|\\s)*\\S(.|\\s)*' //return true if string is not empty or blank
Expand Down Expand Up @@ -300,15 +301,21 @@
* @param {string} urlString
* @returns {string}
*/
function normalizeURL(urlString: string): string {
function normalizeURL(urlString: string, removeBookmark?: boolean): string {
const urlObj = new URL(urlString)
const hostPath = urlObj.hostname + urlObj.pathname
let hostPath = urlObj.hostname + urlObj.pathname
if (hostPath.length > 0 && hostPath.slice(-1) == '/') {
// handling trailing slash
return hostPath.slice(0, -1)
}
if (removeBookmark && urlString.includes('#')) {
// handling bookmark
hostPath = hostPath.substring(0, hostPath.indexOf('#'))
}

return hostPath
}
const LARGE_FILE_EXTENSIONS = ['zip', 'tar', 'rar', 'jar', 'arj', 'gz'] //todo: add full listing

/**
* Recursive crawl using normalizeURL and getURLsFromHTML
Expand All @@ -318,23 +325,51 @@
* @param {number} limit
* @returns {Promise<string[]>}
*/
async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> {
async function crawl(
baseURL: string,
currentURL: string,
pages: string[],
limit: number,
prefixUrls?: string[],
exPrefixUrls?: string[]
): Promise<string[]> {
const baseURLObj = new URL(baseURL)
const currentURLObj = new URL(currentURL)

if (limit !== 0 && pages.length === limit) return pages
if (limit > 0 && pages.length >= limit) {
console.info(`crawl limit reached: ${limit}`)
return pages
}

if (baseURLObj.hostname !== currentURLObj.hostname) return pages

const normalizeCurrentURL = baseURLObj.protocol + '//' + normalizeURL(currentURL)
const normalizeCurrentURL = baseURLObj.protocol + '//' + normalizeURL(currentURL, true)

const lastSec = normalizeCurrentURL.substring(normalizeCurrentURL.lastIndexOf('/') + 1)
const dotPos = lastSec.lastIndexOf('.')
const urlExt = dotPos > -1 && dotPos < lastSec.length - 1 ? lastSec.substring(dotPos + 1) : ''

//fix issue with interable error when crawing a zip file, most likely timed out
if (!!urlExt && LARGE_FILE_EXTENSIONS.includes(urlExt)) return pages

if (!!prefixUrls?.length && !prefixUrls.some((prefixUrl) => normalizeCurrentURL.toLowerCase().startsWith(prefixUrl))) {
console.info(`skipping url ${normalizeCurrentURL} because it does not start with any required prefix urls.`)
return pages
}

if (!!exPrefixUrls?.length && exPrefixUrls.some((exPrefixUrl) => normalizeCurrentURL.toLowerCase().startsWith(exPrefixUrl))) {
console.info(`skipping url ${normalizeCurrentURL} because it starts with one or more excluded prefix urls.`)
return pages
}

if (pages.includes(normalizeCurrentURL)) {
return pages
}

pages.push(normalizeCurrentURL)

if (process.env.DEBUG === 'true') console.info(`actively crawling ${currentURL}`)
try {
console.info(`crawling ${currentURL}`)
const resp = await fetch(currentURL)

if (resp.status > 399) {
Expand All @@ -349,13 +384,17 @@
}

const htmlBody = await resp.text()
console.info(`crawled ${currentURL}`)
const nextURLs = getURLsFromHTML(htmlBody, baseURL)
for (const nextURL of nextURLs) {
pages = await crawl(baseURL, nextURL, pages, limit)
}
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in fetch url: ${err.message}, on page: ${currentURL}`)
console.error(`error in fetch url: ${err.message}, on page: ${currentURL}`)
}

console.info(`crawled ${pages.length} pages so far, limit: ${limit}}`)

return pages
}

Expand All @@ -365,10 +404,10 @@
* @param {number} limit
* @returns {Promise<string[]>}
*/
export async function webCrawl(stringURL: string, limit: number): Promise<string[]> {
export async function webCrawl(stringURL: string, limit: number, baseUrls?: string[], exBaseUrls?: string[]): Promise<string[]> {
const URLObj = new URL(stringURL)
const modifyURL = stringURL.slice(-1) === '/' ? stringURL.slice(0, -1) : stringURL
return await crawl(URLObj.protocol + '//' + URLObj.hostname, modifyURL, [], limit)
return await crawl(URLObj.protocol + '//' + URLObj.hostname, modifyURL, [], limit, baseUrls, exBaseUrls)
}

export function getURLsFromXML(xmlBody: string, limit: number): string[] {
Expand Down
Loading
Loading