FlowiseAI · jitsmaster · Dec 14, 2023 · Dec 15, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts
@@ -1,10 +1,11 @@
-import { INode, INodeData, INodeParams } from '../../../src/Interface'
-import { TextSplitter } from 'langchain/text_splitter'
+import { SelectorType } from 'cheerio'
+import { parse } from 'css-what'
+import { Document } from 'langchain/document'
 import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
+import { TextSplitter } from 'langchain/text_splitter'
 import { test } from 'linkifyjs'
-import { parse } from 'css-what'
 import { webCrawl, xmlScrape } from '../../../src'
-import { SelectorType } from 'cheerio'
+import { INode, INodeData, INodeParams } from '../../../src/Interface'
 
 class Cheerio_DocumentLoaders implements INode {
     label: string
@@ -76,6 +77,23 @@ class Cheerio_DocumentLoaders implements INode {
                 optional: true,
                 additionalParams: true
             },
+            {
+                label: 'Base URL Prefixes',
+                name: 'urlFilter',
+                type: 'string',
+                description: 'Delimited by comma. If specified, only links that start with this URL will be retrieved. (Web Crawl only)',
+                optional: true,
+                additionalParams: true
+            },
+            {
+                label: 'Excluded URL Prefixes',
+                name: 'exUrlFilter',
+                type: 'string',
+                description:
+                    "Delimited by comma. If specified, only links that don't start with this URL will be retrieved. (Web Crawl only)",
+                optional: true,
+                additionalParams: true
+            },
             {
                 label: 'Metadata',
                 name: 'metadata',
@@ -106,18 +124,53 @@ class Cheerio_DocumentLoaders implements INode {
             params['selector'] = selector
         }
 
-        async function cheerioLoader(url: string): Promise<any> {
+        const baseUrlFilters = ((nodeData.inputs?.urlFilter as string)?.trim()?.toLowerCase() || '').split(',').filter((x) => !!x)
+        const exBaseUrlFilter = ((nodeData.inputs?.exUrlFilter as string)?.trim()?.toLowerCase() || '').split(',').filter((x) => !!x)
+
+        console.info(`Prefix urls: ${baseUrlFilters.join(',')}`)
+        console.info(`Excluded urls: ${exBaseUrlFilter.join(',')}`)
+
+        const loadedDocUrls = new Set<string>()
+
+        async function cheerioLoader(url: string): Promise<Document[]> {
             try {
-                let docs = []
+                let docs = [] as Document[]
+                if (!!baseUrlFilters?.length && !baseUrlFilters.some((baseUrl) => url.toLowerCase().startsWith(baseUrl))) {
+                    console.info(`scraping - skipping url ${url} because it does not start with ${baseUrlFilters}`)
+                    return docs
+                }
+
+                if (!!exBaseUrlFilter?.length && exBaseUrlFilter.some((exBaseUrl) => url.toLowerCase().startsWith(exBaseUrl))) {
+                    console.info(`scraping - skipping url ${url} because it starts with ${exBaseUrlFilter}`)
+                    return docs
+                }
+
                 const loader = new CheerioWebBaseLoader(url, params)
+                console.info(`scraping - loading url ${url}`)
                 if (textSplitter) {
                     docs = await loader.loadAndSplit(textSplitter)
                 } else {
                     docs = await loader.load()
                 }
-                return docs
+
+                const newDocs = docs.filter(
+                    (doc) =>
+                        !loadedDocUrls.has(doc.metadata.source) &&
+                        (!baseUrlFilters || baseUrlFilters.some((baseUrl) => doc.metadata.source.toLowerCase().startsWith(baseUrl))) &&
+                        (!exBaseUrlFilter || !exBaseUrlFilter.some((exBaseUrl) => doc.metadata.source.toLowerCase().startsWith(exBaseUrl)))
+                )
+
+                newDocs
+                    .map((doc) => doc.metadata.source)
+                    .forEach((docUrl) => {
+                        if (!loadedDocUrls.has(docUrl)) loadedDocUrls.add(docUrl)
+                    })
+
+                console.info(`scraping - loaded ${docs.length} new docs from ${url}`)
+                return newDocs
             } catch (err) {
-                if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
+                console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
+                return []
             }
         }
 
@@ -126,13 +179,33 @@ class Cheerio_DocumentLoaders implements INode {
             if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
             if (!limit) limit = '10'
             else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
-            const pages: string[] =
-                relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
+            console.info(`scrape limit: ${limit}`)
+            console.info(`scraping url: ${url}`)
+            let pages: string[] =
+                relativeLinksMethod === 'webCrawl'
+                    ? await webCrawl(url, parseInt(limit), baseUrlFilters, exBaseUrlFilter)
+                    : await xmlScrape(url, parseInt(limit))
+
             if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
-            if (!pages || pages.length === 0) throw new Error('No relative links found')
-            for (const page of pages) {
-                docs.push(...(await cheerioLoader(page)))
+            if (!pages || !Array.isArray(pages) || pages.length === 0) {
+                console.warn(`No relative links found for ${url}`)
+                return
+            }
+
+            if (!!limit && parseInt(limit) > 0) {
+                console.info(`scraping limit to ${limit}`)
+                pages = pages.slice(0, parseInt(limit)) // limit docs to be returned
+            }
+
+            try {
+                console.info(`scraping found ${pages.length} pages: ${pages.join(', ')}`)
+                for (const page of pages) {
+                    docs.push(...(await cheerioLoader(page)))
+                }
+            } catch (err) {
+                console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
             }
+
             if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
         } else {
             docs = await cheerioLoader(url)
@@ -141,6 +214,7 @@ class Cheerio_DocumentLoaders implements INode {
         if (metadata) {
             const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
             let finaldocs = []
+
             for (const doc of docs) {
                 const newdoc = {
                     ...doc,
@@ -154,6 +228,12 @@ class Cheerio_DocumentLoaders implements INode {
             return finaldocs
         }
 
+        console.info(`scraped ${docs.length} docs from ${url}`)
+        if (!!limit && parseInt(limit) > 0 && docs.length > parseInt(limit)) {
+            console.info(`scraped docs limiting to ${limit}`)
+            docs = docs.slice(0, parseInt(limit)) // limit docs to be returned
+        }
+
         return docs
     }
 }

diff --git a/packages/components/nodes/vectorstores/Pinecone/Pinecone.ts b/packages/components/nodes/vectorstores/Pinecone/Pinecone.ts
@@ -1,8 +1,8 @@
-import { flatten } from 'lodash'
 import { Pinecone } from '@pinecone-database/pinecone'
-import { PineconeLibArgs, PineconeStore } from 'langchain/vectorstores/pinecone'
-import { Embeddings } from 'langchain/embeddings/base'
 import { Document } from 'langchain/document'
+import { Embeddings } from 'langchain/embeddings/base'
+import { PineconeLibArgs, PineconeStore } from 'langchain/vectorstores/pinecone'
+import { flatten } from 'lodash'
 import { ICommonObject, INode, INodeData, INodeOutputsValue, INodeParams } from '../../../src/Interface'
 import { getBaseClasses, getCredentialData, getCredentialParam } from '../../../src/utils'
 
@@ -113,12 +113,7 @@ class Pinecone_VectorStores implements INode {
             const pineconeIndex = client.Index(index)
 
             const flattenDocs = docs && docs.length ? flatten(docs) : []
-            const finalDocs = []
-            for (let i = 0; i < flattenDocs.length; i += 1) {
-                if (flattenDocs[i] && flattenDocs[i].pageContent) {
-                    finalDocs.push(new Document(flattenDocs[i]))
-                }
-            }
+            const finalDocs = flattenDocs.filter((doc) => !!doc?.pageContent).map((doc) => new Document(doc))
 
             const obj: PineconeLibArgs = {
                 pineconeIndex

diff --git a/packages/components/src/utils.ts b/packages/components/src/utils.ts
@@ -1,14 +1,15 @@
 import axios from 'axios'
 import { load } from 'cheerio'
+import { AES, enc } from 'crypto-js'
 import * as fs from 'fs'
-import * as path from 'path'
 import { JSDOM } from 'jsdom'
-import { z } from 'zod'
+import { ChatMessageHistory } from 'langchain/memory'
+import { AIMessage, HumanMessage, BaseMessage } from 'langchain/schema'
+import * as path from 'path'
 import { DataSource } from 'typeorm'
+import { z } from 'zod'
 import { ICommonObject, IDatabaseEntity, IMessage, INodeData } from './Interface'
-import { AES, enc } from 'crypto-js'
-import { ChatMessageHistory } from 'langchain/memory'
 import { AIMessage, HumanMessage, BaseMessage } from 'langchain/schema'
 
 export const numberOrExpressionRegex = '^(\\d+\\.?\\d*|{{.*}})$' //return true if string consists only numbers OR expression {{}}
 export const notEmptyRegex = '(.|\\s)*\\S(.|\\s)*' //return true if string is not empty or blank
@@ -300,15 +301,21 @@
  * @param {string} urlString
  * @returns {string}
  */
-function normalizeURL(urlString: string): string {
+function normalizeURL(urlString: string, removeBookmark?: boolean): string {
     const urlObj = new URL(urlString)
-    const hostPath = urlObj.hostname + urlObj.pathname
+    let hostPath = urlObj.hostname + urlObj.pathname
     if (hostPath.length > 0 && hostPath.slice(-1) == '/') {
         // handling trailing slash
         return hostPath.slice(0, -1)
     }
+    if (removeBookmark && urlString.includes('#')) {
+        // handling bookmark
+        hostPath = hostPath.substring(0, hostPath.indexOf('#'))
+    }
+
     return hostPath
 }
+const LARGE_FILE_EXTENSIONS = ['zip', 'tar', 'rar', 'jar', 'arj', 'gz'] //todo: add full listing
 
 /**
  * Recursive crawl using normalizeURL and getURLsFromHTML
@@ -318,23 +325,51 @@
  * @param {number} limit
  * @returns {Promise<string[]>}
  */
-async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> {
+async function crawl(
+    baseURL: string,
+    currentURL: string,
+    pages: string[],
+    limit: number,
+    prefixUrls?: string[],
+    exPrefixUrls?: string[]
+): Promise<string[]> {
     const baseURLObj = new URL(baseURL)
     const currentURLObj = new URL(currentURL)
 
-    if (limit !== 0 && pages.length === limit) return pages
+    if (limit > 0 && pages.length >= limit) {
+        console.info(`crawl limit reached: ${limit}`)
+        return pages
+    }
 
     if (baseURLObj.hostname !== currentURLObj.hostname) return pages
 
-    const normalizeCurrentURL = baseURLObj.protocol + '//' + normalizeURL(currentURL)
+    const normalizeCurrentURL = baseURLObj.protocol + '//' + normalizeURL(currentURL, true)
+
+    const lastSec = normalizeCurrentURL.substring(normalizeCurrentURL.lastIndexOf('/') + 1)
+    const dotPos = lastSec.lastIndexOf('.')
+    const urlExt = dotPos > -1 && dotPos < lastSec.length - 1 ? lastSec.substring(dotPos + 1) : ''
+
+    //fix issue with interable error when crawing a zip file, most likely timed out
+    if (!!urlExt && LARGE_FILE_EXTENSIONS.includes(urlExt)) return pages
+
+    if (!!prefixUrls?.length && !prefixUrls.some((prefixUrl) => normalizeCurrentURL.toLowerCase().startsWith(prefixUrl))) {
+        console.info(`skipping url ${normalizeCurrentURL} because it does not start with any required prefix urls.`)
+        return pages
+    }
+
+    if (!!exPrefixUrls?.length && exPrefixUrls.some((exPrefixUrl) => normalizeCurrentURL.toLowerCase().startsWith(exPrefixUrl))) {
+        console.info(`skipping url ${normalizeCurrentURL} because it starts with one or more excluded prefix urls.`)
+        return pages
+    }
+
     if (pages.includes(normalizeCurrentURL)) {
         return pages
     }
 
     pages.push(normalizeCurrentURL)
 
-    if (process.env.DEBUG === 'true') console.info(`actively crawling ${currentURL}`)
     try {
+        console.info(`crawling ${currentURL}`)
         const resp = await fetch(currentURL)
 
         if (resp.status > 399) {
@@ -349,13 +384,17 @@
         }
 
         const htmlBody = await resp.text()
+        console.info(`crawled ${currentURL}`)
         const nextURLs = getURLsFromHTML(htmlBody, baseURL)
         for (const nextURL of nextURLs) {
             pages = await crawl(baseURL, nextURL, pages, limit)
         }
     } catch (err) {
-        if (process.env.DEBUG === 'true') console.error(`error in fetch url: ${err.message}, on page: ${currentURL}`)
+        console.error(`error in fetch url: ${err.message}, on page: ${currentURL}`)
     }
+
+    console.info(`crawled ${pages.length} pages so far, limit: ${limit}}`)
+
     return pages
 }
 
@@ -365,10 +404,10 @@
  * @param {number} limit
  * @returns {Promise<string[]>}
  */
-export async function webCrawl(stringURL: string, limit: number): Promise<string[]> {
+export async function webCrawl(stringURL: string, limit: number, baseUrls?: string[], exBaseUrls?: string[]): Promise<string[]> {
     const URLObj = new URL(stringURL)
     const modifyURL = stringURL.slice(-1) === '/' ? stringURL.slice(0, -1) : stringURL
-    return await crawl(URLObj.protocol + '//' + URLObj.hostname, modifyURL, [], limit)
+    return await crawl(URLObj.protocol + '//' + URLObj.hostname, modifyURL, [], limit, baseUrls, exBaseUrls)
 }
 
 export function getURLsFromXML(xmlBody: string, limit: number): string[] {