From 392855c7dc31dad6ac6823e06b65a7d968f6e28f Mon Sep 17 00:00:00 2001 From: Richard Casemore Date: Fri, 25 Aug 2023 04:31:01 +0100 Subject: [PATCH] Ported Recursive URL Loader: From Python to TypeScript with Concurrency Support (#2379) * Recursive loader based on the python implementation with concurrency * Added inheritence from BaseDocumentLoader and implements DocumentLoader * Fixed linting * Update entry point * export options * Move to just document_loaders/recursive_url, clean up code * Adds install instructions * More docs updates * Fix typo --------- Co-authored-by: jacoblee93 --- .../web_loaders/recursive_url_loader.md | 67 ++++++ langchain/.gitignore | 3 + langchain/package.json | 8 + langchain/scripts/create-entrypoints.js | 2 + .../tests/recursive_url.int.test.ts | 33 +++ .../src/document_loaders/web/recursive_url.ts | 197 ++++++++++++++++++ langchain/src/load/import_constants.ts | 1 + langchain/src/load/import_type.d.ts | 3 + langchain/tsconfig.json | 1 + 9 files changed, 315 insertions(+) create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/web_loaders/recursive_url_loader.md create mode 100644 langchain/src/document_loaders/tests/recursive_url.int.test.ts create mode 100644 langchain/src/document_loaders/web/recursive_url.ts diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/web_loaders/recursive_url_loader.md b/docs/extras/modules/data_connection/document_loaders/integrations/web_loaders/recursive_url_loader.md new file mode 100644 index 000000000000..d76522c542db --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/web_loaders/recursive_url_loader.md @@ -0,0 +1,67 @@ +--- +sidebar_class_name: node-only +hide_table_of_contents: true +--- + +# Recursive URL Loader + +When loading content from a website, we may want to process load all URLs on a page. + +For example, let's look at the [LangChainJS introduction](https://js.langchain.com/docs/get_started/introduction) docs. + +This has many interesting child pages that we may want to load, split, and later retrieve in bulk. + +The challenge is traversing the tree of child pages and assembling a list! + +We do this using the RecursiveUrlLoader. + +This also gives us the flexibility to exclude some children, customize the extractor, and more. + +## Setup + +To get started, you'll need to install the [`jsdom`](https://www.npmjs.com/package/jsdom) package: + +```bash npm2yarn +npm i jsdom +``` + +We also suggest adding a package like [`html-to-text`](https://www.npmjs.com/package/html-to-text) or +[`@mozilla/readability`](https://www.npmjs.com/package/@mozilla/readability) for extracting the raw text from the page. + +```bash npm2yarn +npm i html-to-text +``` + +## Usage + +```typescript +import { compile } from "html-to-text"; +import { RecursiveUrlLoader } from "langchain/document_loaders/web/recursive_url"; + +const url = "https://js.langchain.com/docs/get_started/introduction"; + +const compiledConvert = compile({ wordwrap: 130 }); // returns (text: string) => string; + +const loader = new RecursiveUrlLoader(url, { + extractor: compiledConvert, + maxDepth: 1, + excludeDirs: ["https://js.langchain.com/docs/api/"], +}); + +const docs = await loader.load(); +``` + +## Options + +```typescript +interface Options { + excludeDirs?: string[]; // webpage directories to exclude. + extractor?: (text: string) => string; // a function to extract the text of the document from the webpage, by default it returns the page as it is. It is recommended to use tools like html-to-text to extract the text. By default, it just returns the page as it is. + maxDepth?: number; // the maximum depth to crawl. By default, it is set to 2. If you need to crawl the whole website, set it to a number that is large enough would simply do the job. + timeout?: number; // the timeout for each request, in the unit of seconds. By default, it is set to 10000 (10 seconds). + preventOutside?: boolean; // whether to prevent crawling outside the root url. By default, it is set to true. + callerOptions?: AsyncCallerConstructorParams; // the options to call the AsyncCaller for example setting max concurrency (default is 64) +} +``` + +However, since it's hard to perform a perfect filter, you may still see some irrelevant results in the results. You can perform a filter on the returned documents by yourself, if it's needed. Most of the time, the returned results are good enough. diff --git a/langchain/.gitignore b/langchain/.gitignore index 1dc5222cbb4a..cdd1ee1fa69c 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -289,6 +289,9 @@ document_loaders/web/notiondb.d.ts document_loaders/web/notionapi.cjs document_loaders/web/notionapi.js document_loaders/web/notionapi.d.ts +document_loaders/web/recursive_url.cjs +document_loaders/web/recursive_url.js +document_loaders/web/recursive_url.d.ts document_loaders/web/s3.cjs document_loaders/web/s3.js document_loaders/web/s3.d.ts diff --git a/langchain/package.json b/langchain/package.json index 514c52d39738..dc61a6b3f499 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -301,6 +301,9 @@ "document_loaders/web/notionapi.cjs", "document_loaders/web/notionapi.js", "document_loaders/web/notionapi.d.ts", + "document_loaders/web/recursive_url.cjs", + "document_loaders/web/recursive_url.js", + "document_loaders/web/recursive_url.d.ts", "document_loaders/web/s3.cjs", "document_loaders/web/s3.js", "document_loaders/web/s3.d.ts", @@ -1544,6 +1547,11 @@ "import": "./document_loaders/web/notionapi.js", "require": "./document_loaders/web/notionapi.cjs" }, + "./document_loaders/web/recursive_url": { + "types": "./document_loaders/web/recursive_url.d.ts", + "import": "./document_loaders/web/recursive_url.js", + "require": "./document_loaders/web/recursive_url.cjs" + }, "./document_loaders/web/s3": { "types": "./document_loaders/web/s3.d.ts", "import": "./document_loaders/web/s3.js", diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index 7701ee1f9e3c..625334c08c02 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -120,6 +120,7 @@ const entrypoints = { "document_loaders/web/github": "document_loaders/web/github", "document_loaders/web/notiondb": "document_loaders/web/notiondb", "document_loaders/web/notionapi": "document_loaders/web/notionapi", + "document_loaders/web/recursive_url": "document_loaders/web/recursive_url", "document_loaders/web/s3": "document_loaders/web/s3", "document_loaders/web/sonix_audio": "document_loaders/web/sonix_audio", "document_loaders/web/confluence": "document_loaders/web/confluence", @@ -306,6 +307,7 @@ const requiresOptionalDependency = [ "document_loaders/web/github", "document_loaders/web/notiondb", "document_loaders/web/notionapi", + "document_loaders/web/recursive_url", "document_loaders/web/s3", "document_loaders/web/sonix_audio", "document_loaders/web/confluence", diff --git a/langchain/src/document_loaders/tests/recursive_url.int.test.ts b/langchain/src/document_loaders/tests/recursive_url.int.test.ts new file mode 100644 index 000000000000..d8da2a71b6b5 --- /dev/null +++ b/langchain/src/document_loaders/tests/recursive_url.int.test.ts @@ -0,0 +1,33 @@ +/* eslint-disable no-process-env */ +/* eslint-disable @typescript-eslint/no-non-null-assertion */ +import { test } from "@jest/globals"; +import { compile } from "html-to-text"; +import { RecursiveUrlLoader } from "../web/recursive_url.js"; + +describe("RecursiveUrlLoader", () => { + test("loading valid url", async () => { + const url = "https://js.langchain.com/docs/get_started/introduction"; + + const compiledConvert = compile({ wordwrap: 130 }); // returns (input: string;) => string; + + const loader = new RecursiveUrlLoader(url, { + extractor: compiledConvert, + maxDepth: 1, + excludeDirs: ["https://js.langchain.com/docs/api/"], + }); + + const docs = await loader.load(); + expect(docs.length).toBeGreaterThan(0); + expect(docs[0].pageContent).toContain("LangChain"); + }); + + test("loading invalid url", async () => { + const url = "https://this.url.is.invalid/this/is/a/test"; + const loader = new RecursiveUrlLoader(url, { + maxDepth: 1, + preventOutside: true, + }); + const docs = await loader.load(); + expect(docs.length).toBe(0); + }); +}); diff --git a/langchain/src/document_loaders/web/recursive_url.ts b/langchain/src/document_loaders/web/recursive_url.ts new file mode 100644 index 000000000000..59474da26765 --- /dev/null +++ b/langchain/src/document_loaders/web/recursive_url.ts @@ -0,0 +1,197 @@ +import { JSDOM } from "jsdom"; +import { Document } from "../../document.js"; +import { AsyncCaller } from "../../util/async_caller.js"; +import { BaseDocumentLoader, DocumentLoader } from "../base.js"; + +export interface RecursiveUrlLoaderOptions { + excludeDirs?: string[]; + extractor?: (text: string) => string; + maxDepth?: number; + timeout?: number; + preventOutside?: boolean; + callerOptions?: ConstructorParameters[0]; +} + +export class RecursiveUrlLoader + extends BaseDocumentLoader + implements DocumentLoader +{ + private caller: AsyncCaller; + + private url: string; + + private excludeDirs: string[]; + + private extractor: (text: string) => string; + + private maxDepth: number; + + private timeout: number; + + private preventOutside: boolean; + + constructor(url: string, options: RecursiveUrlLoaderOptions) { + super(); + + this.caller = new AsyncCaller({ + maxConcurrency: 64, + maxRetries: 0, + ...options.callerOptions, + }); + + this.url = url; + this.excludeDirs = options.excludeDirs ?? []; + this.extractor = options.extractor ?? ((s: string) => s); + this.maxDepth = options.maxDepth ?? 2; + this.timeout = options.timeout ?? 10000; + this.preventOutside = options.preventOutside ?? true; + } + + private async fetchWithTimeout( + resource: string, + options: { timeout: number } & RequestInit + ): Promise { + const { timeout, ...rest } = options; + return this.caller.call(() => + fetch(resource, { ...rest, signal: AbortSignal.timeout(timeout) }) + ); + } + + private getChildLinks(html: string, baseUrl: string): Array { + const allLinks = Array.from( + new JSDOM(html).window.document.querySelectorAll("a") + ).map((a) => a.href); + const absolutePaths = []; + // eslint-disable-next-line no-script-url + const invalidPrefixes = ["javascript:", "mailto:", "#"]; + const invalidSuffixes = [ + ".css", + ".js", + ".ico", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".svg", + ]; + + for (const link of allLinks) { + if ( + invalidPrefixes.some((prefix) => link.startsWith(prefix)) || + invalidSuffixes.some((suffix) => link.endsWith(suffix)) + ) + continue; + + if (link.startsWith("http")) { + const isAllowed = !this.preventOutside || link.startsWith(baseUrl); + if (isAllowed) absolutePaths.push(link); + } else if (link.startsWith("//")) { + const base = new URL(baseUrl); + absolutePaths.push(base.protocol + link); + } else { + const newLink = new URL(link, baseUrl).href; + absolutePaths.push(newLink); + } + } + + return Array.from(new Set(absolutePaths)); + } + + private extractMetadata(rawHtml: string, url: string) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const metadata: Record = { source: url }; + const { document } = new JSDOM(rawHtml).window; + + const title = document.getElementsByTagName("title")[0]; + if (title) { + metadata.title = title.textContent; + } + + const description = document.querySelector("meta[name=description]"); + if (description) { + metadata.description = description.getAttribute("content"); + } + + const html = document.getElementsByTagName("html")[0]; + if (html) { + metadata.language = html.getAttribute("lang"); + } + + return metadata; + } + + private async getUrlAsDoc(url: string): Promise { + let res; + try { + res = await this.fetchWithTimeout(url, { timeout: this.timeout }); + res = await res.text(); + } catch (e) { + return null; + } + + return { + pageContent: this.extractor(res), + metadata: this.extractMetadata(res, url), + }; + } + + private async getChildUrlsRecursive( + inputUrl: string, + visited: Set = new Set(), + depth = 0 + ): Promise { + if (depth > this.maxDepth) return []; + + let url = inputUrl; + if (!inputUrl.endsWith("/")) url += "/"; + + const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir)); + if (isExcluded) return []; + + let res; + try { + res = await this.fetchWithTimeout(url, { timeout: this.timeout }); + res = await res.text(); + } catch (e) { + return []; + } + + const childUrls: string[] = this.getChildLinks(res, url); + + const results = await Promise.all( + childUrls.map((childUrl) => + (async () => { + if (visited.has(childUrl)) return null; + visited.add(childUrl); + + const childDoc = await this.getUrlAsDoc(childUrl); + if (!childDoc) return null; + + if (childUrl.endsWith("/")) { + const childUrlResponses = await this.getChildUrlsRecursive( + childUrl, + visited, + depth + 1 + ); + return [childDoc, ...childUrlResponses]; + } + + return [childDoc]; + })() + ) + ); + + return results.flat().filter((docs) => docs !== null) as Document[]; + } + + async load(): Promise { + const rootDoc = await this.getUrlAsDoc(this.url); + if (!rootDoc) return []; + + const docs = [rootDoc]; + docs.push( + ...(await this.getChildUrlsRecursive(this.url, new Set([this.url]))) + ); + return docs; + } +} diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts index 21ca1eb61bb4..fb1eca46e417 100644 --- a/langchain/src/load/import_constants.ts +++ b/langchain/src/load/import_constants.ts @@ -67,6 +67,7 @@ export const optionalImportEntrypoints = [ "langchain/document_loaders/web/github", "langchain/document_loaders/web/notiondb", "langchain/document_loaders/web/notionapi", + "langchain/document_loaders/web/recursive_url", "langchain/document_loaders/web/s3", "langchain/document_loaders/web/sonix_audio", "langchain/document_loaders/web/confluence", diff --git a/langchain/src/load/import_type.d.ts b/langchain/src/load/import_type.d.ts index 35079aa07851..79c469e2199f 100644 --- a/langchain/src/load/import_type.d.ts +++ b/langchain/src/load/import_type.d.ts @@ -199,6 +199,9 @@ export interface OptionalImportMap { "langchain/document_loaders/web/notionapi"?: | typeof import("../document_loaders/web/notionapi.js") | Promise; + "langchain/document_loaders/web/recursive_url"?: + | typeof import("../document_loaders/web/recursive_url.js") + | Promise; "langchain/document_loaders/web/s3"?: | typeof import("../document_loaders/web/s3.js") | Promise; diff --git a/langchain/tsconfig.json b/langchain/tsconfig.json index 10cdcb5735aa..ed32ca455ba2 100644 --- a/langchain/tsconfig.json +++ b/langchain/tsconfig.json @@ -125,6 +125,7 @@ "src/document_loaders/web/github.ts", "src/document_loaders/web/notiondb.ts", "src/document_loaders/web/notionapi.ts", + "src/document_loaders/web/recursive_url.ts", "src/document_loaders/web/s3.ts", "src/document_loaders/web/sonix_audio.ts", "src/document_loaders/web/confluence.ts",