Ported Recursive URL Loader: From Python to TypeScript with Concurren…

…cy Support (#2379) * Recursive loader based on the python implementation with concurrency * Added inheritence from BaseDocumentLoader and implements DocumentLoader * Fixed linting * Update entry point * export options * Move to just document_loaders/recursive_url, clean up code * Adds install instructions * More docs updates * Fix typo --------- Co-authored-by: jacoblee93 <jacoblee93@gmail.com>
langchain-ai · Aug 25, 2023 · 392855c · 392855c · vercel · Aug 25, 2023
1 parent 925fafb
commit 392855c
Show file tree

Hide file tree

Showing 9 changed files with 315 additions and 0 deletions.
diff --git a/...ta_connection/document_loaders/integrations/web_loaders/recursive_url_loader.md b/...ta_connection/document_loaders/integrations/web_loaders/recursive_url_loader.md
@@ -0,0 +1,67 @@
+---
+sidebar_class_name: node-only
+hide_table_of_contents: true
+---
+
+# Recursive URL Loader
+
+When loading content from a website, we may want to process load all URLs on a page.
+
+For example, let's look at the [LangChainJS introduction](https://js.langchain.com/docs/get_started/introduction) docs.
+
+This has many interesting child pages that we may want to load, split, and later retrieve in bulk.
+
+The challenge is traversing the tree of child pages and assembling a list!
+
+We do this using the RecursiveUrlLoader.
+
+This also gives us the flexibility to exclude some children, customize the extractor, and more.
+
+## Setup
+
+To get started, you'll need to install the [`jsdom`](https://www.npmjs.com/package/jsdom) package:
+
+```bash npm2yarn
+npm i jsdom
+```
+
+We also suggest adding a package like [`html-to-text`](https://www.npmjs.com/package/html-to-text) or
+[`@mozilla/readability`](https://www.npmjs.com/package/@mozilla/readability) for extracting the raw text from the page.
+
+```bash npm2yarn
+npm i html-to-text
+```
+
+## Usage
+
+```typescript
+import { compile } from "html-to-text";
+import { RecursiveUrlLoader } from "langchain/document_loaders/web/recursive_url";
+
+const url = "https://js.langchain.com/docs/get_started/introduction";
+
+const compiledConvert = compile({ wordwrap: 130 }); // returns (text: string) => string;
+
+const loader = new RecursiveUrlLoader(url, {
+  extractor: compiledConvert,
+  maxDepth: 1,
+  excludeDirs: ["https://js.langchain.com/docs/api/"],
+});
+
+const docs = await loader.load();
+```
+
+## Options
+
+```typescript
+interface Options {
+  excludeDirs?: string[]; // webpage directories to exclude.
+  extractor?: (text: string) => string; // a function to extract the text of the document from the webpage, by default it returns the page as it is. It is recommended to use tools like html-to-text to extract the text. By default, it just returns the page as it is.
+  maxDepth?: number; // the maximum depth to crawl. By default, it is set to 2. If you need to crawl the whole website, set it to a number that is large enough would simply do the job.
+  timeout?: number; // the timeout for each request, in the unit of seconds. By default, it is set to 10000 (10 seconds).
+  preventOutside?: boolean; // whether to prevent crawling outside the root url. By default, it is set to true.
+  callerOptions?: AsyncCallerConstructorParams; // the options to call the AsyncCaller for example setting max concurrency (default is 64)
+}
+```
+
+However, since it's hard to perform a perfect filter, you may still see some irrelevant results in the results. You can perform a filter on the returned documents by yourself, if it's needed. Most of the time, the returned results are good enough.
diff --git a/langchain/.gitignore b/langchain/.gitignore
@@ -289,6 +289,9 @@ document_loaders/web/notiondb.d.ts
 document_loaders/web/notionapi.cjs
 document_loaders/web/notionapi.js
 document_loaders/web/notionapi.d.ts
+document_loaders/web/recursive_url.cjs
+document_loaders/web/recursive_url.js
+document_loaders/web/recursive_url.d.ts
 document_loaders/web/s3.cjs
 document_loaders/web/s3.js
 document_loaders/web/s3.d.ts

diff --git a/langchain/package.json b/langchain/package.json
@@ -301,6 +301,9 @@
     "document_loaders/web/notionapi.cjs",
     "document_loaders/web/notionapi.js",
     "document_loaders/web/notionapi.d.ts",
+    "document_loaders/web/recursive_url.cjs",
+    "document_loaders/web/recursive_url.js",
+    "document_loaders/web/recursive_url.d.ts",
     "document_loaders/web/s3.cjs",
     "document_loaders/web/s3.js",
     "document_loaders/web/s3.d.ts",
@@ -1544,6 +1547,11 @@
       "import": "./document_loaders/web/notionapi.js",
       "require": "./document_loaders/web/notionapi.cjs"
     },
+    "./document_loaders/web/recursive_url": {
+      "types": "./document_loaders/web/recursive_url.d.ts",
+      "import": "./document_loaders/web/recursive_url.js",
+      "require": "./document_loaders/web/recursive_url.cjs"
+    },
     "./document_loaders/web/s3": {
       "types": "./document_loaders/web/s3.d.ts",
       "import": "./document_loaders/web/s3.js",

diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js
@@ -120,6 +120,7 @@ const entrypoints = {
   "document_loaders/web/github": "document_loaders/web/github",
   "document_loaders/web/notiondb": "document_loaders/web/notiondb",
   "document_loaders/web/notionapi": "document_loaders/web/notionapi",
+  "document_loaders/web/recursive_url": "document_loaders/web/recursive_url",
   "document_loaders/web/s3": "document_loaders/web/s3",
   "document_loaders/web/sonix_audio": "document_loaders/web/sonix_audio",
   "document_loaders/web/confluence": "document_loaders/web/confluence",
@@ -306,6 +307,7 @@ const requiresOptionalDependency = [
   "document_loaders/web/github",
   "document_loaders/web/notiondb",
   "document_loaders/web/notionapi",
+  "document_loaders/web/recursive_url",
   "document_loaders/web/s3",
   "document_loaders/web/sonix_audio",
   "document_loaders/web/confluence",

diff --git a/langchain/src/document_loaders/tests/recursive_url.int.test.ts b/langchain/src/document_loaders/tests/recursive_url.int.test.ts
@@ -0,0 +1,33 @@
+/* eslint-disable no-process-env */
+/* eslint-disable @typescript-eslint/no-non-null-assertion */
+import { test } from "@jest/globals";
+import { compile } from "html-to-text";
+import { RecursiveUrlLoader } from "../web/recursive_url.js";
+
+describe("RecursiveUrlLoader", () => {
+  test("loading valid url", async () => {
+    const url = "https://js.langchain.com/docs/get_started/introduction";
+
+    const compiledConvert = compile({ wordwrap: 130 }); // returns (input: string;) => string;
+
+    const loader = new RecursiveUrlLoader(url, {
+      extractor: compiledConvert,
+      maxDepth: 1,
+      excludeDirs: ["https://js.langchain.com/docs/api/"],
+    });
+
+    const docs = await loader.load();
+    expect(docs.length).toBeGreaterThan(0);
+    expect(docs[0].pageContent).toContain("LangChain");
+  });
+
+  test("loading invalid url", async () => {
+    const url = "https://this.url.is.invalid/this/is/a/test";
+    const loader = new RecursiveUrlLoader(url, {
+      maxDepth: 1,
+      preventOutside: true,
+    });
+    const docs = await loader.load();
+    expect(docs.length).toBe(0);
+  });
+});
diff --git a/langchain/src/document_loaders/web/recursive_url.ts b/langchain/src/document_loaders/web/recursive_url.ts
@@ -0,0 +1,197 @@
+import { JSDOM } from "jsdom";
+import { Document } from "../../document.js";
+import { AsyncCaller } from "../../util/async_caller.js";
+import { BaseDocumentLoader, DocumentLoader } from "../base.js";
+
+export interface RecursiveUrlLoaderOptions {
+  excludeDirs?: string[];
+  extractor?: (text: string) => string;
+  maxDepth?: number;
+  timeout?: number;
+  preventOutside?: boolean;
+  callerOptions?: ConstructorParameters<typeof AsyncCaller>[0];
+}
+
+export class RecursiveUrlLoader
+  extends BaseDocumentLoader
+  implements DocumentLoader
+{
+  private caller: AsyncCaller;
+
+  private url: string;
+
+  private excludeDirs: string[];
+
+  private extractor: (text: string) => string;
+
+  private maxDepth: number;
+
+  private timeout: number;
+
+  private preventOutside: boolean;
+
+  constructor(url: string, options: RecursiveUrlLoaderOptions) {
+    super();
+
+    this.caller = new AsyncCaller({
+      maxConcurrency: 64,
+      maxRetries: 0,
+      ...options.callerOptions,
+    });
+
+    this.url = url;
+    this.excludeDirs = options.excludeDirs ?? [];
+    this.extractor = options.extractor ?? ((s: string) => s);
+    this.maxDepth = options.maxDepth ?? 2;
+    this.timeout = options.timeout ?? 10000;
+    this.preventOutside = options.preventOutside ?? true;
+  }
+
+  private async fetchWithTimeout(
+    resource: string,
+    options: { timeout: number } & RequestInit
+  ): Promise<Response> {
+    const { timeout, ...rest } = options;
+    return this.caller.call(() =>
+      fetch(resource, { ...rest, signal: AbortSignal.timeout(timeout) })
+    );
+  }
+
+  private getChildLinks(html: string, baseUrl: string): Array<string> {
+    const allLinks = Array.from(
+      new JSDOM(html).window.document.querySelectorAll("a")
+    ).map((a) => a.href);
+    const absolutePaths = [];
+    // eslint-disable-next-line no-script-url
+    const invalidPrefixes = ["javascript:", "mailto:", "#"];
+    const invalidSuffixes = [
+      ".css",
+      ".js",
+      ".ico",
+      ".png",
+      ".jpg",
+      ".jpeg",
+      ".gif",
+      ".svg",
+    ];
+
+    for (const link of allLinks) {
+      if (
+        invalidPrefixes.some((prefix) => link.startsWith(prefix)) ||
+        invalidSuffixes.some((suffix) => link.endsWith(suffix))
+      )
+        continue;
+
+      if (link.startsWith("http")) {
+        const isAllowed = !this.preventOutside || link.startsWith(baseUrl);
+        if (isAllowed) absolutePaths.push(link);
+      } else if (link.startsWith("//")) {
+        const base = new URL(baseUrl);
+        absolutePaths.push(base.protocol + link);
+      } else {
+        const newLink = new URL(link, baseUrl).href;
+        absolutePaths.push(newLink);
+      }
+    }
+
+    return Array.from(new Set(absolutePaths));
+  }
+
+  private extractMetadata(rawHtml: string, url: string) {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    const metadata: Record<string, any> = { source: url };
+    const { document } = new JSDOM(rawHtml).window;
+
+    const title = document.getElementsByTagName("title")[0];
+    if (title) {
+      metadata.title = title.textContent;
+    }
+
+    const description = document.querySelector("meta[name=description]");
+    if (description) {
+      metadata.description = description.getAttribute("content");
+    }
+
+    const html = document.getElementsByTagName("html")[0];
+    if (html) {
+      metadata.language = html.getAttribute("lang");
+    }
+
+    return metadata;
+  }
+
+  private async getUrlAsDoc(url: string): Promise<Document | null> {
+    let res;
+    try {
+      res = await this.fetchWithTimeout(url, { timeout: this.timeout });
+      res = await res.text();
+    } catch (e) {
+      return null;
+    }
+
+    return {
+      pageContent: this.extractor(res),
+      metadata: this.extractMetadata(res, url),
+    };
+  }
+
+  private async getChildUrlsRecursive(
+    inputUrl: string,
+    visited: Set<string> = new Set<string>(),
+    depth = 0
+  ): Promise<Document[]> {
+    if (depth > this.maxDepth) return [];
+
+    let url = inputUrl;
+    if (!inputUrl.endsWith("/")) url += "/";
+
+    const isExcluded = this.excludeDirs.some((exDir) => url.startsWith(exDir));
+    if (isExcluded) return [];
+
+    let res;
+    try {
+      res = await this.fetchWithTimeout(url, { timeout: this.timeout });
+      res = await res.text();
+    } catch (e) {
+      return [];
+    }
+
+    const childUrls: string[] = this.getChildLinks(res, url);
+
+    const results = await Promise.all(
+      childUrls.map((childUrl) =>
+        (async () => {
+          if (visited.has(childUrl)) return null;
+          visited.add(childUrl);
+
+          const childDoc = await this.getUrlAsDoc(childUrl);
+          if (!childDoc) return null;
+
+          if (childUrl.endsWith("/")) {
+            const childUrlResponses = await this.getChildUrlsRecursive(
+              childUrl,
+              visited,
+              depth + 1
+            );
+            return [childDoc, ...childUrlResponses];
+          }
+
+          return [childDoc];
+        })()
+      )
+    );
+
+    return results.flat().filter((docs) => docs !== null) as Document[];
+  }
+
+  async load(): Promise<Document[]> {
+    const rootDoc = await this.getUrlAsDoc(this.url);
+    if (!rootDoc) return [];
+
+    const docs = [rootDoc];
+    docs.push(
+      ...(await this.getChildUrlsRecursive(this.url, new Set([this.url])))
+    );
+    return docs;
+  }
+}
diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts
@@ -67,6 +67,7 @@ export const optionalImportEntrypoints = [
   "langchain/document_loaders/web/github",
   "langchain/document_loaders/web/notiondb",
   "langchain/document_loaders/web/notionapi",
+  "langchain/document_loaders/web/recursive_url",
   "langchain/document_loaders/web/s3",
   "langchain/document_loaders/web/sonix_audio",
   "langchain/document_loaders/web/confluence",

diff --git a/langchain/src/load/import_type.d.ts b/langchain/src/load/import_type.d.ts
@@ -199,6 +199,9 @@ export interface OptionalImportMap {
   "langchain/document_loaders/web/notionapi"?:
     | typeof import("../document_loaders/web/notionapi.js")
     | Promise<typeof import("../document_loaders/web/notionapi.js")>;
+  "langchain/document_loaders/web/recursive_url"?:
+    | typeof import("../document_loaders/web/recursive_url.js")
+    | Promise<typeof import("../document_loaders/web/recursive_url.js")>;
   "langchain/document_loaders/web/s3"?:
     | typeof import("../document_loaders/web/s3.js")
     | Promise<typeof import("../document_loaders/web/s3.js")>;

diff --git a/langchain/tsconfig.json b/langchain/tsconfig.json
@@ -125,6 +125,7 @@
       "src/document_loaders/web/github.ts",
       "src/document_loaders/web/notiondb.ts",
       "src/document_loaders/web/notionapi.ts",
+      "src/document_loaders/web/recursive_url.ts",
       "src/document_loaders/web/s3.ts",
       "src/document_loaders/web/sonix_audio.ts",
       "src/document_loaders/web/confluence.ts",