From 4d55d1e3179653294ce98b7625fa7143e1597f5b Mon Sep 17 00:00:00 2001
From: Jacob Lee <jacoblee93@gmail.com>
Date: Wed, 31 May 2023 08:11:57 -0700
Subject: [PATCH] Adds splitters for different programming and markup languages
 (#1469)

* Adds a HTML text splitter

* Formatting

* Adds docs page

* Adds CodeTextSplitter with support for popular languages

* Improve text splitting by preserving separators

* Fix formatting

* Merge and fix tests

* Update docs

* Factor individual language splitters into fromLanguage method, add docs
---
 .../indexes/text_splitters/examples/code.mdx  |  38 ++
 .../indexes/text_splitters/examples/latex.mdx |  32 --
 .../text_splitters/examples/markdown.mdx      |  61 ---
 .../examples/recursive_character.mdx          |   2 +-
 examples/src/indexes/html_text_splitter.ts    |  74 ++++
 .../src/indexes/javascript_text_splitter.ts   |  54 +++
 examples/src/indexes/latex_text_splitter.ts   |  57 +--
 examples/src/indexes/python_text_splitter.ts  |  36 ++
 .../src/tests/code_text_splitter.test.ts      | 289 ++++++++++++
 langchain/src/tests/text_splitter.test.ts     |  55 +++
 langchain/src/text_splitter.ts                | 416 +++++++++++++++---
 11 files changed, 937 insertions(+), 177 deletions(-)
 create mode 100644 docs/docs/modules/indexes/text_splitters/examples/code.mdx
 delete mode 100644 docs/docs/modules/indexes/text_splitters/examples/latex.mdx
 delete mode 100644 docs/docs/modules/indexes/text_splitters/examples/markdown.mdx
 create mode 100644 examples/src/indexes/html_text_splitter.ts
 create mode 100644 examples/src/indexes/javascript_text_splitter.ts
 create mode 100644 examples/src/indexes/python_text_splitter.ts
 create mode 100644 langchain/src/tests/code_text_splitter.test.ts
diff --git a/docs/docs/modules/indexes/text_splitters/examples/code.mdx b/docs/docs/modules/indexes/text_splitters/examples/code.mdx
new file mode 100644
index 000000000000..33ca60c51826
--- /dev/null
+++ b/docs/docs/modules/indexes/text_splitters/examples/code.mdx
@@ -0,0 +1,38 @@
+---
+hide_table_of_contents: true
+---
+
+# Code and Markup Text Splitters
+
+LangChain supports a variety of different markup and programming language-specific text splitters to split your text based on language-specific syntax.
+This results in more semantically self-contained chunks that are more useful to a vector store or other retriever.
+Popular languages like JavaScript, Python, and Rust are supported as well as Latex, HTML, and Markdown.
+
+## Usage
+
+Initialize a standard `RecursiveCharacterTextSplitter` with the `fromLanguage` factory method. Below are some examples for various languages.
+
+## JavaScript
+
+import CodeBlock from "@theme/CodeBlock";
+import JSExample from "@examples/indexes/javascript_text_splitter.ts";
+
+<CodeBlock language="typescript">{JSExample}</CodeBlock>
+
+## Python
+
+import PythonExample from "@examples/indexes/python_text_splitter.ts";
+
+<CodeBlock language="typescript">{PythonExample}</CodeBlock>
+
+## HTML
+
+import HTMLExample from "@examples/indexes/html_text_splitter.ts";
+
+<CodeBlock language="typescript">{HTMLExample}</CodeBlock>
+
+## Latex
+
+import LatexExample from "@examples/indexes/latex_text_splitter.ts";
+
+<CodeBlock language="typescript">{LatexExample}</CodeBlock>
diff --git a/docs/docs/modules/indexes/text_splitters/examples/latex.mdx b/docs/docs/modules/indexes/text_splitters/examples/latex.mdx
deleted file mode 100644
index bea3a1d9578b..000000000000
--- a/docs/docs/modules/indexes/text_splitters/examples/latex.mdx
+++ /dev/null
@@ -1,32 +0,0 @@
----
-hide_table_of_contents: true
----
-
-# `LatexTextSplitter`
-
-If you want to load documents in Latex format then try out the `LatexTextSplitter`. This class will split your content into documents based on the Latex syntax.
-For example, given this Latex input:
-
-```latex
-\begin{document}
-\title{🦜️🔗 LangChain}
-⚡ Building applications with LLMs through composability ⚡
-
-\section{Quick Install}
-
-\begin{verbatim}
-Hopefully this code block isn't split
-yarn add langchain
-\end{verbatim}
-
-As an open source project in a rapidly developing field, we are extremely open to contributions.
-
-\end{document}
-```
-
-The `LatexTextSplitter` will split the content into the following documents:
-
-import CodeBlock from "@theme/CodeBlock";
-import Example from "@examples/indexes/latex_text_splitter.ts";
-
-<CodeBlock language="typescript">{Example}</CodeBlock>
diff --git a/docs/docs/modules/indexes/text_splitters/examples/markdown.mdx b/docs/docs/modules/indexes/text_splitters/examples/markdown.mdx
deleted file mode 100644
index e5c07ffcadf4..000000000000
--- a/docs/docs/modules/indexes/text_splitters/examples/markdown.mdx
+++ /dev/null
@@ -1,61 +0,0 @@
----
-hide_table_of_contents: true
----
-
-# `MarkdownTextSplitter`
-
-If your content is in Markdown format then `MarkdownTextSplitter`. This class will split your content into documents based on the Markdown headers. For example, if you have the following Markdown content:
-
-```markdown
-# Header 1
-
-This is some content.
-
-## Header 2
-
-This is some more content.
-
-# Header 3
-
-This is even more content.
-```
-
-Then the `MarkdownTextSplitter` will split the content into three documents:
-
-```typescript
-import { MarkdownTextSplitter } from "langchain/text_splitter";
-
-const text = `# Header 1
-
-This is some content.
-
-## Header 2
-
-This is some more content.
-
-# Header 3
-
-This is even more content.`;
-
-const splitter = new MarkdownTextSplitter();
-
-const output = await splitter.createDocuments([text], {
-  metadata: "something",
-});
-/*
-[
-  {
-    "pageContent": "# Header 1\n\nThis is some content.",
-    "metadata": "something"
-  },
-  {
-    "pageContent": "## Header 2\n\nThis is some more content.",
-    "metadata": "something"
-  },
-  {
-    "pageContent": "# Header 3\n\nThis is even more content.",
-    "metadata": "something"
-  }
-]
-*/
-```
diff --git a/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx b/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx
index b1073c09f652..4fa2ac4574ce 100644
--- a/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx
+++ b/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx
@@ -2,7 +2,7 @@
 hide_table_of_contents: true
 ---
 
-# `RecursiveCharacterTextSplitter`
+# RecursiveCharacterTextSplitter
 
 The recommended TextSplitter is the `RecursiveCharacterTextSplitter`. This will split documents recursively by different characters - starting with `"\n\n"`, then `"\n"`, then `" "`. This is nice because it will try to keep all the semantically relevant content in the same place for as long as possible.
 
diff --git a/examples/src/indexes/html_text_splitter.ts b/examples/src/indexes/html_text_splitter.ts
new file mode 100644
index 000000000000..69bcc795aa90
--- /dev/null
+++ b/examples/src/indexes/html_text_splitter.ts
@@ -0,0 +1,74 @@
+import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
+
+const text = `<!DOCTYPE html>
+<html>
+  <head>
+    <title>🦜️🔗 LangChain</title>
+    <style>
+      body {
+        font-family: Arial, sans-serif;
+      }
+      h1 {
+        color: darkblue;
+      }
+    </style>
+  </head>
+  <body>
+    <div>
+      <h1>🦜️🔗 LangChain</h1>
+      <p>⚡ Building applications with LLMs through composability ⚡</p>
+    </div>
+    <div>
+      As an open source project in a rapidly developing field, we are extremely open to contributions.
+    </div>
+  </body>
+</html>`;
+
+const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
+  chunkSize: 175,
+  chunkOverlap: 20,
+});
+const output = await splitter.createDocuments([text]);
+
+console.log(output);
+
+/*
+  [
+    Document {
+      pageContent: '<!DOCTYPE html>\n<html>',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '<head>\n    <title>🦜️🔗 LangChain</title>',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '<style>\n' +
+        '      body {\n' +
+        '        font-family: Arial, sans-serif;\n' +
+        '      }\n' +
+        '      h1 {\n' +
+        '        color: darkblue;\n' +
+        '      }\n' +
+        '    </style>\n' +
+        '  </head>',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '<body>\n' +
+        '    <div>\n' +
+        '      <h1>🦜️🔗 LangChain</h1>\n' +
+        '      <p>⚡ Building applications with LLMs through composability ⚡</p>\n' +
+        '    </div>',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '<div>\n' +
+        '      As an open source project in a rapidly developing field, we are extremely open to contributions.\n' +
+        '    </div>\n' +
+        '  </body>\n' +
+        '</html>',
+      metadata: { loc: [Object] }
+    }
+  ]
+*/
diff --git a/examples/src/indexes/javascript_text_splitter.ts b/examples/src/indexes/javascript_text_splitter.ts
new file mode 100644
index 000000000000..bc466e35a540
--- /dev/null
+++ b/examples/src/indexes/javascript_text_splitter.ts
@@ -0,0 +1,54 @@
+import {
+  SupportedTextSplitterLanguages,
+  RecursiveCharacterTextSplitter,
+} from "langchain/text_splitter";
+
+console.log(SupportedTextSplitterLanguages); // Array of supported languages
+
+/*
+  [
+    'cpp',      'go',
+    'java',     'js',
+    'php',      'proto',
+    'python',   'rst',
+    'ruby',     'rust',
+    'scala',    'swift',
+    'markdown', 'latex',
+    'html'
+  ]
+*/
+
+const jsCode = `function helloWorld() {
+  console.log("Hello, World!");
+}
+// Call the function
+helloWorld();`;
+
+const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
+  chunkSize: 32,
+  chunkOverlap: 0,
+});
+const jsOutput = await splitter.createDocuments([jsCode]);
+
+console.log(jsOutput);
+
+/*
+  [
+    Document {
+      pageContent: 'function helloWorld() {',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: 'console.log("Hello, World!");',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '}\n// Call the function',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: 'helloWorld();',
+      metadata: { loc: [Object] }
+    }
+  ]
+*/
diff --git a/examples/src/indexes/latex_text_splitter.ts b/examples/src/indexes/latex_text_splitter.ts
index 0b467ac445f3..7fb29f48aa8a 100644
--- a/examples/src/indexes/latex_text_splitter.ts
+++ b/examples/src/indexes/latex_text_splitter.ts
@@ -1,4 +1,4 @@
-import { LatexTextSplitter } from "langchain/text_splitter";
+import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
 
 const text = `\\begin{document}
 \\title{🦜️🔗 LangChain}
@@ -15,7 +15,7 @@ As an open source project in a rapidly developing field, we are extremely open t
 
 \\end{document}`;
 
-const splitter = new LatexTextSplitter({
+const splitter = RecursiveCharacterTextSplitter.fromLanguage("latex", {
   chunkSize: 100,
   chunkOverlap: 0,
 });
@@ -24,30 +24,31 @@ const output = await splitter.createDocuments([text]);
 console.log(output);
 
 /*
-[
-  Document {
-    pageContent: '\\begin{document}\n' +
-      '\\title{🦜️🔗 LangChain}\n' +
-      '⚡ Building applications with LLMs through composability ⚡',
-    metadata: { loc: [Object] }
-  },
-  Document {
-    pageContent: 'Quick Install}',
-    metadata: { loc: [Object] }
-  },
-  Document {
-    pageContent: "Hopefully this code block isn't split\n" +
-      'yarn add langchain\n' +
-      '\\end{verbatim}\n' +
-      '\n' +
-      'As an open source project in a rapidly',
-    metadata: { loc: [Object] }
-  },
-  Document {
-    pageContent: 'developing field, we are extremely open to contributions.\n' +
-      '\n' +
-      '\\end{document}',
-    metadata: { loc: [Object] }
-  }
-]
+  [
+    Document {
+      pageContent: '\\begin{document}\n' +
+        '\\title{🦜️🔗 LangChain}\n' +
+        '⚡ Building applications with LLMs through composability ⚡',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '\\section{Quick Install}',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '\\begin{verbatim}\n' +
+        "Hopefully this code block isn't split\n" +
+        'yarn add langchain\n' +
+        '\\end{verbatim}',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: 'As an open source project in a rapidly developing field, we are extremely open to contributions.',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '\\end{document}',
+      metadata: { loc: [Object] }
+    }
+  ]
 */
diff --git a/examples/src/indexes/python_text_splitter.ts b/examples/src/indexes/python_text_splitter.ts
new file mode 100644
index 000000000000..6dbac98f1870
--- /dev/null
+++ b/examples/src/indexes/python_text_splitter.ts
@@ -0,0 +1,36 @@
+import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
+
+const pythonCode = `def hello_world():
+  print("Hello, World!")
+# Call the function
+hello_world()`;
+
+const splitter = RecursiveCharacterTextSplitter.fromLanguage("python", {
+  chunkSize: 32,
+  chunkOverlap: 0,
+});
+
+const pythonOutput = await splitter.createDocuments([pythonCode]);
+
+console.log(pythonOutput);
+
+/*
+  [
+    Document {
+      pageContent: 'def hello_world():',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: 'print("Hello, World!")',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: '# Call the function',
+      metadata: { loc: [Object] }
+    },
+    Document {
+      pageContent: 'hello_world()',
+      metadata: { loc: [Object] }
+    }
+  ]
+*/
diff --git a/langchain/src/tests/code_text_splitter.test.ts b/langchain/src/tests/code_text_splitter.test.ts
new file mode 100644
index 000000000000..2899e09b6eb2
--- /dev/null
+++ b/langchain/src/tests/code_text_splitter.test.ts
@@ -0,0 +1,289 @@
+import { test, expect } from "@jest/globals";
+import { RecursiveCharacterTextSplitter } from "../text_splitter.js";
+
+test("Python code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("python", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `def hello_world():
+  print("Hello, World!")
+# Call the function
+hello_world()`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "def",
+    "hello_world():",
+    'print("Hello,',
+    'World!")',
+    "# Call the",
+    "function",
+    "hello_world()",
+  ]);
+});
+
+test("Golang code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("go", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `package main
+import "fmt"
+func helloWorld() {
+    fmt.Println("Hello, World!")
+}
+func main() {
+    helloWorld()
+}`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "package main",
+    'import "fmt"',
+    "func",
+    "helloWorld() {",
+    'fmt.Println("He',
+    "llo,",
+    'World!")',
+    "}",
+    "func main() {",
+    "helloWorld()",
+    "}",
+  ]);
+});
+
+test("RST code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("rst", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `Sample Document
+===============
+Section
+-------
+This is the content of the section.
+Lists
+-----
+- Item 1
+- Item 2
+- Item 3`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "Sample Document",
+    "===============",
+    "Section\n-------",
+    "This is the",
+    "content of the",
+    "section.",
+    "Lists\n-----",
+    "- Item 1",
+    "- Item 2",
+    "- Item 3",
+  ]);
+});
+
+test("Proto code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("proto", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `syntax = "proto3";
+package example;
+message Person {
+    string name = 1;
+    int32 age = 2;
+    repeated string hobbies = 3;
+}`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "syntax =",
+    '"proto3";',
+    "package",
+    "example;",
+    "message Person",
+    "{",
+    "string name",
+    "= 1;",
+    "int32 age =",
+    "2;",
+    "repeated",
+    "string hobbies",
+    "= 3;",
+    "}",
+  ]);
+});
+
+test("JS code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `function helloWorld() {
+  console.log("Hello, World!");
+}
+// Call the function
+helloWorld();`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "function",
+    "helloWorld() {",
+    'console.log("He',
+    "llo,",
+    'World!");',
+    "}",
+    "// Call the",
+    "function",
+    "helloWorld();",
+  ]);
+});
+
+test("Java code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("java", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `public class HelloWorld {
+  public static void main(String[] args) {
+      System.out.println("Hello, World!");
+  }
+}`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "public class",
+    "HelloWorld {",
+    "public static",
+    "void",
+    "main(String[]",
+    "args) {",
+    "System.out.prin",
+    'tln("Hello,',
+    'World!");',
+    "}\n}",
+  ]);
+});
+
+test("CPP code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("cpp", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `#include <iostream>
+int main() {
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+}`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "#include",
+    "<iostream>",
+    "int main() {",
+    "std::cout",
+    '<< "Hello,',
+    'World!" <<',
+    "std::endl;",
+    "return 0;\n}",
+  ]);
+});
+
+test("Scala code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("scala", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `object HelloWorld {
+  def main(args: Array[String]): Unit = {
+    println("Hello, World!")
+  }
+}`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "object",
+    "HelloWorld {",
+    "def",
+    "main(args:",
+    "Array[String]):",
+    "Unit = {",
+    'println("Hello,',
+    'World!")',
+    "}\n}",
+  ]);
+});
+
+test("Ruby code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("ruby", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `def hello_world
+  puts "Hello, World!"
+end
+hello_world`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "def hello_world",
+    'puts "Hello,',
+    'World!"',
+    "end\nhello_world",
+  ]);
+});
+
+test("PHP code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("php", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `<?php
+function hello_world() {
+    echo "Hello, World!";
+}
+hello_world();
+?>`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "<?php",
+    "function",
+    "hello_world() {",
+    "echo",
+    '"Hello,',
+    'World!";',
+    "}",
+    "hello_world();",
+    "?>",
+  ]);
+});
+
+test("Swift code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("swift", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `func helloWorld() {
+  print("Hello, World!")
+}
+helloWorld()`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "func",
+    "helloWorld() {",
+    'print("Hello,',
+    'World!")',
+    "}\nhelloWorld()",
+  ]);
+});
+
+test("Rust code splitter", async () => {
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("rust", {
+    chunkSize: 16,
+    chunkOverlap: 0,
+  });
+  const code = `fn main() {
+  println!("Hello, World!");
+}`;
+  const chunks = await splitter.splitText(code);
+  expect(chunks).toStrictEqual([
+    "fn main() {",
+    'println!("Hello',
+    ",",
+    'World!");',
+    "}",
+  ]);
+});
diff --git a/langchain/src/tests/text_splitter.test.ts b/langchain/src/tests/text_splitter.test.ts
index 1f15ed3f707a..bc229da515ca 100644
--- a/langchain/src/tests/text_splitter.test.ts
+++ b/langchain/src/tests/text_splitter.test.ts
@@ -253,6 +253,61 @@ As an open source project in a rapidly developing field, we are extremely open t
   expect(output).toEqual(expectedOutput);
 });
 
+test("Test HTML text splitter", async () => {
+  const text = `<!DOCTYPE html>
+<html>
+  <head>
+    <title>🦜️🔗 LangChain</title>
+    <style>
+      body {
+        font-family: Arial, sans-serif;
+      }
+      h1 {
+        color: darkblue;
+      }
+    </style>
+  </head>
+  <body>
+    <div>
+      <h1>🦜️🔗 LangChain</h1>
+      <p>⚡ Building applications with LLMs through composability ⚡</p>
+    </div>
+    <div>
+      As an open source project in a rapidly developing field, we are extremely open to contributions.
+    </div>
+  </body>
+</html>`;
+  const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
+    chunkSize: 175,
+    chunkOverlap: 20,
+  });
+  const output = await splitter.splitText(text);
+
+  const expectedOutput = [
+    "<!DOCTYPE html>\n<html>",
+    "<head>\n    <title>🦜️🔗 LangChain</title>",
+    `<style>\n      body {
+        font-family: Arial, sans-serif;
+      }
+      h1 {
+        color: darkblue;
+      }
+    </style>
+  </head>`,
+    `<body>
+    <div>
+      <h1>🦜️🔗 LangChain</h1>
+      <p>⚡ Building applications with LLMs through composability ⚡</p>
+    </div>`,
+    `<div>
+      As an open source project in a rapidly developing field, we are extremely open to contributions.
+    </div>
+  </body>
+</html>`,
+  ];
+  expect(output).toEqual(expectedOutput);
+});
+
 test("Test lines loc on iterative text splitter.", async () => {
   const text = `Hi.\nI'm Harrison.\n\nHow?\na\nb`;
   const splitter = new RecursiveCharacterTextSplitter({
diff --git a/langchain/src/text_splitter.ts b/langchain/src/text_splitter.ts
index 3fd1acbc1b51..e3b2aa09bdb2 100644
--- a/langchain/src/text_splitter.ts
+++ b/langchain/src/text_splitter.ts
@@ -206,6 +206,27 @@ export interface RecursiveCharacterTextSplitterParams
   separators: string[];
 }
 
+export const SupportedTextSplitterLanguages = [
+  "cpp",
+  "go",
+  "java",
+  "js",
+  "php",
+  "proto",
+  "python",
+  "rst",
+  "ruby",
+  "rust",
+  "scala",
+  "swift",
+  "markdown",
+  "latex",
+  "html",
+] as const;
+
+export type SupportedTextSplitterLanguage =
+  (typeof SupportedTextSplitterLanguages)[number];
+
 export class RecursiveCharacterTextSplitter
   extends TextSplitter
   implements RecursiveCharacterTextSplitterParams
@@ -270,6 +291,336 @@ export class RecursiveCharacterTextSplitter
   async splitText(text: string): Promise<string[]> {
     return this._splitText(text, this.separators);
   }
+
+  static fromLanguage(
+    language: SupportedTextSplitterLanguage,
+    options: Partial<RecursiveCharacterTextSplitterParams>
+  ) {
+    return new RecursiveCharacterTextSplitter({
+      ...options,
+      separators:
+        RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language),
+    });
+  }
+
+  static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage) {
+    if (language === "cpp") {
+      return [
+        // Split along class definitions
+        "\nclass ",
+        // Split along function definitions
+        "\nvoid ",
+        "\nint ",
+        "\nfloat ",
+        "\ndouble ",
+        // Split along control flow statements
+        "\nif ",
+        "\nfor ",
+        "\nwhile ",
+        "\nswitch ",
+        "\ncase ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "go") {
+      return [
+        // Split along function definitions
+        "\nfunc ",
+        "\nvar ",
+        "\nconst ",
+        "\ntype ",
+        // Split along control flow statements
+        "\nif ",
+        "\nfor ",
+        "\nswitch ",
+        "\ncase ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "java") {
+      return [
+        // Split along class definitions
+        "\nclass ",
+        // Split along method definitions
+        "\npublic ",
+        "\nprotected ",
+        "\nprivate ",
+        "\nstatic ",
+        // Split along control flow statements
+        "\nif ",
+        "\nfor ",
+        "\nwhile ",
+        "\nswitch ",
+        "\ncase ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "js") {
+      return [
+        // Split along function definitions
+        "\nfunction ",
+        "\nconst ",
+        "\nlet ",
+        "\nvar ",
+        "\nclass ",
+        // Split along control flow statements
+        "\nif ",
+        "\nfor ",
+        "\nwhile ",
+        "\nswitch ",
+        "\ncase ",
+        "\ndefault ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "php") {
+      return [
+        // Split along function definitions
+        "\nfunction ",
+        // Split along class definitions
+        "\nclass ",
+        // Split along control flow statements
+        "\nif ",
+        "\nforeach ",
+        "\nwhile ",
+        "\ndo ",
+        "\nswitch ",
+        "\ncase ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "proto") {
+      return [
+        // Split along message definitions
+        "\nmessage ",
+        // Split along service definitions
+        "\nservice ",
+        // Split along enum definitions
+        "\nenum ",
+        // Split along option definitions
+        "\noption ",
+        // Split along import statements
+        "\nimport ",
+        // Split along syntax declarations
+        "\nsyntax ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "python") {
+      return [
+        // First, try to split along class definitions
+        "\nclass ",
+        "\ndef ",
+        "\n\tdef ",
+        // Now split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "rst") {
+      return [
+        // Split along section titles
+        "\n===\n",
+        "\n---\n",
+        "\n***\n",
+        // Split along directive markers
+        "\n.. ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "ruby") {
+      return [
+        // Split along method definitions
+        "\ndef ",
+        "\nclass ",
+        // Split along control flow statements
+        "\nif ",
+        "\nunless ",
+        "\nwhile ",
+        "\nfor ",
+        "\ndo ",
+        "\nbegin ",
+        "\nrescue ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "rust") {
+      return [
+        // Split along function definitions
+        "\nfn ",
+        "\nconst ",
+        "\nlet ",
+        // Split along control flow statements
+        "\nif ",
+        "\nwhile ",
+        "\nfor ",
+        "\nloop ",
+        "\nmatch ",
+        "\nconst ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "scala") {
+      return [
+        // Split along class definitions
+        "\nclass ",
+        "\nobject ",
+        // Split along method definitions
+        "\ndef ",
+        "\nval ",
+        "\nvar ",
+        // Split along control flow statements
+        "\nif ",
+        "\nfor ",
+        "\nwhile ",
+        "\nmatch ",
+        "\ncase ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "swift") {
+      return [
+        // Split along function definitions
+        "\nfunc ",
+        // Split along class definitions
+        "\nclass ",
+        "\nstruct ",
+        "\nenum ",
+        // Split along control flow statements
+        "\nif ",
+        "\nfor ",
+        "\nwhile ",
+        "\ndo ",
+        "\nswitch ",
+        "\ncase ",
+        // Split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "markdown") {
+      return [
+        // First, try to split along Markdown headings (starting with level 2)
+        "\n## ",
+        "\n### ",
+        "\n#### ",
+        "\n##### ",
+        "\n###### ",
+        // Note the alternative syntax for headings (below) is not handled here
+        // Heading level 2
+        // ---------------
+        // End of code block
+        "```\n\n",
+        // Horizontal lines
+        "\n\n***\n\n",
+        "\n\n---\n\n",
+        "\n\n___\n\n",
+        // Note that this splitter doesn't handle horizontal lines defined
+        // by *three or more* of ***, ---, or ___, but this is not handled
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "latex") {
+      return [
+        // First, try to split along Latex sections
+        "\n\\chapter{",
+        "\n\\section{",
+        "\n\\subsection{",
+        "\n\\subsubsection{",
+
+        // Now split by environments
+        "\n\\begin{enumerate}",
+        "\n\\begin{itemize}",
+        "\n\\begin{description}",
+        "\n\\begin{list}",
+        "\n\\begin{quote}",
+        "\n\\begin{quotation}",
+        "\n\\begin{verse}",
+        "\n\\begin{verbatim}",
+
+        // Now split by math environments
+        "\n\\begin{align}",
+        "$$",
+        "$",
+
+        // Now split by the normal type of lines
+        "\n\n",
+        "\n",
+        " ",
+        "",
+      ];
+    } else if (language === "html") {
+      return [
+        // First, try to split along HTML tags
+        "<body>",
+        "<div>",
+        "<p>",
+        "<br>",
+        "<li>",
+        "<h1>",
+        "<h2>",
+        "<h3>",
+        "<h4>",
+        "<h5>",
+        "<h6>",
+        "<span>",
+        "<table>",
+        "<tr>",
+        "<td>",
+        "<th>",
+        "<ul>",
+        "<ol>",
+        "<header>",
+        "<footer>",
+        "<nav>",
+        // Head
+        "<head>",
+        "<style>",
+        "<script>",
+        "<meta>",
+        "<title>",
+        // Normal type of lines
+        " ",
+        "",
+      ];
+    } else {
+      throw new Error(`Language ${language} is not supported.`);
+    }
+  }
 }
 
 export interface TokenTextSplitterParams extends TextSplitterParams {
@@ -336,32 +687,12 @@ export class MarkdownTextSplitter
   extends RecursiveCharacterTextSplitter
   implements MarkdownTextSplitterParams
 {
-  separators: string[] = [
-    // First, try to split along Markdown headings (starting with level 2)
-    "\n## ",
-    "\n### ",
-    "\n#### ",
-    "\n##### ",
-    "\n###### ",
-    // Note the alternative syntax for headings (below) is not handled here
-    // Heading level 2
-    // ---------------
-    // End of code block
-    "```\n\n",
-    // Horizontal lines
-    "\n\n***\n\n",
-    "\n\n---\n\n",
-    "\n\n___\n\n",
-    // Note that this splitter doesn't handle horizontal lines defined
-    // by *three or more* of ***, ---, or ___, but this is not handled
-    "\n\n",
-    "\n",
-    " ",
-    "",
-  ];
-
   constructor(fields?: Partial<MarkdownTextSplitterParams>) {
-    super(fields);
+    super({
+      ...fields,
+      separators:
+        RecursiveCharacterTextSplitter.getSeparatorsForLanguage("markdown"),
+    });
   }
 }
 
@@ -371,36 +702,11 @@ export class LatexTextSplitter
   extends RecursiveCharacterTextSplitter
   implements LatexTextSplitterParams
 {
-  separators: string[] = [
-    // First, try to split along Latex sections
-    "\n\\chapter{",
-    "\n\\section{",
-    "\n\\subsection{",
-    "\n\\subsubsection{",
-
-    // Now split by environments
-    "\n\\begin{enumerate}",
-    "\n\\begin{itemize}",
-    "\n\\begin{description}",
-    "\n\\begin{list}",
-    "\n\\begin{quote}",
-    "\n\\begin{quotation}",
-    "\n\\begin{verse}",
-    "\n\\begin{verbatim}",
-
-    // Now split by math environments
-    "\n\\begin{align}",
-    "$$",
-    "$",
-
-    // Now split by the normal type of lines
-    "\n\n",
-    "\n",
-    " ",
-    "",
-  ];
-
   constructor(fields?: Partial<LatexTextSplitterParams>) {
-    super(fields);
+    super({
+      ...fields,
+      separators:
+        RecursiveCharacterTextSplitter.getSeparatorsForLanguage("latex"),
+    });
   }
 }