From 4d55d1e3179653294ce98b7625fa7143e1597f5b Mon Sep 17 00:00:00 2001 From: Jacob Lee Date: Wed, 31 May 2023 08:11:57 -0700 Subject: [PATCH] Adds splitters for different programming and markup languages (#1469) * Adds a HTML text splitter * Formatting * Adds docs page * Adds CodeTextSplitter with support for popular languages * Improve text splitting by preserving separators * Fix formatting * Merge and fix tests * Update docs * Factor individual language splitters into fromLanguage method, add docs --- .../indexes/text_splitters/examples/code.mdx | 38 ++ .../indexes/text_splitters/examples/latex.mdx | 32 -- .../text_splitters/examples/markdown.mdx | 61 --- .../examples/recursive_character.mdx | 2 +- examples/src/indexes/html_text_splitter.ts | 74 ++++ .../src/indexes/javascript_text_splitter.ts | 54 +++ examples/src/indexes/latex_text_splitter.ts | 57 +-- examples/src/indexes/python_text_splitter.ts | 36 ++ .../src/tests/code_text_splitter.test.ts | 289 ++++++++++++ langchain/src/tests/text_splitter.test.ts | 55 +++ langchain/src/text_splitter.ts | 416 +++++++++++++++--- 11 files changed, 937 insertions(+), 177 deletions(-) create mode 100644 docs/docs/modules/indexes/text_splitters/examples/code.mdx delete mode 100644 docs/docs/modules/indexes/text_splitters/examples/latex.mdx delete mode 100644 docs/docs/modules/indexes/text_splitters/examples/markdown.mdx create mode 100644 examples/src/indexes/html_text_splitter.ts create mode 100644 examples/src/indexes/javascript_text_splitter.ts create mode 100644 examples/src/indexes/python_text_splitter.ts create mode 100644 langchain/src/tests/code_text_splitter.test.ts diff --git a/docs/docs/modules/indexes/text_splitters/examples/code.mdx b/docs/docs/modules/indexes/text_splitters/examples/code.mdx new file mode 100644 index 000000000000..33ca60c51826 --- /dev/null +++ b/docs/docs/modules/indexes/text_splitters/examples/code.mdx @@ -0,0 +1,38 @@ +--- +hide_table_of_contents: true +--- + +# Code and Markup Text Splitters + +LangChain supports a variety of different markup and programming language-specific text splitters to split your text based on language-specific syntax. +This results in more semantically self-contained chunks that are more useful to a vector store or other retriever. +Popular languages like JavaScript, Python, and Rust are supported as well as Latex, HTML, and Markdown. + +## Usage + +Initialize a standard `RecursiveCharacterTextSplitter` with the `fromLanguage` factory method. Below are some examples for various languages. + +## JavaScript + +import CodeBlock from "@theme/CodeBlock"; +import JSExample from "@examples/indexes/javascript_text_splitter.ts"; + +{JSExample} + +## Python + +import PythonExample from "@examples/indexes/python_text_splitter.ts"; + +{PythonExample} + +## HTML + +import HTMLExample from "@examples/indexes/html_text_splitter.ts"; + +{HTMLExample} + +## Latex + +import LatexExample from "@examples/indexes/latex_text_splitter.ts"; + +{LatexExample} diff --git a/docs/docs/modules/indexes/text_splitters/examples/latex.mdx b/docs/docs/modules/indexes/text_splitters/examples/latex.mdx deleted file mode 100644 index bea3a1d9578b..000000000000 --- a/docs/docs/modules/indexes/text_splitters/examples/latex.mdx +++ /dev/null @@ -1,32 +0,0 @@ ---- -hide_table_of_contents: true ---- - -# `LatexTextSplitter` - -If you want to load documents in Latex format then try out the `LatexTextSplitter`. This class will split your content into documents based on the Latex syntax. -For example, given this Latex input: - -```latex -\begin{document} -\title{🦜️🔗 LangChain} -⚡ Building applications with LLMs through composability ⚡ - -\section{Quick Install} - -\begin{verbatim} -Hopefully this code block isn't split -yarn add langchain -\end{verbatim} - -As an open source project in a rapidly developing field, we are extremely open to contributions. - -\end{document} -``` - -The `LatexTextSplitter` will split the content into the following documents: - -import CodeBlock from "@theme/CodeBlock"; -import Example from "@examples/indexes/latex_text_splitter.ts"; - -{Example} diff --git a/docs/docs/modules/indexes/text_splitters/examples/markdown.mdx b/docs/docs/modules/indexes/text_splitters/examples/markdown.mdx deleted file mode 100644 index e5c07ffcadf4..000000000000 --- a/docs/docs/modules/indexes/text_splitters/examples/markdown.mdx +++ /dev/null @@ -1,61 +0,0 @@ ---- -hide_table_of_contents: true ---- - -# `MarkdownTextSplitter` - -If your content is in Markdown format then `MarkdownTextSplitter`. This class will split your content into documents based on the Markdown headers. For example, if you have the following Markdown content: - -```markdown -# Header 1 - -This is some content. - -## Header 2 - -This is some more content. - -# Header 3 - -This is even more content. -``` - -Then the `MarkdownTextSplitter` will split the content into three documents: - -```typescript -import { MarkdownTextSplitter } from "langchain/text_splitter"; - -const text = `# Header 1 - -This is some content. - -## Header 2 - -This is some more content. - -# Header 3 - -This is even more content.`; - -const splitter = new MarkdownTextSplitter(); - -const output = await splitter.createDocuments([text], { - metadata: "something", -}); -/* -[ - { - "pageContent": "# Header 1\n\nThis is some content.", - "metadata": "something" - }, - { - "pageContent": "## Header 2\n\nThis is some more content.", - "metadata": "something" - }, - { - "pageContent": "# Header 3\n\nThis is even more content.", - "metadata": "something" - } -] -*/ -``` diff --git a/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx b/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx index b1073c09f652..4fa2ac4574ce 100644 --- a/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx +++ b/docs/docs/modules/indexes/text_splitters/examples/recursive_character.mdx @@ -2,7 +2,7 @@ hide_table_of_contents: true --- -# `RecursiveCharacterTextSplitter` +# RecursiveCharacterTextSplitter The recommended TextSplitter is the `RecursiveCharacterTextSplitter`. This will split documents recursively by different characters - starting with `"\n\n"`, then `"\n"`, then `" "`. This is nice because it will try to keep all the semantically relevant content in the same place for as long as possible. diff --git a/examples/src/indexes/html_text_splitter.ts b/examples/src/indexes/html_text_splitter.ts new file mode 100644 index 000000000000..69bcc795aa90 --- /dev/null +++ b/examples/src/indexes/html_text_splitter.ts @@ -0,0 +1,74 @@ +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; + +const text = ` + + + 🦜️🔗 LangChain + + + +
+

🦜️🔗 LangChain

+

⚡ Building applications with LLMs through composability ⚡

+
+
+ As an open source project in a rapidly developing field, we are extremely open to contributions. +
+ +`; + +const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", { + chunkSize: 175, + chunkOverlap: 20, +}); +const output = await splitter.createDocuments([text]); + +console.log(output); + +/* + [ + Document { + pageContent: '\n', + metadata: { loc: [Object] } + }, + Document { + pageContent: '\n 🦜️🔗 LangChain', + metadata: { loc: [Object] } + }, + Document { + pageContent: '\n' + + ' ', + metadata: { loc: [Object] } + }, + Document { + pageContent: '\n' + + '
\n' + + '

🦜️🔗 LangChain

\n' + + '

⚡ Building applications with LLMs through composability ⚡

\n' + + '
', + metadata: { loc: [Object] } + }, + Document { + pageContent: '
\n' + + ' As an open source project in a rapidly developing field, we are extremely open to contributions.\n' + + '
\n' + + ' \n' + + '', + metadata: { loc: [Object] } + } + ] +*/ diff --git a/examples/src/indexes/javascript_text_splitter.ts b/examples/src/indexes/javascript_text_splitter.ts new file mode 100644 index 000000000000..bc466e35a540 --- /dev/null +++ b/examples/src/indexes/javascript_text_splitter.ts @@ -0,0 +1,54 @@ +import { + SupportedTextSplitterLanguages, + RecursiveCharacterTextSplitter, +} from "langchain/text_splitter"; + +console.log(SupportedTextSplitterLanguages); // Array of supported languages + +/* + [ + 'cpp', 'go', + 'java', 'js', + 'php', 'proto', + 'python', 'rst', + 'ruby', 'rust', + 'scala', 'swift', + 'markdown', 'latex', + 'html' + ] +*/ + +const jsCode = `function helloWorld() { + console.log("Hello, World!"); +} +// Call the function +helloWorld();`; + +const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", { + chunkSize: 32, + chunkOverlap: 0, +}); +const jsOutput = await splitter.createDocuments([jsCode]); + +console.log(jsOutput); + +/* + [ + Document { + pageContent: 'function helloWorld() {', + metadata: { loc: [Object] } + }, + Document { + pageContent: 'console.log("Hello, World!");', + metadata: { loc: [Object] } + }, + Document { + pageContent: '}\n// Call the function', + metadata: { loc: [Object] } + }, + Document { + pageContent: 'helloWorld();', + metadata: { loc: [Object] } + } + ] +*/ diff --git a/examples/src/indexes/latex_text_splitter.ts b/examples/src/indexes/latex_text_splitter.ts index 0b467ac445f3..7fb29f48aa8a 100644 --- a/examples/src/indexes/latex_text_splitter.ts +++ b/examples/src/indexes/latex_text_splitter.ts @@ -1,4 +1,4 @@ -import { LatexTextSplitter } from "langchain/text_splitter"; +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; const text = `\\begin{document} \\title{🦜️🔗 LangChain} @@ -15,7 +15,7 @@ As an open source project in a rapidly developing field, we are extremely open t \\end{document}`; -const splitter = new LatexTextSplitter({ +const splitter = RecursiveCharacterTextSplitter.fromLanguage("latex", { chunkSize: 100, chunkOverlap: 0, }); @@ -24,30 +24,31 @@ const output = await splitter.createDocuments([text]); console.log(output); /* -[ - Document { - pageContent: '\\begin{document}\n' + - '\\title{🦜️🔗 LangChain}\n' + - '⚡ Building applications with LLMs through composability ⚡', - metadata: { loc: [Object] } - }, - Document { - pageContent: 'Quick Install}', - metadata: { loc: [Object] } - }, - Document { - pageContent: "Hopefully this code block isn't split\n" + - 'yarn add langchain\n' + - '\\end{verbatim}\n' + - '\n' + - 'As an open source project in a rapidly', - metadata: { loc: [Object] } - }, - Document { - pageContent: 'developing field, we are extremely open to contributions.\n' + - '\n' + - '\\end{document}', - metadata: { loc: [Object] } - } -] + [ + Document { + pageContent: '\\begin{document}\n' + + '\\title{🦜️🔗 LangChain}\n' + + '⚡ Building applications with LLMs through composability ⚡', + metadata: { loc: [Object] } + }, + Document { + pageContent: '\\section{Quick Install}', + metadata: { loc: [Object] } + }, + Document { + pageContent: '\\begin{verbatim}\n' + + "Hopefully this code block isn't split\n" + + 'yarn add langchain\n' + + '\\end{verbatim}', + metadata: { loc: [Object] } + }, + Document { + pageContent: 'As an open source project in a rapidly developing field, we are extremely open to contributions.', + metadata: { loc: [Object] } + }, + Document { + pageContent: '\\end{document}', + metadata: { loc: [Object] } + } + ] */ diff --git a/examples/src/indexes/python_text_splitter.ts b/examples/src/indexes/python_text_splitter.ts new file mode 100644 index 000000000000..6dbac98f1870 --- /dev/null +++ b/examples/src/indexes/python_text_splitter.ts @@ -0,0 +1,36 @@ +import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; + +const pythonCode = `def hello_world(): + print("Hello, World!") +# Call the function +hello_world()`; + +const splitter = RecursiveCharacterTextSplitter.fromLanguage("python", { + chunkSize: 32, + chunkOverlap: 0, +}); + +const pythonOutput = await splitter.createDocuments([pythonCode]); + +console.log(pythonOutput); + +/* + [ + Document { + pageContent: 'def hello_world():', + metadata: { loc: [Object] } + }, + Document { + pageContent: 'print("Hello, World!")', + metadata: { loc: [Object] } + }, + Document { + pageContent: '# Call the function', + metadata: { loc: [Object] } + }, + Document { + pageContent: 'hello_world()', + metadata: { loc: [Object] } + } + ] +*/ diff --git a/langchain/src/tests/code_text_splitter.test.ts b/langchain/src/tests/code_text_splitter.test.ts new file mode 100644 index 000000000000..2899e09b6eb2 --- /dev/null +++ b/langchain/src/tests/code_text_splitter.test.ts @@ -0,0 +1,289 @@ +import { test, expect } from "@jest/globals"; +import { RecursiveCharacterTextSplitter } from "../text_splitter.js"; + +test("Python code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("python", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `def hello_world(): + print("Hello, World!") +# Call the function +hello_world()`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "def", + "hello_world():", + 'print("Hello,', + 'World!")', + "# Call the", + "function", + "hello_world()", + ]); +}); + +test("Golang code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("go", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `package main +import "fmt" +func helloWorld() { + fmt.Println("Hello, World!") +} +func main() { + helloWorld() +}`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "package main", + 'import "fmt"', + "func", + "helloWorld() {", + 'fmt.Println("He', + "llo,", + 'World!")', + "}", + "func main() {", + "helloWorld()", + "}", + ]); +}); + +test("RST code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("rst", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `Sample Document +=============== +Section +------- +This is the content of the section. +Lists +----- +- Item 1 +- Item 2 +- Item 3`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "Sample Document", + "===============", + "Section\n-------", + "This is the", + "content of the", + "section.", + "Lists\n-----", + "- Item 1", + "- Item 2", + "- Item 3", + ]); +}); + +test("Proto code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("proto", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `syntax = "proto3"; +package example; +message Person { + string name = 1; + int32 age = 2; + repeated string hobbies = 3; +}`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "syntax =", + '"proto3";', + "package", + "example;", + "message Person", + "{", + "string name", + "= 1;", + "int32 age =", + "2;", + "repeated", + "string hobbies", + "= 3;", + "}", + ]); +}); + +test("JS code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `function helloWorld() { + console.log("Hello, World!"); +} +// Call the function +helloWorld();`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "function", + "helloWorld() {", + 'console.log("He', + "llo,", + 'World!");', + "}", + "// Call the", + "function", + "helloWorld();", + ]); +}); + +test("Java code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("java", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `public class HelloWorld { + public static void main(String[] args) { + System.out.println("Hello, World!"); + } +}`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "public class", + "HelloWorld {", + "public static", + "void", + "main(String[]", + "args) {", + "System.out.prin", + 'tln("Hello,', + 'World!");', + "}\n}", + ]); +}); + +test("CPP code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("cpp", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `#include +int main() { + std::cout << "Hello, World!" << std::endl; + return 0; +}`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "#include", + "", + "int main() {", + "std::cout", + '<< "Hello,', + 'World!" <<', + "std::endl;", + "return 0;\n}", + ]); +}); + +test("Scala code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("scala", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `object HelloWorld { + def main(args: Array[String]): Unit = { + println("Hello, World!") + } +}`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "object", + "HelloWorld {", + "def", + "main(args:", + "Array[String]):", + "Unit = {", + 'println("Hello,', + 'World!")', + "}\n}", + ]); +}); + +test("Ruby code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("ruby", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `def hello_world + puts "Hello, World!" +end +hello_world`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "def hello_world", + 'puts "Hello,', + 'World!"', + "end\nhello_world", + ]); +}); + +test("PHP code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("php", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = ``; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "", + ]); +}); + +test("Swift code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("swift", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `func helloWorld() { + print("Hello, World!") +} +helloWorld()`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "func", + "helloWorld() {", + 'print("Hello,', + 'World!")', + "}\nhelloWorld()", + ]); +}); + +test("Rust code splitter", async () => { + const splitter = RecursiveCharacterTextSplitter.fromLanguage("rust", { + chunkSize: 16, + chunkOverlap: 0, + }); + const code = `fn main() { + println!("Hello, World!"); +}`; + const chunks = await splitter.splitText(code); + expect(chunks).toStrictEqual([ + "fn main() {", + 'println!("Hello', + ",", + 'World!");', + "}", + ]); +}); diff --git a/langchain/src/tests/text_splitter.test.ts b/langchain/src/tests/text_splitter.test.ts index 1f15ed3f707a..bc229da515ca 100644 --- a/langchain/src/tests/text_splitter.test.ts +++ b/langchain/src/tests/text_splitter.test.ts @@ -253,6 +253,61 @@ As an open source project in a rapidly developing field, we are extremely open t expect(output).toEqual(expectedOutput); }); +test("Test HTML text splitter", async () => { + const text = ` + + + 🦜️🔗 LangChain + + + +
+

🦜️🔗 LangChain

+

⚡ Building applications with LLMs through composability ⚡

+
+
+ As an open source project in a rapidly developing field, we are extremely open to contributions. +
+ +`; + const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", { + chunkSize: 175, + chunkOverlap: 20, + }); + const output = await splitter.splitText(text); + + const expectedOutput = [ + "\n", + "\n 🦜️🔗 LangChain", + ` + `, + ` +
+

🦜️🔗 LangChain

+

⚡ Building applications with LLMs through composability ⚡

+
`, + `
+ As an open source project in a rapidly developing field, we are extremely open to contributions. +
+ +`, + ]; + expect(output).toEqual(expectedOutput); +}); + test("Test lines loc on iterative text splitter.", async () => { const text = `Hi.\nI'm Harrison.\n\nHow?\na\nb`; const splitter = new RecursiveCharacterTextSplitter({ diff --git a/langchain/src/text_splitter.ts b/langchain/src/text_splitter.ts index 3fd1acbc1b51..e3b2aa09bdb2 100644 --- a/langchain/src/text_splitter.ts +++ b/langchain/src/text_splitter.ts @@ -206,6 +206,27 @@ export interface RecursiveCharacterTextSplitterParams separators: string[]; } +export const SupportedTextSplitterLanguages = [ + "cpp", + "go", + "java", + "js", + "php", + "proto", + "python", + "rst", + "ruby", + "rust", + "scala", + "swift", + "markdown", + "latex", + "html", +] as const; + +export type SupportedTextSplitterLanguage = + (typeof SupportedTextSplitterLanguages)[number]; + export class RecursiveCharacterTextSplitter extends TextSplitter implements RecursiveCharacterTextSplitterParams @@ -270,6 +291,336 @@ export class RecursiveCharacterTextSplitter async splitText(text: string): Promise { return this._splitText(text, this.separators); } + + static fromLanguage( + language: SupportedTextSplitterLanguage, + options: Partial + ) { + return new RecursiveCharacterTextSplitter({ + ...options, + separators: + RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language), + }); + } + + static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage) { + if (language === "cpp") { + return [ + // Split along class definitions + "\nclass ", + // Split along function definitions + "\nvoid ", + "\nint ", + "\nfloat ", + "\ndouble ", + // Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "go") { + return [ + // Split along function definitions + "\nfunc ", + "\nvar ", + "\nconst ", + "\ntype ", + // Split along control flow statements + "\nif ", + "\nfor ", + "\nswitch ", + "\ncase ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "java") { + return [ + // Split along class definitions + "\nclass ", + // Split along method definitions + "\npublic ", + "\nprotected ", + "\nprivate ", + "\nstatic ", + // Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "js") { + return [ + // Split along function definitions + "\nfunction ", + "\nconst ", + "\nlet ", + "\nvar ", + "\nclass ", + // Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + "\ndefault ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "php") { + return [ + // Split along function definitions + "\nfunction ", + // Split along class definitions + "\nclass ", + // Split along control flow statements + "\nif ", + "\nforeach ", + "\nwhile ", + "\ndo ", + "\nswitch ", + "\ncase ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "proto") { + return [ + // Split along message definitions + "\nmessage ", + // Split along service definitions + "\nservice ", + // Split along enum definitions + "\nenum ", + // Split along option definitions + "\noption ", + // Split along import statements + "\nimport ", + // Split along syntax declarations + "\nsyntax ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "python") { + return [ + // First, try to split along class definitions + "\nclass ", + "\ndef ", + "\n\tdef ", + // Now split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "rst") { + return [ + // Split along section titles + "\n===\n", + "\n---\n", + "\n***\n", + // Split along directive markers + "\n.. ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "ruby") { + return [ + // Split along method definitions + "\ndef ", + "\nclass ", + // Split along control flow statements + "\nif ", + "\nunless ", + "\nwhile ", + "\nfor ", + "\ndo ", + "\nbegin ", + "\nrescue ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "rust") { + return [ + // Split along function definitions + "\nfn ", + "\nconst ", + "\nlet ", + // Split along control flow statements + "\nif ", + "\nwhile ", + "\nfor ", + "\nloop ", + "\nmatch ", + "\nconst ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "scala") { + return [ + // Split along class definitions + "\nclass ", + "\nobject ", + // Split along method definitions + "\ndef ", + "\nval ", + "\nvar ", + // Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nmatch ", + "\ncase ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "swift") { + return [ + // Split along function definitions + "\nfunc ", + // Split along class definitions + "\nclass ", + "\nstruct ", + "\nenum ", + // Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\ndo ", + "\nswitch ", + "\ncase ", + // Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "markdown") { + return [ + // First, try to split along Markdown headings (starting with level 2) + "\n## ", + "\n### ", + "\n#### ", + "\n##### ", + "\n###### ", + // Note the alternative syntax for headings (below) is not handled here + // Heading level 2 + // --------------- + // End of code block + "```\n\n", + // Horizontal lines + "\n\n***\n\n", + "\n\n---\n\n", + "\n\n___\n\n", + // Note that this splitter doesn't handle horizontal lines defined + // by *three or more* of ***, ---, or ___, but this is not handled + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "latex") { + return [ + // First, try to split along Latex sections + "\n\\chapter{", + "\n\\section{", + "\n\\subsection{", + "\n\\subsubsection{", + + // Now split by environments + "\n\\begin{enumerate}", + "\n\\begin{itemize}", + "\n\\begin{description}", + "\n\\begin{list}", + "\n\\begin{quote}", + "\n\\begin{quotation}", + "\n\\begin{verse}", + "\n\\begin{verbatim}", + + // Now split by math environments + "\n\\begin{align}", + "$$", + "$", + + // Now split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ]; + } else if (language === "html") { + return [ + // First, try to split along HTML tags + "", + "
", + "

", + "
", + "

  • ", + "

    ", + "

    ", + "

    ", + "

    ", + "

    ", + "
    ", + "", + "", + "", + "
    ", + "", + "
      ", + "
        ", + "
        ", + "