Preserve separators in recursive character text splitters (#1472)

* Improve text splitting by preserving separators * Fix formatting
langchain-ai · May 31, 2023 · 64cd89e · 64cd89e · vercel · May 31, 2023
1 parent 93636bd
commit 64cd89e
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 41 deletions.
diff --git a/langchain/src/tests/text_splitter.test.ts b/langchain/src/tests/text_splitter.test.ts
@@ -163,18 +163,19 @@ Bye!\n\n-H.`;
     "Harrison.",
     "How? Are?",
     "You?",
-    "Okay then f",
+    "Okay then",
     "f f f f.",
     "This is a",
-    "a weird",
+    "weird",
     "text to",
-    "write, but",
-    "gotta test",
-    "the",
-    "splittingg",
-    "ggg",
+    "write,",
+    "but gotta",
+    "test the",
+    "splitting",
+    "gggg",
     "some how.",
-    "Bye!\n\n-H.",
+    "Bye!",
+    "-H.",
   ];
   expect(output).toEqual(expectedOutput);
 });
@@ -211,39 +212,42 @@ test("Test markdown text splitter.", async () => {
     chunkOverlap: 0,
   });
   const output = await splitter.splitText(text);
+
   const expectedOutput = [
     "# 🦜️🔗 LangChain\n\n⚡ Building applications with LLMs through composability ⚡",
-    "Quick Install\n\n```bash\n# Hopefully this code block isn't split\npip install langchain",
+    "## Quick Install\n\n```bash\n# Hopefully this code block isn't split\npip install langchain",
+    "```",
     "As an open source project in a rapidly developing field, we are extremely open to contributions.",
   ];
   expect(output).toEqual(expectedOutput);
 });
 
 test("Test latex text splitter.", async () => {
   const text = `\\begin{document}
-  \\title{🦜️🔗 LangChain}
-  ⚡ Building applications with LLMs through composability ⚡
+\\title{🦜️🔗 LangChain}
+⚡ Building applications with LLMs through composability ⚡
 
-  \\section{Quick Install}
+\\section{Quick Install}
 
-  \\begin{verbatim}
-  Hopefully this code block isn't split
-  yarn add langchain
-  \\end{verbatim}
+\\begin{verbatim}
+Hopefully this code block isn't split
+yarn add langchain
+\\end{verbatim}
 
-  As an open source project in a rapidly developing field, we are extremely open to contributions.
+As an open source project in a rapidly developing field, we are extremely open to contributions.
 
-  \\end{document}`;
+\\end{document}`;
   const splitter = new LatexTextSplitter({
     chunkSize: 100,
     chunkOverlap: 0,
   });
   const output = await splitter.splitText(text);
 
   const expectedOutput = [
-    "\\begin{document}\n  \\title{🦜️🔗 LangChain}\n  ⚡ Building applications with LLMs through composability ⚡",
-    "\\section{Quick Install}\n\n  \\begin{verbatim}\n  Hopefully this code block isn't split\n  yarn add langchain",
-    "\\end{verbatim}\n\n  As an open source project in a rapidly developing field, we are extremely open to contributions.",
+    "\\begin{document}\n\\title{🦜️🔗 LangChain}\n⚡ Building applications with LLMs through composability ⚡",
+    "\\section{Quick Install}",
+    "\\begin{verbatim}\nHopefully this code block isn't split\nyarn add langchain\n\\end{verbatim}",
+    "As an open source project in a rapidly developing field, we are extremely open to contributions.",
     "\\end{document}",
   ];
   expect(output).toEqual(expectedOutput);

diff --git a/langchain/src/text_splitter.ts b/langchain/src/text_splitter.ts
@@ -5,6 +5,7 @@ import { getEncoding } from "./util/tiktoken.js";
 export interface TextSplitterParams {
   chunkSize: number;
   chunkOverlap: number;
+  keepSeparator: boolean;
 }
 
 export type TextSplitterChunkHeaderOptions = {
@@ -18,16 +19,37 @@ export abstract class TextSplitter implements TextSplitterParams {
 
   chunkOverlap = 200;
 
+  keepSeparator = false;
+
   constructor(fields?: Partial<TextSplitterParams>) {
     this.chunkSize = fields?.chunkSize ?? this.chunkSize;
     this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
+    this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;
     if (this.chunkOverlap >= this.chunkSize) {
       throw new Error("Cannot have chunkOverlap >= chunkSize");
     }
   }
 
   abstract splitText(text: string): Promise<string[]>;
 
+  protected splitOnSeparator(text: string, separator: string): string[] {
+    let splits;
+    if (separator) {
+      if (this.keepSeparator) {
+        const regexEscapedSeparator = separator.replace(
+          /[/\-\\^$*+?.()|[\]{}]/g,
+          "\\$&"
+        );
+        splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
+      } else {
+        splits = text.split(separator);
+      }
+    } else {
+      splits = text.split("");
+    }
+    return splits.filter((s) => s !== "");
+  }
+
   async createDocuments(
     texts: string[],
     // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -174,13 +196,8 @@ export class CharacterTextSplitter
 
   async splitText(text: string): Promise<string[]> {
     // First we naively split the large input into a bunch of smaller ones.
-    let splits: string[];
-    if (this.separator) {
-      splits = text.split(this.separator);
-    } else {
-      splits = text.split("");
-    }
-    return this.mergeSplits(splits, this.separator);
+    const splits = this.splitOnSeparator(text, this.separator);
+    return this.mergeSplits(splits, this.keepSeparator ? "" : this.separator);
   }
 }
 
@@ -198,53 +215,61 @@ export class RecursiveCharacterTextSplitter
   constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>) {
     super(fields);
     this.separators = fields?.separators ?? this.separators;
+    this.keepSeparator = fields?.keepSeparator ?? true;
   }
 
-  async splitText(text: string): Promise<string[]> {
+  private async _splitText(text: string, separators: string[]) {
     const finalChunks: string[] = [];
 
     // Get appropriate separator to use
-    let separator: string = this.separators[this.separators.length - 1];
-    for (const s of this.separators) {
+    let separator: string = separators[separators.length - 1];
+    let newSeparators;
+    for (let i = 0; i < separators.length; i += 1) {
+      const s = separators[i];
       if (s === "") {
         separator = s;
         break;
       }
       if (text.includes(s)) {
         separator = s;
+        newSeparators = separators.slice(i + 1);
         break;
       }
     }
 
     // Now that we have the separator, split the text
-    let splits: string[];
-    if (separator) {
-      splits = text.split(separator);
-    } else {
-      splits = text.split("");
-    }
+    const splits = this.splitOnSeparator(text, separator);
 
     // Now go merging things, recursively splitting longer texts.
     let goodSplits: string[] = [];
+    const _separator = this.keepSeparator ? "" : separator;
     for (const s of splits) {
       if (s.length < this.chunkSize) {
         goodSplits.push(s);
       } else {
         if (goodSplits.length) {
-          const mergedText = this.mergeSplits(goodSplits, separator);
+          const mergedText = this.mergeSplits(goodSplits, _separator);
           finalChunks.push(...mergedText);
           goodSplits = [];
         }
-        const otherInfo = await this.splitText(s);
-        finalChunks.push(...otherInfo);
+        if (!newSeparators) {
+          finalChunks.push(s);
+        } else {
+          const otherInfo = await this._splitText(s, newSeparators);
+          finalChunks.push(...otherInfo);
+        }
       }
     }
     if (goodSplits.length) {
-      const mergedText = this.mergeSplits(goodSplits, separator);
+      const mergedText = this.mergeSplits(goodSplits, _separator);
       finalChunks.push(...mergedText);
     }
     return finalChunks;
   }
+
+  async splitText(text: string): Promise<string[]> {
+    return this._splitText(text, this.separators);
+  }
 }
 
 export interface TokenTextSplitterParams extends TextSplitterParams {
@@ -369,6 +394,8 @@ export class LatexTextSplitter
     "$",
 
     // Now split by the normal type of lines
+    "\n\n",
+    "\n",
     " ",
     "",
   ];