Skip to content

Commit

Permalink
Preserve separators in recursive character text splitters (#1472)
Browse files Browse the repository at this point in the history
* Improve text splitting by preserving separators

* Fix formatting
  • Loading branch information
jacoblee93 authored May 31, 2023
1 parent 93636bd commit 64cd89e
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 41 deletions.
46 changes: 25 additions & 21 deletions langchain/src/tests/text_splitter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,18 +163,19 @@ Bye!\n\n-H.`;
"Harrison.",
"How? Are?",
"You?",
"Okay then f",
"Okay then",
"f f f f.",
"This is a",
"a weird",
"weird",
"text to",
"write, but",
"gotta test",
"the",
"splittingg",
"ggg",
"write,",
"but gotta",
"test the",
"splitting",
"gggg",
"some how.",
"Bye!\n\n-H.",
"Bye!",
"-H.",
];
expect(output).toEqual(expectedOutput);
});
Expand Down Expand Up @@ -211,39 +212,42 @@ test("Test markdown text splitter.", async () => {
chunkOverlap: 0,
});
const output = await splitter.splitText(text);

const expectedOutput = [
"# 🦜️🔗 LangChain\n\n⚡ Building applications with LLMs through composability ⚡",
"Quick Install\n\n```bash\n# Hopefully this code block isn't split\npip install langchain",
"## Quick Install\n\n```bash\n# Hopefully this code block isn't split\npip install langchain",
"```",
"As an open source project in a rapidly developing field, we are extremely open to contributions.",
];
expect(output).toEqual(expectedOutput);
});

test("Test latex text splitter.", async () => {
const text = `\\begin{document}
\\title{🦜️🔗 LangChain}
⚡ Building applications with LLMs through composability ⚡
\\title{🦜️🔗 LangChain}
⚡ Building applications with LLMs through composability ⚡
\\section{Quick Install}
\\section{Quick Install}
\\begin{verbatim}
Hopefully this code block isn't split
yarn add langchain
\\end{verbatim}
\\begin{verbatim}
Hopefully this code block isn't split
yarn add langchain
\\end{verbatim}
As an open source project in a rapidly developing field, we are extremely open to contributions.
As an open source project in a rapidly developing field, we are extremely open to contributions.
\\end{document}`;
\\end{document}`;
const splitter = new LatexTextSplitter({
chunkSize: 100,
chunkOverlap: 0,
});
const output = await splitter.splitText(text);

const expectedOutput = [
"\\begin{document}\n \\title{🦜️🔗 LangChain}\n ⚡ Building applications with LLMs through composability ⚡",
"\\section{Quick Install}\n\n \\begin{verbatim}\n Hopefully this code block isn't split\n yarn add langchain",
"\\end{verbatim}\n\n As an open source project in a rapidly developing field, we are extremely open to contributions.",
"\\begin{document}\n\\title{🦜️🔗 LangChain}\n⚡ Building applications with LLMs through composability ⚡",
"\\section{Quick Install}",
"\\begin{verbatim}\nHopefully this code block isn't split\nyarn add langchain\n\\end{verbatim}",
"As an open source project in a rapidly developing field, we are extremely open to contributions.",
"\\end{document}",
];
expect(output).toEqual(expectedOutput);
Expand Down
67 changes: 47 additions & 20 deletions langchain/src/text_splitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { getEncoding } from "./util/tiktoken.js";
export interface TextSplitterParams {
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
}

export type TextSplitterChunkHeaderOptions = {
Expand All @@ -18,16 +19,37 @@ export abstract class TextSplitter implements TextSplitterParams {

chunkOverlap = 200;

keepSeparator = false;

constructor(fields?: Partial<TextSplitterParams>) {
this.chunkSize = fields?.chunkSize ?? this.chunkSize;
this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;
if (this.chunkOverlap >= this.chunkSize) {
throw new Error("Cannot have chunkOverlap >= chunkSize");
}
}

abstract splitText(text: string): Promise<string[]>;

protected splitOnSeparator(text: string, separator: string): string[] {
let splits;
if (separator) {
if (this.keepSeparator) {
const regexEscapedSeparator = separator.replace(
/[/\-\\^$*+?.()|[\]{}]/g,
"\\$&"
);
splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
} else {
splits = text.split(separator);
}
} else {
splits = text.split("");
}
return splits.filter((s) => s !== "");
}

async createDocuments(
texts: string[],
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand Down Expand Up @@ -174,13 +196,8 @@ export class CharacterTextSplitter

async splitText(text: string): Promise<string[]> {
// First we naively split the large input into a bunch of smaller ones.
let splits: string[];
if (this.separator) {
splits = text.split(this.separator);
} else {
splits = text.split("");
}
return this.mergeSplits(splits, this.separator);
const splits = this.splitOnSeparator(text, this.separator);
return this.mergeSplits(splits, this.keepSeparator ? "" : this.separator);
}
}

Expand All @@ -198,53 +215,61 @@ export class RecursiveCharacterTextSplitter
constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>) {
super(fields);
this.separators = fields?.separators ?? this.separators;
this.keepSeparator = fields?.keepSeparator ?? true;
}

async splitText(text: string): Promise<string[]> {
private async _splitText(text: string, separators: string[]) {
const finalChunks: string[] = [];

// Get appropriate separator to use
let separator: string = this.separators[this.separators.length - 1];
for (const s of this.separators) {
let separator: string = separators[separators.length - 1];
let newSeparators;
for (let i = 0; i < separators.length; i += 1) {
const s = separators[i];
if (s === "") {
separator = s;
break;
}
if (text.includes(s)) {
separator = s;
newSeparators = separators.slice(i + 1);
break;
}
}

// Now that we have the separator, split the text
let splits: string[];
if (separator) {
splits = text.split(separator);
} else {
splits = text.split("");
}
const splits = this.splitOnSeparator(text, separator);

// Now go merging things, recursively splitting longer texts.
let goodSplits: string[] = [];
const _separator = this.keepSeparator ? "" : separator;
for (const s of splits) {
if (s.length < this.chunkSize) {
goodSplits.push(s);
} else {
if (goodSplits.length) {
const mergedText = this.mergeSplits(goodSplits, separator);
const mergedText = this.mergeSplits(goodSplits, _separator);
finalChunks.push(...mergedText);
goodSplits = [];
}
const otherInfo = await this.splitText(s);
finalChunks.push(...otherInfo);
if (!newSeparators) {
finalChunks.push(s);
} else {
const otherInfo = await this._splitText(s, newSeparators);
finalChunks.push(...otherInfo);
}
}
}
if (goodSplits.length) {
const mergedText = this.mergeSplits(goodSplits, separator);
const mergedText = this.mergeSplits(goodSplits, _separator);
finalChunks.push(...mergedText);
}
return finalChunks;
}

async splitText(text: string): Promise<string[]> {
return this._splitText(text, this.separators);
}
}

export interface TokenTextSplitterParams extends TextSplitterParams {
Expand Down Expand Up @@ -369,6 +394,8 @@ export class LatexTextSplitter
"$",

// Now split by the normal type of lines
"\n\n",
"\n",
" ",
"",
];
Expand Down

1 comment on commit 64cd89e

@vercel
Copy link

@vercel vercel bot commented on 64cd89e May 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.