diff --git a/packages/llm.gblib/services/ChatServices.ts b/packages/llm.gblib/services/ChatServices.ts index aad20055..a6a38612 100644 --- a/packages/llm.gblib/services/ChatServices.ts +++ b/packages/llm.gblib/services/ChatServices.ts @@ -196,11 +196,18 @@ export class ChatServices { const doc = uniqueDocuments[filePaths]; const metadata = doc.metadata; const filename = path.basename(metadata.source); + + if (!GBUtil.isContentPage(doc.pageContent)){ + continue; + } + let page = 0; if (metadata.source.endsWith('.pdf')) { page = await ChatServices.findPageForText(metadata.source, doc.pageContent); } + + output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page ? page : 'entire document' } (you will fill the JSON sources collection field later), diff --git a/src/util.ts b/src/util.ts index 5cf4666b..ebd23a13 100644 --- a/src/util.ts +++ b/src/util.ts @@ -360,4 +360,47 @@ export class GBUtil { const randomDelay = Math.floor(Math.random() * (max - min + 1) + min) * 1000; await new Promise(resolve => setTimeout(resolve, randomDelay)); } + + public static isContentPage(text: string): boolean { + // Common patterns that indicate non-content pages + const nonContentPatterns = [ + /^index$/i, + /^contents$/i, + /^table of contents$/i, + /^appendix/i, + /^glossary$/i, + /^bibliography$/i, + /^references$/i, + /^acknowledgments?$/i, + /^copyright/i, + /^about the author/i + ]; + + // Check if page is mostly dots, numbers or blank + const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/); + const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/); + const isBlankPage = text.trim().length === 0; + + // Check if page has actual content + const wordCount = text.trim().split(/\s+/).length; + const hasMinimalContent = wordCount > 10; + + // Check if page matches any non-content patterns + const isNonContent = nonContentPatterns.some(pattern => + pattern.test(text.trim()) + ); + + // Page is valid content if: + // - Not mostly dots/numbers/blank + // - Has minimal word count + // - Doesn't match non-content patterns + return !isDotLeaderPage && + !isNumbersPage && + !isBlankPage && + hasMinimalContent && + !isNonContent; + } + + + }