Skip to content

Commit

Permalink
fix(llm.gblib): Talk to data local db use fix.
Browse files Browse the repository at this point in the history
  • Loading branch information
rodrigorodriguez committed Nov 24, 2024
1 parent 5b69a12 commit 29ddb89
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 0 deletions.
7 changes: 7 additions & 0 deletions packages/llm.gblib/services/ChatServices.ts
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,18 @@ export class ChatServices {
const doc = uniqueDocuments[filePaths];
const metadata = doc.metadata;
const filename = path.basename(metadata.source);

if (!GBUtil.isContentPage(doc.pageContent)){
continue;
}

let page = 0;
if (metadata.source.endsWith('.pdf')) {
page = await ChatServices.findPageForText(metadata.source, doc.pageContent);
}



output = `${output}\n\n\n\nUse also the following context which is coming from Source Document: ${filename} at page: ${page ? page : 'entire document'
}
(you will fill the JSON sources collection field later),
Expand Down
43 changes: 43 additions & 0 deletions src/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -360,4 +360,47 @@ export class GBUtil {
const randomDelay = Math.floor(Math.random() * (max - min + 1) + min) * 1000;
await new Promise(resolve => setTimeout(resolve, randomDelay));
}

public static isContentPage(text: string): boolean {
// Common patterns that indicate non-content pages
const nonContentPatterns = [
/^index$/i,
/^contents$/i,
/^table of contents$/i,
/^appendix/i,
/^glossary$/i,
/^bibliography$/i,
/^references$/i,
/^acknowledgments?$/i,
/^copyright/i,
/^about the author/i
];

// Check if page is mostly dots, numbers or blank
const isDotLeaderPage = text.replace(/\s+/g, '').match(/\.{10,}/);
const isNumbersPage = text.replace(/\s+/g, '').match(/^\d+$/);
const isBlankPage = text.trim().length === 0;

// Check if page has actual content
const wordCount = text.trim().split(/\s+/).length;
const hasMinimalContent = wordCount > 10;

// Check if page matches any non-content patterns
const isNonContent = nonContentPatterns.some(pattern =>
pattern.test(text.trim())
);

// Page is valid content if:
// - Not mostly dots/numbers/blank
// - Has minimal word count
// - Doesn't match non-content patterns
return !isDotLeaderPage &&
!isNumbersPage &&
!isBlankPage &&
hasMinimalContent &&
!isNonContent;
}



}

0 comments on commit 29ddb89

Please sign in to comment.