liam-hq · hoshinotsuyoshi · Feb 20, 2025 · Feb 6, 2025 · Feb 14, 2025 · Feb 19, 2025
@@ -0,0 +1,8 @@
+---
+"@liam-hq/db-structure": patch
+"@liam-hq/erd-core": patch
+---
+
+♻️ Refactor SQL chunk processing to reduce memory errors.
+
+Increases the likelihood of processing larger `.sql` files without encountering memory errors.
@@ -8,6 +8,15 @@ import {
 export const createParserTestCases = (
   userTable: (override?: Partial<Table>) => DBStructure,
 ) => ({
+  normal: userTable({
+    columns: {
+      name: aColumn({
+        name: 'name',
+        type: 'varchar',
+        notNull: true,
+      }),
+    },
+  }),
   'table comment': userTable({
     comment: 'store our users.',
   }),

@@ -254,4 +254,20 @@ describe(processor, () => {
       expect(result).toEqual({ value, errors })
     })
   })
+
+  describe('Long statement (exceeds 500 lines, surpassing CHUNK_SIZE)', () => {
+    it('parses without errors', async () => {
+      const _500Lines = '\n'.repeat(500)
+      const { value, errors } = await processor(/* sql */ `
+        CREATE TABLE users (
+          id BIGSERIAL PRIMARY KEY,
+          name VARCHAR(255) NOT NULL
+          ${_500Lines}
+        );
+      `)
+
+      expect(value).toEqual(parserTestCases.normal)
+      expect(errors).toEqual([])
+    })
+  })
 })
@@ -6,26 +6,66 @@ import { mergeDBStructures } from './mergeDBStructures.js'
 import { parse } from './parser.js'
 import { processSQLInChunks } from './processSQLInChunks.js'
 
-export const processor: Processor = async (str: string) => {
-  const dbStructure: DBStructure = { tables: {}, relationships: {} }
-  const CHUNK_SIZE = 1000
-  const errors: ProcessError[] = []
+/**
+ * Processes SQL statements and constructs a database structure.
+ */
+export const processor: Processor = async (sql: string) => {
+  const dbSchema: DBStructure = { tables: {}, relationships: {} }
+
+  // Number of lines to process in a single chunk.
+  // While a chunk size of around 1000 might work, running it on db/structure.sql
+  // from https://gitlab.com/gitlab-org/gitlab-foss resulted in a memory error.
+  // Keep this in mind when considering any adjustments.
+  const CHUNK_SIZE = 500
+
+  const parseErrors: ProcessError[] = []
+
+  const errors = await processSQLInChunks(sql, CHUNK_SIZE, async (chunk) => {
+    let readOffset: number | null = null
+    let errorOffset: number | null = null
+    const errors: ProcessError[] = []
 
-  await processSQLInChunks(str, CHUNK_SIZE, async (chunk) => {
     const { parse_tree, error: parseError } = await parse(chunk)
+
+    if (parse_tree.stmts.length > 0 && parseError !== null) {
+      throw new Error('UnexpectedCondition')
+    }
+
     if (parseError !== null) {
       errors.push(new UnexpectedTokenWarningError(parseError.message))
+      errorOffset = parseError.cursorpos
+      return [errorOffset, readOffset, errors]
     }
 
-    const { value: converted, errors: convertErrors } = convertToDBStructure(
-      parse_tree.stmts,
-    )
-    if (convertErrors !== null) {
-      errors.push(...convertErrors)
+    let isLastStatementComplete = true
+    const statementCount = parse_tree.stmts.length
+
+    if (statementCount > 0) {
+      const lastStmt = parse_tree.stmts[statementCount - 1]
+      if (lastStmt?.stmt_len === undefined) {
+        isLastStatementComplete = false
+        if (lastStmt?.stmt_location === undefined) {
+          throw new Error('UnexpectedCondition')
+        }
+        readOffset = lastStmt?.stmt_location - 1
+      }
     }
 
-    mergeDBStructures(dbStructure, converted)
+    const { value: convertedSchema, errors: conversionErrors } =
+      convertToDBStructure(
+        isLastStatementComplete
+          ? parse_tree.stmts
+          : parse_tree.stmts.slice(0, -1),
+      )
+
+    if (conversionErrors !== null) {
+      parseErrors.push(...conversionErrors)
+    }
+
+    mergeDBStructures(dbSchema, convertedSchema)
+
+    return [errorOffset, readOffset, errors]
   })
 
-  return { value: dbStructure, errors }
+  return { value: dbSchema, errors: parseErrors.concat(errors) }
 }
@@ -1,12 +1,13 @@
 import { describe, expect, it, vi } from 'vitest'
+import { UnexpectedTokenWarningError } from '../../errors.js'
 import { processSQLInChunks } from './processSQLInChunks.js'
 
 describe(processSQLInChunks, () => {
   describe('processSQLInChunks', () => {
     it('should split input by newline and process each chunk', async () => {
       const input = 'SELECT 1;\nSELECT 2;\nSELECT 3;'
       const chunkSize = 2
-      const callback = vi.fn()
+      const callback = vi.fn().mockResolvedValue([null, null, []])
 
       await processSQLInChunks(input, chunkSize, callback)
 
@@ -18,7 +19,7 @@ describe(processSQLInChunks, () => {
     it('should handle chunks correctly to avoid invalid SQL syntax', async () => {
       const input = 'SELECT 1;\nSELECT 2;\nSELECT 3;\nSELECT 4;'
       const chunkSize = 3
-      const callback = vi.fn()
+      const callback = vi.fn().mockResolvedValue([null, null, []])
 
       await processSQLInChunks(input, chunkSize, callback)
 
@@ -30,7 +31,7 @@ describe(processSQLInChunks, () => {
     it('should handle input with no newlines correctly', async () => {
       const input = 'SELECT 1; SELECT 2; SELECT 3;'
       const chunkSize = 1
-      const callback = vi.fn()
+      const callback = vi.fn().mockResolvedValue([null, null, []])
 
       await processSQLInChunks(input, chunkSize, callback)
 
@@ -41,24 +42,126 @@ describe(processSQLInChunks, () => {
     it('should handle empty input correctly', async () => {
       const input = ''
       const chunkSize = 1
-      const callback = vi.fn()
+      const callback = vi.fn().mockResolvedValue([null, null, []])
 
       await processSQLInChunks(input, chunkSize, callback)
 
       expect(callback).not.toHaveBeenCalled()
     })
+  })
+
+  describe('processSQLInChunks, partially consuming chunk lines', () => {
+    // Helper function to create a mock callback that asserts the query and returns the expected value.
+    type SQLCallbackResult = [
+      errorOffset: number | null,
+      readOffset: number | null,
+      errors: UnexpectedTokenWarningError[],
+    ]
+    const createMockCallback = (
+      expectedCalls: [query: string, result: SQLCallbackResult][],
+    ) => {
+      const callback = vi.fn().mockImplementation(async (query) => {
+        const callIndex = callback.mock.calls.length - 1
+        const expectedCall = expectedCalls[callIndex]
+
+        if (!expectedCall) {
+          throw new Error(
+            `Unexpected callback call at index ${callIndex} with query:\n${query}`,
+          )
+        }
+
+        const [expectedQuery, returnValue] = expectedCall
+        expect(query).toBe(expectedQuery)
+        return returnValue
+      })
 
-    it('should correctly process SQL chunks while ignoring semicolons in comment lines starting with "--"', async () => {
-      const input =
-        'SELECT 1;\nSELECT 2;\n-- This is a comment line; additional text here should be ignored.\nSELECT 3;\nSELECT 4;'
+      return callback
+    }
+
+    it('should correctly handle readOffset by partially consuming chunk lines', async () => {
+      const input = `SELECT 1;
+SELECT 2;
+SELECT 3, -- partial statement
+4;`
       const chunkSize = 3
-      const callback = vi.fn()
+      const expectedCalls: [query: string, result: SQLCallbackResult][] = [
+        // [query, [errorOffset, readOffset, errors]]
 
-      await processSQLInChunks(input, chunkSize, callback)
+        // 1st: Processes "SELECT 1;", "SELECT 2;", and "SELECT 3,".
+        // Since "SELECT 3," is an incomplete statement, the parser returns readOffset at the end of the second statement (position 19).
+        [
+          'SELECT 1;\nSELECT 2;\nSELECT 3, -- partial statement',
+          [null, 19, []],
+        ],
+        // 2nd: Processes the remaining part starting from "SELECT 3".
+        ['SELECT 3, -- partial statement\n4;', [null, null, []]],
+      ]
 
-      expect(callback).toHaveBeenCalledTimes(2)
-      expect(callback).toHaveBeenCalledWith('SELECT 1;\nSELECT 2;\nSELECT 3;')
-      expect(callback).toHaveBeenCalledWith('SELECT 4;')
+      const callback = createMockCallback(expectedCalls)
+      const errors = await processSQLInChunks(input, chunkSize, callback)
+
+      expect(errors).toEqual([])
+      expect(callback).toHaveBeenCalledTimes(expectedCalls.length)
+    })
+
+    it('should correctly handle errorOffset by partially consuming chunk lines', async () => {
+      // Test case where a statement (a 6-line CREATE TABLE statement) exceeds the chunk size (3 lines).
+      //
+      // NOTE:
+      // This is a real-world scenario inspired by GitLab's `db/structure.sql`.
+      // While we set CHUNK_SIZE to 500 in practice, GitLab's `db/structure.sql` contains some
+      // `CREATE TABLE` statements that span over 500 lines.
+      // Reference:
+      // https://gitlab.com/gitlab-org/gitlab-foss/-/blob/b94eb57589bd639078b783c296642e68dfb91780/db/structure.sql#L6897-7506
+      //
+      // For simplicity, this test uses CHUNK_SIZE = 3 and a CREATE TABLE statement with only 6 lines.
+      const input = `SELECT 1;
+SELECT 2;
+CREATE TABLE t1 (
+  c1 int,
+  c2 int,
+  c3 int,
+  c4 int
+);`
+      const chunkSize = 3
+      const error = new UnexpectedTokenWarningError('')
+      const expectedCalls: [query: string, result: SQLCallbackResult][] = [
+        // [query, [errorOffset, readOffset, errors]]
+
+        // 1st: Reads the first three lines. "CREATE TABLE t1 (" is incomplete, so the parser returns an errorOffset at position 38.
+        ['SELECT 1;\nSELECT 2;\nCREATE TABLE t1 (', [38, null, [error]]],
+        // 2nd: Retries processing only the first two lines.
+        ['SELECT 1;\nSELECT 2;', [null, null, []]],
+        // 3rd: Attempts to process lines 3-5, hitting the same issue.
+        ['CREATE TABLE t1 (\n  c1 int,\n  c2 int,', [38, null, [error]]],
+        // 4th: Shrinking mode: Attempts with lines 3-4.
+        ['CREATE TABLE t1 (\n  c1 int,', [28, null, [error]]],
+        // 5th: Shrinking mode: Tries with line 3 only.
+        ['CREATE TABLE t1 (', [18, null, [error]]],
+        // 6th: Reverts to previous attempt (lines 3-5). Note: This is redundant and could be optimized.
+        ['CREATE TABLE t1 (\n  c1 int,\n  c2 int,', [38, null, [error]]],
+        // 7th: Expanding mode: Tries lines 3-6.
+        [
+          'CREATE TABLE t1 (\n  c1 int,\n  c2 int,\n  c3 int,',
+          [48, null, [error]],
+        ],
+        // 8th: Expanding mode: Tries lines 3-7.
+        [
+          'CREATE TABLE t1 (\n  c1 int,\n  c2 int,\n  c3 int,\n  c4 int',
+          [57, null, [error]],
+        ],
+        // 9th: Expanding mode: Tries lines 3-8 (complete statement).
+        [
+          'CREATE TABLE t1 (\n  c1 int,\n  c2 int,\n  c3 int,\n  c4 int\n);',
+          [null, null, []],
+        ],
+      ]
+
+      const callback = createMockCallback(expectedCalls)
+      const errors = await processSQLInChunks(input, chunkSize, callback)
+
+      expect(errors).toEqual([])
+      expect(callback).toHaveBeenCalledTimes(expectedCalls.length)
     })
   })
 })