Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

♻️ Refactor SQL chunk processing to reduce memory errors #699

Merged
merged 6 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changeset/cyan-crabs-kiss.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"@liam-hq/db-structure": patch
"@liam-hq/erd-core": patch
---

♻️ Refactor SQL chunk processing to reduce memory errors.

Increases the likelihood of processing larger `.sql` files without encountering memory errors.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ import {
export const createParserTestCases = (
userTable: (override?: Partial<Table>) => DBStructure,
) => ({
normal: userTable({
columns: {
name: aColumn({
name: 'name',
type: 'varchar',
notNull: true,
}),
},
}),
'table comment': userTable({
comment: 'store our users.',
}),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,4 +254,20 @@ describe(processor, () => {
expect(result).toEqual({ value, errors })
})
})

describe('Long statement (exceeds 500 lines, surpassing CHUNK_SIZE)', () => {
it('parses without errors', async () => {
const _500Lines = '\n'.repeat(500)
const { value, errors } = await processor(/* sql */ `
CREATE TABLE users (
id BIGSERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL
${_500Lines}
);
`)

expect(value).toEqual(parserTestCases.normal)
expect(errors).toEqual([])
})
})
})
64 changes: 52 additions & 12 deletions frontend/packages/db-structure/src/parser/sql/postgresql/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,66 @@ import { mergeDBStructures } from './mergeDBStructures.js'
import { parse } from './parser.js'
import { processSQLInChunks } from './processSQLInChunks.js'

export const processor: Processor = async (str: string) => {
const dbStructure: DBStructure = { tables: {}, relationships: {} }
const CHUNK_SIZE = 1000
const errors: ProcessError[] = []
/**
* Processes SQL statements and constructs a database structure.
*/
export const processor: Processor = async (sql: string) => {
const dbSchema: DBStructure = { tables: {}, relationships: {} }

// Number of lines to process in a single chunk.
// While a chunk size of around 1000 might work, running it on db/structure.sql
// from https://gitlab.com/gitlab-org/gitlab-foss resulted in a memory error.
// Keep this in mind when considering any adjustments.
const CHUNK_SIZE = 500

const parseErrors: ProcessError[] = []

const errors = await processSQLInChunks(sql, CHUNK_SIZE, async (chunk) => {
let readOffset: number | null = null
let errorOffset: number | null = null
const errors: ProcessError[] = []

await processSQLInChunks(str, CHUNK_SIZE, async (chunk) => {
const { parse_tree, error: parseError } = await parse(chunk)

if (parse_tree.stmts.length > 0 && parseError !== null) {
throw new Error('UnexpectedCondition')
}

if (parseError !== null) {
errors.push(new UnexpectedTokenWarningError(parseError.message))
errorOffset = parseError.cursorpos
return [errorOffset, readOffset, errors]
}

const { value: converted, errors: convertErrors } = convertToDBStructure(
parse_tree.stmts,
)
if (convertErrors !== null) {
errors.push(...convertErrors)
let isLastStatementComplete = true
const statementCount = parse_tree.stmts.length

if (statementCount > 0) {
const lastStmt = parse_tree.stmts[statementCount - 1]
if (lastStmt?.stmt_len === undefined) {
isLastStatementComplete = false
if (lastStmt?.stmt_location === undefined) {
throw new Error('UnexpectedCondition')
}
readOffset = lastStmt?.stmt_location - 1
}
}

mergeDBStructures(dbStructure, converted)
const { value: convertedSchema, errors: conversionErrors } =
convertToDBStructure(
isLastStatementComplete
? parse_tree.stmts
: parse_tree.stmts.slice(0, -1),
)

if (conversionErrors !== null) {
parseErrors.push(...conversionErrors)
}

mergeDBStructures(dbSchema, convertedSchema)

return [errorOffset, readOffset, errors]
})

return { value: dbStructure, errors }
return { value: dbSchema, errors: parseErrors.concat(errors) }
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import { describe, expect, it, vi } from 'vitest'
import { UnexpectedTokenWarningError } from '../../errors.js'
import { processSQLInChunks } from './processSQLInChunks.js'

describe(processSQLInChunks, () => {
describe('processSQLInChunks', () => {
it('should split input by newline and process each chunk', async () => {
const input = 'SELECT 1;\nSELECT 2;\nSELECT 3;'
const chunkSize = 2
const callback = vi.fn()
const callback = vi.fn().mockResolvedValue([null, null, []])

await processSQLInChunks(input, chunkSize, callback)

Expand All @@ -18,7 +19,7 @@ describe(processSQLInChunks, () => {
it('should handle chunks correctly to avoid invalid SQL syntax', async () => {
const input = 'SELECT 1;\nSELECT 2;\nSELECT 3;\nSELECT 4;'
const chunkSize = 3
const callback = vi.fn()
const callback = vi.fn().mockResolvedValue([null, null, []])

await processSQLInChunks(input, chunkSize, callback)

Expand All @@ -30,7 +31,7 @@ describe(processSQLInChunks, () => {
it('should handle input with no newlines correctly', async () => {
const input = 'SELECT 1; SELECT 2; SELECT 3;'
const chunkSize = 1
const callback = vi.fn()
const callback = vi.fn().mockResolvedValue([null, null, []])

await processSQLInChunks(input, chunkSize, callback)

Expand All @@ -41,24 +42,126 @@ describe(processSQLInChunks, () => {
it('should handle empty input correctly', async () => {
const input = ''
const chunkSize = 1
const callback = vi.fn()
const callback = vi.fn().mockResolvedValue([null, null, []])

await processSQLInChunks(input, chunkSize, callback)

expect(callback).not.toHaveBeenCalled()
})
})

describe('processSQLInChunks, partially consuming chunk lines', () => {
// Helper function to create a mock callback that asserts the query and returns the expected value.
type SQLCallbackResult = [
errorOffset: number | null,
readOffset: number | null,
errors: UnexpectedTokenWarningError[],
]
const createMockCallback = (
expectedCalls: [query: string, result: SQLCallbackResult][],
) => {
const callback = vi.fn().mockImplementation(async (query) => {
const callIndex = callback.mock.calls.length - 1
const expectedCall = expectedCalls[callIndex]

if (!expectedCall) {
throw new Error(
`Unexpected callback call at index ${callIndex} with query:\n${query}`,
)
}

const [expectedQuery, returnValue] = expectedCall
expect(query).toBe(expectedQuery)
return returnValue
})

it('should correctly process SQL chunks while ignoring semicolons in comment lines starting with "--"', async () => {
const input =
'SELECT 1;\nSELECT 2;\n-- This is a comment line; additional text here should be ignored.\nSELECT 3;\nSELECT 4;'
return callback
}

it('should correctly handle readOffset by partially consuming chunk lines', async () => {
const input = `SELECT 1;
SELECT 2;
SELECT 3, -- partial statement
4;`
const chunkSize = 3
const callback = vi.fn()
const expectedCalls: [query: string, result: SQLCallbackResult][] = [
// [query, [errorOffset, readOffset, errors]]

await processSQLInChunks(input, chunkSize, callback)
// 1st: Processes "SELECT 1;", "SELECT 2;", and "SELECT 3,".
// Since "SELECT 3," is an incomplete statement, the parser returns readOffset at the end of the second statement (position 19).
[
'SELECT 1;\nSELECT 2;\nSELECT 3, -- partial statement',
[null, 19, []],
],
// 2nd: Processes the remaining part starting from "SELECT 3".
['SELECT 3, -- partial statement\n4;', [null, null, []]],
]

expect(callback).toHaveBeenCalledTimes(2)
expect(callback).toHaveBeenCalledWith('SELECT 1;\nSELECT 2;\nSELECT 3;')
expect(callback).toHaveBeenCalledWith('SELECT 4;')
const callback = createMockCallback(expectedCalls)
const errors = await processSQLInChunks(input, chunkSize, callback)

expect(errors).toEqual([])
expect(callback).toHaveBeenCalledTimes(expectedCalls.length)
})

it('should correctly handle errorOffset by partially consuming chunk lines', async () => {
// Test case where a statement (a 6-line CREATE TABLE statement) exceeds the chunk size (3 lines).
//
// NOTE:
// This is a real-world scenario inspired by GitLab's `db/structure.sql`.
// While we set CHUNK_SIZE to 500 in practice, GitLab's `db/structure.sql` contains some
// `CREATE TABLE` statements that span over 500 lines.
// Reference:
// https://gitlab.com/gitlab-org/gitlab-foss/-/blob/b94eb57589bd639078b783c296642e68dfb91780/db/structure.sql#L6897-7506
//
// For simplicity, this test uses CHUNK_SIZE = 3 and a CREATE TABLE statement with only 6 lines.
const input = `SELECT 1;
SELECT 2;
CREATE TABLE t1 (
c1 int,
c2 int,
c3 int,
c4 int
);`
const chunkSize = 3
const error = new UnexpectedTokenWarningError('')
const expectedCalls: [query: string, result: SQLCallbackResult][] = [
// [query, [errorOffset, readOffset, errors]]

// 1st: Reads the first three lines. "CREATE TABLE t1 (" is incomplete, so the parser returns an errorOffset at position 38.
['SELECT 1;\nSELECT 2;\nCREATE TABLE t1 (', [38, null, [error]]],
// 2nd: Retries processing only the first two lines.
['SELECT 1;\nSELECT 2;', [null, null, []]],
// 3rd: Attempts to process lines 3-5, hitting the same issue.
['CREATE TABLE t1 (\n c1 int,\n c2 int,', [38, null, [error]]],
// 4th: Shrinking mode: Attempts with lines 3-4.
['CREATE TABLE t1 (\n c1 int,', [28, null, [error]]],
// 5th: Shrinking mode: Tries with line 3 only.
['CREATE TABLE t1 (', [18, null, [error]]],
// 6th: Reverts to previous attempt (lines 3-5). Note: This is redundant and could be optimized.
['CREATE TABLE t1 (\n c1 int,\n c2 int,', [38, null, [error]]],
// 7th: Expanding mode: Tries lines 3-6.
[
'CREATE TABLE t1 (\n c1 int,\n c2 int,\n c3 int,',
[48, null, [error]],
],
// 8th: Expanding mode: Tries lines 3-7.
[
'CREATE TABLE t1 (\n c1 int,\n c2 int,\n c3 int,\n c4 int',
[57, null, [error]],
],
// 9th: Expanding mode: Tries lines 3-8 (complete statement).
[
'CREATE TABLE t1 (\n c1 int,\n c2 int,\n c3 int,\n c4 int\n);',
[null, null, []],
],
]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test case is easy to understand 👍🏻


const callback = createMockCallback(expectedCalls)
const errors = await processSQLInChunks(input, chunkSize, callback)

expect(errors).toEqual([])
expect(callback).toHaveBeenCalledTimes(expectedCalls.length)
})
})
})
Loading
Loading