From 464d594e385b121050092432529c8c07ccf584f7 Mon Sep 17 00:00:00 2001
From: Martin Fleck <mfleck@eclipsesource.com>
Date: Fri, 6 Sep 2024 11:25:02 +0200
Subject: [PATCH 1/3] Introduce tokenizing options for full and partial mode

Add tokenizing mode to tokenizing method
- Full: We get the full text to tokenize
- Partial: We get only a portion of the text to tokenize

In indentation lexing, we do not auto-complete dedents for partial mode
---
 packages/langium/src/parser/indentation-aware.ts     | 12 +++++++-----
 packages/langium/src/parser/langium-parser.ts        |  2 +-
 packages/langium/src/parser/lexer.ts                 | 10 ++++++++--
 .../langium/test/parser/indentation-aware.test.ts    | 12 ++++++++++++
 4 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index 531fbc76d..d4c3abf5e 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -7,11 +7,11 @@
 import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition, TokenVocabulary } from 'chevrotain';
 import type { Grammar, TerminalRule } from '../languages/generated/ast.js';
 import type { LexingReport, TokenBuilderOptions } from './token-builder.js';
-import type { LexerResult } from './lexer.js';
+import type { LexerResult, TokenizeOptions } from './lexer.js';
 import type { LangiumCoreServices } from '../services.js';
 import { createToken, createTokenInstance, Lexer } from 'chevrotain';
 import { DefaultTokenBuilder } from './token-builder.js';
-import { DefaultLexer, isTokenTypeArray } from './lexer.js';
+import { DEFAULT_TOKENIZE_OPTIONS, DefaultLexer, isTokenTypeArray } from './lexer.js';
 
 type IndentationAwareDelimiter<TokenName extends string> = [begin: TokenName, end: TokenName];
 
@@ -402,13 +402,15 @@ export class IndentationAwareLexer extends DefaultLexer {
         }
     }
 
-    override tokenize(text: string): LexerResult {
+    override tokenize(text: string, options: TokenizeOptions = DEFAULT_TOKENIZE_OPTIONS): LexerResult {
         const result = super.tokenize(text);
 
         // consuming all remaining dedents and remove them as they might not be serializable
         const report = result.report as IndentationLexingReport;
-        const remainingDedents = report.remainingDedents;
-        result.tokens.push(...remainingDedents);
+        if (options?.mode === 'full') {
+            // auto-complete document with remaining dedents
+            result.tokens.push(...report.remainingDedents);
+        }
         report.remainingDedents = [];
 
         // remove any "indent-dedent" pair with an empty body as these are typically
diff --git a/packages/langium/src/parser/langium-parser.ts b/packages/langium/src/parser/langium-parser.ts
index 44292780b..4d431e3aa 100644
--- a/packages/langium/src/parser/langium-parser.ts
+++ b/packages/langium/src/parser/langium-parser.ts
@@ -527,7 +527,7 @@ export class LangiumCompletionParser extends AbstractLangiumParser {
 
     parse(input: string): CompletionParserResult {
         this.resetState();
-        const tokens = this.lexer.tokenize(input);
+        const tokens = this.lexer.tokenize(input, { mode: 'partial' });
         this.tokens = tokens.tokens;
         this.wrapper.input = [...this.tokens];
         this.mainRule.call(this.wrapper, {});
diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index a45109e82..bf6c0e299 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -25,9 +25,15 @@ export interface LexerResult {
     report?: LexingReport;
 }
 
+export interface TokenizeOptions {
+    mode: 'full' | 'partial';
+}
+
+export const DEFAULT_TOKENIZE_OPTIONS: TokenizeOptions = { mode: 'full' };
+
 export interface Lexer {
     readonly definition: TokenTypeDictionary;
-    tokenize(text: string): LexerResult;
+    tokenize(text: string, options?: TokenizeOptions): LexerResult;
 }
 
 export class DefaultLexer implements Lexer {
@@ -52,7 +58,7 @@ export class DefaultLexer implements Lexer {
         return this.tokenTypes;
     }
 
-    tokenize(text: string): LexerResult {
+    tokenize(text: string, _options: TokenizeOptions = DEFAULT_TOKENIZE_OPTIONS): LexerResult {
         const chevrotainResult = this.chevrotainLexer.tokenize(text);
         return {
             tokens: chevrotainResult.tokens,
diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index 1b1f51d7a..bf9683dee 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -193,6 +193,18 @@ describe('IndentationAwareLexer', () => {
         expect(dedent.tokenType.name).toBe('DEDENT');
     });
 
+    test('should NOT add remaining dedents to the end if partial tokenizing', async () => {
+        const lexer = await getLexer(sampleGrammar);
+        const { tokens } = lexer.tokenize(expandToString`
+        // single-line comment
+        {
+            name`, { mode: 'partial' });
+        expect(tokens).toHaveLength(3);
+
+        const [/* L_BRAC */, indent, /* id */] = tokens;
+        expect(indent.tokenType.name).toBe('INDENT');
+    });
+
     test('should not return any tokens for empty input', async () => {
         const lexer = await getLexer(sampleGrammar);
         const { tokens } = lexer.tokenize('');

From 14abdc4cf1c06c8a20b87dd3fd38ed742eb8514d Mon Sep 17 00:00:00 2001
From: Martin Fleck <mfleck@eclipsesource.com>
Date: Fri, 6 Sep 2024 13:30:28 +0200
Subject: [PATCH 2/3] Consider PR feedback

- Adapt a few method names
- Add missing parameter JSDoc
- Fix wrong whitespace in Lexer constructor
- Introduce specific types for unions
- Add completion test
- Do not use token names if there is no match as the text length is
important for offset calculation
---
 .../langium/src/parser/indentation-aware.ts   | 25 +++++++++----------
 packages/langium/src/parser/lexer.ts          |  8 +++---
 packages/langium/src/parser/token-builder.ts  |  9 ++++---
 .../src/validation/document-validator.ts      | 14 +++++------
 .../src/validation/validation-registry.ts     |  4 ++-
 .../test/parser/indentation-aware.test.ts     | 23 ++++++++++++++++-
 6 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index d4c3abf5e..2ff40dec6 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -179,11 +179,11 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
         }
     }
 
-    override popLexingReport(text: string): IndentationLexingReport {
-        const result = super.popLexingReport(text);
+    override flushLexingReport(text: string): IndentationLexingReport {
+        const result = super.flushLexingReport(text);
         return {
             ...result,
-            remainingDedents: this.popRemainingDedents(text),
+            remainingDedents: this.flushRemainingDedents(text),
         };
     }
 
@@ -203,9 +203,12 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      *
      * @param text The full input string.
      * @param offset The current position at which to attempt a match
+     * @param tokens Previously scanned tokens
+     * @param groups Token Groups
      * @returns The current and previous indentation levels and the matched whitespace
      */
-    protected matchWhitespace(text: string, offset: number, _tokens: IToken[], _groups: Record<string, IToken[]>): { currIndentLevel: number, prevIndentLevel: number, match: RegExpExecArray | null } {
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    protected matchWhitespace(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): { currIndentLevel: number, prevIndentLevel: number, match: RegExpExecArray | null } {
         this.whitespaceRegExp.lastIndex = offset;
         const match = this.whitespaceRegExp.exec(text);
         return {
@@ -251,12 +254,10 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      *
      * @param text The full input string.
      * @param offset The offset at which to attempt a match
-     * @param tokens Previously scanned Tokens
+     * @param tokens Previously scanned tokens
      * @param groups Token Groups
      */
     protected indentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
-        const { indentTokenName } = this.options;
-
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
@@ -274,7 +275,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
         const indentToken = this.createIndentationTokenInstance(
             this.indentTokenType,
             text,
-            match?.[0] ?? indentTokenName,
+            match?.[0] ?? '',
             offset,
         );
         tokens.push(indentToken);
@@ -288,12 +289,10 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      *
      * @param text The full input string.
      * @param offset The offset at which to attempt a match
-     * @param tokens Previously scanned Tokens
+     * @param tokens Previously scanned tokens
      * @param groups Token Groups
      */
     protected dedentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc> {
-        const { dedentTokenName } = this.options;
-
         if (!this.isStartOfLine(text, offset)) {
             return null;
         }
@@ -327,7 +326,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
             const token = this.createIndentationTokenInstance(
                 this.dedentTokenType,
                 text,
-                match?.[0] ?? dedentTokenName,
+                match?.[0] ?? '',
                 offset,
             );
             tokens.push(token);
@@ -362,7 +361,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
      * @param text Full text that was tokenized
      * @returns Remaining dedent tokens to match all previous indents at the end of the file
      */
-    popRemainingDedents(text: string): IToken[] {
+    flushRemainingDedents(text: string): IToken[] {
         const remainingDedents: IToken[] = [];
         while (this.indentationStack.length > 1) {
             remainingDedents.push(
diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index bf6c0e299..28fd8c87a 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -25,8 +25,10 @@ export interface LexerResult {
     report?: LexingReport;
 }
 
+export type TokenizeMode = 'full' | 'partial';
+
 export interface TokenizeOptions {
-    mode: 'full' | 'partial';
+    mode: TokenizeMode;
 }
 
 export const DEFAULT_TOKENIZE_OPTIONS: TokenizeOptions = { mode: 'full' };
@@ -42,7 +44,7 @@ export class DefaultLexer implements Lexer {
     protected tokenBuilder: TokenBuilder;
     protected tokenTypes: TokenTypeDictionary;
 
-    constructor( services: LangiumCoreServices) {
+    constructor(services: LangiumCoreServices) {
         this.tokenBuilder = services.parser.TokenBuilder;
         const tokens = this.tokenBuilder.buildTokens(services.Grammar, {
             caseInsensitive: services.LanguageMetaData.caseInsensitive
@@ -64,7 +66,7 @@ export class DefaultLexer implements Lexer {
             tokens: chevrotainResult.tokens,
             errors: chevrotainResult.errors,
             hidden: chevrotainResult.groups.hidden ?? [],
-            report: this.tokenBuilder.popLexingReport?.(text)
+            report: this.tokenBuilder.flushLexingReport?.(text)
         };
     }
 
diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts
index 9407c9c71..a2d8c2952 100644
--- a/packages/langium/src/parser/token-builder.ts
+++ b/packages/langium/src/parser/token-builder.ts
@@ -25,7 +25,7 @@ export interface TokenBuilder {
      *
      * @param text The text that was tokenized.
      */
-    popLexingReport?(text: string): LexingReport;
+    flushLexingReport?(text: string): LexingReport;
 }
 
 /**
@@ -36,8 +36,10 @@ export interface LexingReport {
     diagnostics: LexingDiagnostic[];
 }
 
+export type LexingDiagnosticSeverity = 'error' | 'warning' | 'info' | 'hint';
+
 export interface LexingDiagnostic extends ILexingError {
-    severity?: 'error' | 'warning' | 'info' | 'hint';
+    severity?: LexingDiagnosticSeverity;
 }
 
 export class DefaultTokenBuilder implements TokenBuilder {
@@ -64,7 +66,8 @@ export class DefaultTokenBuilder implements TokenBuilder {
         return tokens;
     }
 
-    popLexingReport(_text: string): LexingReport {
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    flushLexingReport(text: string): LexingReport {
         return { diagnostics: this.popDiagnostics() };
     }
 
diff --git a/packages/langium/src/validation/document-validator.ts b/packages/langium/src/validation/document-validator.ts
index 8c4ae0850..804cb4765 100644
--- a/packages/langium/src/validation/document-validator.ts
+++ b/packages/langium/src/validation/document-validator.ts
@@ -11,14 +11,14 @@ import type { ParseResult } from '../parser/langium-parser.js';
 import type { LangiumCoreServices } from '../services.js';
 import type { AstNode, CstNode } from '../syntax-tree.js';
 import type { LangiumDocument } from '../workspace/documents.js';
-import type { DiagnosticData, DiagnosticInfo, ValidationAcceptor, ValidationCategory, ValidationRegistry } from './validation-registry.js';
+import type { DiagnosticData, DiagnosticInfo, ValidationAcceptor, ValidationCategory, ValidationRegistry, ValidationSeverity } from './validation-registry.js';
 import { CancellationToken } from '../utils/cancellation.js';
 import { findNodeForKeyword, findNodeForProperty } from '../utils/grammar-utils.js';
 import { streamAst } from '../utils/ast-utils.js';
 import { tokenToRange } from '../utils/cst-utils.js';
 import { interruptAndCheck, isOperationCancelled } from '../utils/promise-utils.js';
 import { diagnosticData } from './validation-registry.js';
-import type { LexingDiagnostic } from '../parser/token-builder.js';
+import type { LexingDiagnostic, LexingDiagnosticSeverity } from '../parser/token-builder.js';
 
 export interface ValidationOptions {
     /**
@@ -100,7 +100,7 @@ export class DefaultDocumentValidator implements DocumentValidator {
     protected processLexingErrors(parseResult: ParseResult, diagnostics: Diagnostic[], _options: ValidationOptions): void {
         const lexerDiagnostics = [...parseResult.lexerErrors, ...parseResult.lexerReport?.diagnostics ?? []] as LexingDiagnostic[];
         for (const lexerDiagnostic of lexerDiagnostics) {
-            const severity = lexerDiagnostic?.severity ?? 'error';
+            const severity = lexerDiagnostic.severity ?? 'error';
             const diagnostic: Diagnostic = {
                 severity: toDiagnosticSeverity(severity),
                 range: {
@@ -180,7 +180,7 @@ export class DefaultDocumentValidator implements DocumentValidator {
 
     protected async validateAst(rootNode: AstNode, options: ValidationOptions, cancelToken = CancellationToken.None): Promise<Diagnostic[]> {
         const validationItems: Diagnostic[] = [];
-        const acceptor: ValidationAcceptor = <N extends AstNode>(severity: 'error' | 'warning' | 'info' | 'hint', message: string, info: DiagnosticInfo<N>) => {
+        const acceptor: ValidationAcceptor = <N extends AstNode>(severity: ValidationSeverity, message: string, info: DiagnosticInfo<N>) => {
             validationItems.push(this.toDiagnostic(severity, message, info));
         };
 
@@ -194,7 +194,7 @@ export class DefaultDocumentValidator implements DocumentValidator {
         return validationItems;
     }
 
-    protected toDiagnostic<N extends AstNode>(severity: 'error' | 'warning' | 'info' | 'hint', message: string, info: DiagnosticInfo<N, string>): Diagnostic {
+    protected toDiagnostic<N extends AstNode>(severity: ValidationSeverity, message: string, info: DiagnosticInfo<N, string>): Diagnostic {
         return {
             message,
             range: getDiagnosticRange(info),
@@ -233,7 +233,7 @@ export function getDiagnosticRange<N extends AstNode>(info: DiagnosticInfo<N, st
     return cstNode.range;
 }
 
-export function toDiagnosticSeverity(severity: 'error' | 'warning' | 'info' | 'hint'): DiagnosticSeverity {
+export function toDiagnosticSeverity(severity: LexingDiagnosticSeverity): DiagnosticSeverity {
     switch (severity) {
         case 'error':
             return 1; // according to vscode-languageserver-types/lib/esm/main.js#DiagnosticSeverity.Error
@@ -248,7 +248,7 @@ export function toDiagnosticSeverity(severity: 'error' | 'warning' | 'info' | 'h
     }
 }
 
-export function toDiagnosticData(severity: 'error' | 'warning' | 'info' | 'hint'): DiagnosticData {
+export function toDiagnosticData(severity: LexingDiagnosticSeverity): DiagnosticData {
     switch (severity) {
         case 'error':
             return diagnosticData(DocumentValidator.LexingError);
diff --git a/packages/langium/src/validation/validation-registry.ts b/packages/langium/src/validation/validation-registry.ts
index e6ae570fa..5c3d7acc1 100644
--- a/packages/langium/src/validation/validation-registry.ts
+++ b/packages/langium/src/validation/validation-registry.ts
@@ -57,7 +57,9 @@ export function diagnosticData(code: string): DiagnosticData {
     return { code };
 }
 
-export type ValidationAcceptor = <N extends AstNode>(severity: 'error' | 'warning' | 'info' | 'hint', message: string, info: DiagnosticInfo<N>) => void
+export type ValidationSeverity = 'error' | 'warning' | 'info' | 'hint';
+
+export type ValidationAcceptor = <N extends AstNode>(severity: ValidationSeverity, message: string, info: DiagnosticInfo<N>) => void
 
 export type ValidationCheck<T extends AstNode = AstNode> = (node: T, accept: ValidationAcceptor, cancelToken: CancellationToken) => MaybePromise<void>;
 
diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index bf9683dee..558235570 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -11,7 +11,7 @@ import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder }
 import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar';
 import type { LangiumServices, PartialLangiumServices } from 'langium/lsp';
 import { expandToString } from 'langium/generate';
-import { parseHelper } from 'langium/test';
+import { expectCompletion, parseHelper } from 'langium/test';
 import type { IMultiModeLexerDefinition } from 'chevrotain';
 
 const grammarServices = createLangiumGrammarServices(EmptyFileSystem).grammar;
@@ -401,6 +401,27 @@ describe('IndentationAware parsing', () => {
         expect(return2.value).toBe(true);
     });
 
+    test('should offer correct auto-completion parsing', async () => {
+        const text = expandToString`
+        <|>if true:
+            <|>return true
+        <|>else:
+            <|>if false:
+                <|>return true
+                <|>return false
+            <|>return true
+        `;
+
+        const services = await createIndentationAwareServices(sampleGrammar);
+        const completion = expectCompletion(services);
+        await completion({ text, index: 0, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 1, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 2, expectedItems: ['else'] });
+        await completion({ text, index: 3, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 4, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 5, expectedItems: ['if', 'return'] });
+        await completion({ text, index: 6, expectedItems: ['if', 'return'] });
+    });
 });
 
 type Statement = If | Return;

From a4d2cb3fb30ebd3b3cd4f5b2f41f0a1a97594632 Mon Sep 17 00:00:00 2001
From: Martin Fleck <mfleck@eclipsesource.com>
Date: Fri, 6 Sep 2024 16:52:25 +0200
Subject: [PATCH 3/3] PR feedback

- Ensure column-index is 1-based to avoid error
- Make properties of optional options also optional
- Mark test case as failing
---
 packages/langium/src/parser/indentation-aware.ts       | 2 +-
 packages/langium/src/parser/lexer.ts                   | 2 +-
 packages/langium/test/parser/indentation-aware.test.ts | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts
index 2ff40dec6..3891f585e 100644
--- a/packages/langium/src/parser/indentation-aware.ts
+++ b/packages/langium/src/parser/indentation-aware.ts
@@ -315,7 +315,7 @@ export class IndentationAwareTokenBuilder<Terminals extends string = string, Key
                 offset,
                 length: match?.[0]?.length ?? 0,
                 line: this.getLineNumber(text, offset),
-                column: 0
+                column: 1
             });
             return null;
         }
diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts
index 28fd8c87a..fedfad6fc 100644
--- a/packages/langium/src/parser/lexer.ts
+++ b/packages/langium/src/parser/lexer.ts
@@ -28,7 +28,7 @@ export interface LexerResult {
 export type TokenizeMode = 'full' | 'partial';
 
 export interface TokenizeOptions {
-    mode: TokenizeMode;
+    mode?: TokenizeMode;
 }
 
 export const DEFAULT_TOKENIZE_OPTIONS: TokenizeOptions = { mode: 'full' };
diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts
index 558235570..766e6194f 100644
--- a/packages/langium/test/parser/indentation-aware.test.ts
+++ b/packages/langium/test/parser/indentation-aware.test.ts
@@ -401,7 +401,7 @@ describe('IndentationAware parsing', () => {
         expect(return2.value).toBe(true);
     });
 
-    test('should offer correct auto-completion parsing', async () => {
+    test.fails('should offer correct auto-completion parsing', async () => {
         const text = expandToString`
         <|>if true:
             <|>return true
@@ -415,6 +415,7 @@ describe('IndentationAware parsing', () => {
         const services = await createIndentationAwareServices(sampleGrammar);
         const completion = expectCompletion(services);
         await completion({ text, index: 0, expectedItems: ['if', 'return'] });
+        // PR 1669: the lines below currently fail as the completion provider may wrongly assumes that all whitespace tokens are hidden
         await completion({ text, index: 1, expectedItems: ['if', 'return'] });
         await completion({ text, index: 2, expectedItems: ['else'] });
         await completion({ text, index: 3, expectedItems: ['if', 'return'] });