Improve and abstract Cursor design. Start scanner

This commit is contained in:
Yura Dupyn 2026-02-06 00:38:16 +01:00
parent d382b16e6d
commit d5f9777711
8 changed files with 713 additions and 476 deletions

View file

@ -1,318 +1,85 @@
import { char, isWhitespace, isDigit } from './source_text';
import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text';
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
function isSymbolChar(c: CodePoint): boolean {
return (
c === char("#") ||
c === char("$") ||
c === char("@") ||
c === char("(") ||
c === char(")") ||
c === char("{") ||
c === char("}") ||
c === char(",") ||
c === char(".")
);
import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
import { Result } from '../result';
// === Language Specific Stuff ===
const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
export type Delimiter = typeof DELIMITER_CHARS[number];
const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!";
// Returns the raw string. Does NOT create a token (the caller decides the token type).
function scanRawIdentifier(cursor: Cursor): string {
const start = cursor.currentIndex;
// Consume until EOF or Delimiter or Whitespace (including comments)
// TODO: How to check for comments? They are special in that they must start with two characters. Fuck.
cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c)));
return cursor.text.sliceByCp(start, cursor.currentIndex);
}
function isIdentifierChar(char: CodePoint): boolean {
return !isWhitespace(char) && !isSymbolChar(char);
}
export type ExprScanError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "InvalidIdentifier", text: string, reason: string, span: Span }
| NumberError
| StringError;
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
// === Scanner ===
export type Token =
export type ExprStartToken =
| { tag: "number", value: number, span: Span }
| { tag: "string", text: string, span: Span }
| { tag: "identifier", text: string, span: Span }
| { tag: "function_name", name: string, span: Span }
| { tag: "variable_use", name: string, span: Span }
| { tag: "tag", name: string, span: Span }
| { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
| { tag: "keyword", kw: Keyword, span: Span }
| { tag: "symbol", sym: Symbol, span: Span }
| { tag: "EOF", span: Span }
export namespace TokenKind {
export type T =
| { tag: "number" }
| { tag: "string" }
| { tag: "identifier" }
| { tag: "symbol", value: Symbol }
| { tag: "keyword", value: Keyword }
| { tag: "EOF" }
}
// TODO: Move this back to `cursor.ts`
export type LexError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "UnexpectedEOF", span: Span }
| { tag: "ExpectedNumber", span: Span }
| { tag: "InvalidNumber", text: string, reason: string, span: Span }
| { tag: "InvalidEscape", reason: string, span: Span };
function skipWhitespaceAndComments(cursor: Cursor): number {
let totalConsumed = 0;
export class Scanner {
private i: CodePointIndex = 0;
private line = 1;
private column = 1;
while (true) {
// 1. Consume standard whitespace (spaces, tabs, newlines)
const wsCount = cursor.consumeWhile(isWhitespace);
totalConsumed += wsCount;
// Track previous char to handle \r\n correctly
private lastCharWasCR = false;
// 2. Check for Line Comment start ('//')
const c = cursor.peek();
const nextC = cursor.peek(1);
constructor(private readonly text: SourceText) {}
if (c === char('/') && nextC === char('/')) {
// Found comment start. Consume the '//' markers
cursor.next();
cursor.next();
totalConsumed += 2;
eof(): boolean {
return this.i >= this.text.length;
}
private peek(n: number = 0): CodePoint | undefined {
return this.text.chars[this.i + n]?.char;
}
private next(): CodePoint | undefined {
const ref = this.text.chars[this.i];
if (!ref) return undefined;
const c = ref.char;
this.i++;
if (c === 0x0A /* \n */) {
if (!this.lastCharWasCR) {
this.line++;
this.column = 1;
} else {
// We just saw \r, so this \n is part of \r\n.
// We already bumped the line count on \r.
// Just reset the flag.
this.lastCharWasCR = false;
}
} else if (c === 0x0D /* \r */) {
this.line++;
this.column = 1;
this.lastCharWasCR = true;
// Consume everything until the next newline (or EOF).
// Note: We do NOT consume the newline itself here.
// We let the NEXT iteration of the 'while(true)' loop catch
// the newline as standard whitespace.
const commentContentLength = cursor.consumeWhile(c => c !== NEW_LINE && c !== CARRIAGE_RETURN);
totalConsumed += commentContentLength;
} else {
this.column++;
this.lastCharWasCR = false;
}
return c;
}
private currentOffset(): StringIndex {
return this.text.chars[this.i]?.offset ?? this.text.source.length;
}
private currentLocation(): SourceLocation {
return { index: this.i, line: this.line, column: this.column };
}
private makeSpan(start: SourceLocation): Span {
const startOffset =
this.text.chars[start.index]?.offset ?? this.text.source.length;
const endOffset = this.currentOffset();
return {
start: startOffset,
end: endOffset,
line: start.line,
column: start.column,
};
}
private consumeWhile(pred: (c: CodePoint) => boolean): number {
let count = 0;
while (!this.eof()) {
const c = this.peek();
if (c === undefined || !pred(c)) break;
this.next();
count++;
}
return count;
}
private expect(
pred: (c: CodePoint) => boolean,
error: LexError
): CodePoint {
const c = this.peek();
if (c === undefined || !pred(c)) {
throw error;
}
this.next();
return c;
}
// Helper to check for exact char matches quickly
private match(c: CodePoint): boolean {
if (this.peek() === c) {
this.next();
return true;
}
return false;
}
private skipWhitespace() {
this.consumeWhile(isWhitespace);
}
// === Main Scanners ===
private scanNumber(): Token {
// number :=
// | optional(`-`) digits optional(`.` digits)
const startNumberLocation = this.currentLocation();
let c: CodePoint;
// 1. Optional Sign
c = this.peek();
if (c === char("-")) {
this.next();
}
// 2. Integer Part
c = this.peek();
const integerPartDigitCount = this.consumeWhile(isDigit);
if (integerPartDigitCount === 0) {
throw <LexError>{
tag: "ExpectedNumber",
span: this.makeSpan(startNumberLocation),
};
}
// 3. Fractional Part
if (this.peek() === char(".")) {
const dotLocation = this.currentLocation();
this.next(); // consume '.'
const fracPartDigitCount = this.consumeWhile(isDigit);
if (fracPartDigitCount === 0) {
throw <LexError>{
tag: "InvalidNumber",
reason: "MissingFractionalDigits",
span: this.makeSpan(dotLocation),
};
}
}
const text = this.text.sliceByCp(startNumberLocation.index, this.i);
const value = Number(text);
if (!Number.isFinite(value)) {
throw <LexError>{
tag: "InvalidNumber",
reason: "NotFinite",
span: this.makeSpan(startNumberLocation),
};
}
return {
tag: "number",
value,
span: this.makeSpan(startNumberLocation),
};
}
private scanString(): Token {
const start = this.currentLocation();
// We assume the caller checked the opening quote '"'
this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
let value = ""; // The actual string content
while (true) {
if (this.eof()) {
throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
}
const c = this.peek();
// 1. End of string
if (c === char('"')) {
this.next(); // consume closing quote
// We are not at a comment.
// If we also didn't consume any whitespace in step 1, we are truly done.
if (wsCount === 0) {
break;
}
if (c === char('\\')) {
// 2. Escape Sequences
const escapeStart = this.currentLocation();
this.next(); // consume backslash
const escaped = this.peek();
switch (escaped) {
case char('n'): value += '\n'; this.next(); break;
case char('r'): value += '\r'; this.next(); break;
case char('t'): value += '\t'; this.next(); break;
case char('\\'): value += '\\'; this.next(); break;
case char("0"): value += "\0"; break;
case char('"'): value += '"'; this.next(); break;
// Unicode Escape: \u{XXXX}
case char('u'): {
this.next(); // consume 'u'
// Expect '{'
const braceStart = this.currentLocation();
if (this.peek() !== char('{')) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
}
this.next(); // consume '{'
// Consume Hex Digits
const hexStart = this.i;
const hexCount = this.consumeWhile(c =>
(c >= char('0') && c <= char('9')) ||
(c >= char('a') && c <= char('f')) ||
(c >= char('A') && c <= char('F'))
);
if (hexCount === 0) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
}
// Expect '}'
if (this.peek() !== char("}")) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
}
this.next(); // consume '}'
// Convert & Append
const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
const codePoint = parseInt(hexStr, 16);
if (codePoint > 0x10FFFF) {
throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
}
value += String.fromCodePoint(codePoint);
break;
}
default:
throw <LexError>{
tag: "InvalidEscape",
reason: `UnknownEscapeSequence`,
span: this.makeSpan(escapeStart)
};
}
} else {
// 3. Regular character
// Optimization: consume chunks of non-special chars for speed?
// For now, char-by-char is fine.
this.next();
// Note: We use ! because we checked EOF at loop start
value += String.fromCodePoint(c!);
}
}
return {
tag: "string",
text: value,
span: this.makeSpan(start)
};
}
return totalConsumed;
}
export function scanExprStart(cursor: Cursor): Result<ExprStartToken, ExprScanError> {
// TODO
return (0 as any);
}
// TODO: Need a Token to TokenKind function
// TODO: Need is_start_of_expression(token): boolean