scrowl/src/lang/parser/cursor.ts
2026-02-07 10:43:30 +01:00

333 lines
10 KiB
TypeScript

import { char, NEW_LINE, CARRIAGE_RETURN, DOT, DIGIT_0, DIGIT_9, LOWERCASE_a, LOWERCASE_f, UPPERCASE_A, UPPERCASE_F, SPACE, TAB } from './source_text';
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
import { Result } from '../result';
export type CursorState = {
index: CodePointIndex,
line: number,
column: number,
lastCharWasCR: boolean,
}
export class Cursor {
private index: CodePointIndex = 0;
private line: number = 1;
private column: number = 1;
// Track previous char to handle \r\n correctly
private lastCharWasCR: boolean = false;
constructor(readonly text: SourceText) {}
save(): CursorState {
return { index: this.index, line: this.line, column: this.column, lastCharWasCR: this.lastCharWasCR };
}
restore({ index, line, column, lastCharWasCR }: CursorState) {
this.index = index;
this.line = line;
this.column = column;
this.lastCharWasCR = lastCharWasCR;
}
eof(): boolean {
return this.index >= this.text.length;
}
peek(n: number = 0): CodePoint | undefined {
return this.text.chars[this.index + n]?.char;
}
next(): CodePoint | undefined {
const ref = this.text.chars[this.index];
if (!ref) return undefined;
const c = ref.char;
this.index++;
if (c === NEW_LINE) {
if (!this.lastCharWasCR) {
this.line++;
this.column = 1;
} else {
// We just saw \r, so this \n is part of \r\n.
// We already bumped the line count on \r.
// Just reset the flag.
this.lastCharWasCR = false;
}
} else if (c === CARRIAGE_RETURN) {
this.line++;
this.column = 1;
this.lastCharWasCR = true;
} else {
this.column++;
this.lastCharWasCR = false;
}
return c;
}
get currentIndex(): CodePointIndex {
return this.index;
}
// TODO: unicode-index ~> string-offset, make that into a separate function.
currentOffset(): StringIndex {
return this.text.chars[this.index]?.offset ?? this.text.source.length;
}
currentLocation(): SourceLocation {
return { index: this.index, line: this.line, column: this.column };
}
makeSpan(start: SourceLocation): Span {
return {
start,
end: this.currentLocation(),
};
}
consumeWhile(pred: (c: CodePoint) => boolean): number {
let count = 0;
while (!this.eof()) {
const c = this.peek();
if (c === undefined || !pred(c)) break;
this.next();
count++;
}
return count;
}
// Helper to check for exact char matches quickly
match(c: CodePoint): boolean {
if (this.peek() === c) {
this.next();
return true;
}
return false;
}
// Checks if the next characters match the string.
// If yes, consumes them and returns true.
// If no, touches nothing and returns false.
matchString(str: string): boolean {
if (this.index + str.length > this.text.length) return false;
const slice = this.text.sliceByCp(this.index, this.index + str.length);
if (slice !== str) return false;
for (let i = 0; i < str.length; i++) {
// We must call next() to correctly update line/col tracking.
// We already know it matches, so we just burn through.
this.next();
}
return true;
}
}
// === Basic Scanners/Predicates ===
export function isWhitespace(char: CodePoint): boolean {
return char === SPACE || char === TAB || char === NEW_LINE || char === CARRIAGE_RETURN;
}
export function isDigit(char: CodePoint): boolean {
return char >= DIGIT_0 && char <= DIGIT_9;
}
export type GenericScanError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "UnexpectedEOF", span: Span }
export type NumberError =
| { tag: "ExpectedNumber", span: Span }
| { tag: "InvalidNumber", reason: "NotFinite" | "MissingFractionalDigits", span: Span }
export function scanNumber(cursor: Cursor): Result<{ value: number, span: Span }, NumberError> {
// number :=
// | optional(`-`) digits optional(`.` digits)
const startNumberLocation = cursor.currentLocation();
let c: CodePoint;
// 1. Optional Sign
c = cursor.peek();
if (c === char("-")) {
cursor.next();
}
// 2. Integer Part
c = cursor.peek();
const integerPartDigitCount = cursor.consumeWhile(isDigit);
if (integerPartDigitCount === 0) {
return Result.error({
tag: "ExpectedNumber",
span: cursor.makeSpan(startNumberLocation),
});
}
// 3. Fractional Part
if (cursor.peek() === DOT) {
const dotLocation = cursor.currentLocation();
cursor.next(); // consume '.'
const fracPartDigitCount = cursor.consumeWhile(isDigit);
if (fracPartDigitCount === 0) {
return Result.error({
tag: "InvalidNumber",
reason: "MissingFractionalDigits",
span: cursor.makeSpan(dotLocation),
});
}
}
const text = cursor.text.sliceByCp(startNumberLocation.index, cursor.currentIndex);
const value = Number(text);
if (!Number.isFinite(value)) {
return Result.error({
tag: "InvalidNumber",
reason: "NotFinite",
span: cursor.makeSpan(startNumberLocation),
});
}
return Result.ok({
value,
span: cursor.makeSpan(startNumberLocation),
});
}
export type StringError =
| { tag: "InvalidEscape", reason: EscapeErrorReason, span: Span };
export type EscapeErrorReason =
| { tag: "UnknownEscapeSequence", char: CodePoint } // e.g. \k
| { tag: "UnicodeMissingBrace" } // \u without {
| { tag: "UnicodeNoDigits" } // \u{}
| { tag: "UnicodeUnclosed" } // \u{FF without }
| { tag: "UnicodeOverflow", value: number }; // \u{110000}
export function scanString(cursor: Cursor): Result<{ value: string, span: Span }, StringError | GenericScanError> {
const start = cursor.currentLocation();
const firstChar = cursor.peek();
if (firstChar === undefined) {
return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
}
if (firstChar !== char('"')) {
return Result.error({ tag: "UnexpectedCharacter", char: firstChar, span: cursor.makeSpan(start) });
}
cursor.next();
let value = ""; // The actual string content
while (true) {
if (cursor.eof()) {
return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
}
const c = cursor.peek();
// 1. End of string
if (c === char('"')) {
cursor.next(); // consume closing quote
break;
}
if (c === char('\\')) {
// 2. Escape Sequences
const escapeStart = cursor.currentLocation();
cursor.next(); // consume backslash
const escaped = cursor.peek();
switch (escaped) {
case char('n'): value += '\n'; cursor.next(); break;
case char('r'): value += '\r'; cursor.next(); break;
case char('t'): value += '\t'; cursor.next(); break;
case char('\\'): value += '\\'; cursor.next(); break;
case char("0"): value += "\0"; cursor.next(); break;
case char('"'): value += '"'; cursor.next(); break;
// Unicode Escape: \u{XXXX}
case char('u'): {
cursor.next(); // consume 'u'
// Expect '{'
const braceStart = cursor.currentLocation();
if (cursor.peek() !== char('{')) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeMissingBrace" }, span: cursor.makeSpan(braceStart) });
}
cursor.next(); // consume '{'
// Consume Hex Digits
const hexStart = cursor.currentIndex;
const hexCount = cursor.consumeWhile(c =>
(c >= DIGIT_0 && c <= DIGIT_9) ||
(c >= LOWERCASE_a && c <= LOWERCASE_f) ||
(c >= UPPERCASE_A && c <= UPPERCASE_F)
);
if (hexCount === 0) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeNoDigits" }, span: cursor.makeSpan(braceStart) });
}
// Expect '}'
if (cursor.peek() !== char("}")) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeUnclosed" }, span: cursor.makeSpan(braceStart) });
}
cursor.next(); // consume '}'
// Convert & Append
const hexStr = cursor.text.sliceByCp(hexStart, hexStart + hexCount);
const codePoint = parseInt(hexStr, 16);
if (codePoint > 0x10FFFF) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeOverflow", value: codePoint }, span: cursor.makeSpan(braceStart) });
}
value += String.fromCodePoint(codePoint);
break;
}
default:
return Result.error({
tag: "InvalidEscape",
reason: { tag: "UnknownEscapeSequence", char: escaped },
span: cursor.makeSpan(escapeStart)
});
}
} else {
// 3. Regular character
// Optimization: consume chunks of non-special chars for speed?
// For now, char-by-char is fine.
cursor.next();
// Note: We use ! because we checked EOF at loop start
value += String.fromCodePoint(c!);
}
}
return Result.ok({
value,
span: cursor.makeSpan(start)
});
}
// TODO: rendering of errors
// function renderStringError(err: StringError): string {
// switch (err.tag) {
// case "ExpectedQuote": return "Expected a string starting with \"";
// case "UnexpectedEOF": return "Unterminated string literal";
// case "InvalidEscape":
// const k = err.kind;
// switch (k.tag) {
// case "Unknown":
// return `Unknown escape sequence '\\${String.fromCodePoint(k.char)}'`;
// case "UnicodeMissingBrace":
// return "Unicode escape must start with '{', e.g. \\u{1F600}";
// case "UnicodeNoDigits":
// return "Empty unicode escape \\u{}";
// case "UnicodeUnclosed":
// return "Expected '}' to close unicode escape";
// case "UnicodeOverflow":
// return `Unicode code point 0x${k.value.toString(16)} is too large (max 0x10FFFF)`;
// }
// }
// }