import { char, NEW_LINE, CARRIAGE_RETURN, DOT, DIGIT_0, DIGIT_9, LOWERCASE_a, LOWERCASE_f, UPPERCASE_A, UPPERCASE_F, SPACE, TAB } from './source_text'; import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text'; import { Result } from '../result'; export type CursorState = { index: CodePointIndex, line: number, column: number, lastCharWasCR: boolean, } export class Cursor { private index: CodePointIndex = 0; private line: number = 1; private column: number = 1; // Track previous char to handle \r\n correctly private lastCharWasCR: boolean = false; constructor(readonly text: SourceText) {} save(): CursorState { return { index: this.index, line: this.line, column: this.column, lastCharWasCR: this.lastCharWasCR }; } restore({ index, line, column, lastCharWasCR }: CursorState) { this.index = index; this.line = line; this.column = column; this.lastCharWasCR = lastCharWasCR; } eof(): boolean { return this.index >= this.text.length; } peek(n: number = 0): CodePoint | undefined { return this.text.chars[this.index + n]?.char; } next(): CodePoint | undefined { const ref = this.text.chars[this.index]; if (!ref) return undefined; const c = ref.char; this.index++; if (c === NEW_LINE) { if (!this.lastCharWasCR) { this.line++; this.column = 1; } else { // We just saw \r, so this \n is part of \r\n. // We already bumped the line count on \r. // Just reset the flag. this.lastCharWasCR = false; } } else if (c === CARRIAGE_RETURN) { this.line++; this.column = 1; this.lastCharWasCR = true; } else { this.column++; this.lastCharWasCR = false; } return c; } get currentIndex(): CodePointIndex { return this.index; } // TODO: unicode-index ~> string-offset, make that into a separate function. currentOffset(): StringIndex { return this.text.chars[this.index]?.offset ?? this.text.source.length; } currentLocation(): SourceLocation { return { index: this.index, line: this.line, column: this.column }; } makeSpan(start: SourceLocation): Span { return { start, end: this.currentLocation(), }; } consumeWhile(pred: (c: CodePoint) => boolean): number { let count = 0; while (!this.eof()) { const c = this.peek(); if (c === undefined || !pred(c)) break; this.next(); count++; } return count; } // Helper to check for exact char matches quickly match(c: CodePoint): boolean { if (this.peek() === c) { this.next(); return true; } return false; } // Checks if the next characters match the string. // If yes, consumes them and returns true. // If no, touches nothing and returns false. matchString(str: string): boolean { if (this.index + str.length > this.text.length) return false; const slice = this.text.sliceByCp(this.index, this.index + str.length); if (slice !== str) return false; for (let i = 0; i < str.length; i++) { // We must call next() to correctly update line/col tracking. // We already know it matches, so we just burn through. this.next(); } return true; } } // === Basic Scanners/Predicates === export function isWhitespace(char: CodePoint): boolean { return char === SPACE || char === TAB || char === NEW_LINE || char === CARRIAGE_RETURN; } export function isDigit(char: CodePoint): boolean { return char >= DIGIT_0 && char <= DIGIT_9; } export type GenericScanError = | { tag: "UnexpectedCharacter", char: CodePoint, span: Span } | { tag: "UnexpectedEOF", span: Span } export type NumberError = | { tag: "ExpectedNumber", span: Span } | { tag: "InvalidNumber", reason: "NotFinite" | "MissingFractionalDigits", span: Span } export function scanNumber(cursor: Cursor): Result<{ value: number, span: Span }, NumberError> { // number := // | optional(`-`) digits optional(`.` digits) const startNumberLocation = cursor.currentLocation(); let c: CodePoint; // 1. Optional Sign c = cursor.peek(); if (c === char("-")) { cursor.next(); } // 2. Integer Part c = cursor.peek(); const integerPartDigitCount = cursor.consumeWhile(isDigit); if (integerPartDigitCount === 0) { return Result.error({ tag: "ExpectedNumber", span: cursor.makeSpan(startNumberLocation), }); } // 3. Fractional Part if (cursor.peek() === DOT) { const dotLocation = cursor.currentLocation(); cursor.next(); // consume '.' const fracPartDigitCount = cursor.consumeWhile(isDigit); if (fracPartDigitCount === 0) { return Result.error({ tag: "InvalidNumber", reason: "MissingFractionalDigits", span: cursor.makeSpan(dotLocation), }); } } const text = cursor.text.sliceByCp(startNumberLocation.index, cursor.currentIndex); const value = Number(text); if (!Number.isFinite(value)) { return Result.error({ tag: "InvalidNumber", reason: "NotFinite", span: cursor.makeSpan(startNumberLocation), }); } return Result.ok({ value, span: cursor.makeSpan(startNumberLocation), }); } export type StringError = | { tag: "InvalidEscape", reason: EscapeErrorReason, span: Span }; export type EscapeErrorReason = | { tag: "UnknownEscapeSequence", char: CodePoint } // e.g. \k | { tag: "UnicodeMissingBrace" } // \u without { | { tag: "UnicodeNoDigits" } // \u{} | { tag: "UnicodeUnclosed" } // \u{FF without } | { tag: "UnicodeOverflow", value: number }; // \u{110000} export function scanString(cursor: Cursor): Result<{ value: string, span: Span }, StringError | GenericScanError> { const start = cursor.currentLocation(); const firstChar = cursor.peek(); if (firstChar === undefined) { return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) }); } if (firstChar !== char('"')) { return Result.error({ tag: "UnexpectedCharacter", char: firstChar, span: cursor.makeSpan(start) }); } cursor.next(); let value = ""; // The actual string content while (true) { if (cursor.eof()) { return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) }); } const c = cursor.peek(); // 1. End of string if (c === char('"')) { cursor.next(); // consume closing quote break; } if (c === char('\\')) { // 2. Escape Sequences const escapeStart = cursor.currentLocation(); cursor.next(); // consume backslash const escaped = cursor.peek(); switch (escaped) { case char('n'): value += '\n'; cursor.next(); break; case char('r'): value += '\r'; cursor.next(); break; case char('t'): value += '\t'; cursor.next(); break; case char('\\'): value += '\\'; cursor.next(); break; case char("0"): value += "\0"; cursor.next(); break; case char('"'): value += '"'; cursor.next(); break; // Unicode Escape: \u{XXXX} case char('u'): { cursor.next(); // consume 'u' // Expect '{' const braceStart = cursor.currentLocation(); if (cursor.peek() !== char('{')) { return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeMissingBrace" }, span: cursor.makeSpan(braceStart) }); } cursor.next(); // consume '{' // Consume Hex Digits const hexStart = cursor.currentIndex; const hexCount = cursor.consumeWhile(c => (c >= DIGIT_0 && c <= DIGIT_9) || (c >= LOWERCASE_a && c <= LOWERCASE_f) || (c >= UPPERCASE_A && c <= UPPERCASE_F) ); if (hexCount === 0) { return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeNoDigits" }, span: cursor.makeSpan(braceStart) }); } // Expect '}' if (cursor.peek() !== char("}")) { return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeUnclosed" }, span: cursor.makeSpan(braceStart) }); } cursor.next(); // consume '}' // Convert & Append const hexStr = cursor.text.sliceByCp(hexStart, hexStart + hexCount); const codePoint = parseInt(hexStr, 16); if (codePoint > 0x10FFFF) { return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeOverflow", value: codePoint }, span: cursor.makeSpan(braceStart) }); } value += String.fromCodePoint(codePoint); break; } default: return Result.error({ tag: "InvalidEscape", reason: { tag: "UnknownEscapeSequence", char: escaped }, span: cursor.makeSpan(escapeStart) }); } } else { // 3. Regular character // Optimization: consume chunks of non-special chars for speed? // For now, char-by-char is fine. cursor.next(); // Note: We use ! because we checked EOF at loop start value += String.fromCodePoint(c!); } } return Result.ok({ value, span: cursor.makeSpan(start) }); } // TODO: rendering of errors // function renderStringError(err: StringError): string { // switch (err.tag) { // case "ExpectedQuote": return "Expected a string starting with \""; // case "UnexpectedEOF": return "Unterminated string literal"; // case "InvalidEscape": // const k = err.kind; // switch (k.tag) { // case "Unknown": // return `Unknown escape sequence '\\${String.fromCodePoint(k.char)}'`; // case "UnicodeMissingBrace": // return "Unicode escape must start with '{', e.g. \\u{1F600}"; // case "UnicodeNoDigits": // return "Empty unicode escape \\u{}"; // case "UnicodeUnclosed": // return "Expected '}' to close unicode escape"; // case "UnicodeOverflow": // return `Unicode code point 0x${k.value.toString(16)} is too large (max 0x10FFFF)`; // } // } // }