333 lines
10 KiB
TypeScript
333 lines
10 KiB
TypeScript
import { char, NEW_LINE, CARRIAGE_RETURN, DOT, DIGIT_0, DIGIT_9, LOWERCASE_a, LOWERCASE_f, UPPERCASE_A, UPPERCASE_F, SPACE, TAB } from './source_text';
|
|
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
|
|
import { Result } from '../result';
|
|
|
|
export type CursorState = {
|
|
index: CodePointIndex,
|
|
line: number,
|
|
column: number,
|
|
lastCharWasCR: boolean,
|
|
}
|
|
|
|
export class Cursor {
|
|
private index: CodePointIndex = 0;
|
|
private line: number = 1;
|
|
private column: number = 1;
|
|
// Track previous char to handle \r\n correctly
|
|
private lastCharWasCR: boolean = false;
|
|
|
|
constructor(readonly text: SourceText) {}
|
|
|
|
save(): CursorState {
|
|
return { index: this.index, line: this.line, column: this.column, lastCharWasCR: this.lastCharWasCR };
|
|
}
|
|
|
|
restore({ index, line, column, lastCharWasCR }: CursorState) {
|
|
this.index = index;
|
|
this.line = line;
|
|
this.column = column;
|
|
this.lastCharWasCR = lastCharWasCR;
|
|
}
|
|
|
|
eof(): boolean {
|
|
return this.index >= this.text.length;
|
|
}
|
|
|
|
peek(n: number = 0): CodePoint | undefined {
|
|
return this.text.chars[this.index + n]?.char;
|
|
}
|
|
|
|
next(): CodePoint | undefined {
|
|
const ref = this.text.chars[this.index];
|
|
if (!ref) return undefined;
|
|
|
|
const c = ref.char;
|
|
this.index++;
|
|
|
|
if (c === NEW_LINE) {
|
|
if (!this.lastCharWasCR) {
|
|
this.line++;
|
|
this.column = 1;
|
|
} else {
|
|
// We just saw \r, so this \n is part of \r\n.
|
|
// We already bumped the line count on \r.
|
|
// Just reset the flag.
|
|
this.lastCharWasCR = false;
|
|
}
|
|
} else if (c === CARRIAGE_RETURN) {
|
|
this.line++;
|
|
this.column = 1;
|
|
this.lastCharWasCR = true;
|
|
} else {
|
|
this.column++;
|
|
this.lastCharWasCR = false;
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
get currentIndex(): CodePointIndex {
|
|
return this.index;
|
|
}
|
|
|
|
// TODO: unicode-index ~> string-offset, make that into a separate function.
|
|
currentOffset(): StringIndex {
|
|
return this.text.chars[this.index]?.offset ?? this.text.source.length;
|
|
}
|
|
|
|
currentLocation(): SourceLocation {
|
|
return { index: this.index, line: this.line, column: this.column };
|
|
}
|
|
|
|
makeSpan(start: SourceLocation): Span {
|
|
return {
|
|
start,
|
|
end: this.currentLocation(),
|
|
};
|
|
}
|
|
|
|
consumeWhile(pred: (c: CodePoint) => boolean): number {
|
|
let count = 0;
|
|
while (!this.eof()) {
|
|
const c = this.peek();
|
|
if (c === undefined || !pred(c)) break;
|
|
this.next();
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
// Helper to check for exact char matches quickly
|
|
match(c: CodePoint): boolean {
|
|
if (this.peek() === c) {
|
|
this.next();
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Checks if the next characters match the string.
|
|
// If yes, consumes them and returns true.
|
|
// If no, touches nothing and returns false.
|
|
matchString(str: string): boolean {
|
|
if (this.index + str.length > this.text.length) return false;
|
|
|
|
const slice = this.text.sliceByCp(this.index, this.index + str.length);
|
|
if (slice !== str) return false;
|
|
|
|
for (let i = 0; i < str.length; i++) {
|
|
// We must call next() to correctly update line/col tracking.
|
|
// We already know it matches, so we just burn through.
|
|
this.next();
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// === Basic Scanners/Predicates ===
|
|
export function isWhitespace(char: CodePoint): boolean {
|
|
return char === SPACE || char === TAB || char === NEW_LINE || char === CARRIAGE_RETURN;
|
|
}
|
|
|
|
export function isDigit(char: CodePoint): boolean {
|
|
return char >= DIGIT_0 && char <= DIGIT_9;
|
|
}
|
|
|
|
export type GenericScanError =
|
|
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
|
|
| { tag: "UnexpectedEOF", span: Span }
|
|
|
|
export type NumberError =
|
|
| { tag: "ExpectedNumber", span: Span }
|
|
| { tag: "InvalidNumber", reason: "NotFinite" | "MissingFractionalDigits", span: Span }
|
|
|
|
export function scanNumber(cursor: Cursor): Result<{ value: number, span: Span }, NumberError> {
|
|
// number :=
|
|
// | optional(`-`) digits optional(`.` digits)
|
|
|
|
const startNumberLocation = cursor.currentLocation();
|
|
|
|
let c: CodePoint;
|
|
|
|
// 1. Optional Sign
|
|
c = cursor.peek();
|
|
if (c === char("-")) {
|
|
cursor.next();
|
|
}
|
|
|
|
// 2. Integer Part
|
|
c = cursor.peek();
|
|
const integerPartDigitCount = cursor.consumeWhile(isDigit);
|
|
if (integerPartDigitCount === 0) {
|
|
return Result.error({
|
|
tag: "ExpectedNumber",
|
|
span: cursor.makeSpan(startNumberLocation),
|
|
});
|
|
}
|
|
|
|
// 3. Fractional Part
|
|
if (cursor.peek() === DOT) {
|
|
const dotLocation = cursor.currentLocation();
|
|
|
|
cursor.next(); // consume '.'
|
|
|
|
const fracPartDigitCount = cursor.consumeWhile(isDigit);
|
|
if (fracPartDigitCount === 0) {
|
|
return Result.error({
|
|
tag: "InvalidNumber",
|
|
reason: "MissingFractionalDigits",
|
|
span: cursor.makeSpan(dotLocation),
|
|
});
|
|
}
|
|
}
|
|
|
|
const text = cursor.text.sliceByCp(startNumberLocation.index, cursor.currentIndex);
|
|
const value = Number(text);
|
|
|
|
if (!Number.isFinite(value)) {
|
|
return Result.error({
|
|
tag: "InvalidNumber",
|
|
reason: "NotFinite",
|
|
span: cursor.makeSpan(startNumberLocation),
|
|
});
|
|
}
|
|
return Result.ok({
|
|
value,
|
|
span: cursor.makeSpan(startNumberLocation),
|
|
});
|
|
}
|
|
|
|
export type StringError =
|
|
| { tag: "InvalidEscape", reason: EscapeErrorReason, span: Span };
|
|
|
|
export type EscapeErrorReason =
|
|
| { tag: "UnknownEscapeSequence", char: CodePoint } // e.g. \k
|
|
| { tag: "UnicodeMissingBrace" } // \u without {
|
|
| { tag: "UnicodeNoDigits" } // \u{}
|
|
| { tag: "UnicodeUnclosed" } // \u{FF without }
|
|
| { tag: "UnicodeOverflow", value: number }; // \u{110000}
|
|
|
|
export function scanString(cursor: Cursor): Result<{ value: string, span: Span }, StringError | GenericScanError> {
|
|
const start = cursor.currentLocation();
|
|
|
|
const firstChar = cursor.peek();
|
|
if (firstChar === undefined) {
|
|
return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
|
|
}
|
|
if (firstChar !== char('"')) {
|
|
return Result.error({ tag: "UnexpectedCharacter", char: firstChar, span: cursor.makeSpan(start) });
|
|
}
|
|
cursor.next();
|
|
|
|
let value = ""; // The actual string content
|
|
|
|
while (true) {
|
|
if (cursor.eof()) {
|
|
return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
|
|
}
|
|
|
|
const c = cursor.peek();
|
|
|
|
// 1. End of string
|
|
if (c === char('"')) {
|
|
cursor.next(); // consume closing quote
|
|
break;
|
|
}
|
|
|
|
if (c === char('\\')) {
|
|
// 2. Escape Sequences
|
|
const escapeStart = cursor.currentLocation();
|
|
cursor.next(); // consume backslash
|
|
const escaped = cursor.peek();
|
|
|
|
switch (escaped) {
|
|
case char('n'): value += '\n'; cursor.next(); break;
|
|
case char('r'): value += '\r'; cursor.next(); break;
|
|
case char('t'): value += '\t'; cursor.next(); break;
|
|
case char('\\'): value += '\\'; cursor.next(); break;
|
|
case char("0"): value += "\0"; cursor.next(); break;
|
|
case char('"'): value += '"'; cursor.next(); break;
|
|
// Unicode Escape: \u{XXXX}
|
|
case char('u'): {
|
|
cursor.next(); // consume 'u'
|
|
|
|
// Expect '{'
|
|
const braceStart = cursor.currentLocation();
|
|
if (cursor.peek() !== char('{')) {
|
|
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeMissingBrace" }, span: cursor.makeSpan(braceStart) });
|
|
}
|
|
cursor.next(); // consume '{'
|
|
|
|
// Consume Hex Digits
|
|
const hexStart = cursor.currentIndex;
|
|
const hexCount = cursor.consumeWhile(c =>
|
|
(c >= DIGIT_0 && c <= DIGIT_9) ||
|
|
(c >= LOWERCASE_a && c <= LOWERCASE_f) ||
|
|
(c >= UPPERCASE_A && c <= UPPERCASE_F)
|
|
);
|
|
|
|
if (hexCount === 0) {
|
|
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeNoDigits" }, span: cursor.makeSpan(braceStart) });
|
|
}
|
|
|
|
// Expect '}'
|
|
if (cursor.peek() !== char("}")) {
|
|
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeUnclosed" }, span: cursor.makeSpan(braceStart) });
|
|
}
|
|
cursor.next(); // consume '}'
|
|
|
|
// Convert & Append
|
|
const hexStr = cursor.text.sliceByCp(hexStart, hexStart + hexCount);
|
|
const codePoint = parseInt(hexStr, 16);
|
|
|
|
if (codePoint > 0x10FFFF) {
|
|
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeOverflow", value: codePoint }, span: cursor.makeSpan(braceStart) });
|
|
}
|
|
|
|
value += String.fromCodePoint(codePoint);
|
|
break;
|
|
}
|
|
default:
|
|
return Result.error({
|
|
tag: "InvalidEscape",
|
|
reason: { tag: "UnknownEscapeSequence", char: escaped },
|
|
span: cursor.makeSpan(escapeStart)
|
|
});
|
|
}
|
|
} else {
|
|
// 3. Regular character
|
|
// Optimization: consume chunks of non-special chars for speed?
|
|
// For now, char-by-char is fine.
|
|
cursor.next();
|
|
// Note: We use ! because we checked EOF at loop start
|
|
value += String.fromCodePoint(c!);
|
|
}
|
|
}
|
|
|
|
return Result.ok({
|
|
value,
|
|
span: cursor.makeSpan(start)
|
|
});
|
|
}
|
|
|
|
// TODO: rendering of errors
|
|
// function renderStringError(err: StringError): string {
|
|
// switch (err.tag) {
|
|
// case "ExpectedQuote": return "Expected a string starting with \"";
|
|
// case "UnexpectedEOF": return "Unterminated string literal";
|
|
// case "InvalidEscape":
|
|
// const k = err.kind;
|
|
// switch (k.tag) {
|
|
// case "Unknown":
|
|
// return `Unknown escape sequence '\\${String.fromCodePoint(k.char)}'`;
|
|
// case "UnicodeMissingBrace":
|
|
// return "Unicode escape must start with '{', e.g. \\u{1F600}";
|
|
// case "UnicodeNoDigits":
|
|
// return "Empty unicode escape \\u{}";
|
|
// case "UnicodeUnclosed":
|
|
// return "Expected '}' to close unicode escape";
|
|
// case "UnicodeOverflow":
|
|
// return `Unicode code point 0x${k.value.toString(16)} is too large (max 0x10FFFF)`;
|
|
// }
|
|
// }
|
|
// }
|