Improve and abstract Cursor design. Start scanner
This commit is contained in:
parent
d382b16e6d
commit
d5f9777711
8 changed files with 713 additions and 476 deletions
|
|
@ -1,318 +1,85 @@
|
|||
|
||||
import { char, isWhitespace, isDigit } from './source_text';
|
||||
import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text';
|
||||
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
|
||||
|
||||
function isSymbolChar(c: CodePoint): boolean {
|
||||
return (
|
||||
c === char("#") ||
|
||||
c === char("$") ||
|
||||
c === char("@") ||
|
||||
c === char("(") ||
|
||||
c === char(")") ||
|
||||
c === char("{") ||
|
||||
c === char("}") ||
|
||||
c === char(",") ||
|
||||
c === char(".")
|
||||
);
|
||||
import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
|
||||
import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
|
||||
import { Result } from '../result';
|
||||
|
||||
// === Language Specific Stuff ===
|
||||
|
||||
const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
|
||||
export type Delimiter = typeof DELIMITER_CHARS[number];
|
||||
const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
|
||||
|
||||
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!";
|
||||
|
||||
// Returns the raw string. Does NOT create a token (the caller decides the token type).
|
||||
function scanRawIdentifier(cursor: Cursor): string {
|
||||
const start = cursor.currentIndex;
|
||||
// Consume until EOF or Delimiter or Whitespace (including comments)
|
||||
// TODO: How to check for comments? They are special in that they must start with two characters. Fuck.
|
||||
cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c)));
|
||||
return cursor.text.sliceByCp(start, cursor.currentIndex);
|
||||
}
|
||||
|
||||
function isIdentifierChar(char: CodePoint): boolean {
|
||||
return !isWhitespace(char) && !isSymbolChar(char);
|
||||
}
|
||||
export type ExprScanError =
|
||||
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
|
||||
| { tag: "InvalidIdentifier", text: string, reason: string, span: Span }
|
||||
| NumberError
|
||||
| StringError;
|
||||
|
||||
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
|
||||
export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
|
||||
|
||||
// === Scanner ===
|
||||
export type Token =
|
||||
export type ExprStartToken =
|
||||
| { tag: "number", value: number, span: Span }
|
||||
| { tag: "string", text: string, span: Span }
|
||||
| { tag: "identifier", text: string, span: Span }
|
||||
| { tag: "function_name", name: string, span: Span }
|
||||
| { tag: "variable_use", name: string, span: Span }
|
||||
| { tag: "tag", name: string, span: Span }
|
||||
| { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
|
||||
| { tag: "keyword", kw: Keyword, span: Span }
|
||||
| { tag: "symbol", sym: Symbol, span: Span }
|
||||
| { tag: "EOF", span: Span }
|
||||
|
||||
export namespace TokenKind {
|
||||
export type T =
|
||||
| { tag: "number" }
|
||||
| { tag: "string" }
|
||||
| { tag: "identifier" }
|
||||
| { tag: "symbol", value: Symbol }
|
||||
| { tag: "keyword", value: Keyword }
|
||||
| { tag: "EOF" }
|
||||
}
|
||||
// TODO: Move this back to `cursor.ts`
|
||||
|
||||
export type LexError =
|
||||
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
|
||||
| { tag: "UnexpectedEOF", span: Span }
|
||||
| { tag: "ExpectedNumber", span: Span }
|
||||
| { tag: "InvalidNumber", text: string, reason: string, span: Span }
|
||||
| { tag: "InvalidEscape", reason: string, span: Span };
|
||||
function skipWhitespaceAndComments(cursor: Cursor): number {
|
||||
let totalConsumed = 0;
|
||||
|
||||
export class Scanner {
|
||||
private i: CodePointIndex = 0;
|
||||
private line = 1;
|
||||
private column = 1;
|
||||
while (true) {
|
||||
// 1. Consume standard whitespace (spaces, tabs, newlines)
|
||||
const wsCount = cursor.consumeWhile(isWhitespace);
|
||||
totalConsumed += wsCount;
|
||||
|
||||
// Track previous char to handle \r\n correctly
|
||||
private lastCharWasCR = false;
|
||||
// 2. Check for Line Comment start ('//')
|
||||
const c = cursor.peek();
|
||||
const nextC = cursor.peek(1);
|
||||
|
||||
constructor(private readonly text: SourceText) {}
|
||||
if (c === char('/') && nextC === char('/')) {
|
||||
// Found comment start. Consume the '//' markers
|
||||
cursor.next();
|
||||
cursor.next();
|
||||
totalConsumed += 2;
|
||||
|
||||
eof(): boolean {
|
||||
return this.i >= this.text.length;
|
||||
}
|
||||
|
||||
private peek(n: number = 0): CodePoint | undefined {
|
||||
return this.text.chars[this.i + n]?.char;
|
||||
}
|
||||
|
||||
private next(): CodePoint | undefined {
|
||||
const ref = this.text.chars[this.i];
|
||||
if (!ref) return undefined;
|
||||
|
||||
const c = ref.char;
|
||||
this.i++;
|
||||
|
||||
if (c === 0x0A /* \n */) {
|
||||
if (!this.lastCharWasCR) {
|
||||
this.line++;
|
||||
this.column = 1;
|
||||
} else {
|
||||
// We just saw \r, so this \n is part of \r\n.
|
||||
// We already bumped the line count on \r.
|
||||
// Just reset the flag.
|
||||
this.lastCharWasCR = false;
|
||||
}
|
||||
} else if (c === 0x0D /* \r */) {
|
||||
this.line++;
|
||||
this.column = 1;
|
||||
this.lastCharWasCR = true;
|
||||
// Consume everything until the next newline (or EOF).
|
||||
// Note: We do NOT consume the newline itself here.
|
||||
// We let the NEXT iteration of the 'while(true)' loop catch
|
||||
// the newline as standard whitespace.
|
||||
const commentContentLength = cursor.consumeWhile(c => c !== NEW_LINE && c !== CARRIAGE_RETURN);
|
||||
totalConsumed += commentContentLength;
|
||||
} else {
|
||||
this.column++;
|
||||
this.lastCharWasCR = false;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
private currentOffset(): StringIndex {
|
||||
return this.text.chars[this.i]?.offset ?? this.text.source.length;
|
||||
}
|
||||
|
||||
private currentLocation(): SourceLocation {
|
||||
return { index: this.i, line: this.line, column: this.column };
|
||||
}
|
||||
|
||||
private makeSpan(start: SourceLocation): Span {
|
||||
const startOffset =
|
||||
this.text.chars[start.index]?.offset ?? this.text.source.length;
|
||||
const endOffset = this.currentOffset();
|
||||
|
||||
return {
|
||||
start: startOffset,
|
||||
end: endOffset,
|
||||
line: start.line,
|
||||
column: start.column,
|
||||
};
|
||||
}
|
||||
|
||||
private consumeWhile(pred: (c: CodePoint) => boolean): number {
|
||||
let count = 0;
|
||||
while (!this.eof()) {
|
||||
const c = this.peek();
|
||||
if (c === undefined || !pred(c)) break;
|
||||
this.next();
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private expect(
|
||||
pred: (c: CodePoint) => boolean,
|
||||
error: LexError
|
||||
): CodePoint {
|
||||
const c = this.peek();
|
||||
if (c === undefined || !pred(c)) {
|
||||
throw error;
|
||||
}
|
||||
this.next();
|
||||
return c;
|
||||
}
|
||||
|
||||
// Helper to check for exact char matches quickly
|
||||
private match(c: CodePoint): boolean {
|
||||
if (this.peek() === c) {
|
||||
this.next();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private skipWhitespace() {
|
||||
this.consumeWhile(isWhitespace);
|
||||
}
|
||||
|
||||
// === Main Scanners ===
|
||||
|
||||
private scanNumber(): Token {
|
||||
// number :=
|
||||
// | optional(`-`) digits optional(`.` digits)
|
||||
|
||||
const startNumberLocation = this.currentLocation();
|
||||
|
||||
let c: CodePoint;
|
||||
|
||||
// 1. Optional Sign
|
||||
c = this.peek();
|
||||
if (c === char("-")) {
|
||||
this.next();
|
||||
}
|
||||
|
||||
// 2. Integer Part
|
||||
c = this.peek();
|
||||
const integerPartDigitCount = this.consumeWhile(isDigit);
|
||||
if (integerPartDigitCount === 0) {
|
||||
throw <LexError>{
|
||||
tag: "ExpectedNumber",
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
|
||||
// 3. Fractional Part
|
||||
if (this.peek() === char(".")) {
|
||||
const dotLocation = this.currentLocation();
|
||||
|
||||
this.next(); // consume '.'
|
||||
|
||||
const fracPartDigitCount = this.consumeWhile(isDigit);
|
||||
if (fracPartDigitCount === 0) {
|
||||
throw <LexError>{
|
||||
tag: "InvalidNumber",
|
||||
reason: "MissingFractionalDigits",
|
||||
span: this.makeSpan(dotLocation),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const text = this.text.sliceByCp(startNumberLocation.index, this.i);
|
||||
const value = Number(text);
|
||||
|
||||
if (!Number.isFinite(value)) {
|
||||
throw <LexError>{
|
||||
tag: "InvalidNumber",
|
||||
reason: "NotFinite",
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
return {
|
||||
tag: "number",
|
||||
value,
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
|
||||
private scanString(): Token {
|
||||
const start = this.currentLocation();
|
||||
// We assume the caller checked the opening quote '"'
|
||||
this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
|
||||
|
||||
let value = ""; // The actual string content
|
||||
|
||||
while (true) {
|
||||
if (this.eof()) {
|
||||
throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
|
||||
}
|
||||
|
||||
const c = this.peek();
|
||||
|
||||
// 1. End of string
|
||||
if (c === char('"')) {
|
||||
this.next(); // consume closing quote
|
||||
// We are not at a comment.
|
||||
// If we also didn't consume any whitespace in step 1, we are truly done.
|
||||
if (wsCount === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (c === char('\\')) {
|
||||
// 2. Escape Sequences
|
||||
const escapeStart = this.currentLocation();
|
||||
this.next(); // consume backslash
|
||||
const escaped = this.peek();
|
||||
|
||||
switch (escaped) {
|
||||
case char('n'): value += '\n'; this.next(); break;
|
||||
case char('r'): value += '\r'; this.next(); break;
|
||||
case char('t'): value += '\t'; this.next(); break;
|
||||
case char('\\'): value += '\\'; this.next(); break;
|
||||
case char("0"): value += "\0"; break;
|
||||
case char('"'): value += '"'; this.next(); break;
|
||||
// Unicode Escape: \u{XXXX}
|
||||
case char('u'): {
|
||||
this.next(); // consume 'u'
|
||||
|
||||
// Expect '{'
|
||||
const braceStart = this.currentLocation();
|
||||
if (this.peek() !== char('{')) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
this.next(); // consume '{'
|
||||
|
||||
// Consume Hex Digits
|
||||
const hexStart = this.i;
|
||||
const hexCount = this.consumeWhile(c =>
|
||||
(c >= char('0') && c <= char('9')) ||
|
||||
(c >= char('a') && c <= char('f')) ||
|
||||
(c >= char('A') && c <= char('F'))
|
||||
);
|
||||
|
||||
if (hexCount === 0) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
|
||||
// Expect '}'
|
||||
if (this.peek() !== char("}")) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
this.next(); // consume '}'
|
||||
|
||||
// Convert & Append
|
||||
const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
|
||||
const codePoint = parseInt(hexStr, 16);
|
||||
|
||||
if (codePoint > 0x10FFFF) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
|
||||
value += String.fromCodePoint(codePoint);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw <LexError>{
|
||||
tag: "InvalidEscape",
|
||||
reason: `UnknownEscapeSequence`,
|
||||
span: this.makeSpan(escapeStart)
|
||||
};
|
||||
}
|
||||
} else {
|
||||
// 3. Regular character
|
||||
// Optimization: consume chunks of non-special chars for speed?
|
||||
// For now, char-by-char is fine.
|
||||
this.next();
|
||||
// Note: We use ! because we checked EOF at loop start
|
||||
value += String.fromCodePoint(c!);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
tag: "string",
|
||||
text: value,
|
||||
span: this.makeSpan(start)
|
||||
};
|
||||
}
|
||||
|
||||
return totalConsumed;
|
||||
}
|
||||
|
||||
export function scanExprStart(cursor: Cursor): Result<ExprStartToken, ExprScanError> {
|
||||
// TODO
|
||||
return (0 as any);
|
||||
}
|
||||
|
||||
// TODO: Need a Token to TokenKind function
|
||||
// TODO: Need is_start_of_expression(token): boolean
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue