Sketch basic scanner

This commit is contained in:
Yura Dupyn 2026-02-05 13:21:44 +01:00
parent 99cd517a58
commit eb6ade5a3d
7 changed files with 836 additions and 6 deletions

368
src/parser/scanner.ts Normal file
View file

@ -0,0 +1,368 @@
import { char, isWhitespace, isDigit } from './source_text';
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
function isSymbolChar(c: CodePoint): boolean {
return (
c === char("#") ||
c === char("$") ||
c === char("@") ||
c === char("(") ||
c === char(")") ||
c === char("{") ||
c === char("}") ||
c === char(",") ||
c === char(".")
);
}
function isIdentifierChar(char: CodePoint): boolean {
return !isWhitespace(char) && !isSymbolChar(char);
}
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
// === Scanner ===
export type Token =
| { tag: "number", value: number, span: Span }
| { tag: "string", text: string, span: Span }
| { tag: "identifier", text: string, span: Span }
| { tag: "keyword", kw: Keyword, span: Span }
| { tag: "symbol", sym: Symbol, span: Span }
| { tag: "EOF", span: Span }
export namespace TokenKind {
export type T =
| { tag: "number" }
| { tag: "string" }
| { tag: "identifier" }
| { tag: "symbol", value: Symbol }
| { tag: "keyword", value: Keyword }
| { tag: "EOF" }
}
export type LexError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "UnexpectedEOF", span: Span }
| { tag: "ExpectedNumber", span: Span }
| { tag: "InvalidNumber", text: string, reason: string, span: Span }
| { tag: "InvalidEscape", reason: string, span: Span };
export class Scanner {
private i: CodePointIndex = 0;
private line = 1;
private column = 1;
// Track previous char to handle \r\n correctly
private lastCharWasCR = false;
constructor(private readonly text: SourceText) {}
eof(): boolean {
return this.i >= this.text.length;
}
private peek(n: number = 0): CodePoint | undefined {
return this.text.chars[this.i + n]?.char;
}
private next(): CodePoint | undefined {
const ref = this.text.chars[this.i];
if (!ref) return undefined;
const c = ref.char;
this.i++;
if (c === 0x0A /* \n */) {
if (!this.lastCharWasCR) {
this.line++;
this.column = 1;
} else {
// We just saw \r, so this \n is part of \r\n.
// We already bumped the line count on \r.
// Just reset the flag.
this.lastCharWasCR = false;
}
} else if (c === 0x0D /* \r */) {
this.line++;
this.column = 1;
this.lastCharWasCR = true;
} else {
this.column++;
this.lastCharWasCR = false;
}
return c;
}
private currentOffset(): StringIndex {
return this.text.chars[this.i]?.offset ?? this.text.source.length;
}
private currentLocation(): SourceLocation {
return { index: this.i, line: this.line, column: this.column };
}
private makeSpan(start: SourceLocation): Span {
const startOffset =
this.text.chars[start.index]?.offset ?? this.text.source.length;
const endOffset = this.currentOffset();
return {
start: startOffset,
end: endOffset,
line: start.line,
column: start.column,
};
}
private consumeWhile(pred: (c: CodePoint) => boolean): number {
let count = 0;
while (!this.eof()) {
const c = this.peek();
if (c === undefined || !pred(c)) break;
this.next();
count++;
}
return count;
}
private expect(
pred: (c: CodePoint) => boolean,
error: LexError
): CodePoint {
const c = this.peek();
if (c === undefined || !pred(c)) {
throw error;
}
this.next();
return c;
}
// Helper to check for exact char matches quickly
private match(c: CodePoint): boolean {
if (this.peek() === c) {
this.next();
return true;
}
return false;
}
private skipWhitespace() {
this.consumeWhile(isWhitespace);
}
// === Main Scanners ===
private scanNumber(): Token {
// number :=
// | optional(`-`) digits optional(`.` digits)
const startNumberLocation = this.currentLocation();
let c: CodePoint;
// 1. Optional Sign
c = this.peek();
if (c === char("-")) {
this.next();
}
// 2. Integer Part
c = this.peek();
const integerPartDigitCount = this.consumeWhile(isDigit);
if (integerPartDigitCount === 0) {
throw <LexError>{
tag: "ExpectedNumber",
span: this.makeSpan(startNumberLocation),
};
}
// 3. Fractional Part
if (this.peek() === char(".")) {
const dotLocation = this.currentLocation();
this.next(); // consume '.'
const fracPartDigitCount = this.consumeWhile(isDigit);
if (fracPartDigitCount === 0) {
throw <LexError>{
tag: "InvalidNumber",
reason: "MissingFractionalDigits",
span: this.makeSpan(dotLocation),
};
}
}
const text = this.text.sliceByCp(startNumberLocation.index, this.i);
const value = Number(text);
if (!Number.isFinite(value)) {
throw <LexError>{
tag: "InvalidNumber",
reason: "NotFinite",
span: this.makeSpan(startNumberLocation),
};
}
return {
tag: "number",
value,
span: this.makeSpan(startNumberLocation),
};
}
private scanString(): Token {
const start = this.currentLocation();
// We assume the caller checked the opening quote '"'
this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
let value = ""; // The actual string content
while (true) {
if (this.eof()) {
throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
}
const c = this.peek();
// 1. End of string
if (c === char('"')) {
this.next(); // consume closing quote
break;
}
if (c === char('\\')) {
// 2. Escape Sequences
const escapeStart = this.currentLocation();
this.next(); // consume backslash
const escaped = this.peek();
switch (escaped) {
case char('n'): value += '\n'; this.next(); break;
case char('r'): value += '\r'; this.next(); break;
case char('t'): value += '\t'; this.next(); break;
case char('\\'): value += '\\'; this.next(); break;
case char("0"): value += "\0"; break;
case char('"'): value += '"'; this.next(); break;
// Unicode Escape: \u{XXXX}
case char('v'): {
const braceStart = 123123;
}
case char('u'): {
this.next(); // consume 'u'
// 1. Expect '{'
const braceStart = this.currentLocation();
if (this.peek() !== char('{')) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
}
this.next(); // consume '{'
// 2. Consume Hex Digits
const hexStart = this.i;
const hexCount = this.consumeWhile(c =>
(c >= char('0') && c <= char('9')) ||
(c >= char('a') && c <= char('f')) ||
(c >= char('A') && c <= char('F'))
);
if (hexCount === 0) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
}
// 3. Expect '}'
if (this.peek() !== char("}")) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
}
this.next(); // consume '}'
// 4. Convert & Append
const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
const codePoint = parseInt(hexStr, 16);
if (codePoint > 0x10FFFF) {
throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
}
value += String.fromCodePoint(codePoint);
break;
}
default:
throw <LexError>{
tag: "InvalidEscape",
reason: `UnknownEscapeSequence`,
span: this.makeSpan(escapeStart)
};
}
} else {
// 3. Regular character
// Optimization: consume chunks of non-special chars for speed?
// For now, char-by-char is fine.
this.next();
// Note: We use ! because we checked EOF at loop start
value += String.fromCodePoint(c!);
}
}
return {
tag: "string",
text: value,
span: this.makeSpan(start)
};
}
}
// TODO: Need a Token to TokenKind function
// TODO: Need is_start_of_expression(token): boolean
// identifier -> true
// symbol # -> true
// symbol $ -> true
// symbol @ -> true
// symbol ( -> true
// symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context
// symbol _ -> false
// number -> true
// string -> true
// keyword let -> true
// keyword fn -> true
// keyword apply -> true
// keyword = -> false
// keyword | -> false
// EOF -> false
//
// TODO: function that matches a token with a token_type (returns bool)
// TODO: forbidden characters are
// '('
// ')'
// '{'
// '}'
// '.'
// ','
// '|'
// '$'
// '#'
// '@'
// '"'
// ' '
// '\r'
// '\t'
// '\n'
// TODO: need function is_forbidden_char
// === scanner functions ===
// TODO: whitespace - consumes whitespace
// TODO: comment - consumes token
// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier
// TODO: string - consumes string like "foo bar\njfjdsajfksd"
// TODO: number - consumes number like 123123 or 000123 or 23919233.123
//
// TODO: token - gives next token