Sketch basic scanner
This commit is contained in:
parent
99cd517a58
commit
eb6ade5a3d
7 changed files with 836 additions and 6 deletions
368
src/parser/scanner.ts
Normal file
368
src/parser/scanner.ts
Normal file
|
|
@ -0,0 +1,368 @@
|
|||
|
||||
import { char, isWhitespace, isDigit } from './source_text';
|
||||
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
|
||||
|
||||
function isSymbolChar(c: CodePoint): boolean {
|
||||
return (
|
||||
c === char("#") ||
|
||||
c === char("$") ||
|
||||
c === char("@") ||
|
||||
c === char("(") ||
|
||||
c === char(")") ||
|
||||
c === char("{") ||
|
||||
c === char("}") ||
|
||||
c === char(",") ||
|
||||
c === char(".")
|
||||
);
|
||||
}
|
||||
|
||||
function isIdentifierChar(char: CodePoint): boolean {
|
||||
return !isWhitespace(char) && !isSymbolChar(char);
|
||||
}
|
||||
|
||||
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
|
||||
export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
|
||||
|
||||
// === Scanner ===
|
||||
export type Token =
|
||||
| { tag: "number", value: number, span: Span }
|
||||
| { tag: "string", text: string, span: Span }
|
||||
| { tag: "identifier", text: string, span: Span }
|
||||
| { tag: "keyword", kw: Keyword, span: Span }
|
||||
| { tag: "symbol", sym: Symbol, span: Span }
|
||||
| { tag: "EOF", span: Span }
|
||||
|
||||
export namespace TokenKind {
|
||||
export type T =
|
||||
| { tag: "number" }
|
||||
| { tag: "string" }
|
||||
| { tag: "identifier" }
|
||||
| { tag: "symbol", value: Symbol }
|
||||
| { tag: "keyword", value: Keyword }
|
||||
| { tag: "EOF" }
|
||||
}
|
||||
|
||||
export type LexError =
|
||||
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
|
||||
| { tag: "UnexpectedEOF", span: Span }
|
||||
| { tag: "ExpectedNumber", span: Span }
|
||||
| { tag: "InvalidNumber", text: string, reason: string, span: Span }
|
||||
| { tag: "InvalidEscape", reason: string, span: Span };
|
||||
|
||||
export class Scanner {
|
||||
private i: CodePointIndex = 0;
|
||||
private line = 1;
|
||||
private column = 1;
|
||||
|
||||
// Track previous char to handle \r\n correctly
|
||||
private lastCharWasCR = false;
|
||||
|
||||
constructor(private readonly text: SourceText) {}
|
||||
|
||||
eof(): boolean {
|
||||
return this.i >= this.text.length;
|
||||
}
|
||||
|
||||
private peek(n: number = 0): CodePoint | undefined {
|
||||
return this.text.chars[this.i + n]?.char;
|
||||
}
|
||||
|
||||
private next(): CodePoint | undefined {
|
||||
const ref = this.text.chars[this.i];
|
||||
if (!ref) return undefined;
|
||||
|
||||
const c = ref.char;
|
||||
this.i++;
|
||||
|
||||
if (c === 0x0A /* \n */) {
|
||||
if (!this.lastCharWasCR) {
|
||||
this.line++;
|
||||
this.column = 1;
|
||||
} else {
|
||||
// We just saw \r, so this \n is part of \r\n.
|
||||
// We already bumped the line count on \r.
|
||||
// Just reset the flag.
|
||||
this.lastCharWasCR = false;
|
||||
}
|
||||
} else if (c === 0x0D /* \r */) {
|
||||
this.line++;
|
||||
this.column = 1;
|
||||
this.lastCharWasCR = true;
|
||||
} else {
|
||||
this.column++;
|
||||
this.lastCharWasCR = false;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
private currentOffset(): StringIndex {
|
||||
return this.text.chars[this.i]?.offset ?? this.text.source.length;
|
||||
}
|
||||
|
||||
private currentLocation(): SourceLocation {
|
||||
return { index: this.i, line: this.line, column: this.column };
|
||||
}
|
||||
|
||||
private makeSpan(start: SourceLocation): Span {
|
||||
const startOffset =
|
||||
this.text.chars[start.index]?.offset ?? this.text.source.length;
|
||||
const endOffset = this.currentOffset();
|
||||
|
||||
return {
|
||||
start: startOffset,
|
||||
end: endOffset,
|
||||
line: start.line,
|
||||
column: start.column,
|
||||
};
|
||||
}
|
||||
|
||||
private consumeWhile(pred: (c: CodePoint) => boolean): number {
|
||||
let count = 0;
|
||||
while (!this.eof()) {
|
||||
const c = this.peek();
|
||||
if (c === undefined || !pred(c)) break;
|
||||
this.next();
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private expect(
|
||||
pred: (c: CodePoint) => boolean,
|
||||
error: LexError
|
||||
): CodePoint {
|
||||
const c = this.peek();
|
||||
if (c === undefined || !pred(c)) {
|
||||
throw error;
|
||||
}
|
||||
this.next();
|
||||
return c;
|
||||
}
|
||||
|
||||
// Helper to check for exact char matches quickly
|
||||
private match(c: CodePoint): boolean {
|
||||
if (this.peek() === c) {
|
||||
this.next();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private skipWhitespace() {
|
||||
this.consumeWhile(isWhitespace);
|
||||
}
|
||||
|
||||
// === Main Scanners ===
|
||||
|
||||
private scanNumber(): Token {
|
||||
// number :=
|
||||
// | optional(`-`) digits optional(`.` digits)
|
||||
|
||||
const startNumberLocation = this.currentLocation();
|
||||
|
||||
let c: CodePoint;
|
||||
|
||||
// 1. Optional Sign
|
||||
c = this.peek();
|
||||
if (c === char("-")) {
|
||||
this.next();
|
||||
}
|
||||
|
||||
// 2. Integer Part
|
||||
c = this.peek();
|
||||
const integerPartDigitCount = this.consumeWhile(isDigit);
|
||||
if (integerPartDigitCount === 0) {
|
||||
throw <LexError>{
|
||||
tag: "ExpectedNumber",
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
|
||||
// 3. Fractional Part
|
||||
if (this.peek() === char(".")) {
|
||||
const dotLocation = this.currentLocation();
|
||||
|
||||
this.next(); // consume '.'
|
||||
|
||||
const fracPartDigitCount = this.consumeWhile(isDigit);
|
||||
if (fracPartDigitCount === 0) {
|
||||
throw <LexError>{
|
||||
tag: "InvalidNumber",
|
||||
reason: "MissingFractionalDigits",
|
||||
span: this.makeSpan(dotLocation),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const text = this.text.sliceByCp(startNumberLocation.index, this.i);
|
||||
const value = Number(text);
|
||||
|
||||
if (!Number.isFinite(value)) {
|
||||
throw <LexError>{
|
||||
tag: "InvalidNumber",
|
||||
reason: "NotFinite",
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
return {
|
||||
tag: "number",
|
||||
value,
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
|
||||
private scanString(): Token {
|
||||
const start = this.currentLocation();
|
||||
// We assume the caller checked the opening quote '"'
|
||||
this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
|
||||
|
||||
let value = ""; // The actual string content
|
||||
|
||||
while (true) {
|
||||
if (this.eof()) {
|
||||
throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
|
||||
}
|
||||
|
||||
const c = this.peek();
|
||||
|
||||
// 1. End of string
|
||||
if (c === char('"')) {
|
||||
this.next(); // consume closing quote
|
||||
break;
|
||||
}
|
||||
|
||||
if (c === char('\\')) {
|
||||
// 2. Escape Sequences
|
||||
const escapeStart = this.currentLocation();
|
||||
this.next(); // consume backslash
|
||||
const escaped = this.peek();
|
||||
|
||||
switch (escaped) {
|
||||
case char('n'): value += '\n'; this.next(); break;
|
||||
case char('r'): value += '\r'; this.next(); break;
|
||||
case char('t'): value += '\t'; this.next(); break;
|
||||
case char('\\'): value += '\\'; this.next(); break;
|
||||
case char("0"): value += "\0"; break;
|
||||
case char('"'): value += '"'; this.next(); break;
|
||||
// Unicode Escape: \u{XXXX}
|
||||
case char('v'): {
|
||||
const braceStart = 123123;
|
||||
}
|
||||
case char('u'): {
|
||||
this.next(); // consume 'u'
|
||||
|
||||
// 1. Expect '{'
|
||||
const braceStart = this.currentLocation();
|
||||
if (this.peek() !== char('{')) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
this.next(); // consume '{'
|
||||
|
||||
// 2. Consume Hex Digits
|
||||
const hexStart = this.i;
|
||||
const hexCount = this.consumeWhile(c =>
|
||||
(c >= char('0') && c <= char('9')) ||
|
||||
(c >= char('a') && c <= char('f')) ||
|
||||
(c >= char('A') && c <= char('F'))
|
||||
);
|
||||
|
||||
if (hexCount === 0) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
|
||||
// 3. Expect '}'
|
||||
if (this.peek() !== char("}")) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
this.next(); // consume '}'
|
||||
|
||||
// 4. Convert & Append
|
||||
const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
|
||||
const codePoint = parseInt(hexStr, 16);
|
||||
|
||||
if (codePoint > 0x10FFFF) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
|
||||
value += String.fromCodePoint(codePoint);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw <LexError>{
|
||||
tag: "InvalidEscape",
|
||||
reason: `UnknownEscapeSequence`,
|
||||
span: this.makeSpan(escapeStart)
|
||||
};
|
||||
}
|
||||
} else {
|
||||
// 3. Regular character
|
||||
// Optimization: consume chunks of non-special chars for speed?
|
||||
// For now, char-by-char is fine.
|
||||
this.next();
|
||||
// Note: We use ! because we checked EOF at loop start
|
||||
value += String.fromCodePoint(c!);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
tag: "string",
|
||||
text: value,
|
||||
span: this.makeSpan(start)
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// TODO: Need a Token to TokenKind function
|
||||
// TODO: Need is_start_of_expression(token): boolean
|
||||
// identifier -> true
|
||||
// symbol # -> true
|
||||
// symbol $ -> true
|
||||
// symbol @ -> true
|
||||
// symbol ( -> true
|
||||
// symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context
|
||||
// symbol _ -> false
|
||||
// number -> true
|
||||
// string -> true
|
||||
// keyword let -> true
|
||||
// keyword fn -> true
|
||||
// keyword apply -> true
|
||||
// keyword = -> false
|
||||
// keyword | -> false
|
||||
// EOF -> false
|
||||
//
|
||||
// TODO: function that matches a token with a token_type (returns bool)
|
||||
|
||||
// TODO: forbidden characters are
|
||||
// '('
|
||||
// ')'
|
||||
// '{'
|
||||
// '}'
|
||||
// '.'
|
||||
// ','
|
||||
// '|'
|
||||
// '$'
|
||||
// '#'
|
||||
// '@'
|
||||
// '"'
|
||||
// ' '
|
||||
// '\r'
|
||||
// '\t'
|
||||
// '\n'
|
||||
// TODO: need function is_forbidden_char
|
||||
|
||||
|
||||
|
||||
// === scanner functions ===
|
||||
// TODO: whitespace - consumes whitespace
|
||||
// TODO: comment - consumes token
|
||||
// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier
|
||||
// TODO: string - consumes string like "foo bar\njfjdsajfksd"
|
||||
// TODO: number - consumes number like 123123 or 000123 or 23919233.123
|
||||
//
|
||||
// TODO: token - gives next token
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue