Sketch basic scanner

This commit is contained in:
Yura Dupyn 2026-02-05 13:21:44 +01:00
parent 99cd517a58
commit eb6ade5a3d
7 changed files with 836 additions and 6 deletions

158
package-lock.json generated
View file

@ -28,6 +28,7 @@
"eslint": "^9.39.2",
"eslint-plugin-import": "^2.32.0",
"sass-embedded": "^1.97.3",
"ts-node": "^10.9.2",
"typescript": "^5.9.3",
"vite": "^7.3.1"
}
@ -39,6 +40,30 @@
"dev": true,
"license": "(Apache-2.0 AND BSD-3-Clause)"
},
"node_modules/@cspotcode/source-map-support": {
"version": "0.8.1",
"resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz",
"integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==",
"dev": true,
"license": "MIT",
"dependencies": {
"@jridgewell/trace-mapping": "0.3.9"
},
"engines": {
"node": ">=12"
}
},
"node_modules/@cspotcode/source-map-support/node_modules/@jridgewell/trace-mapping": {
"version": "0.3.9",
"resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz",
"integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"@jridgewell/resolve-uri": "^3.0.3",
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@electron-forge/cli": {
"version": "7.11.1",
"resolved": "https://registry.npmjs.org/@electron-forge/cli/-/cli-7.11.1.tgz",
@ -2615,6 +2640,34 @@
"node": ">= 10"
}
},
"node_modules/@tsconfig/node10": {
"version": "1.0.12",
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.12.tgz",
"integrity": "sha512-UCYBaeFvM11aU2y3YPZ//O5Rhj+xKyzy7mvcIoAjASbigy8mHMryP5cK7dgjlz2hWxh1g5pLw084E0a/wlUSFQ==",
"dev": true,
"license": "MIT"
},
"node_modules/@tsconfig/node12": {
"version": "1.0.11",
"resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz",
"integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==",
"dev": true,
"license": "MIT"
},
"node_modules/@tsconfig/node14": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz",
"integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==",
"dev": true,
"license": "MIT"
},
"node_modules/@tsconfig/node16": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz",
"integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/cacheable-request": {
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
@ -3248,6 +3301,19 @@
"acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
}
},
"node_modules/acorn-walk": {
"version": "8.3.4",
"resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.4.tgz",
"integrity": "sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==",
"dev": true,
"license": "MIT",
"dependencies": {
"acorn": "^8.11.0"
},
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/agent-base": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz",
@ -3389,6 +3455,13 @@
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/arg": {
"version": "4.1.3",
"resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz",
"integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==",
"dev": true,
"license": "MIT"
},
"node_modules/argparse": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
@ -4190,6 +4263,13 @@
"dev": true,
"license": "MIT"
},
"node_modules/create-require": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz",
"integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==",
"dev": true,
"license": "MIT"
},
"node_modules/cross-dirname": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/cross-dirname/-/cross-dirname-0.1.0.tgz",
@ -4421,6 +4501,16 @@
"license": "MIT",
"optional": true
},
"node_modules/diff": {
"version": "4.0.4",
"resolved": "https://registry.npmjs.org/diff/-/diff-4.0.4.tgz",
"integrity": "sha512-X07nttJQkwkfKfvTPG/KSnE2OMdcUCao6+eXF3wmnIQRn2aPAHH3VxDbDOdegkd6JbPsXqShpvEOHfAT+nCNwQ==",
"dev": true,
"license": "BSD-3-Clause",
"engines": {
"node": ">=0.3.1"
}
},
"node_modules/dir-compare": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/dir-compare/-/dir-compare-4.2.0.tgz",
@ -7701,6 +7791,13 @@
"node": ">=12"
}
},
"node_modules/make-error": {
"version": "1.3.6",
"resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz",
"integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==",
"dev": true,
"license": "ISC"
},
"node_modules/make-fetch-happen": {
"version": "10.2.1",
"resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-10.2.1.tgz",
@ -10779,6 +10876,50 @@
"typescript": ">=4.8.4"
}
},
"node_modules/ts-node": {
"version": "10.9.2",
"resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz",
"integrity": "sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"@cspotcode/source-map-support": "^0.8.0",
"@tsconfig/node10": "^1.0.7",
"@tsconfig/node12": "^1.0.7",
"@tsconfig/node14": "^1.0.0",
"@tsconfig/node16": "^1.0.2",
"acorn": "^8.4.1",
"acorn-walk": "^8.1.1",
"arg": "^4.1.0",
"create-require": "^1.1.0",
"diff": "^4.0.1",
"make-error": "^1.1.1",
"v8-compile-cache-lib": "^3.0.1",
"yn": "3.1.1"
},
"bin": {
"ts-node": "dist/bin.js",
"ts-node-cwd": "dist/bin-cwd.js",
"ts-node-esm": "dist/bin-esm.js",
"ts-node-script": "dist/bin-script.js",
"ts-node-transpile-only": "dist/bin-transpile.js",
"ts-script": "dist/bin-script-deprecated.js"
},
"peerDependencies": {
"@swc/core": ">=1.2.50",
"@swc/wasm": ">=1.2.50",
"@types/node": "*",
"typescript": ">=2.7"
},
"peerDependenciesMeta": {
"@swc/core": {
"optional": true
},
"@swc/wasm": {
"optional": true
}
}
},
"node_modules/tsconfig-paths": {
"version": "3.15.0",
"resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz",
@ -11041,6 +11182,13 @@
"dev": true,
"license": "MIT"
},
"node_modules/v8-compile-cache-lib": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
"dev": true,
"license": "MIT"
},
"node_modules/validate-npm-package-license": {
"version": "3.0.4",
"resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz",
@ -11558,6 +11706,16 @@
"fd-slicer": "~1.1.0"
}
},
"node_modules/yn": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz",
"integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/yocto-queue": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",

View file

@ -30,6 +30,7 @@
"eslint": "^9.39.2",
"eslint-plugin-import": "^2.32.0",
"sass-embedded": "^1.97.3",
"ts-node": "^10.9.2",
"typescript": "^5.9.3",
"vite": "^7.3.1"
},

View file

@ -1,4 +1,3 @@
Syntax is designed to be easily parsable by a recursive-descent/parser-combinators - not necessarily designed for intuitive human use.
It is inspired by syntax like LISP, but instead of `(f a b c)` we'd write `f(a, b, c)`.
Note that we allow fairly permissive identifiers like
@ -32,21 +31,28 @@ $
@
.
,
|
(
)
{
}
```
Then there are keywords, which can occur as substrings, but can't just be the keywords (TODO: Not sure if perhaps `|` we could allow as a keyword
Then there are keywords, which can occur as substrings, but can't just be the keywords
```
let
fn
match
=
|
```
Note that `=` and `|` are treated as keywords, not symbols. So these can occur in identifiers.
This has strange consequences, for example
```
x=123
```
is a valid identifier (sincce there are no spaces) - and this may cause huge confusion for the user when using record syntax or pattern matching on records.
Even though this is borderline evil, I've decided for simplicity's sake to accept this. I'll just have to make sure syntax errors are really good.
Examples of expressions in the language (except the top-level function definitions which are here just for illustration)
```
@ -242,7 +248,5 @@ So for example when parsing `#foo 123`, we parse the tag, the remainder is `123`
so we conclude that this has to be a tagged value we're parsing, so the next thing is an expression.
But for a case like `#foo, ...`, we see that the next token is `,`, so we must have just parsed a tag - not a tagged value.
TODO:
Also:
We don't yet have any convinient syntax for creating new records from old (like an immutable update syntax).

169
src/parser/scanner.test.ts Normal file
View file

@ -0,0 +1,169 @@
import { SourceText } from "./source_text";
import { Scanner, Token } from "./scanner";
// === Test Harness Utilities ===
const RED = "\x1b[31m";
const GREEN = "\x1b[32m";
const RESET = "\x1b[0m";
function assert(condition: boolean, message: string) {
if (!condition) {
throw new Error(message);
}
}
function assertToken(token: Token, expectedTag: string, expectedValue?: any) {
assert(token.tag === expectedTag, `Expected tag '${expectedTag}', got '${token.tag}'`);
if (expectedValue !== undefined) {
// Check 'value' for numbers, 'text' for strings
const actualValue = "value" in token ? token.value : "text" in token ? token.text : undefined;
assert(actualValue === expectedValue, `Expected value '${expectedValue}', got '${actualValue}'`);
}
}
// TODO: Rewrite this once `scanToken()` is implemented.
function scanOne(source: string): Token {
const src = new SourceText(source);
const scanner = new Scanner(src);
// We assume your scanner has a nextToken() method exposed,
// or you make the specific scan methods public for testing.
// Since you likely only expose nextToken() eventually, let's cheat
// and cast to any to access private methods for unit testing specific parts.
// OR: You can just expose 'scanNumber' as public for now.
// For this test, I will assume we are calling the private methods via 'any'
// to strictly unit test them without the dispatch logic.
if (source.trim().startsWith('"')) return (scanner as any).scanString();
return (scanner as any).scanNumber();
}
function test_integers() {
const t1 = scanOne("123");
assertToken(t1, "number", 123);
const t2 = scanOne("-500");
assertToken(t2, "number", -500);
console.log(`${GREEN}✔ Integers passed${RESET}`);
}
function test_floats() {
const t1 = scanOne("3.14159");
assertToken(t1, "number", 3.14159);
const t2 = scanOne("-0.001");
assertToken(t2, "number", -0.001);
console.log(`${GREEN}✔ Floats passed${RESET}`);
}
function test_number_errors() {
try {
scanOne("1."); // Should fail (trailing dot)
throw new Error("Should have thrown error for '1.'");
} catch (e: any) {
assert(e.tag === "InvalidNumber", "Expected InvalidNumber error for '1.'");
}
try {
scanOne(".5"); // Should fail (no leading digit)
throw new Error("Should have thrown error for '.5'");
} catch (e: any) {
assert(e.tag === "ExpectedNumber", "Expected ExpectedNumber error for '.5'");
}
console.log(`${GREEN}✔ Number errors passed${RESET}`);
}
function test_basic_strings() {
const t1 = scanOne('"hello world"');
assertToken(t1, "string", "hello world");
const t2 = scanOne('""'); // Empty string
assertToken(t2, "string", "");
console.log(`${GREEN}✔ Basic strings passed${RESET}`);
}
function test_string_escapes() {
const t1 = scanOne('"line1\\nline2"');
assertToken(t1, "string", "line1\nline2");
const t2 = scanOne('"col1\\tcol2"');
assertToken(t2, "string", "col1\tcol2");
const t3 = scanOne('"quote: \\" slash: \\\\"');
assertToken(t3, "string", 'quote: " slash: \\');
console.log(`${GREEN}✔ String escapes passed${RESET}`);
}
function test_unicode_escapes() {
// Rocket emoji: 🚀 (U+1F680)
const t1 = scanOne('"\\u{1F680}"');
assertToken(t1, "string", "🚀");
// Two escapes: A (U+41) and B (U+42)
const t2 = scanOne('"\\u{41}\\u{42}"');
assertToken(t2, "string", "AB");
// Max valid unicode
scanOne('"\\u{10FFFF}"');
console.log(`${GREEN}✔ Unicode escapes passed${RESET}`);
}
function test_line_counting() {
// Mixed line endings:
// Line 1: 123 (CRLF)
// Line 2: 456 (LF)
// Line 3: "foo"
const code = "123\r\n456\n\"foo\"";
const src = new SourceText(code);
const scanner = new Scanner(src);
// We need to implement a mini-loop here since scanOne creates new scanners
// 123
let tok = (scanner as any).scanNumber();
assert(tok.value === 123, "Line 1 value wrong");
assert(tok.span.line === 1, "Line 1 line# wrong");
// consume whitespace manually since we are bypassing nextToken()
(scanner as any).skipWhitespace();
// 456
tok = (scanner as any).scanNumber();
assert(tok.value === 456, "Line 2 value wrong");
assert(tok.span.line === 2, "Line 2 line# wrong");
(scanner as any).skipWhitespace();
// "foo"
tok = (scanner as any).scanString();
assert(tok.text === "foo", "Line 3 value wrong");
assert(tok.span.line === 3, "Line 3 line# wrong");
console.log(`${GREEN}✔ Line counting passed${RESET}`);
}
// === Run All ===
function run_all_tests() {
console.log("Running Scanner Tests...\n");
try {
test_integers();
test_floats();
test_number_errors();
test_basic_strings();
test_string_escapes();
test_unicode_escapes();
test_line_counting();
console.log(`\n${GREEN}ALL TESTS PASSED${RESET}`);
} catch (e: any) {
console.error(`\n${RED}TEST FAILED:${RESET}`);
console.error(e.message || e);
process.exit(1);
}
}
run_all_tests();

368
src/parser/scanner.ts Normal file
View file

@ -0,0 +1,368 @@
import { char, isWhitespace, isDigit } from './source_text';
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
function isSymbolChar(c: CodePoint): boolean {
return (
c === char("#") ||
c === char("$") ||
c === char("@") ||
c === char("(") ||
c === char(")") ||
c === char("{") ||
c === char("}") ||
c === char(",") ||
c === char(".")
);
}
function isIdentifierChar(char: CodePoint): boolean {
return !isWhitespace(char) && !isSymbolChar(char);
}
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
// === Scanner ===
export type Token =
| { tag: "number", value: number, span: Span }
| { tag: "string", text: string, span: Span }
| { tag: "identifier", text: string, span: Span }
| { tag: "keyword", kw: Keyword, span: Span }
| { tag: "symbol", sym: Symbol, span: Span }
| { tag: "EOF", span: Span }
export namespace TokenKind {
export type T =
| { tag: "number" }
| { tag: "string" }
| { tag: "identifier" }
| { tag: "symbol", value: Symbol }
| { tag: "keyword", value: Keyword }
| { tag: "EOF" }
}
export type LexError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "UnexpectedEOF", span: Span }
| { tag: "ExpectedNumber", span: Span }
| { tag: "InvalidNumber", text: string, reason: string, span: Span }
| { tag: "InvalidEscape", reason: string, span: Span };
export class Scanner {
private i: CodePointIndex = 0;
private line = 1;
private column = 1;
// Track previous char to handle \r\n correctly
private lastCharWasCR = false;
constructor(private readonly text: SourceText) {}
eof(): boolean {
return this.i >= this.text.length;
}
private peek(n: number = 0): CodePoint | undefined {
return this.text.chars[this.i + n]?.char;
}
private next(): CodePoint | undefined {
const ref = this.text.chars[this.i];
if (!ref) return undefined;
const c = ref.char;
this.i++;
if (c === 0x0A /* \n */) {
if (!this.lastCharWasCR) {
this.line++;
this.column = 1;
} else {
// We just saw \r, so this \n is part of \r\n.
// We already bumped the line count on \r.
// Just reset the flag.
this.lastCharWasCR = false;
}
} else if (c === 0x0D /* \r */) {
this.line++;
this.column = 1;
this.lastCharWasCR = true;
} else {
this.column++;
this.lastCharWasCR = false;
}
return c;
}
private currentOffset(): StringIndex {
return this.text.chars[this.i]?.offset ?? this.text.source.length;
}
private currentLocation(): SourceLocation {
return { index: this.i, line: this.line, column: this.column };
}
private makeSpan(start: SourceLocation): Span {
const startOffset =
this.text.chars[start.index]?.offset ?? this.text.source.length;
const endOffset = this.currentOffset();
return {
start: startOffset,
end: endOffset,
line: start.line,
column: start.column,
};
}
private consumeWhile(pred: (c: CodePoint) => boolean): number {
let count = 0;
while (!this.eof()) {
const c = this.peek();
if (c === undefined || !pred(c)) break;
this.next();
count++;
}
return count;
}
private expect(
pred: (c: CodePoint) => boolean,
error: LexError
): CodePoint {
const c = this.peek();
if (c === undefined || !pred(c)) {
throw error;
}
this.next();
return c;
}
// Helper to check for exact char matches quickly
private match(c: CodePoint): boolean {
if (this.peek() === c) {
this.next();
return true;
}
return false;
}
private skipWhitespace() {
this.consumeWhile(isWhitespace);
}
// === Main Scanners ===
private scanNumber(): Token {
// number :=
// | optional(`-`) digits optional(`.` digits)
const startNumberLocation = this.currentLocation();
let c: CodePoint;
// 1. Optional Sign
c = this.peek();
if (c === char("-")) {
this.next();
}
// 2. Integer Part
c = this.peek();
const integerPartDigitCount = this.consumeWhile(isDigit);
if (integerPartDigitCount === 0) {
throw <LexError>{
tag: "ExpectedNumber",
span: this.makeSpan(startNumberLocation),
};
}
// 3. Fractional Part
if (this.peek() === char(".")) {
const dotLocation = this.currentLocation();
this.next(); // consume '.'
const fracPartDigitCount = this.consumeWhile(isDigit);
if (fracPartDigitCount === 0) {
throw <LexError>{
tag: "InvalidNumber",
reason: "MissingFractionalDigits",
span: this.makeSpan(dotLocation),
};
}
}
const text = this.text.sliceByCp(startNumberLocation.index, this.i);
const value = Number(text);
if (!Number.isFinite(value)) {
throw <LexError>{
tag: "InvalidNumber",
reason: "NotFinite",
span: this.makeSpan(startNumberLocation),
};
}
return {
tag: "number",
value,
span: this.makeSpan(startNumberLocation),
};
}
private scanString(): Token {
const start = this.currentLocation();
// We assume the caller checked the opening quote '"'
this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
let value = ""; // The actual string content
while (true) {
if (this.eof()) {
throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
}
const c = this.peek();
// 1. End of string
if (c === char('"')) {
this.next(); // consume closing quote
break;
}
if (c === char('\\')) {
// 2. Escape Sequences
const escapeStart = this.currentLocation();
this.next(); // consume backslash
const escaped = this.peek();
switch (escaped) {
case char('n'): value += '\n'; this.next(); break;
case char('r'): value += '\r'; this.next(); break;
case char('t'): value += '\t'; this.next(); break;
case char('\\'): value += '\\'; this.next(); break;
case char("0"): value += "\0"; break;
case char('"'): value += '"'; this.next(); break;
// Unicode Escape: \u{XXXX}
case char('v'): {
const braceStart = 123123;
}
case char('u'): {
this.next(); // consume 'u'
// 1. Expect '{'
const braceStart = this.currentLocation();
if (this.peek() !== char('{')) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
}
this.next(); // consume '{'
// 2. Consume Hex Digits
const hexStart = this.i;
const hexCount = this.consumeWhile(c =>
(c >= char('0') && c <= char('9')) ||
(c >= char('a') && c <= char('f')) ||
(c >= char('A') && c <= char('F'))
);
if (hexCount === 0) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
}
// 3. Expect '}'
if (this.peek() !== char("}")) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
}
this.next(); // consume '}'
// 4. Convert & Append
const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
const codePoint = parseInt(hexStr, 16);
if (codePoint > 0x10FFFF) {
throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
}
value += String.fromCodePoint(codePoint);
break;
}
default:
throw <LexError>{
tag: "InvalidEscape",
reason: `UnknownEscapeSequence`,
span: this.makeSpan(escapeStart)
};
}
} else {
// 3. Regular character
// Optimization: consume chunks of non-special chars for speed?
// For now, char-by-char is fine.
this.next();
// Note: We use ! because we checked EOF at loop start
value += String.fromCodePoint(c!);
}
}
return {
tag: "string",
text: value,
span: this.makeSpan(start)
};
}
}
// TODO: Need a Token to TokenKind function
// TODO: Need is_start_of_expression(token): boolean
// identifier -> true
// symbol # -> true
// symbol $ -> true
// symbol @ -> true
// symbol ( -> true
// symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context
// symbol _ -> false
// number -> true
// string -> true
// keyword let -> true
// keyword fn -> true
// keyword apply -> true
// keyword = -> false
// keyword | -> false
// EOF -> false
//
// TODO: function that matches a token with a token_type (returns bool)
// TODO: forbidden characters are
// '('
// ')'
// '{'
// '}'
// '.'
// ','
// '|'
// '$'
// '#'
// '@'
// '"'
// ' '
// '\r'
// '\t'
// '\n'
// TODO: need function is_forbidden_char
// === scanner functions ===
// TODO: whitespace - consumes whitespace
// TODO: comment - consumes token
// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier
// TODO: string - consumes string like "foo bar\njfjdsajfksd"
// TODO: number - consumes number like 123123 or 000123 or 23919233.123
//
// TODO: token - gives next token

125
src/parser/source_text.ts Normal file
View file

@ -0,0 +1,125 @@
// === Char type ===
export type StringIndex = number; // UTF-16 index into string
export type CodePointIndex = number; // index into array of code-points
export type CodePoint = number; // could also name it `UnicodeCodePoint`. Basically for `s: string` we have `s.codePointAt(i: index): char`.
export function char(c: string): CodePoint {
return c.codePointAt(0)
}
export type CodePointRef = {
char: CodePoint,
offset: StringIndex,
};
// === Source Text ===
export class SourceText {
readonly source: string;
// TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
//
readonly chars: CodePointRef[];
// Stores the CodePointIndex where each line begins
readonly lineStarts: CodePointIndex[];
constructor(rawSource: string) {
const source = rawSource.normalize('NFC');
this.source = source;
this.chars = [];
this.lineStarts = [0]; // Line 1 always starts at index 0
let i = 0;
while (i < source.length) {
const char = source.codePointAt(i) as CodePoint;
const cpIndex = this.chars.length;
this.chars.push({ char: char, offset: i });
const size =(char > 0xFFFF ? 2 : 1);
i += size;
// === Newline Logic ===
// 0x0A is '\n', 0x0D is '\r'
if (char === 0x0A) {
// Found a newline, the NEXT char starts a new line
this.lineStarts.push(cpIndex + 1);
}
// Handle CR (Classic Mac) or CRLF start
else if (char === 0x0D) {
// Check if the next char is '\n' (CRLF)
// We peek ahead in the raw string to see if we need to skip the \n for line counting purposes
// or just treat this as a newline.
const nextIsNL = i < source.length && source.codePointAt(i) === 0x0A;
if (!nextIsNL) {
// Only push if it's NOT CRLF. If it is CRLF, the loop handles the \n next.
this.lineStarts.push(cpIndex + 1);
}
}
}
}
get length(): number {
return this.chars.length;
}
sliceByCp(start: number, end: number): string {
const startRef = this.chars[start];
// Handle out of bounds gracefully
if (!startRef) return "";
const startOff = startRef.offset;
const endOff = end < this.chars.length
? this.chars[end].offset
: this.source.length;
return this.source.slice(startOff, endOff);
}
// Converts a linear Code Point Index into SourceLocation
// getLocation(index: CodePointIndex): SourceLocation {
// // TODO: can be implemented either by a linear or binary search.
// return (0 as any);
// }
// Returns the full text of a specific line (1-based index)
getLineText(line: number): string {
const lineIndex = line - 1;
if (lineIndex < 0 || lineIndex >= this.lineStarts.length) return "";
const startCp = this.lineStarts[lineIndex];
const endCp = (lineIndex + 1 < this.lineStarts.length)
? this.lineStarts[lineIndex + 1] - 1 // -1 to exclude the newline char itself
: this.chars.length;
// TODO: Consider removing \r or \n from the end if they exist.
return this.sliceByCp(startCp, endCp);
}
}
export function sourceText(s: string) {
return new SourceText(s);
}
export function isWhitespace(char: CodePoint): boolean {
return char === 0x20 || char === 0x09 || char === 0x0A || char === 0x0D;
}
export function isDigit(char: CodePoint): boolean {
return char >= 0x30 && char <= 0x39;
}
export type Span = {
start: StringIndex,
end: StringIndex,
line: number,
column: number,
}
export type SourceLocation = {
index: CodePointIndex;
line: number; // 1-based
column: number; // 1-based
};

View file

@ -10,3 +10,8 @@ npm install electron-squirrel-startup
npm install -D sass-embedded
# Tests
npx ts-node src/parser/scanner.test.ts