Sketch basic scanner
This commit is contained in:
parent
99cd517a58
commit
eb6ade5a3d
7 changed files with 836 additions and 6 deletions
158
package-lock.json
generated
158
package-lock.json
generated
|
|
@ -28,6 +28,7 @@
|
|||
"eslint": "^9.39.2",
|
||||
"eslint-plugin-import": "^2.32.0",
|
||||
"sass-embedded": "^1.97.3",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.9.3",
|
||||
"vite": "^7.3.1"
|
||||
}
|
||||
|
|
@ -39,6 +40,30 @@
|
|||
"dev": true,
|
||||
"license": "(Apache-2.0 AND BSD-3-Clause)"
|
||||
},
|
||||
"node_modules/@cspotcode/source-map-support": {
|
||||
"version": "0.8.1",
|
||||
"resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz",
|
||||
"integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/trace-mapping": "0.3.9"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/@cspotcode/source-map-support/node_modules/@jridgewell/trace-mapping": {
|
||||
"version": "0.3.9",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz",
|
||||
"integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/resolve-uri": "^3.0.3",
|
||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||
}
|
||||
},
|
||||
"node_modules/@electron-forge/cli": {
|
||||
"version": "7.11.1",
|
||||
"resolved": "https://registry.npmjs.org/@electron-forge/cli/-/cli-7.11.1.tgz",
|
||||
|
|
@ -2615,6 +2640,34 @@
|
|||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@tsconfig/node10": {
|
||||
"version": "1.0.12",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.12.tgz",
|
||||
"integrity": "sha512-UCYBaeFvM11aU2y3YPZ//O5Rhj+xKyzy7mvcIoAjASbigy8mHMryP5cK7dgjlz2hWxh1g5pLw084E0a/wlUSFQ==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@tsconfig/node12": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz",
|
||||
"integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@tsconfig/node14": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz",
|
||||
"integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@tsconfig/node16": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz",
|
||||
"integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/cacheable-request": {
|
||||
"version": "6.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
|
||||
|
|
@ -3248,6 +3301,19 @@
|
|||
"acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/acorn-walk": {
|
||||
"version": "8.3.4",
|
||||
"resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.4.tgz",
|
||||
"integrity": "sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"acorn": "^8.11.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/agent-base": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz",
|
||||
|
|
@ -3389,6 +3455,13 @@
|
|||
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/arg": {
|
||||
"version": "4.1.3",
|
||||
"resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz",
|
||||
"integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/argparse": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
|
||||
|
|
@ -4190,6 +4263,13 @@
|
|||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/create-require": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz",
|
||||
"integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/cross-dirname": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/cross-dirname/-/cross-dirname-0.1.0.tgz",
|
||||
|
|
@ -4421,6 +4501,16 @@
|
|||
"license": "MIT",
|
||||
"optional": true
|
||||
},
|
||||
"node_modules/diff": {
|
||||
"version": "4.0.4",
|
||||
"resolved": "https://registry.npmjs.org/diff/-/diff-4.0.4.tgz",
|
||||
"integrity": "sha512-X07nttJQkwkfKfvTPG/KSnE2OMdcUCao6+eXF3wmnIQRn2aPAHH3VxDbDOdegkd6JbPsXqShpvEOHfAT+nCNwQ==",
|
||||
"dev": true,
|
||||
"license": "BSD-3-Clause",
|
||||
"engines": {
|
||||
"node": ">=0.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/dir-compare": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/dir-compare/-/dir-compare-4.2.0.tgz",
|
||||
|
|
@ -7701,6 +7791,13 @@
|
|||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/make-error": {
|
||||
"version": "1.3.6",
|
||||
"resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz",
|
||||
"integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==",
|
||||
"dev": true,
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/make-fetch-happen": {
|
||||
"version": "10.2.1",
|
||||
"resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-10.2.1.tgz",
|
||||
|
|
@ -10779,6 +10876,50 @@
|
|||
"typescript": ">=4.8.4"
|
||||
}
|
||||
},
|
||||
"node_modules/ts-node": {
|
||||
"version": "10.9.2",
|
||||
"resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz",
|
||||
"integrity": "sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@cspotcode/source-map-support": "^0.8.0",
|
||||
"@tsconfig/node10": "^1.0.7",
|
||||
"@tsconfig/node12": "^1.0.7",
|
||||
"@tsconfig/node14": "^1.0.0",
|
||||
"@tsconfig/node16": "^1.0.2",
|
||||
"acorn": "^8.4.1",
|
||||
"acorn-walk": "^8.1.1",
|
||||
"arg": "^4.1.0",
|
||||
"create-require": "^1.1.0",
|
||||
"diff": "^4.0.1",
|
||||
"make-error": "^1.1.1",
|
||||
"v8-compile-cache-lib": "^3.0.1",
|
||||
"yn": "3.1.1"
|
||||
},
|
||||
"bin": {
|
||||
"ts-node": "dist/bin.js",
|
||||
"ts-node-cwd": "dist/bin-cwd.js",
|
||||
"ts-node-esm": "dist/bin-esm.js",
|
||||
"ts-node-script": "dist/bin-script.js",
|
||||
"ts-node-transpile-only": "dist/bin-transpile.js",
|
||||
"ts-script": "dist/bin-script-deprecated.js"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@swc/core": ">=1.2.50",
|
||||
"@swc/wasm": ">=1.2.50",
|
||||
"@types/node": "*",
|
||||
"typescript": ">=2.7"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@swc/core": {
|
||||
"optional": true
|
||||
},
|
||||
"@swc/wasm": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/tsconfig-paths": {
|
||||
"version": "3.15.0",
|
||||
"resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz",
|
||||
|
|
@ -11041,6 +11182,13 @@
|
|||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/v8-compile-cache-lib": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
|
||||
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/validate-npm-package-license": {
|
||||
"version": "3.0.4",
|
||||
"resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz",
|
||||
|
|
@ -11558,6 +11706,16 @@
|
|||
"fd-slicer": "~1.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/yn": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz",
|
||||
"integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/yocto-queue": {
|
||||
"version": "0.1.0",
|
||||
"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@
|
|||
"eslint": "^9.39.2",
|
||||
"eslint-plugin-import": "^2.32.0",
|
||||
"sass-embedded": "^1.97.3",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.9.3",
|
||||
"vite": "^7.3.1"
|
||||
},
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
Syntax is designed to be easily parsable by a recursive-descent/parser-combinators - not necessarily designed for intuitive human use.
|
||||
It is inspired by syntax like LISP, but instead of `(f a b c)` we'd write `f(a, b, c)`.
|
||||
Note that we allow fairly permissive identifiers like
|
||||
|
|
@ -32,21 +31,28 @@ $
|
|||
@
|
||||
.
|
||||
,
|
||||
|
|
||||
(
|
||||
)
|
||||
{
|
||||
}
|
||||
```
|
||||
|
||||
Then there are keywords, which can occur as substrings, but can't just be the keywords (TODO: Not sure if perhaps `|` we could allow as a keyword
|
||||
Then there are keywords, which can occur as substrings, but can't just be the keywords
|
||||
```
|
||||
let
|
||||
fn
|
||||
match
|
||||
=
|
||||
|
|
||||
```
|
||||
Note that `=` and `|` are treated as keywords, not symbols. So these can occur in identifiers.
|
||||
|
||||
This has strange consequences, for example
|
||||
```
|
||||
x=123
|
||||
```
|
||||
is a valid identifier (sincce there are no spaces) - and this may cause huge confusion for the user when using record syntax or pattern matching on records.
|
||||
Even though this is borderline evil, I've decided for simplicity's sake to accept this. I'll just have to make sure syntax errors are really good.
|
||||
|
||||
Examples of expressions in the language (except the top-level function definitions which are here just for illustration)
|
||||
```
|
||||
|
|
@ -242,7 +248,5 @@ So for example when parsing `#foo 123`, we parse the tag, the remainder is `123`
|
|||
so we conclude that this has to be a tagged value we're parsing, so the next thing is an expression.
|
||||
But for a case like `#foo, ...`, we see that the next token is `,`, so we must have just parsed a tag - not a tagged value.
|
||||
|
||||
|
||||
|
||||
TODO:
|
||||
Also:
|
||||
We don't yet have any convinient syntax for creating new records from old (like an immutable update syntax).
|
||||
|
|
|
|||
169
src/parser/scanner.test.ts
Normal file
169
src/parser/scanner.test.ts
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
import { SourceText } from "./source_text";
|
||||
import { Scanner, Token } from "./scanner";
|
||||
|
||||
// === Test Harness Utilities ===
|
||||
const RED = "\x1b[31m";
|
||||
const GREEN = "\x1b[32m";
|
||||
const RESET = "\x1b[0m";
|
||||
|
||||
function assert(condition: boolean, message: string) {
|
||||
if (!condition) {
|
||||
throw new Error(message);
|
||||
}
|
||||
}
|
||||
|
||||
function assertToken(token: Token, expectedTag: string, expectedValue?: any) {
|
||||
assert(token.tag === expectedTag, `Expected tag '${expectedTag}', got '${token.tag}'`);
|
||||
if (expectedValue !== undefined) {
|
||||
// Check 'value' for numbers, 'text' for strings
|
||||
const actualValue = "value" in token ? token.value : "text" in token ? token.text : undefined;
|
||||
assert(actualValue === expectedValue, `Expected value '${expectedValue}', got '${actualValue}'`);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Rewrite this once `scanToken()` is implemented.
|
||||
function scanOne(source: string): Token {
|
||||
const src = new SourceText(source);
|
||||
const scanner = new Scanner(src);
|
||||
// We assume your scanner has a nextToken() method exposed,
|
||||
// or you make the specific scan methods public for testing.
|
||||
// Since you likely only expose nextToken() eventually, let's cheat
|
||||
// and cast to any to access private methods for unit testing specific parts.
|
||||
// OR: You can just expose 'scanNumber' as public for now.
|
||||
|
||||
// For this test, I will assume we are calling the private methods via 'any'
|
||||
// to strictly unit test them without the dispatch logic.
|
||||
if (source.trim().startsWith('"')) return (scanner as any).scanString();
|
||||
return (scanner as any).scanNumber();
|
||||
}
|
||||
|
||||
function test_integers() {
|
||||
const t1 = scanOne("123");
|
||||
assertToken(t1, "number", 123);
|
||||
|
||||
const t2 = scanOne("-500");
|
||||
assertToken(t2, "number", -500);
|
||||
|
||||
console.log(`${GREEN}✔ Integers passed${RESET}`);
|
||||
}
|
||||
|
||||
function test_floats() {
|
||||
const t1 = scanOne("3.14159");
|
||||
assertToken(t1, "number", 3.14159);
|
||||
|
||||
const t2 = scanOne("-0.001");
|
||||
assertToken(t2, "number", -0.001);
|
||||
|
||||
console.log(`${GREEN}✔ Floats passed${RESET}`);
|
||||
}
|
||||
|
||||
function test_number_errors() {
|
||||
try {
|
||||
scanOne("1."); // Should fail (trailing dot)
|
||||
throw new Error("Should have thrown error for '1.'");
|
||||
} catch (e: any) {
|
||||
assert(e.tag === "InvalidNumber", "Expected InvalidNumber error for '1.'");
|
||||
}
|
||||
|
||||
try {
|
||||
scanOne(".5"); // Should fail (no leading digit)
|
||||
throw new Error("Should have thrown error for '.5'");
|
||||
} catch (e: any) {
|
||||
assert(e.tag === "ExpectedNumber", "Expected ExpectedNumber error for '.5'");
|
||||
}
|
||||
|
||||
console.log(`${GREEN}✔ Number errors passed${RESET}`);
|
||||
}
|
||||
|
||||
function test_basic_strings() {
|
||||
const t1 = scanOne('"hello world"');
|
||||
assertToken(t1, "string", "hello world");
|
||||
|
||||
const t2 = scanOne('""'); // Empty string
|
||||
assertToken(t2, "string", "");
|
||||
|
||||
console.log(`${GREEN}✔ Basic strings passed${RESET}`);
|
||||
}
|
||||
|
||||
function test_string_escapes() {
|
||||
const t1 = scanOne('"line1\\nline2"');
|
||||
assertToken(t1, "string", "line1\nline2");
|
||||
|
||||
const t2 = scanOne('"col1\\tcol2"');
|
||||
assertToken(t2, "string", "col1\tcol2");
|
||||
|
||||
const t3 = scanOne('"quote: \\" slash: \\\\"');
|
||||
assertToken(t3, "string", 'quote: " slash: \\');
|
||||
|
||||
console.log(`${GREEN}✔ String escapes passed${RESET}`);
|
||||
}
|
||||
|
||||
function test_unicode_escapes() {
|
||||
// Rocket emoji: 🚀 (U+1F680)
|
||||
const t1 = scanOne('"\\u{1F680}"');
|
||||
assertToken(t1, "string", "🚀");
|
||||
|
||||
// Two escapes: A (U+41) and B (U+42)
|
||||
const t2 = scanOne('"\\u{41}\\u{42}"');
|
||||
assertToken(t2, "string", "AB");
|
||||
|
||||
// Max valid unicode
|
||||
scanOne('"\\u{10FFFF}"');
|
||||
|
||||
console.log(`${GREEN}✔ Unicode escapes passed${RESET}`);
|
||||
}
|
||||
|
||||
function test_line_counting() {
|
||||
// Mixed line endings:
|
||||
// Line 1: 123 (CRLF)
|
||||
// Line 2: 456 (LF)
|
||||
// Line 3: "foo"
|
||||
const code = "123\r\n456\n\"foo\"";
|
||||
const src = new SourceText(code);
|
||||
const scanner = new Scanner(src);
|
||||
|
||||
// We need to implement a mini-loop here since scanOne creates new scanners
|
||||
// 123
|
||||
let tok = (scanner as any).scanNumber();
|
||||
assert(tok.value === 123, "Line 1 value wrong");
|
||||
assert(tok.span.line === 1, "Line 1 line# wrong");
|
||||
|
||||
// consume whitespace manually since we are bypassing nextToken()
|
||||
(scanner as any).skipWhitespace();
|
||||
|
||||
// 456
|
||||
tok = (scanner as any).scanNumber();
|
||||
assert(tok.value === 456, "Line 2 value wrong");
|
||||
assert(tok.span.line === 2, "Line 2 line# wrong");
|
||||
|
||||
(scanner as any).skipWhitespace();
|
||||
|
||||
// "foo"
|
||||
tok = (scanner as any).scanString();
|
||||
assert(tok.text === "foo", "Line 3 value wrong");
|
||||
assert(tok.span.line === 3, "Line 3 line# wrong");
|
||||
|
||||
console.log(`${GREEN}✔ Line counting passed${RESET}`);
|
||||
}
|
||||
|
||||
// === Run All ===
|
||||
function run_all_tests() {
|
||||
console.log("Running Scanner Tests...\n");
|
||||
try {
|
||||
test_integers();
|
||||
test_floats();
|
||||
test_number_errors();
|
||||
test_basic_strings();
|
||||
test_string_escapes();
|
||||
test_unicode_escapes();
|
||||
test_line_counting();
|
||||
|
||||
console.log(`\n${GREEN}ALL TESTS PASSED${RESET}`);
|
||||
} catch (e: any) {
|
||||
console.error(`\n${RED}TEST FAILED:${RESET}`);
|
||||
console.error(e.message || e);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
run_all_tests();
|
||||
368
src/parser/scanner.ts
Normal file
368
src/parser/scanner.ts
Normal file
|
|
@ -0,0 +1,368 @@
|
|||
|
||||
import { char, isWhitespace, isDigit } from './source_text';
|
||||
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
|
||||
|
||||
function isSymbolChar(c: CodePoint): boolean {
|
||||
return (
|
||||
c === char("#") ||
|
||||
c === char("$") ||
|
||||
c === char("@") ||
|
||||
c === char("(") ||
|
||||
c === char(")") ||
|
||||
c === char("{") ||
|
||||
c === char("}") ||
|
||||
c === char(",") ||
|
||||
c === char(".")
|
||||
);
|
||||
}
|
||||
|
||||
function isIdentifierChar(char: CodePoint): boolean {
|
||||
return !isWhitespace(char) && !isSymbolChar(char);
|
||||
}
|
||||
|
||||
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
|
||||
export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
|
||||
|
||||
// === Scanner ===
|
||||
export type Token =
|
||||
| { tag: "number", value: number, span: Span }
|
||||
| { tag: "string", text: string, span: Span }
|
||||
| { tag: "identifier", text: string, span: Span }
|
||||
| { tag: "keyword", kw: Keyword, span: Span }
|
||||
| { tag: "symbol", sym: Symbol, span: Span }
|
||||
| { tag: "EOF", span: Span }
|
||||
|
||||
export namespace TokenKind {
|
||||
export type T =
|
||||
| { tag: "number" }
|
||||
| { tag: "string" }
|
||||
| { tag: "identifier" }
|
||||
| { tag: "symbol", value: Symbol }
|
||||
| { tag: "keyword", value: Keyword }
|
||||
| { tag: "EOF" }
|
||||
}
|
||||
|
||||
export type LexError =
|
||||
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
|
||||
| { tag: "UnexpectedEOF", span: Span }
|
||||
| { tag: "ExpectedNumber", span: Span }
|
||||
| { tag: "InvalidNumber", text: string, reason: string, span: Span }
|
||||
| { tag: "InvalidEscape", reason: string, span: Span };
|
||||
|
||||
export class Scanner {
|
||||
private i: CodePointIndex = 0;
|
||||
private line = 1;
|
||||
private column = 1;
|
||||
|
||||
// Track previous char to handle \r\n correctly
|
||||
private lastCharWasCR = false;
|
||||
|
||||
constructor(private readonly text: SourceText) {}
|
||||
|
||||
eof(): boolean {
|
||||
return this.i >= this.text.length;
|
||||
}
|
||||
|
||||
private peek(n: number = 0): CodePoint | undefined {
|
||||
return this.text.chars[this.i + n]?.char;
|
||||
}
|
||||
|
||||
private next(): CodePoint | undefined {
|
||||
const ref = this.text.chars[this.i];
|
||||
if (!ref) return undefined;
|
||||
|
||||
const c = ref.char;
|
||||
this.i++;
|
||||
|
||||
if (c === 0x0A /* \n */) {
|
||||
if (!this.lastCharWasCR) {
|
||||
this.line++;
|
||||
this.column = 1;
|
||||
} else {
|
||||
// We just saw \r, so this \n is part of \r\n.
|
||||
// We already bumped the line count on \r.
|
||||
// Just reset the flag.
|
||||
this.lastCharWasCR = false;
|
||||
}
|
||||
} else if (c === 0x0D /* \r */) {
|
||||
this.line++;
|
||||
this.column = 1;
|
||||
this.lastCharWasCR = true;
|
||||
} else {
|
||||
this.column++;
|
||||
this.lastCharWasCR = false;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
private currentOffset(): StringIndex {
|
||||
return this.text.chars[this.i]?.offset ?? this.text.source.length;
|
||||
}
|
||||
|
||||
private currentLocation(): SourceLocation {
|
||||
return { index: this.i, line: this.line, column: this.column };
|
||||
}
|
||||
|
||||
private makeSpan(start: SourceLocation): Span {
|
||||
const startOffset =
|
||||
this.text.chars[start.index]?.offset ?? this.text.source.length;
|
||||
const endOffset = this.currentOffset();
|
||||
|
||||
return {
|
||||
start: startOffset,
|
||||
end: endOffset,
|
||||
line: start.line,
|
||||
column: start.column,
|
||||
};
|
||||
}
|
||||
|
||||
private consumeWhile(pred: (c: CodePoint) => boolean): number {
|
||||
let count = 0;
|
||||
while (!this.eof()) {
|
||||
const c = this.peek();
|
||||
if (c === undefined || !pred(c)) break;
|
||||
this.next();
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private expect(
|
||||
pred: (c: CodePoint) => boolean,
|
||||
error: LexError
|
||||
): CodePoint {
|
||||
const c = this.peek();
|
||||
if (c === undefined || !pred(c)) {
|
||||
throw error;
|
||||
}
|
||||
this.next();
|
||||
return c;
|
||||
}
|
||||
|
||||
// Helper to check for exact char matches quickly
|
||||
private match(c: CodePoint): boolean {
|
||||
if (this.peek() === c) {
|
||||
this.next();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private skipWhitespace() {
|
||||
this.consumeWhile(isWhitespace);
|
||||
}
|
||||
|
||||
// === Main Scanners ===
|
||||
|
||||
private scanNumber(): Token {
|
||||
// number :=
|
||||
// | optional(`-`) digits optional(`.` digits)
|
||||
|
||||
const startNumberLocation = this.currentLocation();
|
||||
|
||||
let c: CodePoint;
|
||||
|
||||
// 1. Optional Sign
|
||||
c = this.peek();
|
||||
if (c === char("-")) {
|
||||
this.next();
|
||||
}
|
||||
|
||||
// 2. Integer Part
|
||||
c = this.peek();
|
||||
const integerPartDigitCount = this.consumeWhile(isDigit);
|
||||
if (integerPartDigitCount === 0) {
|
||||
throw <LexError>{
|
||||
tag: "ExpectedNumber",
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
|
||||
// 3. Fractional Part
|
||||
if (this.peek() === char(".")) {
|
||||
const dotLocation = this.currentLocation();
|
||||
|
||||
this.next(); // consume '.'
|
||||
|
||||
const fracPartDigitCount = this.consumeWhile(isDigit);
|
||||
if (fracPartDigitCount === 0) {
|
||||
throw <LexError>{
|
||||
tag: "InvalidNumber",
|
||||
reason: "MissingFractionalDigits",
|
||||
span: this.makeSpan(dotLocation),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const text = this.text.sliceByCp(startNumberLocation.index, this.i);
|
||||
const value = Number(text);
|
||||
|
||||
if (!Number.isFinite(value)) {
|
||||
throw <LexError>{
|
||||
tag: "InvalidNumber",
|
||||
reason: "NotFinite",
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
return {
|
||||
tag: "number",
|
||||
value,
|
||||
span: this.makeSpan(startNumberLocation),
|
||||
};
|
||||
}
|
||||
|
||||
private scanString(): Token {
|
||||
const start = this.currentLocation();
|
||||
// We assume the caller checked the opening quote '"'
|
||||
this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
|
||||
|
||||
let value = ""; // The actual string content
|
||||
|
||||
while (true) {
|
||||
if (this.eof()) {
|
||||
throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
|
||||
}
|
||||
|
||||
const c = this.peek();
|
||||
|
||||
// 1. End of string
|
||||
if (c === char('"')) {
|
||||
this.next(); // consume closing quote
|
||||
break;
|
||||
}
|
||||
|
||||
if (c === char('\\')) {
|
||||
// 2. Escape Sequences
|
||||
const escapeStart = this.currentLocation();
|
||||
this.next(); // consume backslash
|
||||
const escaped = this.peek();
|
||||
|
||||
switch (escaped) {
|
||||
case char('n'): value += '\n'; this.next(); break;
|
||||
case char('r'): value += '\r'; this.next(); break;
|
||||
case char('t'): value += '\t'; this.next(); break;
|
||||
case char('\\'): value += '\\'; this.next(); break;
|
||||
case char("0"): value += "\0"; break;
|
||||
case char('"'): value += '"'; this.next(); break;
|
||||
// Unicode Escape: \u{XXXX}
|
||||
case char('v'): {
|
||||
const braceStart = 123123;
|
||||
}
|
||||
case char('u'): {
|
||||
this.next(); // consume 'u'
|
||||
|
||||
// 1. Expect '{'
|
||||
const braceStart = this.currentLocation();
|
||||
if (this.peek() !== char('{')) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
this.next(); // consume '{'
|
||||
|
||||
// 2. Consume Hex Digits
|
||||
const hexStart = this.i;
|
||||
const hexCount = this.consumeWhile(c =>
|
||||
(c >= char('0') && c <= char('9')) ||
|
||||
(c >= char('a') && c <= char('f')) ||
|
||||
(c >= char('A') && c <= char('F'))
|
||||
);
|
||||
|
||||
if (hexCount === 0) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
|
||||
// 3. Expect '}'
|
||||
if (this.peek() !== char("}")) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
this.next(); // consume '}'
|
||||
|
||||
// 4. Convert & Append
|
||||
const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
|
||||
const codePoint = parseInt(hexStr, 16);
|
||||
|
||||
if (codePoint > 0x10FFFF) {
|
||||
throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
|
||||
}
|
||||
|
||||
value += String.fromCodePoint(codePoint);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw <LexError>{
|
||||
tag: "InvalidEscape",
|
||||
reason: `UnknownEscapeSequence`,
|
||||
span: this.makeSpan(escapeStart)
|
||||
};
|
||||
}
|
||||
} else {
|
||||
// 3. Regular character
|
||||
// Optimization: consume chunks of non-special chars for speed?
|
||||
// For now, char-by-char is fine.
|
||||
this.next();
|
||||
// Note: We use ! because we checked EOF at loop start
|
||||
value += String.fromCodePoint(c!);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
tag: "string",
|
||||
text: value,
|
||||
span: this.makeSpan(start)
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// TODO: Need a Token to TokenKind function
|
||||
// TODO: Need is_start_of_expression(token): boolean
|
||||
// identifier -> true
|
||||
// symbol # -> true
|
||||
// symbol $ -> true
|
||||
// symbol @ -> true
|
||||
// symbol ( -> true
|
||||
// symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context
|
||||
// symbol _ -> false
|
||||
// number -> true
|
||||
// string -> true
|
||||
// keyword let -> true
|
||||
// keyword fn -> true
|
||||
// keyword apply -> true
|
||||
// keyword = -> false
|
||||
// keyword | -> false
|
||||
// EOF -> false
|
||||
//
|
||||
// TODO: function that matches a token with a token_type (returns bool)
|
||||
|
||||
// TODO: forbidden characters are
|
||||
// '('
|
||||
// ')'
|
||||
// '{'
|
||||
// '}'
|
||||
// '.'
|
||||
// ','
|
||||
// '|'
|
||||
// '$'
|
||||
// '#'
|
||||
// '@'
|
||||
// '"'
|
||||
// ' '
|
||||
// '\r'
|
||||
// '\t'
|
||||
// '\n'
|
||||
// TODO: need function is_forbidden_char
|
||||
|
||||
|
||||
|
||||
// === scanner functions ===
|
||||
// TODO: whitespace - consumes whitespace
|
||||
// TODO: comment - consumes token
|
||||
// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier
|
||||
// TODO: string - consumes string like "foo bar\njfjdsajfksd"
|
||||
// TODO: number - consumes number like 123123 or 000123 or 23919233.123
|
||||
//
|
||||
// TODO: token - gives next token
|
||||
|
||||
125
src/parser/source_text.ts
Normal file
125
src/parser/source_text.ts
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
|
||||
// === Char type ===
|
||||
export type StringIndex = number; // UTF-16 index into string
|
||||
export type CodePointIndex = number; // index into array of code-points
|
||||
export type CodePoint = number; // could also name it `UnicodeCodePoint`. Basically for `s: string` we have `s.codePointAt(i: index): char`.
|
||||
|
||||
export function char(c: string): CodePoint {
|
||||
return c.codePointAt(0)
|
||||
}
|
||||
|
||||
export type CodePointRef = {
|
||||
char: CodePoint,
|
||||
offset: StringIndex,
|
||||
};
|
||||
|
||||
// === Source Text ===
|
||||
export class SourceText {
|
||||
readonly source: string;
|
||||
// TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
|
||||
//
|
||||
readonly chars: CodePointRef[];
|
||||
|
||||
// Stores the CodePointIndex where each line begins
|
||||
readonly lineStarts: CodePointIndex[];
|
||||
|
||||
constructor(rawSource: string) {
|
||||
const source = rawSource.normalize('NFC');
|
||||
|
||||
this.source = source;
|
||||
this.chars = [];
|
||||
this.lineStarts = [0]; // Line 1 always starts at index 0
|
||||
|
||||
let i = 0;
|
||||
while (i < source.length) {
|
||||
const char = source.codePointAt(i) as CodePoint;
|
||||
const cpIndex = this.chars.length;
|
||||
this.chars.push({ char: char, offset: i });
|
||||
|
||||
const size =(char > 0xFFFF ? 2 : 1);
|
||||
i += size;
|
||||
|
||||
// === Newline Logic ===
|
||||
// 0x0A is '\n', 0x0D is '\r'
|
||||
if (char === 0x0A) {
|
||||
// Found a newline, the NEXT char starts a new line
|
||||
this.lineStarts.push(cpIndex + 1);
|
||||
}
|
||||
// Handle CR (Classic Mac) or CRLF start
|
||||
else if (char === 0x0D) {
|
||||
// Check if the next char is '\n' (CRLF)
|
||||
// We peek ahead in the raw string to see if we need to skip the \n for line counting purposes
|
||||
// or just treat this as a newline.
|
||||
const nextIsNL = i < source.length && source.codePointAt(i) === 0x0A;
|
||||
if (!nextIsNL) {
|
||||
// Only push if it's NOT CRLF. If it is CRLF, the loop handles the \n next.
|
||||
this.lineStarts.push(cpIndex + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
get length(): number {
|
||||
return this.chars.length;
|
||||
}
|
||||
|
||||
sliceByCp(start: number, end: number): string {
|
||||
const startRef = this.chars[start];
|
||||
// Handle out of bounds gracefully
|
||||
if (!startRef) return "";
|
||||
|
||||
const startOff = startRef.offset;
|
||||
const endOff = end < this.chars.length
|
||||
? this.chars[end].offset
|
||||
: this.source.length;
|
||||
|
||||
return this.source.slice(startOff, endOff);
|
||||
}
|
||||
|
||||
// Converts a linear Code Point Index into SourceLocation
|
||||
// getLocation(index: CodePointIndex): SourceLocation {
|
||||
// // TODO: can be implemented either by a linear or binary search.
|
||||
// return (0 as any);
|
||||
// }
|
||||
|
||||
// Returns the full text of a specific line (1-based index)
|
||||
getLineText(line: number): string {
|
||||
const lineIndex = line - 1;
|
||||
if (lineIndex < 0 || lineIndex >= this.lineStarts.length) return "";
|
||||
|
||||
const startCp = this.lineStarts[lineIndex];
|
||||
const endCp = (lineIndex + 1 < this.lineStarts.length)
|
||||
? this.lineStarts[lineIndex + 1] - 1 // -1 to exclude the newline char itself
|
||||
: this.chars.length;
|
||||
|
||||
// TODO: Consider removing \r or \n from the end if they exist.
|
||||
return this.sliceByCp(startCp, endCp);
|
||||
}
|
||||
}
|
||||
|
||||
export function sourceText(s: string) {
|
||||
return new SourceText(s);
|
||||
}
|
||||
|
||||
|
||||
export function isWhitespace(char: CodePoint): boolean {
|
||||
return char === 0x20 || char === 0x09 || char === 0x0A || char === 0x0D;
|
||||
}
|
||||
|
||||
export function isDigit(char: CodePoint): boolean {
|
||||
return char >= 0x30 && char <= 0x39;
|
||||
}
|
||||
|
||||
export type Span = {
|
||||
start: StringIndex,
|
||||
end: StringIndex,
|
||||
line: number,
|
||||
column: number,
|
||||
}
|
||||
|
||||
export type SourceLocation = {
|
||||
index: CodePointIndex;
|
||||
line: number; // 1-based
|
||||
column: number; // 1-based
|
||||
};
|
||||
|
||||
|
|
@ -10,3 +10,8 @@ npm install electron-squirrel-startup
|
|||
npm install -D sass-embedded
|
||||
|
||||
|
||||
# Tests
|
||||
|
||||
npx ts-node src/parser/scanner.test.ts
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue