From eb6ade5a3deb4a75b313fe672aea41542ea115cb Mon Sep 17 00:00:00 2001 From: Yura Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Thu, 5 Feb 2026 13:21:44 +0100 Subject: [PATCH] Sketch basic scanner --- package-lock.json | 158 ++++++++++++++++ package.json | 1 + src/SYNTAX.md | 16 +- src/parser/scanner.test.ts | 169 +++++++++++++++++ src/parser/scanner.ts | 368 +++++++++++++++++++++++++++++++++++++ src/parser/source_text.ts | 125 +++++++++++++ tmp_repl/tmp_repl.md | 5 + 7 files changed, 836 insertions(+), 6 deletions(-) create mode 100644 src/parser/scanner.test.ts create mode 100644 src/parser/scanner.ts create mode 100644 src/parser/source_text.ts diff --git a/package-lock.json b/package-lock.json index 0228ac6..b1241a5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -28,6 +28,7 @@ "eslint": "^9.39.2", "eslint-plugin-import": "^2.32.0", "sass-embedded": "^1.97.3", + "ts-node": "^10.9.2", "typescript": "^5.9.3", "vite": "^7.3.1" } @@ -39,6 +40,30 @@ "dev": true, "license": "(Apache-2.0 AND BSD-3-Clause)" }, + "node_modules/@cspotcode/source-map-support": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", + "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/trace-mapping": "0.3.9" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@cspotcode/source-map-support/node_modules/@jridgewell/trace-mapping": { + "version": "0.3.9", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", + "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.0.3", + "@jridgewell/sourcemap-codec": "^1.4.10" + } + }, "node_modules/@electron-forge/cli": { "version": "7.11.1", "resolved": "https://registry.npmjs.org/@electron-forge/cli/-/cli-7.11.1.tgz", @@ -2615,6 +2640,34 @@ "node": ">= 10" } }, + "node_modules/@tsconfig/node10": { + "version": "1.0.12", + "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.12.tgz", + "integrity": "sha512-UCYBaeFvM11aU2y3YPZ//O5Rhj+xKyzy7mvcIoAjASbigy8mHMryP5cK7dgjlz2hWxh1g5pLw084E0a/wlUSFQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@tsconfig/node12": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz", + "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==", + "dev": true, + "license": "MIT" + }, + "node_modules/@tsconfig/node14": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz", + "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==", + "dev": true, + "license": "MIT" + }, + "node_modules/@tsconfig/node16": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", + "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/cacheable-request": { "version": "6.0.3", "resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz", @@ -3248,6 +3301,19 @@ "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, + "node_modules/acorn-walk": { + "version": "8.3.4", + "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.3.4.tgz", + "integrity": "sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==", + "dev": true, + "license": "MIT", + "dependencies": { + "acorn": "^8.11.0" + }, + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/agent-base": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz", @@ -3389,6 +3455,13 @@ "url": "https://github.com/chalk/ansi-styles?sponsor=1" } }, + "node_modules/arg": { + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", + "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==", + "dev": true, + "license": "MIT" + }, "node_modules/argparse": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", @@ -4190,6 +4263,13 @@ "dev": true, "license": "MIT" }, + "node_modules/create-require": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", + "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", + "dev": true, + "license": "MIT" + }, "node_modules/cross-dirname": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/cross-dirname/-/cross-dirname-0.1.0.tgz", @@ -4421,6 +4501,16 @@ "license": "MIT", "optional": true }, + "node_modules/diff": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.4.tgz", + "integrity": "sha512-X07nttJQkwkfKfvTPG/KSnE2OMdcUCao6+eXF3wmnIQRn2aPAHH3VxDbDOdegkd6JbPsXqShpvEOHfAT+nCNwQ==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.3.1" + } + }, "node_modules/dir-compare": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/dir-compare/-/dir-compare-4.2.0.tgz", @@ -7701,6 +7791,13 @@ "node": ">=12" } }, + "node_modules/make-error": { + "version": "1.3.6", + "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", + "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", + "dev": true, + "license": "ISC" + }, "node_modules/make-fetch-happen": { "version": "10.2.1", "resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-10.2.1.tgz", @@ -10779,6 +10876,50 @@ "typescript": ">=4.8.4" } }, + "node_modules/ts-node": { + "version": "10.9.2", + "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.2.tgz", + "integrity": "sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@cspotcode/source-map-support": "^0.8.0", + "@tsconfig/node10": "^1.0.7", + "@tsconfig/node12": "^1.0.7", + "@tsconfig/node14": "^1.0.0", + "@tsconfig/node16": "^1.0.2", + "acorn": "^8.4.1", + "acorn-walk": "^8.1.1", + "arg": "^4.1.0", + "create-require": "^1.1.0", + "diff": "^4.0.1", + "make-error": "^1.1.1", + "v8-compile-cache-lib": "^3.0.1", + "yn": "3.1.1" + }, + "bin": { + "ts-node": "dist/bin.js", + "ts-node-cwd": "dist/bin-cwd.js", + "ts-node-esm": "dist/bin-esm.js", + "ts-node-script": "dist/bin-script.js", + "ts-node-transpile-only": "dist/bin-transpile.js", + "ts-script": "dist/bin-script-deprecated.js" + }, + "peerDependencies": { + "@swc/core": ">=1.2.50", + "@swc/wasm": ">=1.2.50", + "@types/node": "*", + "typescript": ">=2.7" + }, + "peerDependenciesMeta": { + "@swc/core": { + "optional": true + }, + "@swc/wasm": { + "optional": true + } + } + }, "node_modules/tsconfig-paths": { "version": "3.15.0", "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz", @@ -11041,6 +11182,13 @@ "dev": true, "license": "MIT" }, + "node_modules/v8-compile-cache-lib": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", + "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", + "dev": true, + "license": "MIT" + }, "node_modules/validate-npm-package-license": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz", @@ -11558,6 +11706,16 @@ "fd-slicer": "~1.1.0" } }, + "node_modules/yn": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", + "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/yocto-queue": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", diff --git a/package.json b/package.json index c2fd6ba..0170c7b 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "eslint": "^9.39.2", "eslint-plugin-import": "^2.32.0", "sass-embedded": "^1.97.3", + "ts-node": "^10.9.2", "typescript": "^5.9.3", "vite": "^7.3.1" }, diff --git a/src/SYNTAX.md b/src/SYNTAX.md index 4ae6747..2842344 100644 --- a/src/SYNTAX.md +++ b/src/SYNTAX.md @@ -1,4 +1,3 @@ - Syntax is designed to be easily parsable by a recursive-descent/parser-combinators - not necessarily designed for intuitive human use. It is inspired by syntax like LISP, but instead of `(f a b c)` we'd write `f(a, b, c)`. Note that we allow fairly permissive identifiers like @@ -32,21 +31,28 @@ $ @ . , -| ( ) { } ``` -Then there are keywords, which can occur as substrings, but can't just be the keywords (TODO: Not sure if perhaps `|` we could allow as a keyword +Then there are keywords, which can occur as substrings, but can't just be the keywords ``` let fn match = +| ``` +Note that `=` and `|` are treated as keywords, not symbols. So these can occur in identifiers. +This has strange consequences, for example +``` +x=123 +``` +is a valid identifier (sincce there are no spaces) - and this may cause huge confusion for the user when using record syntax or pattern matching on records. +Even though this is borderline evil, I've decided for simplicity's sake to accept this. I'll just have to make sure syntax errors are really good. Examples of expressions in the language (except the top-level function definitions which are here just for illustration) ``` @@ -242,7 +248,5 @@ So for example when parsing `#foo 123`, we parse the tag, the remainder is `123` so we conclude that this has to be a tagged value we're parsing, so the next thing is an expression. But for a case like `#foo, ...`, we see that the next token is `,`, so we must have just parsed a tag - not a tagged value. - - -TODO: +Also: We don't yet have any convinient syntax for creating new records from old (like an immutable update syntax). diff --git a/src/parser/scanner.test.ts b/src/parser/scanner.test.ts new file mode 100644 index 0000000..847a6ed --- /dev/null +++ b/src/parser/scanner.test.ts @@ -0,0 +1,169 @@ +import { SourceText } from "./source_text"; +import { Scanner, Token } from "./scanner"; + +// === Test Harness Utilities === +const RED = "\x1b[31m"; +const GREEN = "\x1b[32m"; +const RESET = "\x1b[0m"; + +function assert(condition: boolean, message: string) { + if (!condition) { + throw new Error(message); + } +} + +function assertToken(token: Token, expectedTag: string, expectedValue?: any) { + assert(token.tag === expectedTag, `Expected tag '${expectedTag}', got '${token.tag}'`); + if (expectedValue !== undefined) { + // Check 'value' for numbers, 'text' for strings + const actualValue = "value" in token ? token.value : "text" in token ? token.text : undefined; + assert(actualValue === expectedValue, `Expected value '${expectedValue}', got '${actualValue}'`); + } +} + +// TODO: Rewrite this once `scanToken()` is implemented. +function scanOne(source: string): Token { + const src = new SourceText(source); + const scanner = new Scanner(src); + // We assume your scanner has a nextToken() method exposed, + // or you make the specific scan methods public for testing. + // Since you likely only expose nextToken() eventually, let's cheat + // and cast to any to access private methods for unit testing specific parts. + // OR: You can just expose 'scanNumber' as public for now. + + // For this test, I will assume we are calling the private methods via 'any' + // to strictly unit test them without the dispatch logic. + if (source.trim().startsWith('"')) return (scanner as any).scanString(); + return (scanner as any).scanNumber(); +} + +function test_integers() { + const t1 = scanOne("123"); + assertToken(t1, "number", 123); + + const t2 = scanOne("-500"); + assertToken(t2, "number", -500); + + console.log(`${GREEN}✔ Integers passed${RESET}`); +} + +function test_floats() { + const t1 = scanOne("3.14159"); + assertToken(t1, "number", 3.14159); + + const t2 = scanOne("-0.001"); + assertToken(t2, "number", -0.001); + + console.log(`${GREEN}✔ Floats passed${RESET}`); +} + +function test_number_errors() { + try { + scanOne("1."); // Should fail (trailing dot) + throw new Error("Should have thrown error for '1.'"); + } catch (e: any) { + assert(e.tag === "InvalidNumber", "Expected InvalidNumber error for '1.'"); + } + + try { + scanOne(".5"); // Should fail (no leading digit) + throw new Error("Should have thrown error for '.5'"); + } catch (e: any) { + assert(e.tag === "ExpectedNumber", "Expected ExpectedNumber error for '.5'"); + } + + console.log(`${GREEN}✔ Number errors passed${RESET}`); +} + +function test_basic_strings() { + const t1 = scanOne('"hello world"'); + assertToken(t1, "string", "hello world"); + + const t2 = scanOne('""'); // Empty string + assertToken(t2, "string", ""); + + console.log(`${GREEN}✔ Basic strings passed${RESET}`); +} + +function test_string_escapes() { + const t1 = scanOne('"line1\\nline2"'); + assertToken(t1, "string", "line1\nline2"); + + const t2 = scanOne('"col1\\tcol2"'); + assertToken(t2, "string", "col1\tcol2"); + + const t3 = scanOne('"quote: \\" slash: \\\\"'); + assertToken(t3, "string", 'quote: " slash: \\'); + + console.log(`${GREEN}✔ String escapes passed${RESET}`); +} + +function test_unicode_escapes() { + // Rocket emoji: 🚀 (U+1F680) + const t1 = scanOne('"\\u{1F680}"'); + assertToken(t1, "string", "🚀"); + + // Two escapes: A (U+41) and B (U+42) + const t2 = scanOne('"\\u{41}\\u{42}"'); + assertToken(t2, "string", "AB"); + + // Max valid unicode + scanOne('"\\u{10FFFF}"'); + + console.log(`${GREEN}✔ Unicode escapes passed${RESET}`); +} + +function test_line_counting() { + // Mixed line endings: + // Line 1: 123 (CRLF) + // Line 2: 456 (LF) + // Line 3: "foo" + const code = "123\r\n456\n\"foo\""; + const src = new SourceText(code); + const scanner = new Scanner(src); + + // We need to implement a mini-loop here since scanOne creates new scanners + // 123 + let tok = (scanner as any).scanNumber(); + assert(tok.value === 123, "Line 1 value wrong"); + assert(tok.span.line === 1, "Line 1 line# wrong"); + + // consume whitespace manually since we are bypassing nextToken() + (scanner as any).skipWhitespace(); + + // 456 + tok = (scanner as any).scanNumber(); + assert(tok.value === 456, "Line 2 value wrong"); + assert(tok.span.line === 2, "Line 2 line# wrong"); + + (scanner as any).skipWhitespace(); + + // "foo" + tok = (scanner as any).scanString(); + assert(tok.text === "foo", "Line 3 value wrong"); + assert(tok.span.line === 3, "Line 3 line# wrong"); + + console.log(`${GREEN}✔ Line counting passed${RESET}`); +} + +// === Run All === +function run_all_tests() { + console.log("Running Scanner Tests...\n"); + try { + test_integers(); + test_floats(); + test_number_errors(); + test_basic_strings(); + test_string_escapes(); + test_unicode_escapes(); + test_line_counting(); + + console.log(`\n${GREEN}ALL TESTS PASSED${RESET}`); + } catch (e: any) { + console.error(`\n${RED}TEST FAILED:${RESET}`); + console.error(e.message || e); + process.exit(1); + } +} + +run_all_tests(); diff --git a/src/parser/scanner.ts b/src/parser/scanner.ts new file mode 100644 index 0000000..a3fefd7 --- /dev/null +++ b/src/parser/scanner.ts @@ -0,0 +1,368 @@ + +import { char, isWhitespace, isDigit } from './source_text'; +import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text'; + +function isSymbolChar(c: CodePoint): boolean { + return ( + c === char("#") || + c === char("$") || + c === char("@") || + c === char("(") || + c === char(")") || + c === char("{") || + c === char("}") || + c === char(",") || + c === char(".") + ); +} + +function isIdentifierChar(char: CodePoint): boolean { + return !isWhitespace(char) && !isSymbolChar(char); +} + +export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|"; +export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | "."; + +// === Scanner === +export type Token = + | { tag: "number", value: number, span: Span } + | { tag: "string", text: string, span: Span } + | { tag: "identifier", text: string, span: Span } + | { tag: "keyword", kw: Keyword, span: Span } + | { tag: "symbol", sym: Symbol, span: Span } + | { tag: "EOF", span: Span } + +export namespace TokenKind { + export type T = + | { tag: "number" } + | { tag: "string" } + | { tag: "identifier" } + | { tag: "symbol", value: Symbol } + | { tag: "keyword", value: Keyword } + | { tag: "EOF" } +} + +export type LexError = + | { tag: "UnexpectedCharacter", char: CodePoint, span: Span } + | { tag: "UnexpectedEOF", span: Span } + | { tag: "ExpectedNumber", span: Span } + | { tag: "InvalidNumber", text: string, reason: string, span: Span } + | { tag: "InvalidEscape", reason: string, span: Span }; + +export class Scanner { + private i: CodePointIndex = 0; + private line = 1; + private column = 1; + + // Track previous char to handle \r\n correctly + private lastCharWasCR = false; + + constructor(private readonly text: SourceText) {} + + eof(): boolean { + return this.i >= this.text.length; + } + + private peek(n: number = 0): CodePoint | undefined { + return this.text.chars[this.i + n]?.char; + } + + private next(): CodePoint | undefined { + const ref = this.text.chars[this.i]; + if (!ref) return undefined; + + const c = ref.char; + this.i++; + + if (c === 0x0A /* \n */) { + if (!this.lastCharWasCR) { + this.line++; + this.column = 1; + } else { + // We just saw \r, so this \n is part of \r\n. + // We already bumped the line count on \r. + // Just reset the flag. + this.lastCharWasCR = false; + } + } else if (c === 0x0D /* \r */) { + this.line++; + this.column = 1; + this.lastCharWasCR = true; + } else { + this.column++; + this.lastCharWasCR = false; + } + + return c; + } + + + private currentOffset(): StringIndex { + return this.text.chars[this.i]?.offset ?? this.text.source.length; + } + + private currentLocation(): SourceLocation { + return { index: this.i, line: this.line, column: this.column }; + } + + private makeSpan(start: SourceLocation): Span { + const startOffset = + this.text.chars[start.index]?.offset ?? this.text.source.length; + const endOffset = this.currentOffset(); + + return { + start: startOffset, + end: endOffset, + line: start.line, + column: start.column, + }; + } + + private consumeWhile(pred: (c: CodePoint) => boolean): number { + let count = 0; + while (!this.eof()) { + const c = this.peek(); + if (c === undefined || !pred(c)) break; + this.next(); + count++; + } + return count; + } + + private expect( + pred: (c: CodePoint) => boolean, + error: LexError + ): CodePoint { + const c = this.peek(); + if (c === undefined || !pred(c)) { + throw error; + } + this.next(); + return c; + } + + // Helper to check for exact char matches quickly + private match(c: CodePoint): boolean { + if (this.peek() === c) { + this.next(); + return true; + } + return false; + } + + private skipWhitespace() { + this.consumeWhile(isWhitespace); + } + + // === Main Scanners === + + private scanNumber(): Token { + // number := + // | optional(`-`) digits optional(`.` digits) + + const startNumberLocation = this.currentLocation(); + + let c: CodePoint; + + // 1. Optional Sign + c = this.peek(); + if (c === char("-")) { + this.next(); + } + + // 2. Integer Part + c = this.peek(); + const integerPartDigitCount = this.consumeWhile(isDigit); + if (integerPartDigitCount === 0) { + throw { + tag: "ExpectedNumber", + span: this.makeSpan(startNumberLocation), + }; + } + + // 3. Fractional Part + if (this.peek() === char(".")) { + const dotLocation = this.currentLocation(); + + this.next(); // consume '.' + + const fracPartDigitCount = this.consumeWhile(isDigit); + if (fracPartDigitCount === 0) { + throw { + tag: "InvalidNumber", + reason: "MissingFractionalDigits", + span: this.makeSpan(dotLocation), + }; + } + } + + + const text = this.text.sliceByCp(startNumberLocation.index, this.i); + const value = Number(text); + + if (!Number.isFinite(value)) { + throw { + tag: "InvalidNumber", + reason: "NotFinite", + span: this.makeSpan(startNumberLocation), + }; + } + return { + tag: "number", + value, + span: this.makeSpan(startNumberLocation), + }; + } + + private scanString(): Token { + const start = this.currentLocation(); + // We assume the caller checked the opening quote '"' + this.expect(c => c === char('"'), { tag: "UnexpectedCharacter", span: this.makeSpan(start) }); + + let value = ""; // The actual string content + + while (true) { + if (this.eof()) { + throw { tag: "UnexpectedEOF", span: this.makeSpan(start) }; + } + + const c = this.peek(); + + // 1. End of string + if (c === char('"')) { + this.next(); // consume closing quote + break; + } + + if (c === char('\\')) { + // 2. Escape Sequences + const escapeStart = this.currentLocation(); + this.next(); // consume backslash + const escaped = this.peek(); + + switch (escaped) { + case char('n'): value += '\n'; this.next(); break; + case char('r'): value += '\r'; this.next(); break; + case char('t'): value += '\t'; this.next(); break; + case char('\\'): value += '\\'; this.next(); break; + case char("0"): value += "\0"; break; + case char('"'): value += '"'; this.next(); break; + // Unicode Escape: \u{XXXX} + case char('v'): { + const braceStart = 123123; + } + case char('u'): { + this.next(); // consume 'u' + + // 1. Expect '{' + const braceStart = this.currentLocation(); + if (this.peek() !== char('{')) { + throw { tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) }; + } + this.next(); // consume '{' + + // 2. Consume Hex Digits + const hexStart = this.i; + const hexCount = this.consumeWhile(c => + (c >= char('0') && c <= char('9')) || + (c >= char('a') && c <= char('f')) || + (c >= char('A') && c <= char('F')) + ); + + if (hexCount === 0) { + throw { tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) }; + } + + // 3. Expect '}' + if (this.peek() !== char("}")) { + throw { tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) }; + } + this.next(); // consume '}' + + // 4. Convert & Append + const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount); + const codePoint = parseInt(hexStr, 16); + + if (codePoint > 0x10FFFF) { + throw { tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) }; + } + + value += String.fromCodePoint(codePoint); + break; + } + default: + throw { + tag: "InvalidEscape", + reason: `UnknownEscapeSequence`, + span: this.makeSpan(escapeStart) + }; + } + } else { + // 3. Regular character + // Optimization: consume chunks of non-special chars for speed? + // For now, char-by-char is fine. + this.next(); + // Note: We use ! because we checked EOF at loop start + value += String.fromCodePoint(c!); + } + } + + return { + tag: "string", + text: value, + span: this.makeSpan(start) + }; + } + +} + + +// TODO: Need a Token to TokenKind function +// TODO: Need is_start_of_expression(token): boolean +// identifier -> true +// symbol # -> true +// symbol $ -> true +// symbol @ -> true +// symbol ( -> true +// symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context +// symbol _ -> false +// number -> true +// string -> true +// keyword let -> true +// keyword fn -> true +// keyword apply -> true +// keyword = -> false +// keyword | -> false +// EOF -> false +// +// TODO: function that matches a token with a token_type (returns bool) + +// TODO: forbidden characters are +// '(' +// ')' +// '{' +// '}' +// '.' +// ',' +// '|' +// '$' +// '#' +// '@' +// '"' +// ' ' +// '\r' +// '\t' +// '\n' +// TODO: need function is_forbidden_char + + + +// === scanner functions === +// TODO: whitespace - consumes whitespace +// TODO: comment - consumes token +// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier +// TODO: string - consumes string like "foo bar\njfjdsajfksd" +// TODO: number - consumes number like 123123 or 000123 or 23919233.123 +// +// TODO: token - gives next token + diff --git a/src/parser/source_text.ts b/src/parser/source_text.ts new file mode 100644 index 0000000..9614f97 --- /dev/null +++ b/src/parser/source_text.ts @@ -0,0 +1,125 @@ + +// === Char type === +export type StringIndex = number; // UTF-16 index into string +export type CodePointIndex = number; // index into array of code-points +export type CodePoint = number; // could also name it `UnicodeCodePoint`. Basically for `s: string` we have `s.codePointAt(i: index): char`. + +export function char(c: string): CodePoint { + return c.codePointAt(0) +} + +export type CodePointRef = { + char: CodePoint, + offset: StringIndex, +}; + +// === Source Text === +export class SourceText { + readonly source: string; + // TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string. + // + readonly chars: CodePointRef[]; + + // Stores the CodePointIndex where each line begins + readonly lineStarts: CodePointIndex[]; + + constructor(rawSource: string) { + const source = rawSource.normalize('NFC'); + + this.source = source; + this.chars = []; + this.lineStarts = [0]; // Line 1 always starts at index 0 + + let i = 0; + while (i < source.length) { + const char = source.codePointAt(i) as CodePoint; + const cpIndex = this.chars.length; + this.chars.push({ char: char, offset: i }); + + const size =(char > 0xFFFF ? 2 : 1); + i += size; + + // === Newline Logic === + // 0x0A is '\n', 0x0D is '\r' + if (char === 0x0A) { + // Found a newline, the NEXT char starts a new line + this.lineStarts.push(cpIndex + 1); + } + // Handle CR (Classic Mac) or CRLF start + else if (char === 0x0D) { + // Check if the next char is '\n' (CRLF) + // We peek ahead in the raw string to see if we need to skip the \n for line counting purposes + // or just treat this as a newline. + const nextIsNL = i < source.length && source.codePointAt(i) === 0x0A; + if (!nextIsNL) { + // Only push if it's NOT CRLF. If it is CRLF, the loop handles the \n next. + this.lineStarts.push(cpIndex + 1); + } + } + } + } + + get length(): number { + return this.chars.length; + } + + sliceByCp(start: number, end: number): string { + const startRef = this.chars[start]; + // Handle out of bounds gracefully + if (!startRef) return ""; + + const startOff = startRef.offset; + const endOff = end < this.chars.length + ? this.chars[end].offset + : this.source.length; + + return this.source.slice(startOff, endOff); + } + + // Converts a linear Code Point Index into SourceLocation + // getLocation(index: CodePointIndex): SourceLocation { + // // TODO: can be implemented either by a linear or binary search. + // return (0 as any); + // } + + // Returns the full text of a specific line (1-based index) + getLineText(line: number): string { + const lineIndex = line - 1; + if (lineIndex < 0 || lineIndex >= this.lineStarts.length) return ""; + + const startCp = this.lineStarts[lineIndex]; + const endCp = (lineIndex + 1 < this.lineStarts.length) + ? this.lineStarts[lineIndex + 1] - 1 // -1 to exclude the newline char itself + : this.chars.length; + + // TODO: Consider removing \r or \n from the end if they exist. + return this.sliceByCp(startCp, endCp); + } +} + +export function sourceText(s: string) { + return new SourceText(s); +} + + +export function isWhitespace(char: CodePoint): boolean { + return char === 0x20 || char === 0x09 || char === 0x0A || char === 0x0D; +} + +export function isDigit(char: CodePoint): boolean { + return char >= 0x30 && char <= 0x39; +} + +export type Span = { + start: StringIndex, + end: StringIndex, + line: number, + column: number, +} + +export type SourceLocation = { + index: CodePointIndex; + line: number; // 1-based + column: number; // 1-based +}; + diff --git a/tmp_repl/tmp_repl.md b/tmp_repl/tmp_repl.md index e28a033..3cc888d 100644 --- a/tmp_repl/tmp_repl.md +++ b/tmp_repl/tmp_repl.md @@ -10,3 +10,8 @@ npm install electron-squirrel-startup npm install -D sass-embedded +# Tests + +npx ts-node src/parser/scanner.test.ts + +