Improve and abstract Cursor design. Start scanner

This commit is contained in:
Yura Dupyn 2026-02-06 00:38:16 +01:00
parent d382b16e6d
commit d5f9777711
8 changed files with 713 additions and 476 deletions

47
src/parser/SCANNER.md Normal file
View file

@ -0,0 +1,47 @@
Suppose that we have even the parser for expressions implemented, and it is run on some string. It should be an expression. To do this correctly, the parser needs to decide which sub-parsers to activate - so we need to use the scanner to determine info about what sort of an expression we may be looking at.
It can either be
| number
| string
| variable-use
| tag
| tagged-value
| tuple
| record
| top-level function call
| match
| let binding
| lambda-abstraction
| lambda-application
The syntax is designed in such a way that we don't need to do a deep-lookahead.
Basically in the scanner we try the following in this exact order to determine what to parse next:
- peek one character
- digit?, yes -> number
- symbol `-`?, peek at the next token,
- digit? -> number
- not-a-digit? -> the only possibility is that we're looking at an identifier that happens to start with `-`, and we discuss how to handle identifiers at the end of this process...
- symbol `"`?, yes -> string
- symbol `$`?, yes -> variable-use
- symbol `#`?, yes ->
- we have either a tag or a tagged-value (we don't know which yet... ). At a first glance it seems it is not scanner's job to determine which - more of a parser's job.
TODO:
But actually we can do this in scanner. After we scan the tag-name, we can do a lookahead and see if the next char is NOT-EXPRESSION-START, which is possible to do in our grammar.
- symbol `(`?, yes -> tuple
- symbol `{`?, yes -> record
- here the only possibilities are: top-level function call or some construct that starts syntactically with a keyword or an error. So the scanner needs to attempta raw-identifier scan.
- keyword? easy
- match? -> match-expression
- let? -> let-expression
- fn? -> lambda-abstraction
- apply? -> lambda-application
- not-a-keyword? then we need to validate:
It is either a valid identifier (in this specific case function-name) or invalid identifier.
In case it is an invalid identifier, it would be nice to also predict typical errors (like are we looking at punctuation?) - but this is a bit hard - maybe this shouldn't be the job of the scanner - parser will have more context in general. In this case, we're just expecting a top-level function call. But this kind of analysis will have to be done for other invalid identifiers...
raw-identifier scan just consumes any character until it encounters a delimiter `$ # @ ( ) { } " , . \` or whitespace (I include comments as whitespace here).
The job of a validator for a raw identifier is to decide whether we're looking at a keyword or an actual valid identifier.

237
src/parser/cursor.test.ts Normal file
View file

@ -0,0 +1,237 @@
import { SourceText } from "./source_text";
import { Cursor, scanString, scanNumber } from "./cursor";
import { Result } from "../result";
// === Test Harness Utilities ===
const RED = "\x1b[31m";
const GREEN = "\x1b[32m";
const RESET = "\x1b[0m";
function assert(condition: boolean, message: string) {
if (!condition) {
throw new Error(message);
}
}
// Helper to assert that a scan result is OK and matches expected value
function assertOk<T>(result: Result<T, any>, expectedValue: any) {
if (result.tag === "error") {
// If it's a structured error, pretty print it
const err = result.error as any;
const reason = err.reason ? ` (Reason: ${JSON.stringify(err.reason)})` : "";
throw new Error(`Expected Ok, got Error: ${err.tag}${reason}`);
}
// Unwrap the value
const actual = (result.value as any).value; // Both number and string results have a .value property
assert(actual === expectedValue, `Expected value '${expectedValue}', got '${actual}'`);
}
// Helper to assert that a scan result is an Error with specific tags
function assertError(result: Result<any, any>, expectedTag: string, expectedReason?: string | object) {
if (result.tag === "ok") {
throw new Error(`Expected Error '${expectedTag}', but got Ok with value: ${(result.value as any).value}`);
}
const err = result.error;
assert(err.tag === expectedTag, `Expected error tag '${expectedTag}', got '${err.tag}'`);
if (expectedReason !== undefined) {
// If reason is an object (like for Unicode errors), compare deeply-ish
if (typeof expectedReason === 'object') {
const actualReason = (err as any).reason;
assert(JSON.stringify(actualReason) === JSON.stringify(expectedReason),
`Expected reason ${JSON.stringify(expectedReason)}, got ${JSON.stringify(actualReason)}`);
} else {
assert((err as any).reason === expectedReason,
`Expected reason '${expectedReason}', got '${(err as any).reason}'`);
}
}
}
// === Number Tests ===
function test_integers() {
const src = new SourceText("123");
const cursor = new Cursor(src);
const result = scanNumber(cursor);
assertOk(result, 123);
const src2 = new SourceText("-500");
const cursor2 = new Cursor(src2);
const result2 = scanNumber(cursor2);
assertOk(result2, -500);
console.log(`${GREEN}✔ Integers passed${RESET}`);
}
function test_floats() {
const src = new SourceText("3.14159");
const cursor = new Cursor(src);
const result = scanNumber(cursor);
assertOk(result, 3.14159);
const src2 = new SourceText("-0.001");
const cursor2 = new Cursor(src2);
const result2 = scanNumber(cursor2);
assertOk(result2, -0.001);
console.log(`${GREEN}✔ Floats passed${RESET}`);
}
function test_number_errors() {
// 1. Trailing Dot
const c1 = new Cursor(new SourceText("1."));
const r1 = scanNumber(c1);
assertError(r1, "InvalidNumber", "MissingFractionalDigits");
// 2. No leading digit (.5)
// Let's test "Saw Sign but no digits" which is a hard error
const c2 = new Cursor(new SourceText("-")); // Just a minus
const r2 = scanNumber(c2);
assertError(r2, "ExpectedNumber");
console.log(`${GREEN}✔ Number errors passed${RESET}`);
}
// === String Tests ===
function test_basic_strings() {
const src = new SourceText('"hello world"');
const cursor = new Cursor(src);
const result = scanString(cursor);
assertOk(result, "hello world");
const src2 = new SourceText('""'); // Empty string
const cursor2 = new Cursor(src2);
const result2 = scanString(cursor2);
assertOk(result2, "");
console.log(`${GREEN}✔ Basic strings passed${RESET}`);
}
function test_string_escapes() {
const src = new SourceText('"line1\\nline2"');
const cursor = new Cursor(src);
const result = scanString(cursor);
assertOk(result, "line1\nline2");
const src2 = new SourceText('"col1\\tcol2"');
const cursor2 = new Cursor(src2);
const result2 = scanString(cursor2);
assertOk(result2, "col1\tcol2");
const src3 = new SourceText('"quote: \\" slash: \\\\"');
const cursor3 = new Cursor(src3);
const result3 = scanString(cursor3);
assertOk(result3, 'quote: " slash: \\');
// Null byte test
const src4 = new SourceText('"null\\0byte"');
const cursor4 = new Cursor(src4);
const result4 = scanString(cursor4);
assertOk(result4, "null\0byte");
console.log(`${GREEN}✔ String escapes passed${RESET}`);
}
function test_unicode_escapes() {
// Rocket emoji: 🚀 (U+1F680)
const c1 = new Cursor(new SourceText('"\\u{1F680}"'));
assertOk(scanString(c1), "🚀");
// Two escapes
const c2 = new Cursor(new SourceText('"\\u{41}\\u{42}"'));
assertOk(scanString(c2), "AB");
// Error: Missing Brace
const c3 = new Cursor(new SourceText('"\\u1F680"'));
assertError(scanString(c3), "InvalidEscape", { tag: "UnicodeMissingBrace" });
// Error: Empty
const c4 = new Cursor(new SourceText('"\\u{}"'));
assertError(scanString(c4), "InvalidEscape", { tag: "UnicodeNoDigits" });
// Error: Overflow
const c5 = new Cursor(new SourceText('"\\u{110000}"'));
const res5 = scanString(c5);
// Need to check the value inside the reason for overflow
if (res5.tag === 'ok') throw new Error("Should have failed overflow");
const err = res5.error as any;
assert(err.tag === "InvalidEscape", "Wrong tag");
assert(err.reason.tag === "UnicodeOverflow", "Wrong reason tag");
assert(err.reason.value === 0x110000, "Wrong overflow value");
console.log(`${GREEN}✔ Unicode escapes passed${RESET}`);
}
function test_cursor_tracking() {
// Mixed line endings:
// Line 1: 123 (CRLF)
// Line 2: 456 (LF)
// Line 3: "foo"
const code = "123\r\n456\n\"foo\"";
const src = new SourceText(code);
const cursor = new Cursor(src);
// 1. Scan 123
const r1 = scanNumber(cursor);
assertOk(r1, 123);
// Check location after scan (should be after 123, before \r)
// Actually r1.value.span tells us where the token WAS.
const span1 = (r1 as any).value.span;
assert(span1.line === 1, "Line 1 line# wrong");
assert(span1.column === 1, "Line 1 col# wrong");
// 2. Skip Whitespace (Scanner logic simulation)
// We need to manually skip \r\n
cursor.next(); // \r
cursor.next(); // \n
// 3. Scan 456
const r2 = scanNumber(cursor);
assertOk(r2, 456);
const span2 = (r2 as any).value.span;
assert(span2.line === 2, "Line 2 line# wrong");
// 4. Skip \n
cursor.next();
// 5. Scan "foo"
const r3 = scanString(cursor);
assertOk(r3, "foo");
const span3 = (r3 as any).value.span;
assert(span3.line === 3, "Line 3 line# wrong");
console.log(`${GREEN}✔ Cursor tracking passed${RESET}`);
}
// === Run All ===
function run_all_tests() {
console.log("Running Scanner Tests...\n");
try {
test_integers();
test_floats();
test_number_errors();
test_basic_strings();
test_string_escapes();
test_unicode_escapes();
test_cursor_tracking();
console.log(`\n${GREEN}ALL TESTS PASSED${RESET}`);
} catch (e: any) {
console.error(`\n${RED}TEST FAILED:${RESET}`);
console.error(e.message || e);
process.exit(1);
}
}
run_all_tests();

338
src/parser/cursor.ts Normal file
View file

@ -0,0 +1,338 @@
import { char, NEW_LINE, CARRIAGE_RETURN, DOT, DIGIT_0, DIGIT_9, LOWERCASE_a, LOWERCASE_f, UPPERCASE_A, UPPERCASE_F, SPACE, TAB } from './source_text';
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
import { Result } from '../result';
export type CursorState = {
index: CodePointIndex,
line: number,
column: number,
lastCharWasCR: boolean,
}
export class Cursor {
private index: CodePointIndex = 0;
private line: number = 1;
private column: number = 1;
// Track previous char to handle \r\n correctly
private lastCharWasCR: boolean = false;
constructor(readonly text: SourceText) {}
save(): CursorState {
return { index: this.index, line: this.line, column: this.column, lastCharWasCR: this.lastCharWasCR };
}
restore({ index, line, column, lastCharWasCR }: CursorState) {
this.index = index;
this.line = line;
this.column = column;
this.lastCharWasCR = lastCharWasCR;
}
eof(): boolean {
return this.index >= this.text.length;
}
peek(n: number = 0): CodePoint | undefined {
return this.text.chars[this.index + n]?.char;
}
next(): CodePoint | undefined {
const ref = this.text.chars[this.index];
if (!ref) return undefined;
const c = ref.char;
this.index++;
if (c === NEW_LINE) {
if (!this.lastCharWasCR) {
this.line++;
this.column = 1;
} else {
// We just saw \r, so this \n is part of \r\n.
// We already bumped the line count on \r.
// Just reset the flag.
this.lastCharWasCR = false;
}
} else if (c === CARRIAGE_RETURN) {
this.line++;
this.column = 1;
this.lastCharWasCR = true;
} else {
this.column++;
this.lastCharWasCR = false;
}
return c;
}
get currentIndex(): CodePointIndex {
return this.index;
}
currentOffset(): StringIndex {
return this.text.chars[this.index]?.offset ?? this.text.source.length;
}
currentLocation(): SourceLocation {
return { index: this.index, line: this.line, column: this.column };
}
makeSpan(start: SourceLocation): Span {
const startOffset =
this.text.chars[start.index]?.offset ?? this.text.source.length;
const endOffset = this.currentOffset();
return {
start: startOffset,
end: endOffset,
line: start.line,
column: start.column,
};
}
consumeWhile(pred: (c: CodePoint) => boolean): number {
let count = 0;
while (!this.eof()) {
const c = this.peek();
if (c === undefined || !pred(c)) break;
this.next();
count++;
}
return count;
}
// Helper to check for exact char matches quickly
match(c: CodePoint): boolean {
if (this.peek() === c) {
this.next();
return true;
}
return false;
}
// Checks if the next characters match the string.
// If yes, consumes them and returns true.
// If no, touches nothing and returns false.
matchString(str: string): boolean {
if (this.index + str.length > this.text.length) return false;
const slice = this.text.sliceByCp(this.index, this.index + str.length);
if (slice !== str) return false;
for (let i = 0; i < str.length; i++) {
// We must call next() to correctly update line/col tracking.
// We already know it matches, so we just burn through.
this.next();
}
return true;
}
}
// === Basic Scanners/Predicates ===
export function isWhitespace(char: CodePoint): boolean {
return char === SPACE || char === TAB || char === NEW_LINE || char === CARRIAGE_RETURN;
}
export function isDigit(char: CodePoint): boolean {
return char >= DIGIT_0 && char <= DIGIT_9;
}
export type GenericScanError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "UnexpectedEOF", span: Span }
export type NumberError =
| { tag: "ExpectedNumber", span: Span }
| { tag: "InvalidNumber", reason: "NotFinite" | "MissingFractionalDigits", span: Span }
export function scanNumber(cursor: Cursor): Result<{ value: number, span: Span }, NumberError> {
// number :=
// | optional(`-`) digits optional(`.` digits)
const startNumberLocation = cursor.currentLocation();
let c: CodePoint;
// 1. Optional Sign
c = cursor.peek();
if (c === char("-")) {
cursor.next();
}
// 2. Integer Part
c = cursor.peek();
const integerPartDigitCount = cursor.consumeWhile(isDigit);
if (integerPartDigitCount === 0) {
return Result.error({
tag: "ExpectedNumber",
span: cursor.makeSpan(startNumberLocation),
});
}
// 3. Fractional Part
if (cursor.peek() === DOT) {
const dotLocation = cursor.currentLocation();
cursor.next(); // consume '.'
const fracPartDigitCount = cursor.consumeWhile(isDigit);
if (fracPartDigitCount === 0) {
return Result.error({
tag: "InvalidNumber",
reason: "MissingFractionalDigits",
span: cursor.makeSpan(dotLocation),
});
}
}
const text = cursor.text.sliceByCp(startNumberLocation.index, cursor.currentIndex);
const value = Number(text);
if (!Number.isFinite(value)) {
return Result.error({
tag: "InvalidNumber",
reason: "NotFinite",
span: cursor.makeSpan(startNumberLocation),
});
}
return Result.ok({
value,
span: cursor.makeSpan(startNumberLocation),
});
}
export type StringError =
| { tag: "InvalidEscape", reason: EscapeErrorReason, span: Span };
export type EscapeErrorReason =
| { tag: "UnknownEscapeSequence", char: CodePoint } // e.g. \k
| { tag: "UnicodeMissingBrace" } // \u without {
| { tag: "UnicodeNoDigits" } // \u{}
| { tag: "UnicodeUnclosed" } // \u{FF without }
| { tag: "UnicodeOverflow", value: number }; // \u{110000}
export function scanString(cursor: Cursor): Result<{ value: string, span: Span }, StringError | GenericScanError> {
const start = cursor.currentLocation();
const firstChar = cursor.peek();
if (firstChar === undefined) {
return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
}
if (firstChar !== char('"')) {
return Result.error({ tag: "UnexpectedCharacter", char: firstChar, span: cursor.makeSpan(start) });
}
cursor.next();
let value = ""; // The actual string content
while (true) {
if (cursor.eof()) {
return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
}
const c = cursor.peek();
// 1. End of string
if (c === char('"')) {
cursor.next(); // consume closing quote
break;
}
if (c === char('\\')) {
// 2. Escape Sequences
const escapeStart = cursor.currentLocation();
cursor.next(); // consume backslash
const escaped = cursor.peek();
switch (escaped) {
case char('n'): value += '\n'; cursor.next(); break;
case char('r'): value += '\r'; cursor.next(); break;
case char('t'): value += '\t'; cursor.next(); break;
case char('\\'): value += '\\'; cursor.next(); break;
case char("0"): value += "\0"; cursor.next(); break;
case char('"'): value += '"'; cursor.next(); break;
// Unicode Escape: \u{XXXX}
case char('u'): {
cursor.next(); // consume 'u'
// Expect '{'
const braceStart = cursor.currentLocation();
if (cursor.peek() !== char('{')) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeMissingBrace" }, span: cursor.makeSpan(braceStart) });
}
cursor.next(); // consume '{'
// Consume Hex Digits
const hexStart = cursor.currentIndex;
const hexCount = cursor.consumeWhile(c =>
(c >= DIGIT_0 && c <= DIGIT_9) ||
(c >= LOWERCASE_a && c <= LOWERCASE_f) ||
(c >= UPPERCASE_A && c <= UPPERCASE_F)
);
if (hexCount === 0) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeNoDigits" }, span: cursor.makeSpan(braceStart) });
}
// Expect '}'
if (cursor.peek() !== char("}")) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeUnclosed" }, span: cursor.makeSpan(braceStart) });
}
cursor.next(); // consume '}'
// Convert & Append
const hexStr = cursor.text.sliceByCp(hexStart, hexStart + hexCount);
const codePoint = parseInt(hexStr, 16);
if (codePoint > 0x10FFFF) {
return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeOverflow", value: codePoint }, span: cursor.makeSpan(braceStart) });
}
value += String.fromCodePoint(codePoint);
break;
}
default:
return Result.error({
tag: "InvalidEscape",
reason: { tag: "UnknownEscapeSequence", char: escaped },
span: cursor.makeSpan(escapeStart)
});
}
} else {
// 3. Regular character
// Optimization: consume chunks of non-special chars for speed?
// For now, char-by-char is fine.
cursor.next();
// Note: We use ! because we checked EOF at loop start
value += String.fromCodePoint(c!);
}
}
return Result.ok({
value,
span: cursor.makeSpan(start)
});
}
// TODO: rendering of errors
// function renderStringError(err: StringError): string {
// switch (err.tag) {
// case "ExpectedQuote": return "Expected a string starting with \"";
// case "UnexpectedEOF": return "Unterminated string literal";
// case "InvalidEscape":
// const k = err.kind;
// switch (k.tag) {
// case "Unknown":
// return `Unknown escape sequence '\\${String.fromCodePoint(k.char)}'`;
// case "UnicodeMissingBrace":
// return "Unicode escape must start with '{', e.g. \\u{1F600}";
// case "UnicodeNoDigits":
// return "Empty unicode escape \\u{}";
// case "UnicodeUnclosed":
// return "Expected '}' to close unicode escape";
// case "UnicodeOverflow":
// return `Unicode code point 0x${k.value.toString(16)} is too large (max 0x10FFFF)`;
// }
// }
// }

View file

@ -1,169 +0,0 @@
import { SourceText } from "./source_text";
import { Scanner, Token } from "./scanner";
// === Test Harness Utilities ===
const RED = "\x1b[31m";
const GREEN = "\x1b[32m";
const RESET = "\x1b[0m";
function assert(condition: boolean, message: string) {
if (!condition) {
throw new Error(message);
}
}
function assertToken(token: Token, expectedTag: string, expectedValue?: any) {
assert(token.tag === expectedTag, `Expected tag '${expectedTag}', got '${token.tag}'`);
if (expectedValue !== undefined) {
// Check 'value' for numbers, 'text' for strings
const actualValue = "value" in token ? token.value : "text" in token ? token.text : undefined;
assert(actualValue === expectedValue, `Expected value '${expectedValue}', got '${actualValue}'`);
}
}
// TODO: Rewrite this once `scanToken()` is implemented.
function scanOne(source: string): Token {
const src = new SourceText(source);
const scanner = new Scanner(src);
// We assume your scanner has a nextToken() method exposed,
// or you make the specific scan methods public for testing.
// Since you likely only expose nextToken() eventually, let's cheat
// and cast to any to access private methods for unit testing specific parts.
// OR: You can just expose 'scanNumber' as public for now.
// For this test, I will assume we are calling the private methods via 'any'
// to strictly unit test them without the dispatch logic.
if (source.trim().startsWith('"')) return (scanner as any).scanString();
return (scanner as any).scanNumber();
}
function test_integers() {
const t1 = scanOne("123");
assertToken(t1, "number", 123);
const t2 = scanOne("-500");
assertToken(t2, "number", -500);
console.log(`${GREEN}✔ Integers passed${RESET}`);
}
function test_floats() {
const t1 = scanOne("3.14159");
assertToken(t1, "number", 3.14159);
const t2 = scanOne("-0.001");
assertToken(t2, "number", -0.001);
console.log(`${GREEN}✔ Floats passed${RESET}`);
}
function test_number_errors() {
try {
scanOne("1."); // Should fail (trailing dot)
throw new Error("Should have thrown error for '1.'");
} catch (e: any) {
assert(e.tag === "InvalidNumber", "Expected InvalidNumber error for '1.'");
}
try {
scanOne(".5"); // Should fail (no leading digit)
throw new Error("Should have thrown error for '.5'");
} catch (e: any) {
assert(e.tag === "ExpectedNumber", "Expected ExpectedNumber error for '.5'");
}
console.log(`${GREEN}✔ Number errors passed${RESET}`);
}
function test_basic_strings() {
const t1 = scanOne('"hello world"');
assertToken(t1, "string", "hello world");
const t2 = scanOne('""'); // Empty string
assertToken(t2, "string", "");
console.log(`${GREEN}✔ Basic strings passed${RESET}`);
}
function test_string_escapes() {
const t1 = scanOne('"line1\\nline2"');
assertToken(t1, "string", "line1\nline2");
const t2 = scanOne('"col1\\tcol2"');
assertToken(t2, "string", "col1\tcol2");
const t3 = scanOne('"quote: \\" slash: \\\\"');
assertToken(t3, "string", 'quote: " slash: \\');
console.log(`${GREEN}✔ String escapes passed${RESET}`);
}
function test_unicode_escapes() {
// Rocket emoji: 🚀 (U+1F680)
const t1 = scanOne('"\\u{1F680}"');
assertToken(t1, "string", "🚀");
// Two escapes: A (U+41) and B (U+42)
const t2 = scanOne('"\\u{41}\\u{42}"');
assertToken(t2, "string", "AB");
// Max valid unicode
scanOne('"\\u{10FFFF}"');
console.log(`${GREEN}✔ Unicode escapes passed${RESET}`);
}
function test_line_counting() {
// Mixed line endings:
// Line 1: 123 (CRLF)
// Line 2: 456 (LF)
// Line 3: "foo"
const code = "123\r\n456\n\"foo\"";
const src = new SourceText(code);
const scanner = new Scanner(src);
// We need to implement a mini-loop here since scanOne creates new scanners
// 123
let tok = (scanner as any).scanNumber();
assert(tok.value === 123, "Line 1 value wrong");
assert(tok.span.line === 1, "Line 1 line# wrong");
// consume whitespace manually since we are bypassing nextToken()
(scanner as any).skipWhitespace();
// 456
tok = (scanner as any).scanNumber();
assert(tok.value === 456, "Line 2 value wrong");
assert(tok.span.line === 2, "Line 2 line# wrong");
(scanner as any).skipWhitespace();
// "foo"
tok = (scanner as any).scanString();
assert(tok.text === "foo", "Line 3 value wrong");
assert(tok.span.line === 3, "Line 3 line# wrong");
console.log(`${GREEN}✔ Line counting passed${RESET}`);
}
// === Run All ===
function run_all_tests() {
console.log("Running Scanner Tests...\n");
try {
test_integers();
test_floats();
test_number_errors();
test_basic_strings();
test_string_escapes();
test_unicode_escapes();
test_line_counting();
console.log(`\n${GREEN}ALL TESTS PASSED${RESET}`);
} catch (e: any) {
console.error(`\n${RED}TEST FAILED:${RESET}`);
console.error(e.message || e);
process.exit(1);
}
}
run_all_tests();

View file

@ -1,318 +1,85 @@
import { char, isWhitespace, isDigit } from './source_text';
import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text';
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
function isSymbolChar(c: CodePoint): boolean {
return (
c === char("#") ||
c === char("$") ||
c === char("@") ||
c === char("(") ||
c === char(")") ||
c === char("{") ||
c === char("}") ||
c === char(",") ||
c === char(".")
);
import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
import { Result } from '../result';
// === Language Specific Stuff ===
const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
export type Delimiter = typeof DELIMITER_CHARS[number];
const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!";
// Returns the raw string. Does NOT create a token (the caller decides the token type).
function scanRawIdentifier(cursor: Cursor): string {
const start = cursor.currentIndex;
// Consume until EOF or Delimiter or Whitespace (including comments)
// TODO: How to check for comments? They are special in that they must start with two characters. Fuck.
cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c)));
return cursor.text.sliceByCp(start, cursor.currentIndex);
}
function isIdentifierChar(char: CodePoint): boolean {
return !isWhitespace(char) && !isSymbolChar(char);
}
export type ExprScanError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "InvalidIdentifier", text: string, reason: string, span: Span }
| NumberError
| StringError;
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
// === Scanner ===
export type Token =
export type ExprStartToken =
| { tag: "number", value: number, span: Span }
| { tag: "string", text: string, span: Span }
| { tag: "identifier", text: string, span: Span }
| { tag: "function_name", name: string, span: Span }
| { tag: "variable_use", name: string, span: Span }
| { tag: "tag", name: string, span: Span }
| { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
| { tag: "keyword", kw: Keyword, span: Span }
| { tag: "symbol", sym: Symbol, span: Span }
| { tag: "EOF", span: Span }
export namespace TokenKind {
export type T =
| { tag: "number" }
| { tag: "string" }
| { tag: "identifier" }
| { tag: "symbol", value: Symbol }
| { tag: "keyword", value: Keyword }
| { tag: "EOF" }
}
// TODO: Move this back to `cursor.ts`
export type LexError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "UnexpectedEOF", span: Span }
| { tag: "ExpectedNumber", span: Span }
| { tag: "InvalidNumber", text: string, reason: string, span: Span }
| { tag: "InvalidEscape", reason: string, span: Span };
function skipWhitespaceAndComments(cursor: Cursor): number {
let totalConsumed = 0;
export class Scanner {
private i: CodePointIndex = 0;
private line = 1;
private column = 1;
while (true) {
// 1. Consume standard whitespace (spaces, tabs, newlines)
const wsCount = cursor.consumeWhile(isWhitespace);
totalConsumed += wsCount;
// Track previous char to handle \r\n correctly
private lastCharWasCR = false;
// 2. Check for Line Comment start ('//')
const c = cursor.peek();
const nextC = cursor.peek(1);
constructor(private readonly text: SourceText) {}
if (c === char('/') && nextC === char('/')) {
// Found comment start. Consume the '//' markers
cursor.next();
cursor.next();
totalConsumed += 2;
eof(): boolean {
return this.i >= this.text.length;
}
private peek(n: number = 0): CodePoint | undefined {
return this.text.chars[this.i + n]?.char;
}
private next(): CodePoint | undefined {
const ref = this.text.chars[this.i];
if (!ref) return undefined;
const c = ref.char;
this.i++;
if (c === 0x0A /* \n */) {
if (!this.lastCharWasCR) {
this.line++;
this.column = 1;
} else {
// We just saw \r, so this \n is part of \r\n.
// We already bumped the line count on \r.
// Just reset the flag.
this.lastCharWasCR = false;
}
} else if (c === 0x0D /* \r */) {
this.line++;
this.column = 1;
this.lastCharWasCR = true;
// Consume everything until the next newline (or EOF).
// Note: We do NOT consume the newline itself here.
// We let the NEXT iteration of the 'while(true)' loop catch
// the newline as standard whitespace.
const commentContentLength = cursor.consumeWhile(c => c !== NEW_LINE && c !== CARRIAGE_RETURN);
totalConsumed += commentContentLength;
} else {
this.column++;
this.lastCharWasCR = false;
}
return c;
}
private currentOffset(): StringIndex {
return this.text.chars[this.i]?.offset ?? this.text.source.length;
}
private currentLocation(): SourceLocation {
return { index: this.i, line: this.line, column: this.column };
}
private makeSpan(start: SourceLocation): Span {
const startOffset =
this.text.chars[start.index]?.offset ?? this.text.source.length;
const endOffset = this.currentOffset();
return {
start: startOffset,
end: endOffset,
line: start.line,
column: start.column,
};
}
private consumeWhile(pred: (c: CodePoint) => boolean): number {
let count = 0;
while (!this.eof()) {
const c = this.peek();
if (c === undefined || !pred(c)) break;
this.next();
count++;
}
return count;
}
private expect(
pred: (c: CodePoint) => boolean,
error: LexError
): CodePoint {
const c = this.peek();
if (c === undefined || !pred(c)) {
throw error;
}
this.next();
return c;
}
// Helper to check for exact char matches quickly
private match(c: CodePoint): boolean {
if (this.peek() === c) {
this.next();
return true;
}
return false;
}
private skipWhitespace() {
this.consumeWhile(isWhitespace);
}
// === Main Scanners ===
private scanNumber(): Token {
// number :=
// | optional(`-`) digits optional(`.` digits)
const startNumberLocation = this.currentLocation();
let c: CodePoint;
// 1. Optional Sign
c = this.peek();
if (c === char("-")) {
this.next();
}
// 2. Integer Part
c = this.peek();
const integerPartDigitCount = this.consumeWhile(isDigit);
if (integerPartDigitCount === 0) {
throw <LexError>{
tag: "ExpectedNumber",
span: this.makeSpan(startNumberLocation),
};
}
// 3. Fractional Part
if (this.peek() === char(".")) {
const dotLocation = this.currentLocation();
this.next(); // consume '.'
const fracPartDigitCount = this.consumeWhile(isDigit);
if (fracPartDigitCount === 0) {
throw <LexError>{
tag: "InvalidNumber",
reason: "MissingFractionalDigits",
span: this.makeSpan(dotLocation),
};
}
}
const text = this.text.sliceByCp(startNumberLocation.index, this.i);
const value = Number(text);
if (!Number.isFinite(value)) {
throw <LexError>{
tag: "InvalidNumber",
reason: "NotFinite",
span: this.makeSpan(startNumberLocation),
};
}
return {
tag: "number",
value,
span: this.makeSpan(startNumberLocation),
};
}
private scanString(): Token {
const start = this.currentLocation();
// We assume the caller checked the opening quote '"'
this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
let value = ""; // The actual string content
while (true) {
if (this.eof()) {
throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
}
const c = this.peek();
// 1. End of string
if (c === char('"')) {
this.next(); // consume closing quote
// We are not at a comment.
// If we also didn't consume any whitespace in step 1, we are truly done.
if (wsCount === 0) {
break;
}
if (c === char('\\')) {
// 2. Escape Sequences
const escapeStart = this.currentLocation();
this.next(); // consume backslash
const escaped = this.peek();
switch (escaped) {
case char('n'): value += '\n'; this.next(); break;
case char('r'): value += '\r'; this.next(); break;
case char('t'): value += '\t'; this.next(); break;
case char('\\'): value += '\\'; this.next(); break;
case char("0"): value += "\0"; break;
case char('"'): value += '"'; this.next(); break;
// Unicode Escape: \u{XXXX}
case char('u'): {
this.next(); // consume 'u'
// Expect '{'
const braceStart = this.currentLocation();
if (this.peek() !== char('{')) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
}
this.next(); // consume '{'
// Consume Hex Digits
const hexStart = this.i;
const hexCount = this.consumeWhile(c =>
(c >= char('0') && c <= char('9')) ||
(c >= char('a') && c <= char('f')) ||
(c >= char('A') && c <= char('F'))
);
if (hexCount === 0) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
}
// Expect '}'
if (this.peek() !== char("}")) {
throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
}
this.next(); // consume '}'
// Convert & Append
const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
const codePoint = parseInt(hexStr, 16);
if (codePoint > 0x10FFFF) {
throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
}
value += String.fromCodePoint(codePoint);
break;
}
default:
throw <LexError>{
tag: "InvalidEscape",
reason: `UnknownEscapeSequence`,
span: this.makeSpan(escapeStart)
};
}
} else {
// 3. Regular character
// Optimization: consume chunks of non-special chars for speed?
// For now, char-by-char is fine.
this.next();
// Note: We use ! because we checked EOF at loop start
value += String.fromCodePoint(c!);
}
}
return {
tag: "string",
text: value,
span: this.makeSpan(start)
};
}
return totalConsumed;
}
export function scanExprStart(cursor: Cursor): Result<ExprStartToken, ExprScanError> {
// TODO
return (0 as any);
}
// TODO: Need a Token to TokenKind function
// TODO: Need is_start_of_expression(token): boolean

View file

@ -17,7 +17,6 @@ export type CodePointRef = {
export class SourceText {
readonly source: string;
// TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
//
readonly chars: CodePointRef[];
// Stores the CodePointIndex where each line begins
@ -40,17 +39,16 @@ export class SourceText {
i += size;
// === Newline Logic ===
// 0x0A is '\n', 0x0D is '\r'
if (char === 0x0A) {
if (char === NEW_LINE) {
// Found a newline, the NEXT char starts a new line
this.lineStarts.push(cpIndex + 1);
}
// Handle CR (Classic Mac) or CRLF start
else if (char === 0x0D) {
else if (char === CARRIAGE_RETURN) {
// Check if the next char is '\n' (CRLF)
// We peek ahead in the raw string to see if we need to skip the \n for line counting purposes
// or just treat this as a newline.
const nextIsNL = i < source.length && source.codePointAt(i) === 0x0A;
const nextIsNL = i < source.length && source.codePointAt(i) === NEW_LINE;
if (!nextIsNL) {
// Only push if it's NOT CRLF. If it is CRLF, the loop handles the \n next.
this.lineStarts.push(cpIndex + 1);
@ -101,15 +99,6 @@ export function sourceText(s: string) {
return new SourceText(s);
}
export function isWhitespace(char: CodePoint): boolean {
return char === 0x20 || char === 0x09 || char === 0x0A || char === 0x0D;
}
export function isDigit(char: CodePoint): boolean {
return char >= 0x30 && char <= 0x39;
}
export type Span = {
start: StringIndex,
end: StringIndex,
@ -123,3 +112,21 @@ export type SourceLocation = {
column: number; // 1-based
};
// Whitespace
export const NEW_LINE: CodePoint = char('\n');
export const CARRIAGE_RETURN: CodePoint = char('\r');
export const SPACE: CodePoint = char(' ');
export const TAB: CodePoint = char('\t');
// Digit Boundaries
export const DIGIT_0: CodePoint = char('0');
export const DIGIT_9: CodePoint = char('9');
export const DOT: CodePoint = char('.');
// Hex Boundaries
export const LOWERCASE_a: CodePoint = char('a');
export const UPPERCASE_A: CodePoint = char('A');
export const LOWERCASE_f: CodePoint = char('f');
export const UPPERCASE_F: CodePoint = char('F');

10
src/result.ts Normal file
View file

@ -0,0 +1,10 @@
export type Result<T, E> =
| { tag: "ok", value: T }
| { tag: "error", error: E }
export namespace Result {
export function ok<T, E>(value: T): Result<T, E> { return { tag: "ok", value } }
export function error<T, E>(error: E): Result<T, E> { return { tag: "error", error } }
}

View file

@ -12,6 +12,6 @@ npm install -D sass-embedded
# Tests
npx ts-node src/parser/scanner.test.ts
npx ts-node src/parser/cursor.test.ts