Improve and abstract Cursor design. Start scanner

2026-02-06 00:38:16 +01:00 · 2026-02-06 00:38:16 +01:00 · d5f9777711
commit d5f9777711
parent d382b16e6d
8 changed files with 713 additions and 476 deletions
--- a/src/parser/SCANNER.md
+++ b/src/parser/SCANNER.md
@ -0,0 +1,47 @@
+Suppose that we have even the parser for expressions implemented, and it is run on some string. It should be an expression. To do this correctly, the parser needs to decide which sub-parsers to activate - so we need to use the scanner to determine info about what sort of an expression we may be looking at.
+
+It can either be
+| number
+| string
+| variable-use
+| tag
+| tagged-value
+| tuple
+| record
+
+| top-level function call
+| match
+| let binding
+| lambda-abstraction
+| lambda-application
+
+
+The syntax is designed in such a way that we don't need to do a deep-lookahead.
+
+Basically in the scanner we try the following in this exact order to determine what to parse next:
+- peek one character
+- digit?, yes -> number
+- symbol `-`?, peek at the next token,
+  - digit? -> number
+  - not-a-digit? -> the only possibility is that we're looking at an identifier that happens to start with `-`, and we discuss how to handle identifiers at the end of this process...
+- symbol `"`?, yes -> string
+- symbol `$`?, yes -> variable-use
+- symbol `#`?, yes ->
+  - we have either a tag or a tagged-value (we don't know which yet... ). At a first glance it seems it is not scanner's job to determine which - more of a parser's job.
+    TODO:
+    But actually we can do this in scanner. After we scan the tag-name, we can do a lookahead and see if the next char is NOT-EXPRESSION-START, which is possible to do in our grammar.
+- symbol `(`?, yes -> tuple
+- symbol `{`?, yes -> record
+- here the only possibilities are: top-level function call or some construct that starts syntactically with a keyword or an error. So the scanner needs to attempta raw-identifier scan.
+- keyword? easy
+  - match? -> match-expression
+  - let? -> let-expression
+  - fn? -> lambda-abstraction
+  - apply? -> lambda-application
+- not-a-keyword? then we need to validate:
+  It is either a valid identifier (in this specific case function-name) or invalid identifier.
+  In case it is an invalid identifier, it would be nice to also predict typical errors (like are we looking at punctuation?) - but this is a bit hard - maybe this shouldn't be the job of the scanner - parser will have more context in general. In this case, we're just expecting a top-level function call. But this kind of analysis will have to be done for other invalid identifiers...
+
+raw-identifier scan just consumes any character until it encounters a delimiter `$ # @ ( ) { } " , . \` or whitespace (I include comments as whitespace here).
+
+The job of a validator for a raw identifier is to decide whether we're looking at a keyword or an actual valid identifier.
--- a/src/parser/cursor.test.ts
+++ b/src/parser/cursor.test.ts
@ -0,0 +1,237 @@
+import { SourceText } from "./source_text";
+import { Cursor, scanString, scanNumber } from "./cursor";
+import { Result } from "../result";
+
+// === Test Harness Utilities ===
+const RED = "\x1b[31m";
+const GREEN = "\x1b[32m";
+const RESET = "\x1b[0m";
+
+function assert(condition: boolean, message: string) {
+  if (!condition) {
+    throw new Error(message);
+  }
+}
+
+// Helper to assert that a scan result is OK and matches expected value
+function assertOk<T>(result: Result<T, any>, expectedValue: any) {
+  if (result.tag === "error") {
+    // If it's a structured error, pretty print it
+    const err = result.error as any; 
+    const reason = err.reason ? ` (Reason: ${JSON.stringify(err.reason)})` : "";
+    throw new Error(`Expected Ok, got Error: ${err.tag}${reason}`);
+  }
+  
+  // Unwrap the value
+  const actual = (result.value as any).value; // Both number and string results have a .value property
+  assert(actual === expectedValue, `Expected value '${expectedValue}', got '${actual}'`);
+}
+
+// Helper to assert that a scan result is an Error with specific tags
+function assertError(result: Result<any, any>, expectedTag: string, expectedReason?: string | object) {
+  if (result.tag === "ok") {
+    throw new Error(`Expected Error '${expectedTag}', but got Ok with value: ${(result.value as any).value}`);
+  }
+  
+  const err = result.error;
+  assert(err.tag === expectedTag, `Expected error tag '${expectedTag}', got '${err.tag}'`);
+  
+  if (expectedReason !== undefined) {
+    // If reason is an object (like for Unicode errors), compare deeply-ish
+    if (typeof expectedReason === 'object') {
+       const actualReason = (err as any).reason;
+       assert(JSON.stringify(actualReason) === JSON.stringify(expectedReason), 
+              `Expected reason ${JSON.stringify(expectedReason)}, got ${JSON.stringify(actualReason)}`);
+    } else {
+       assert((err as any).reason === expectedReason, 
+              `Expected reason '${expectedReason}', got '${(err as any).reason}'`);
+    }
+  }
+}
+
+// === Number Tests ===
+
+function test_integers() {
+  const src = new SourceText("123");
+  const cursor = new Cursor(src);
+  const result = scanNumber(cursor);
+  
+  assertOk(result, 123);
+
+  const src2 = new SourceText("-500");
+  const cursor2 = new Cursor(src2);
+  const result2 = scanNumber(cursor2);
+  
+  assertOk(result2, -500);
+
+  console.log(`${GREEN}✔ Integers passed${RESET}`);
+}
+
+function test_floats() {
+  const src = new SourceText("3.14159");
+  const cursor = new Cursor(src);
+  const result = scanNumber(cursor);
+  assertOk(result, 3.14159);
+
+  const src2 = new SourceText("-0.001");
+  const cursor2 = new Cursor(src2);
+  const result2 = scanNumber(cursor2);
+  assertOk(result2, -0.001);
+
+  console.log(`${GREEN}✔ Floats passed${RESET}`);
+}
+
+function test_number_errors() {
+  // 1. Trailing Dot
+  const c1 = new Cursor(new SourceText("1."));
+  const r1 = scanNumber(c1);
+  assertError(r1, "InvalidNumber", "MissingFractionalDigits");
+
+  // 2. No leading digit (.5)
+  
+  // Let's test "Saw Sign but no digits" which is a hard error
+  const c2 = new Cursor(new SourceText("-")); // Just a minus
+  const r2 = scanNumber(c2);
+  assertError(r2, "ExpectedNumber");
+
+  console.log(`${GREEN}✔ Number errors passed${RESET}`);
+}
+
+// === String Tests ===
+
+function test_basic_strings() {
+  const src = new SourceText('"hello world"');
+  const cursor = new Cursor(src);
+  const result = scanString(cursor);
+  
+  assertOk(result, "hello world");
+
+  const src2 = new SourceText('""'); // Empty string
+  const cursor2 = new Cursor(src2);
+  const result2 = scanString(cursor2);
+  
+  assertOk(result2, "");
+
+  console.log(`${GREEN}✔ Basic strings passed${RESET}`);
+}
+
+function test_string_escapes() {
+  const src = new SourceText('"line1\\nline2"');
+  const cursor = new Cursor(src);
+  const result = scanString(cursor);
+  
+  assertOk(result, "line1\nline2");
+
+  const src2 = new SourceText('"col1\\tcol2"');
+  const cursor2 = new Cursor(src2);
+  const result2 = scanString(cursor2);
+  
+  assertOk(result2, "col1\tcol2");
+
+  const src3 = new SourceText('"quote: \\" slash: \\\\"');
+  const cursor3 = new Cursor(src3);
+  const result3 = scanString(cursor3);
+  
+  assertOk(result3, 'quote: " slash: \\');
+  
+  // Null byte test
+  const src4 = new SourceText('"null\\0byte"');
+  const cursor4 = new Cursor(src4);
+  const result4 = scanString(cursor4);
+  assertOk(result4, "null\0byte");
+
+  console.log(`${GREEN}✔ String escapes passed${RESET}`);
+}
+
+function test_unicode_escapes() {
+  // Rocket emoji: 🚀 (U+1F680)
+  const c1 = new Cursor(new SourceText('"\\u{1F680}"'));
+  assertOk(scanString(c1), "🚀");
+
+  // Two escapes
+  const c2 = new Cursor(new SourceText('"\\u{41}\\u{42}"'));
+  assertOk(scanString(c2), "AB");
+
+  // Error: Missing Brace
+  const c3 = new Cursor(new SourceText('"\\u1F680"'));
+  assertError(scanString(c3), "InvalidEscape", { tag: "UnicodeMissingBrace" });
+
+  // Error: Empty
+  const c4 = new Cursor(new SourceText('"\\u{}"'));
+  assertError(scanString(c4), "InvalidEscape", { tag: "UnicodeNoDigits" });
+
+  // Error: Overflow
+  const c5 = new Cursor(new SourceText('"\\u{110000}"'));
+  const res5 = scanString(c5);
+  // Need to check the value inside the reason for overflow
+  if (res5.tag === 'ok') throw new Error("Should have failed overflow");
+  const err = res5.error as any;
+  assert(err.tag === "InvalidEscape", "Wrong tag");
+  assert(err.reason.tag === "UnicodeOverflow", "Wrong reason tag");
+  assert(err.reason.value === 0x110000, "Wrong overflow value");
+
+  console.log(`${GREEN}✔ Unicode escapes passed${RESET}`);
+}
+
+function test_cursor_tracking() {
+  // Mixed line endings: 
+  // Line 1: 123 (CRLF)
+  // Line 2: 456 (LF)
+  // Line 3: "foo"
+  const code = "123\r\n456\n\"foo\"";
+  const src = new SourceText(code);
+  const cursor = new Cursor(src);
+
+  // 1. Scan 123
+  const r1 = scanNumber(cursor);
+  assertOk(r1, 123);
+  // Check location after scan (should be after 123, before \r)
+  // Actually r1.value.span tells us where the token WAS.
+  const span1 = (r1 as any).value.span;
+  assert(span1.line === 1, "Line 1 line# wrong");
+  assert(span1.column === 1, "Line 1 col# wrong");
+
+  // 2. Skip Whitespace (Scanner logic simulation)
+  // We need to manually skip \r\n
+  cursor.next(); // \r
+  cursor.next(); // \n
+
+  // 3. Scan 456
+  const r2 = scanNumber(cursor);
+  assertOk(r2, 456);
+  const span2 = (r2 as any).value.span;
+  assert(span2.line === 2, "Line 2 line# wrong");
+  
+  // 4. Skip \n
+  cursor.next();
+
+  // 5. Scan "foo"
+  const r3 = scanString(cursor);
+  assertOk(r3, "foo");
+  const span3 = (r3 as any).value.span;
+  assert(span3.line === 3, "Line 3 line# wrong");
+
+  console.log(`${GREEN}✔ Cursor tracking passed${RESET}`);
+}
+
+// === Run All ===
+function run_all_tests() {
+  console.log("Running Scanner Tests...\n");
+  try {
+    test_integers();
+    test_floats();
+    test_number_errors();
+    test_basic_strings();
+    test_string_escapes();
+    test_unicode_escapes();
+    test_cursor_tracking();
+    
+    console.log(`\n${GREEN}ALL TESTS PASSED${RESET}`);
+  } catch (e: any) {
+    console.error(`\n${RED}TEST FAILED:${RESET}`);
+    console.error(e.message || e);
+    process.exit(1);
+  }
+}
+
+run_all_tests();
--- a/src/parser/cursor.ts
+++ b/src/parser/cursor.ts
@ -0,0 +1,338 @@
+import { char, NEW_LINE, CARRIAGE_RETURN, DOT, DIGIT_0, DIGIT_9, LOWERCASE_a, LOWERCASE_f, UPPERCASE_A, UPPERCASE_F, SPACE, TAB } from './source_text';
+import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
+import { Result } from '../result';
+ 
+export type CursorState = {
+  index: CodePointIndex,
+  line: number,
+  column: number,
+  lastCharWasCR: boolean,
+}
+
+export class Cursor {
+  private index: CodePointIndex = 0;
+  private line: number = 1;
+  private column: number = 1;
+  // Track previous char to handle \r\n correctly
+  private lastCharWasCR: boolean = false;
+
+  constructor(readonly text: SourceText) {}
+
+  save(): CursorState {
+    return { index: this.index, line: this.line, column: this.column, lastCharWasCR: this.lastCharWasCR };
+  }
+
+  restore({ index, line, column, lastCharWasCR }: CursorState) {
+    this.index = index;
+    this.line = line;
+    this.column = column;
+    this.lastCharWasCR = lastCharWasCR;
+  }
+
+  eof(): boolean {
+    return this.index >= this.text.length;
+  }
+
+  peek(n: number = 0): CodePoint | undefined {
+    return this.text.chars[this.index + n]?.char;
+  }
+
+  next(): CodePoint | undefined {
+    const ref = this.text.chars[this.index];
+    if (!ref) return undefined;
+
+    const c = ref.char;
+    this.index++;
+
+    if (c === NEW_LINE) {
+      if (!this.lastCharWasCR) {
+        this.line++;
+        this.column = 1;
+      } else {
+        // We just saw \r, so this \n is part of \r\n. 
+        // We already bumped the line count on \r.
+        // Just reset the flag.
+        this.lastCharWasCR = false;
+      }
+    } else if (c === CARRIAGE_RETURN) {
+      this.line++;
+      this.column = 1;
+      this.lastCharWasCR = true;
+    } else {
+      this.column++;
+      this.lastCharWasCR = false;
+    }
+
+    return c;
+  }
+
+  get currentIndex(): CodePointIndex {
+    return this.index;
+  }
+
+  currentOffset(): StringIndex {
+    return this.text.chars[this.index]?.offset ?? this.text.source.length;
+  }
+
+  currentLocation(): SourceLocation {
+    return { index: this.index, line: this.line, column: this.column };
+  }
+
+  makeSpan(start: SourceLocation): Span {
+    const startOffset =
+      this.text.chars[start.index]?.offset ?? this.text.source.length;
+    const endOffset = this.currentOffset();
+
+    return {
+      start: startOffset,
+      end: endOffset,
+      line: start.line,
+      column: start.column,
+    };
+  }
+
+  consumeWhile(pred: (c: CodePoint) => boolean): number {
+    let count = 0;
+    while (!this.eof()) {
+      const c = this.peek();
+      if (c === undefined || !pred(c)) break;
+      this.next();
+      count++;
+    }
+    return count;
+  }
+
+  // Helper to check for exact char matches quickly
+  match(c: CodePoint): boolean {
+    if (this.peek() === c) {
+        this.next();
+        return true;
+    }
+    return false;
+  }
+
+  // Checks if the next characters match the string. 
+  // If yes, consumes them and returns true.
+  // If no, touches nothing and returns false.
+  matchString(str: string): boolean {
+    if (this.index + str.length > this.text.length) return false;
+    
+    const slice = this.text.sliceByCp(this.index, this.index + str.length);
+    if (slice !== str) return false;
+
+    for (let i = 0; i < str.length; i++) {
+        // We must call next() to correctly update line/col tracking.
+        // We already know it matches, so we just burn through.
+        this.next();
+    }
+    return true;
+  }
+}
+
+// === Basic Scanners/Predicates ===
+export function isWhitespace(char: CodePoint): boolean {
+  return char === SPACE || char === TAB || char === NEW_LINE || char === CARRIAGE_RETURN;
+}
+
+export function isDigit(char: CodePoint): boolean {
+  return char >= DIGIT_0 && char <= DIGIT_9;
+}
+
+export type GenericScanError =
+  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
+  | { tag: "UnexpectedEOF", span: Span }
+
+export type NumberError =
+  | { tag: "ExpectedNumber", span: Span }
+  | { tag: "InvalidNumber", reason: "NotFinite" | "MissingFractionalDigits", span: Span }
+
+export function scanNumber(cursor: Cursor): Result<{ value: number, span: Span }, NumberError> {
+  // number :=
+  //   | optional(`-`) digits optional(`.` digits)
+
+  const startNumberLocation = cursor.currentLocation();
+
+  let c: CodePoint;
+
+  // 1. Optional Sign
+  c = cursor.peek();
+  if (c === char("-")) {
+    cursor.next();
+  }
+
+  // 2. Integer Part
+  c = cursor.peek();
+  const integerPartDigitCount = cursor.consumeWhile(isDigit);
+  if (integerPartDigitCount === 0) {
+    return Result.error({
+      tag: "ExpectedNumber",
+      span: cursor.makeSpan(startNumberLocation),
+    });
+  }
+
+  // 3. Fractional Part
+  if (cursor.peek() === DOT) {
+    const dotLocation = cursor.currentLocation();
+
+    cursor.next(); // consume '.'
+
+    const fracPartDigitCount = cursor.consumeWhile(isDigit);
+    if (fracPartDigitCount === 0) {
+      return Result.error({
+        tag: "InvalidNumber",
+        reason: "MissingFractionalDigits",
+        span: cursor.makeSpan(dotLocation),
+      });
+    }
+  }
+
+  const text = cursor.text.sliceByCp(startNumberLocation.index, cursor.currentIndex);
+  const value = Number(text);
+
+  if (!Number.isFinite(value)) {
+    return Result.error({
+      tag: "InvalidNumber",
+      reason: "NotFinite",
+      span: cursor.makeSpan(startNumberLocation),
+    });
+  }
+  return Result.ok({
+    value,
+    span: cursor.makeSpan(startNumberLocation),
+  });
+}
+
+export type StringError =
+  | { tag: "InvalidEscape", reason: EscapeErrorReason, span: Span };
+
+export type EscapeErrorReason = 
+  | { tag: "UnknownEscapeSequence", char: CodePoint } // e.g. \k
+  | { tag: "UnicodeMissingBrace" }                    // \u without {
+  | { tag: "UnicodeNoDigits" }                        // \u{}
+  | { tag: "UnicodeUnclosed" }                        // \u{FF without }
+  | { tag: "UnicodeOverflow", value: number };        // \u{110000}
+
+export function scanString(cursor: Cursor): Result<{ value: string, span: Span }, StringError | GenericScanError> {
+  const start = cursor.currentLocation();
+
+  const firstChar = cursor.peek();
+  if (firstChar === undefined) {
+    return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
+  }
+  if (firstChar !== char('"')) {
+    return Result.error({ tag: "UnexpectedCharacter", char: firstChar, span: cursor.makeSpan(start) });
+  }
+  cursor.next();
+
+  let value = ""; // The actual string content
+  
+  while (true) {
+    if (cursor.eof()) {
+      return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
+    }
+
+    const c = cursor.peek();
+
+    // 1. End of string
+    if (c === char('"')) {
+      cursor.next(); // consume closing quote
+      break;
+    }
+
+    if (c === char('\\')) {
+      // 2. Escape Sequences
+      const escapeStart = cursor.currentLocation();
+      cursor.next(); // consume backslash
+      const escaped = cursor.peek();
+      
+      switch (escaped) {
+        case char('n'): value += '\n'; cursor.next(); break;
+        case char('r'): value += '\r'; cursor.next(); break;
+        case char('t'): value += '\t'; cursor.next(); break;
+        case char('\\'): value += '\\'; cursor.next(); break;
+        case char("0"): value += "\0"; cursor.next(); break;
+        case char('"'): value += '"'; cursor.next(); break;
+        // Unicode Escape: \u{XXXX}
+        case char('u'): {
+          cursor.next(); // consume 'u'
+          
+          // Expect '{'
+          const braceStart = cursor.currentLocation();
+          if (cursor.peek() !== char('{')) {
+              return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeMissingBrace" }, span: cursor.makeSpan(braceStart) });
+          }
+          cursor.next(); // consume '{'
+
+          // Consume Hex Digits
+          const hexStart = cursor.currentIndex;
+          const hexCount = cursor.consumeWhile(c => 
+            (c >= DIGIT_0 && c <= DIGIT_9) ||
+            (c >= LOWERCASE_a && c <= LOWERCASE_f) ||
+            (c >= UPPERCASE_A && c <= UPPERCASE_F)
+          );
+
+          if (hexCount === 0) {
+             return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeNoDigits" }, span: cursor.makeSpan(braceStart) });
+          }
+
+          // Expect '}'
+          if (cursor.peek() !== char("}")) {
+            return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeUnclosed" }, span: cursor.makeSpan(braceStart) });
+          }
+          cursor.next(); // consume '}'
+
+          // Convert & Append
+          const hexStr = cursor.text.sliceByCp(hexStart, hexStart + hexCount);
+          const codePoint = parseInt(hexStr, 16);
+
+          if (codePoint > 0x10FFFF) {
+             return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeOverflow", value: codePoint }, span: cursor.makeSpan(braceStart) });
+          }
+
+          value += String.fromCodePoint(codePoint);
+          break;
+        }
+        default:
+          return Result.error({ 
+            tag: "InvalidEscape", 
+            reason: { tag: "UnknownEscapeSequence", char: escaped }, 
+            span: cursor.makeSpan(escapeStart) 
+          });
+      }
+    } else {
+      // 3. Regular character
+      // Optimization: consume chunks of non-special chars for speed?
+      // For now, char-by-char is fine.
+      cursor.next();
+      // Note: We use ! because we checked EOF at loop start
+      value += String.fromCodePoint(c!); 
+    }
+  }
+
+  return Result.ok({
+    value,
+    span: cursor.makeSpan(start)
+  });
+}
+
+// TODO: rendering of errors
+// function renderStringError(err: StringError): string {
+//   switch (err.tag) {
+//     case "ExpectedQuote": return "Expected a string starting with \"";
+//     case "UnexpectedEOF": return "Unterminated string literal";
+//     case "InvalidEscape": 
+//       const k = err.kind;
+//       switch (k.tag) {
+//         case "Unknown": 
+//           return `Unknown escape sequence '\\${String.fromCodePoint(k.char)}'`;
+//         case "UnicodeMissingBrace": 
+//           return "Unicode escape must start with '{', e.g. \\u{1F600}";
+//         case "UnicodeNoDigits": 
+//           return "Empty unicode escape \\u{}";
+//         case "UnicodeUnclosed": 
+//           return "Expected '}' to close unicode escape";
+//         case "UnicodeOverflow": 
+//           return `Unicode code point 0x${k.value.toString(16)} is too large (max 0x10FFFF)`;
+//       }
+//   }
+// }
--- a/src/parser/scanner.test.ts
+++ b/src/parser/scanner.test.ts
@ -1,169 +0,0 @@
-import { SourceText } from "./source_text";
-import { Scanner, Token } from "./scanner";
-
-// === Test Harness Utilities ===
-const RED = "\x1b[31m";
-const GREEN = "\x1b[32m";
-const RESET = "\x1b[0m";
-
-function assert(condition: boolean, message: string) {
-  if (!condition) {
-    throw new Error(message);
-  }
-}
-
-function assertToken(token: Token, expectedTag: string, expectedValue?: any) {
-  assert(token.tag === expectedTag, `Expected tag '${expectedTag}', got '${token.tag}'`);
-  if (expectedValue !== undefined) {
-    // Check 'value' for numbers, 'text' for strings
-    const actualValue = "value" in token ? token.value : "text" in token ? token.text : undefined;
-    assert(actualValue === expectedValue, `Expected value '${expectedValue}', got '${actualValue}'`);
-  }
-}
-
-// TODO: Rewrite this once `scanToken()` is implemented.
-function scanOne(source: string): Token {
-  const src = new SourceText(source);
-  const scanner = new Scanner(src);
-  // We assume your scanner has a nextToken() method exposed, 
-  // or you make the specific scan methods public for testing.
-  // Since you likely only expose nextToken() eventually, let's cheat 
-  // and cast to any to access private methods for unit testing specific parts.
-  // OR: You can just expose 'scanNumber' as public for now.
-  
-  // For this test, I will assume we are calling the private methods via 'any' 
-  // to strictly unit test them without the dispatch logic.
-  if (source.trim().startsWith('"')) return (scanner as any).scanString();
-  return (scanner as any).scanNumber();
-}
-
-function test_integers() {
-  const t1 = scanOne("123");
-  assertToken(t1, "number", 123);
-
-  const t2 = scanOne("-500");
-  assertToken(t2, "number", -500);
-
-  console.log(`${GREEN}✔ Integers passed${RESET}`);
-}
-
-function test_floats() {
-  const t1 = scanOne("3.14159");
-  assertToken(t1, "number", 3.14159);
-
-  const t2 = scanOne("-0.001");
-  assertToken(t2, "number", -0.001);
-
-  console.log(`${GREEN}✔ Floats passed${RESET}`);
-}
-
-function test_number_errors() {
-  try {
-    scanOne("1."); // Should fail (trailing dot)
-    throw new Error("Should have thrown error for '1.'");
-  } catch (e: any) {
-    assert(e.tag === "InvalidNumber", "Expected InvalidNumber error for '1.'");
-  }
-
-  try {
-    scanOne(".5"); // Should fail (no leading digit)
-    throw new Error("Should have thrown error for '.5'");
-  } catch (e: any) {
-    assert(e.tag === "ExpectedNumber", "Expected ExpectedNumber error for '.5'");
-  }
-
-  console.log(`${GREEN}✔ Number errors passed${RESET}`);
-}
-
-function test_basic_strings() {
-  const t1 = scanOne('"hello world"');
-  assertToken(t1, "string", "hello world");
-
-  const t2 = scanOne('""'); // Empty string
-  assertToken(t2, "string", "");
-
-  console.log(`${GREEN}✔ Basic strings passed${RESET}`);
-}
-
-function test_string_escapes() {
-  const t1 = scanOne('"line1\\nline2"');
-  assertToken(t1, "string", "line1\nline2");
-
-  const t2 = scanOne('"col1\\tcol2"');
-  assertToken(t2, "string", "col1\tcol2");
-
-  const t3 = scanOne('"quote: \\" slash: \\\\"');
-  assertToken(t3, "string", 'quote: " slash: \\');
-
-  console.log(`${GREEN}✔ String escapes passed${RESET}`);
-}
-
-function test_unicode_escapes() {
-  // Rocket emoji: 🚀 (U+1F680)
-  const t1 = scanOne('"\\u{1F680}"');
-  assertToken(t1, "string", "🚀");
-
-  // Two escapes: A (U+41) and B (U+42)
-  const t2 = scanOne('"\\u{41}\\u{42}"');
-  assertToken(t2, "string", "AB");
-
-  // Max valid unicode
-  scanOne('"\\u{10FFFF}"'); 
-
-  console.log(`${GREEN}✔ Unicode escapes passed${RESET}`);
-}
-
-function test_line_counting() {
-  // Mixed line endings: 
-  // Line 1: 123 (CRLF)
-  // Line 2: 456 (LF)
-  // Line 3: "foo"
-  const code = "123\r\n456\n\"foo\"";
-  const src = new SourceText(code);
-  const scanner = new Scanner(src);
-
-  // We need to implement a mini-loop here since scanOne creates new scanners
-  // 123
-  let tok = (scanner as any).scanNumber();
-  assert(tok.value === 123, "Line 1 value wrong");
-  assert(tok.span.line === 1, "Line 1 line# wrong");
-
-  // consume whitespace manually since we are bypassing nextToken()
-  (scanner as any).skipWhitespace();
-
-  // 456
-  tok = (scanner as any).scanNumber();
-  assert(tok.value === 456, "Line 2 value wrong");
-  assert(tok.span.line === 2, "Line 2 line# wrong");
-  
-  (scanner as any).skipWhitespace();
-
-  // "foo"
-  tok = (scanner as any).scanString();
-  assert(tok.text === "foo", "Line 3 value wrong");
-  assert(tok.span.line === 3, "Line 3 line# wrong");
-
-  console.log(`${GREEN}✔ Line counting passed${RESET}`);
-}
-
-// === Run All ===
-function run_all_tests() {
-  console.log("Running Scanner Tests...\n");
-  try {
-    test_integers();
-    test_floats();
-    test_number_errors();
-    test_basic_strings();
-    test_string_escapes();
-    test_unicode_escapes();
-    test_line_counting();
-    
-    console.log(`\n${GREEN}ALL TESTS PASSED${RESET}`);
-  } catch (e: any) {
-    console.error(`\n${RED}TEST FAILED:${RESET}`);
-    console.error(e.message || e);
-    process.exit(1);
-  }
-}
-
-run_all_tests();
--- a/src/parser/scanner.ts
+++ b/src/parser/scanner.ts
@ -1,318 +1,85 @@

-import { char, isWhitespace, isDigit } from './source_text';
+import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text';
 import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
- 
-function isSymbolChar(c: CodePoint): boolean {
-  return (
-    c === char("#") ||
-    c === char("$") ||
-    c === char("@") ||
-    c === char("(") ||
-    c === char(")") ||
-    c === char("{") ||
-    c === char("}") ||
-    c === char(",") ||
-    c === char(".")
-  );
+import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
+import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
+import { Result } from '../result';
+
+// === Language Specific Stuff ===
+
+const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
+export type Delimiter = typeof DELIMITER_CHARS[number];
+const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
+
+export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!";
+
+// Returns the raw string. Does NOT create a token (the caller decides the token type).
+function scanRawIdentifier(cursor: Cursor): string {
+  const start = cursor.currentIndex;
+  // Consume until EOF or Delimiter or Whitespace (including comments)
+  // TODO: How to check for comments? They are special in that they must start with two characters. Fuck.
+  cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c)));
+  return cursor.text.sliceByCp(start, cursor.currentIndex);
 }

-function isIdentifierChar(char: CodePoint): boolean {
-  return !isWhitespace(char) && !isSymbolChar(char);
-}
+export type ExprScanError = 
+  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
+  | { tag: "InvalidIdentifier", text: string, reason: string, span: Span }
+  | NumberError 
+  | StringError;

-export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
-export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
-
-// === Scanner ===
-export type Token =
+export type ExprStartToken =
  | { tag: "number", value: number, span: Span }
  | { tag: "string", text: string, span: Span }
-  | { tag: "identifier", text: string, span: Span }
+  | { tag: "function_name", name: string, span: Span }
+  | { tag: "variable_use", name: string, span: Span }
+  | { tag: "tag", name: string, span: Span }
+  | { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
  | { tag: "keyword", kw: Keyword, span: Span }
-  | { tag: "symbol", sym: Symbol, span: Span }
  | { tag: "EOF", span: Span }

-export namespace TokenKind {
-  export type T =
-    | { tag: "number" }
-    | { tag: "string" }
-    | { tag: "identifier" }
-    | { tag: "symbol", value: Symbol }
-    | { tag: "keyword", value: Keyword }
-    | { tag: "EOF" }
-}
+// TODO: Move this back to `cursor.ts`

-export type LexError =
-  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
-  | { tag: "UnexpectedEOF", span: Span }
-  | { tag: "ExpectedNumber", span: Span }
-  | { tag: "InvalidNumber", text: string, reason: string, span: Span }
-  | { tag: "InvalidEscape", reason: string, span: Span };
+function skipWhitespaceAndComments(cursor: Cursor): number {
+  let totalConsumed = 0;

-export class Scanner {
-  private i: CodePointIndex = 0;
-  private line = 1;
-  private column = 1;
+  while (true) {
+    // 1. Consume standard whitespace (spaces, tabs, newlines)
+    const wsCount = cursor.consumeWhile(isWhitespace);
+    totalConsumed += wsCount;

-  // Track previous char to handle \r\n correctly
-  private lastCharWasCR = false;
+    // 2. Check for Line Comment start ('//')
+    const c = cursor.peek();
+    const nextC = cursor.peek(1);

-  constructor(private readonly text: SourceText) {}
+    if (c === char('/') && nextC === char('/')) {
+      // Found comment start. Consume the '//' markers
+      cursor.next();
+      cursor.next();
+      totalConsumed += 2;

-  eof(): boolean {
-    return this.i >= this.text.length;
-  }
-
-  private peek(n: number = 0): CodePoint | undefined {
-    return this.text.chars[this.i + n]?.char;
-  }
-
-  private next(): CodePoint | undefined {
-    const ref = this.text.chars[this.i];
-    if (!ref) return undefined;
-
-    const c = ref.char;
-    this.i++;
-
-    if (c === 0x0A /* \n */) {
-      if (!this.lastCharWasCR) {
-        this.line++;
-        this.column = 1;
-      } else {
-        // We just saw \r, so this \n is part of \r\n. 
-        // We already bumped the line count on \r.
-        // Just reset the flag.
-        this.lastCharWasCR = false;
-      }
-    } else if (c === 0x0D /* \r */) {
-      this.line++;
-      this.column = 1;
-      this.lastCharWasCR = true;
+      // Consume everything until the next newline (or EOF).
+      // Note: We do NOT consume the newline itself here.
+      // We let the NEXT iteration of the 'while(true)' loop catch 
+      // the newline as standard whitespace.
+      const commentContentLength = cursor.consumeWhile(c => c !== NEW_LINE && c !== CARRIAGE_RETURN);
+      totalConsumed += commentContentLength;
    } else {
-      this.column++;
-      this.lastCharWasCR = false;
-    }
-
-    return c;
-  }
-
-
-  private currentOffset(): StringIndex {
-    return this.text.chars[this.i]?.offset ?? this.text.source.length;
-  }
-
-  private currentLocation(): SourceLocation {
-    return { index: this.i, line: this.line, column: this.column };
-  }
-
-  private makeSpan(start: SourceLocation): Span {
-    const startOffset =
-      this.text.chars[start.index]?.offset ?? this.text.source.length;
-    const endOffset = this.currentOffset();
-
-    return {
-      start: startOffset,
-      end: endOffset,
-      line: start.line,
-      column: start.column,
-    };
-  }
-
-  private consumeWhile(pred: (c: CodePoint) => boolean): number {
-    let count = 0;
-    while (!this.eof()) {
-      const c = this.peek();
-      if (c === undefined || !pred(c)) break;
-      this.next();
-      count++;
-    }
-    return count;
-  }
-
-  private expect(
-    pred: (c: CodePoint) => boolean,
-    error: LexError
-  ): CodePoint {
-    const c = this.peek();
-    if (c === undefined || !pred(c)) {
-      throw error;
-    }
-    this.next();
-    return c;
-  }
-
-  // Helper to check for exact char matches quickly
-  private match(c: CodePoint): boolean {
-    if (this.peek() === c) {
-        this.next();
-        return true;
-    }
-    return false;
-  }
-
-  private skipWhitespace() {
-    this.consumeWhile(isWhitespace);
-  }
-
-  // === Main Scanners ===
-
-  private scanNumber(): Token {
-    // number :=
-    //   | optional(`-`) digits optional(`.` digits)
-
-    const startNumberLocation = this.currentLocation();
-
-    let c: CodePoint;
-
-    // 1. Optional Sign
-    c = this.peek();
-    if (c === char("-")) {
-      this.next();
-    }
-
-    // 2. Integer Part
-    c = this.peek();
-    const integerPartDigitCount = this.consumeWhile(isDigit);
-    if (integerPartDigitCount === 0) {
-      throw <LexError>{
-        tag: "ExpectedNumber",
-        span: this.makeSpan(startNumberLocation),
-      };
-    }
-
-    // 3. Fractional Part
-    if (this.peek() === char(".")) {
-      const dotLocation = this.currentLocation();
-
-      this.next(); // consume '.'
-
-      const fracPartDigitCount = this.consumeWhile(isDigit);
-      if (fracPartDigitCount === 0) {
-        throw <LexError>{
-          tag: "InvalidNumber",
-          reason: "MissingFractionalDigits",
-          span: this.makeSpan(dotLocation),
-        };
-      }
-    }
-
-
-    const text = this.text.sliceByCp(startNumberLocation.index, this.i);
-    const value = Number(text);
-
-    if (!Number.isFinite(value)) {
-      throw <LexError>{
-        tag: "InvalidNumber",
-        reason: "NotFinite",
-        span: this.makeSpan(startNumberLocation),
-      };
-    }
-    return {
-      tag: "number",
-      value,
-      span: this.makeSpan(startNumberLocation),
-    };
-  }
-
-  private scanString(): Token {
-    const start = this.currentLocation();
-    // We assume the caller checked the opening quote '"'
-    this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
-
-    let value = ""; // The actual string content
-    
-    while (true) {
-      if (this.eof()) {
-        throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
-      }
-
-      const c = this.peek();
-
-      // 1. End of string
-      if (c === char('"')) {
-        this.next(); // consume closing quote
+      // We are not at a comment.
+      // If we also didn't consume any whitespace in step 1, we are truly done.
+      if (wsCount === 0) {
        break;
      }
-
-      if (c === char('\\')) {
-        // 2. Escape Sequences
-        const escapeStart = this.currentLocation();
-        this.next(); // consume backslash
-        const escaped = this.peek();
-        
-        switch (escaped) {
-            case char('n'): value += '\n'; this.next(); break;
-            case char('r'): value += '\r'; this.next(); break;
-            case char('t'): value += '\t'; this.next(); break;
-            case char('\\'): value += '\\'; this.next(); break;
-            case char("0"): value += "\0"; break;
-            case char('"'): value += '"'; this.next(); break;
-            // Unicode Escape: \u{XXXX}
-            case char('u'): {
-              this.next(); // consume 'u'
-              
-              // Expect '{'
-              const braceStart = this.currentLocation();
-              if (this.peek() !== char('{')) {
-                  throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
-              }
-              this.next(); // consume '{'
-
-              // Consume Hex Digits
-              const hexStart = this.i;
-              const hexCount = this.consumeWhile(c => 
-                  (c >= char('0') && c <= char('9')) ||
-                  (c >= char('a') && c <= char('f')) ||
-                  (c >= char('A') && c <= char('F'))
-              );
-
-              if (hexCount === 0) {
-                   throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
-              }
-
-              // Expect '}'
-              if (this.peek() !== char("}")) {
-                  throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
-              }
-              this.next(); // consume '}'
-
-              // Convert & Append
-              const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
-              const codePoint = parseInt(hexStr, 16);
-
-              if (codePoint > 0x10FFFF) {
-                   throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
-              }
-
-              value += String.fromCodePoint(codePoint);
-              break;
-            }
-            default:
-              throw <LexError>{ 
-                tag: "InvalidEscape", 
-                reason: `UnknownEscapeSequence`, 
-                span: this.makeSpan(escapeStart) 
-              };
-        }
-      } else {
-        // 3. Regular character
-        // Optimization: consume chunks of non-special chars for speed?
-        // For now, char-by-char is fine.
-        this.next();
-        // Note: We use ! because we checked EOF at loop start
-        value += String.fromCodePoint(c!); 
-      }
    }
-
-    return {
-      tag: "string",
-      text: value,
-      span: this.makeSpan(start)
-    };
  }

+  return totalConsumed;
 }

+export function scanExprStart(cursor: Cursor): Result<ExprStartToken, ExprScanError> {
+  // TODO
+  return (0 as any);
+}

 // TODO: Need a Token to TokenKind function
 // TODO: Need is_start_of_expression(token): boolean
--- a/src/parser/source_text.ts
+++ b/src/parser/source_text.ts
@ -17,7 +17,6 @@ export type CodePointRef = {
 export class SourceText {
  readonly source: string;
  // TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
-  //
  readonly chars: CodePointRef[];

  // Stores the CodePointIndex where each line begins
@ -40,17 +39,16 @@ export class SourceText {
      i += size;

      // === Newline Logic ===
-      // 0x0A is '\n', 0x0D is '\r'
-      if (char === 0x0A) {
+      if (char === NEW_LINE) {
        // Found a newline, the NEXT char starts a new line
        this.lineStarts.push(cpIndex + 1);
      }
      // Handle CR (Classic Mac) or CRLF start
-      else if (char === 0x0D) {
+      else if (char === CARRIAGE_RETURN) {
        // Check if the next char is '\n' (CRLF)
        // We peek ahead in the raw string to see if we need to skip the \n for line counting purposes
        // or just treat this as a newline.
-        const nextIsNL = i < source.length && source.codePointAt(i) === 0x0A;
+        const nextIsNL = i < source.length && source.codePointAt(i) === NEW_LINE;
        if (!nextIsNL) {
            // Only push if it's NOT CRLF. If it is CRLF, the loop handles the \n next.
            this.lineStarts.push(cpIndex + 1);
@ -101,15 +99,6 @@ export function sourceText(s: string) {
  return new SourceText(s);
 }

-
-export function isWhitespace(char: CodePoint): boolean {
-  return char === 0x20 || char === 0x09 || char === 0x0A || char === 0x0D;
-}
-
-export function isDigit(char: CodePoint): boolean {
-  return char >= 0x30 && char <= 0x39;
-}
-
 export type Span = {
  start: StringIndex,
  end: StringIndex,
@ -123,3 +112,21 @@ export type SourceLocation = {
  column: number; // 1-based
 };

+// Whitespace
+export const NEW_LINE: CodePoint = char('\n');
+export const CARRIAGE_RETURN: CodePoint = char('\r');
+export const SPACE: CodePoint = char(' ');
+export const TAB: CodePoint = char('\t');
+
+// Digit Boundaries
+export const DIGIT_0: CodePoint = char('0');
+export const DIGIT_9: CodePoint = char('9');
+
+export const DOT: CodePoint = char('.');
+
+// Hex Boundaries
+export const LOWERCASE_a: CodePoint = char('a');
+export const UPPERCASE_A: CodePoint = char('A');
+export const LOWERCASE_f: CodePoint = char('f');
+export const UPPERCASE_F: CodePoint = char('F');
+
--- a/src/result.ts
+++ b/src/result.ts
@ -0,0 +1,10 @@
+
+export type Result<T, E> =
+  | { tag: "ok", value: T }
+  | { tag: "error", error: E }
+
+export namespace Result {
+  export function ok<T, E>(value: T): Result<T, E> { return { tag: "ok", value } }
+  export function error<T, E>(error: E): Result<T, E> { return { tag: "error", error } }
+}
+
--- a/tmp_repl/tmp_repl.md
+++ b/tmp_repl/tmp_repl.md
@ -12,6 +12,6 @@ npm install -D sass-embedded

 # Tests

-npx ts-node src/parser/scanner.test.ts
+npx ts-node src/parser/cursor.test.ts