Improve and abstract Cursor design. Start scanner

2026-02-06 00:38:16 +01:00 · 2026-02-06 00:38:16 +01:00 · d5f9777711
commit d5f9777711
parent d382b16e6d
8 changed files with 713 additions and 476 deletions
--- a/src/parser/scanner.ts
+++ b/src/parser/scanner.ts
@ -1,318 +1,85 @@

-import { char, isWhitespace, isDigit } from './source_text';
+import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text';
 import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
- 
-function isSymbolChar(c: CodePoint): boolean {
-  return (
-    c === char("#") ||
-    c === char("$") ||
-    c === char("@") ||
-    c === char("(") ||
-    c === char(")") ||
-    c === char("{") ||
-    c === char("}") ||
-    c === char(",") ||
-    c === char(".")
-  );
+import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
+import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
+import { Result } from '../result';
+
+// === Language Specific Stuff ===
+
+const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
+export type Delimiter = typeof DELIMITER_CHARS[number];
+const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
+
+export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!";
+
+// Returns the raw string. Does NOT create a token (the caller decides the token type).
+function scanRawIdentifier(cursor: Cursor): string {
+  const start = cursor.currentIndex;
+  // Consume until EOF or Delimiter or Whitespace (including comments)
+  // TODO: How to check for comments? They are special in that they must start with two characters. Fuck.
+  cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c)));
+  return cursor.text.sliceByCp(start, cursor.currentIndex);
 }

-function isIdentifierChar(char: CodePoint): boolean {
-  return !isWhitespace(char) && !isSymbolChar(char);
-}
+export type ExprScanError = 
+  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
+  | { tag: "InvalidIdentifier", text: string, reason: string, span: Span }
+  | NumberError 
+  | StringError;

-export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
-export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
-
-// === Scanner ===
-export type Token =
+export type ExprStartToken =
  | { tag: "number", value: number, span: Span }
  | { tag: "string", text: string, span: Span }
-  | { tag: "identifier", text: string, span: Span }
+  | { tag: "function_name", name: string, span: Span }
+  | { tag: "variable_use", name: string, span: Span }
+  | { tag: "tag", name: string, span: Span }
+  | { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
  | { tag: "keyword", kw: Keyword, span: Span }
-  | { tag: "symbol", sym: Symbol, span: Span }
  | { tag: "EOF", span: Span }

-export namespace TokenKind {
-  export type T =
-    | { tag: "number" }
-    | { tag: "string" }
-    | { tag: "identifier" }
-    | { tag: "symbol", value: Symbol }
-    | { tag: "keyword", value: Keyword }
-    | { tag: "EOF" }
-}
+// TODO: Move this back to `cursor.ts`

-export type LexError =
-  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
-  | { tag: "UnexpectedEOF", span: Span }
-  | { tag: "ExpectedNumber", span: Span }
-  | { tag: "InvalidNumber", text: string, reason: string, span: Span }
-  | { tag: "InvalidEscape", reason: string, span: Span };
+function skipWhitespaceAndComments(cursor: Cursor): number {
+  let totalConsumed = 0;

-export class Scanner {
-  private i: CodePointIndex = 0;
-  private line = 1;
-  private column = 1;
+  while (true) {
+    // 1. Consume standard whitespace (spaces, tabs, newlines)
+    const wsCount = cursor.consumeWhile(isWhitespace);
+    totalConsumed += wsCount;

-  // Track previous char to handle \r\n correctly
-  private lastCharWasCR = false;
+    // 2. Check for Line Comment start ('//')
+    const c = cursor.peek();
+    const nextC = cursor.peek(1);

-  constructor(private readonly text: SourceText) {}
+    if (c === char('/') && nextC === char('/')) {
+      // Found comment start. Consume the '//' markers
+      cursor.next();
+      cursor.next();
+      totalConsumed += 2;

-  eof(): boolean {
-    return this.i >= this.text.length;
-  }
-
-  private peek(n: number = 0): CodePoint | undefined {
-    return this.text.chars[this.i + n]?.char;
-  }
-
-  private next(): CodePoint | undefined {
-    const ref = this.text.chars[this.i];
-    if (!ref) return undefined;
-
-    const c = ref.char;
-    this.i++;
-
-    if (c === 0x0A /* \n */) {
-      if (!this.lastCharWasCR) {
-        this.line++;
-        this.column = 1;
-      } else {
-        // We just saw \r, so this \n is part of \r\n. 
-        // We already bumped the line count on \r.
-        // Just reset the flag.
-        this.lastCharWasCR = false;
-      }
-    } else if (c === 0x0D /* \r */) {
-      this.line++;
-      this.column = 1;
-      this.lastCharWasCR = true;
+      // Consume everything until the next newline (or EOF).
+      // Note: We do NOT consume the newline itself here.
+      // We let the NEXT iteration of the 'while(true)' loop catch 
+      // the newline as standard whitespace.
+      const commentContentLength = cursor.consumeWhile(c => c !== NEW_LINE && c !== CARRIAGE_RETURN);
+      totalConsumed += commentContentLength;
    } else {
-      this.column++;
-      this.lastCharWasCR = false;
-    }
-
-    return c;
-  }
-
-
-  private currentOffset(): StringIndex {
-    return this.text.chars[this.i]?.offset ?? this.text.source.length;
-  }
-
-  private currentLocation(): SourceLocation {
-    return { index: this.i, line: this.line, column: this.column };
-  }
-
-  private makeSpan(start: SourceLocation): Span {
-    const startOffset =
-      this.text.chars[start.index]?.offset ?? this.text.source.length;
-    const endOffset = this.currentOffset();
-
-    return {
-      start: startOffset,
-      end: endOffset,
-      line: start.line,
-      column: start.column,
-    };
-  }
-
-  private consumeWhile(pred: (c: CodePoint) => boolean): number {
-    let count = 0;
-    while (!this.eof()) {
-      const c = this.peek();
-      if (c === undefined || !pred(c)) break;
-      this.next();
-      count++;
-    }
-    return count;
-  }
-
-  private expect(
-    pred: (c: CodePoint) => boolean,
-    error: LexError
-  ): CodePoint {
-    const c = this.peek();
-    if (c === undefined || !pred(c)) {
-      throw error;
-    }
-    this.next();
-    return c;
-  }
-
-  // Helper to check for exact char matches quickly
-  private match(c: CodePoint): boolean {
-    if (this.peek() === c) {
-        this.next();
-        return true;
-    }
-    return false;
-  }
-
-  private skipWhitespace() {
-    this.consumeWhile(isWhitespace);
-  }
-
-  // === Main Scanners ===
-
-  private scanNumber(): Token {
-    // number :=
-    //   | optional(`-`) digits optional(`.` digits)
-
-    const startNumberLocation = this.currentLocation();
-
-    let c: CodePoint;
-
-    // 1. Optional Sign
-    c = this.peek();
-    if (c === char("-")) {
-      this.next();
-    }
-
-    // 2. Integer Part
-    c = this.peek();
-    const integerPartDigitCount = this.consumeWhile(isDigit);
-    if (integerPartDigitCount === 0) {
-      throw <LexError>{
-        tag: "ExpectedNumber",
-        span: this.makeSpan(startNumberLocation),
-      };
-    }
-
-    // 3. Fractional Part
-    if (this.peek() === char(".")) {
-      const dotLocation = this.currentLocation();
-
-      this.next(); // consume '.'
-
-      const fracPartDigitCount = this.consumeWhile(isDigit);
-      if (fracPartDigitCount === 0) {
-        throw <LexError>{
-          tag: "InvalidNumber",
-          reason: "MissingFractionalDigits",
-          span: this.makeSpan(dotLocation),
-        };
-      }
-    }
-
-
-    const text = this.text.sliceByCp(startNumberLocation.index, this.i);
-    const value = Number(text);
-
-    if (!Number.isFinite(value)) {
-      throw <LexError>{
-        tag: "InvalidNumber",
-        reason: "NotFinite",
-        span: this.makeSpan(startNumberLocation),
-      };
-    }
-    return {
-      tag: "number",
-      value,
-      span: this.makeSpan(startNumberLocation),
-    };
-  }
-
-  private scanString(): Token {
-    const start = this.currentLocation();
-    // We assume the caller checked the opening quote '"'
-    this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
-
-    let value = ""; // The actual string content
-    
-    while (true) {
-      if (this.eof()) {
-        throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
-      }
-
-      const c = this.peek();
-
-      // 1. End of string
-      if (c === char('"')) {
-        this.next(); // consume closing quote
+      // We are not at a comment.
+      // If we also didn't consume any whitespace in step 1, we are truly done.
+      if (wsCount === 0) {
        break;
      }
-
-      if (c === char('\\')) {
-        // 2. Escape Sequences
-        const escapeStart = this.currentLocation();
-        this.next(); // consume backslash
-        const escaped = this.peek();
-        
-        switch (escaped) {
-            case char('n'): value += '\n'; this.next(); break;
-            case char('r'): value += '\r'; this.next(); break;
-            case char('t'): value += '\t'; this.next(); break;
-            case char('\\'): value += '\\'; this.next(); break;
-            case char("0"): value += "\0"; break;
-            case char('"'): value += '"'; this.next(); break;
-            // Unicode Escape: \u{XXXX}
-            case char('u'): {
-              this.next(); // consume 'u'
-              
-              // Expect '{'
-              const braceStart = this.currentLocation();
-              if (this.peek() !== char('{')) {
-                  throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
-              }
-              this.next(); // consume '{'
-
-              // Consume Hex Digits
-              const hexStart = this.i;
-              const hexCount = this.consumeWhile(c => 
-                  (c >= char('0') && c <= char('9')) ||
-                  (c >= char('a') && c <= char('f')) ||
-                  (c >= char('A') && c <= char('F'))
-              );
-
-              if (hexCount === 0) {
-                   throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
-              }
-
-              // Expect '}'
-              if (this.peek() !== char("}")) {
-                  throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
-              }
-              this.next(); // consume '}'
-
-              // Convert & Append
-              const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
-              const codePoint = parseInt(hexStr, 16);
-
-              if (codePoint > 0x10FFFF) {
-                   throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
-              }
-
-              value += String.fromCodePoint(codePoint);
-              break;
-            }
-            default:
-              throw <LexError>{ 
-                tag: "InvalidEscape", 
-                reason: `UnknownEscapeSequence`, 
-                span: this.makeSpan(escapeStart) 
-              };
-        }
-      } else {
-        // 3. Regular character
-        // Optimization: consume chunks of non-special chars for speed?
-        // For now, char-by-char is fine.
-        this.next();
-        // Note: We use ! because we checked EOF at loop start
-        value += String.fromCodePoint(c!); 
-      }
    }
-
-    return {
-      tag: "string",
-      text: value,
-      span: this.makeSpan(start)
-    };
  }

+  return totalConsumed;
 }

+export function scanExprStart(cursor: Cursor): Result<ExprStartToken, ExprScanError> {
+  // TODO
+  return (0 as any);
+}

 // TODO: Need a Token to TokenKind function
 // TODO: Need is_start_of_expression(token): boolean