Sketch basic scanner

2026-02-05 13:21:44 +01:00 · 2026-02-05 13:21:44 +01:00 · eb6ade5a3d
commit eb6ade5a3d
parent 99cd517a58
7 changed files with 836 additions and 6 deletions
--- a/src/parser/scanner.ts
+++ b/src/parser/scanner.ts
@ -0,0 +1,368 @@
+
+import { char, isWhitespace, isDigit } from './source_text';
+import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
+ 
+function isSymbolChar(c: CodePoint): boolean {
+  return (
+    c === char("#") ||
+    c === char("$") ||
+    c === char("@") ||
+    c === char("(") ||
+    c === char(")") ||
+    c === char("{") ||
+    c === char("}") ||
+    c === char(",") ||
+    c === char(".")
+  );
+}
+
+function isIdentifierChar(char: CodePoint): boolean {
+  return !isWhitespace(char) && !isSymbolChar(char);
+}
+
+export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|";
+export type Symbol = "#" | "$" | "@" | "(" | ")" | "{" | "}" | "," | ".";
+
+// === Scanner ===
+export type Token =
+  | { tag: "number", value: number, span: Span }
+  | { tag: "string", text: string, span: Span }
+  | { tag: "identifier", text: string, span: Span }
+  | { tag: "keyword", kw: Keyword, span: Span }
+  | { tag: "symbol", sym: Symbol, span: Span }
+  | { tag: "EOF", span: Span }
+
+export namespace TokenKind {
+  export type T =
+    | { tag: "number" }
+    | { tag: "string" }
+    | { tag: "identifier" }
+    | { tag: "symbol", value: Symbol }
+    | { tag: "keyword", value: Keyword }
+    | { tag: "EOF" }
+}
+
+export type LexError =
+  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
+  | { tag: "UnexpectedEOF", span: Span }
+  | { tag: "ExpectedNumber", span: Span }
+  | { tag: "InvalidNumber", text: string, reason: string, span: Span }
+  | { tag: "InvalidEscape", reason: string, span: Span };
+
+export class Scanner {
+  private i: CodePointIndex = 0;
+  private line = 1;
+  private column = 1;
+
+  // Track previous char to handle \r\n correctly
+  private lastCharWasCR = false;
+
+  constructor(private readonly text: SourceText) {}
+
+  eof(): boolean {
+    return this.i >= this.text.length;
+  }
+
+  private peek(n: number = 0): CodePoint | undefined {
+    return this.text.chars[this.i + n]?.char;
+  }
+
+  private next(): CodePoint | undefined {
+    const ref = this.text.chars[this.i];
+    if (!ref) return undefined;
+
+    const c = ref.char;
+    this.i++;
+
+    if (c === 0x0A /* \n */) {
+      if (!this.lastCharWasCR) {
+        this.line++;
+        this.column = 1;
+      } else {
+        // We just saw \r, so this \n is part of \r\n. 
+        // We already bumped the line count on \r.
+        // Just reset the flag.
+        this.lastCharWasCR = false;
+      }
+    } else if (c === 0x0D /* \r */) {
+      this.line++;
+      this.column = 1;
+      this.lastCharWasCR = true;
+    } else {
+      this.column++;
+      this.lastCharWasCR = false;
+    }
+
+    return c;
+  }
+
+
+  private currentOffset(): StringIndex {
+    return this.text.chars[this.i]?.offset ?? this.text.source.length;
+  }
+
+  private currentLocation(): SourceLocation {
+    return { index: this.i, line: this.line, column: this.column };
+  }
+
+  private makeSpan(start: SourceLocation): Span {
+    const startOffset =
+      this.text.chars[start.index]?.offset ?? this.text.source.length;
+    const endOffset = this.currentOffset();
+
+    return {
+      start: startOffset,
+      end: endOffset,
+      line: start.line,
+      column: start.column,
+    };
+  }
+
+  private consumeWhile(pred: (c: CodePoint) => boolean): number {
+    let count = 0;
+    while (!this.eof()) {
+      const c = this.peek();
+      if (c === undefined || !pred(c)) break;
+      this.next();
+      count++;
+    }
+    return count;
+  }
+
+  private expect(
+    pred: (c: CodePoint) => boolean,
+    error: LexError
+  ): CodePoint {
+    const c = this.peek();
+    if (c === undefined || !pred(c)) {
+      throw error;
+    }
+    this.next();
+    return c;
+  }
+
+  // Helper to check for exact char matches quickly
+  private match(c: CodePoint): boolean {
+    if (this.peek() === c) {
+        this.next();
+        return true;
+    }
+    return false;
+  }
+
+  private skipWhitespace() {
+    this.consumeWhile(isWhitespace);
+  }
+
+  // === Main Scanners ===
+
+  private scanNumber(): Token {
+    // number :=
+    //   | optional(`-`) digits optional(`.` digits)
+
+    const startNumberLocation = this.currentLocation();
+
+    let c: CodePoint;
+
+    // 1. Optional Sign
+    c = this.peek();
+    if (c === char("-")) {
+      this.next();
+    }
+
+    // 2. Integer Part
+    c = this.peek();
+    const integerPartDigitCount = this.consumeWhile(isDigit);
+    if (integerPartDigitCount === 0) {
+      throw <LexError>{
+        tag: "ExpectedNumber",
+        span: this.makeSpan(startNumberLocation),
+      };
+    }
+
+    // 3. Fractional Part
+    if (this.peek() === char(".")) {
+      const dotLocation = this.currentLocation();
+
+      this.next(); // consume '.'
+
+      const fracPartDigitCount = this.consumeWhile(isDigit);
+      if (fracPartDigitCount === 0) {
+        throw <LexError>{
+          tag: "InvalidNumber",
+          reason: "MissingFractionalDigits",
+          span: this.makeSpan(dotLocation),
+        };
+      }
+    }
+
+
+    const text = this.text.sliceByCp(startNumberLocation.index, this.i);
+    const value = Number(text);
+
+    if (!Number.isFinite(value)) {
+      throw <LexError>{
+        tag: "InvalidNumber",
+        reason: "NotFinite",
+        span: this.makeSpan(startNumberLocation),
+      };
+    }
+    return {
+      tag: "number",
+      value,
+      span: this.makeSpan(startNumberLocation),
+    };
+  }
+
+  private scanString(): Token {
+    const start = this.currentLocation();
+    // We assume the caller checked the opening quote '"'
+    this.expect(c => c === char('"'), <LexError>{ tag: "UnexpectedCharacter", span: this.makeSpan(start) });
+
+    let value = ""; // The actual string content
+    
+    while (true) {
+      if (this.eof()) {
+        throw <LexError>{ tag: "UnexpectedEOF", span: this.makeSpan(start) };
+      }
+
+      const c = this.peek();
+
+      // 1. End of string
+      if (c === char('"')) {
+        this.next(); // consume closing quote
+        break;
+      }
+
+      if (c === char('\\')) {
+        // 2. Escape Sequences
+        const escapeStart = this.currentLocation();
+        this.next(); // consume backslash
+        const escaped = this.peek();
+        
+        switch (escaped) {
+            case char('n'): value += '\n'; this.next(); break;
+            case char('r'): value += '\r'; this.next(); break;
+            case char('t'): value += '\t'; this.next(); break;
+            case char('\\'): value += '\\'; this.next(); break;
+            case char("0"): value += "\0"; break;
+            case char('"'): value += '"'; this.next(); break;
+            // Unicode Escape: \u{XXXX}
+            case char('v'): {
+              const braceStart = 123123;
+            }
+            case char('u'): {
+              this.next(); // consume 'u'
+              
+              // 1. Expect '{'
+              const braceStart = this.currentLocation();
+              if (this.peek() !== char('{')) {
+                  throw <LexError>{ tag: "InvalidEscape", reason: "Expected '{' after \\u", span: this.makeSpan(braceStart) };
+              }
+              this.next(); // consume '{'
+
+              // 2. Consume Hex Digits
+              const hexStart = this.i;
+              const hexCount = this.consumeWhile(c => 
+                  (c >= char('0') && c <= char('9')) ||
+                  (c >= char('a') && c <= char('f')) ||
+                  (c >= char('A') && c <= char('F'))
+              );
+
+              if (hexCount === 0) {
+                   throw <LexError>{ tag: "InvalidEscape", reason: "Expected hex digits in \\u{...}", span: this.makeSpan(braceStart) };
+              }
+
+              // 3. Expect '}'
+              if (this.peek() !== char("}")) {
+                  throw <LexError>{ tag: "InvalidEscape", reason: "Expected '}' closing unicode escape", span: this.makeSpan(braceStart) };
+              }
+              this.next(); // consume '}'
+
+              // 4. Convert & Append
+              const hexStr = this.text.sliceByCp(hexStart, hexStart + hexCount);
+              const codePoint = parseInt(hexStr, 16);
+
+              if (codePoint > 0x10FFFF) {
+                   throw <LexError>{ tag: "InvalidEscape", reason: "Invalid Unicode Code Point (max 0x10FFFF)", span: this.makeSpan(braceStart) };
+              }
+
+              value += String.fromCodePoint(codePoint);
+              break;
+            }
+            default:
+              throw <LexError>{ 
+                tag: "InvalidEscape", 
+                reason: `UnknownEscapeSequence`, 
+                span: this.makeSpan(escapeStart) 
+              };
+        }
+      } else {
+        // 3. Regular character
+        // Optimization: consume chunks of non-special chars for speed?
+        // For now, char-by-char is fine.
+        this.next();
+        // Note: We use ! because we checked EOF at loop start
+        value += String.fromCodePoint(c!); 
+      }
+    }
+
+    return {
+      tag: "string",
+      text: value,
+      span: this.makeSpan(start)
+    };
+  }
+
+}
+
+
+// TODO: Need a Token to TokenKind function
+// TODO: Need is_start_of_expression(token): boolean
+//   identifier -> true
+//   symbol # -> true
+//   symbol $ -> true
+//   symbol @ -> true
+//   symbol ( -> true
+//   symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context 
+//   symbol _ -> false
+//   number   -> true
+//   string   -> true
+//   keyword let -> true
+//   keyword fn -> true
+//   keyword apply -> true
+//   keyword = -> false
+//   keyword | -> false
+//   EOF -> false
+//
+// TODO: function that matches a token with a token_type (returns bool)
+
+// TODO: forbidden characters are
+// '('
+// ')'
+// '{'
+// '}'
+// '.'
+// ','
+// '|'
+// '$'
+// '#'
+// '@'
+// '"'
+// ' '
+// '\r'
+// '\t'
+// '\n'
+// TODO: need function is_forbidden_char
+
+
+
+// === scanner functions ===
+// TODO: whitespace - consumes whitespace
+// TODO: comment - consumes token
+// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier
+// TODO: string - consumes string like "foo bar\njfjdsajfksd"
+// TODO: number - consumes number like 123123 or 000123 or 23919233.123
+//
+// TODO: token - gives next token
+