Finish parser

2026-02-06 18:55:21 +01:00 · 2026-02-06 18:55:21 +01:00 · 13a66f2d16
commit 13a66f2d16
parent d553a05f45
6 changed files with 772 additions and 58 deletions
--- a/src/parser/scanner.ts
+++ b/src/parser/scanner.ts
@ -6,7 +6,7 @@ import type { Cursor, CursorState, GenericScanError, NumberError, StringError }
 import { Result } from '../result';
 import { Expr } from 'src/value';

-function skipWhitespaceAndComments(cursor: Cursor): number {
+export function skipWhitespaceAndComments(cursor: Cursor): number {
  let totalConsumed = 0;

  while (true) {
@ -60,7 +60,13 @@ export type ExprScanError =
  | { tag: "InvalidIdentifier", text: string, kind: IdentifierKind, reason: IdentifierErrorReason, span: Span }

 // What kind of identifier were we trying to parse?
-export type IdentifierKind = "identifier" | "variable_use" | "tag_construction" | "function_call";
+export type IdentifierKind =
+  | "identifier"
+  | "variable_use"
+  | "tag_construction"
+  | "function_call"
+  | "pattern_binding";
+
 export type IdentifierErrorReason = 
  | { tag: "StartsWithDigit" }
  | { tag: "IsKeyword", kw: Keyword }
@ -73,12 +79,20 @@ export type ExprStartToken =
  | { tag: "function_name", name: string, span: Span }
  | { tag: "variable_use", name: string, span: Span }
  | { tag: "tag", name: string, span: Span }
-  | { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
  | { tag: "tuple_start", span: Span }
  | { tag: "record_start", span: Span }
  | { tag: "keyword", kw: Keyword, span: Span }
+  // TODO: ger rid of EOF
  | { tag: "EOF", span: Span }

+export type PatternStartToken =
+  | { tag: "pattern_binding", name: string, span: Span }
+  | { tag: "tag", name: string, span: Span }
+  | { tag: "tuple_start", span: Span }
+  | { tag: "record_start", span: Span }
+  // TODO: ger rid of EOF
+  | { tag: "EOF", span: Span };
+
 // === Identifier Scanners ===

 // Returns the raw string.
@ -146,12 +160,14 @@ export function identifier(cursor: Cursor, kind: IdentifierKind): { name: string

 // === Literal Scanners ===
 // throws ExprScanError
+// TODO: handle trailing whitespace
 function number(cursor: Cursor): { value: number, span: Span } {
  const res = scanNumber(cursor);
  if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
 }

 // throws ExprScanError
+// TODO: handle trailing whitespace
 function string(cursor: Cursor): { value: string, span: Span } {
  const res = scanString(cursor);
  if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
@ -159,10 +175,10 @@ function string(cursor: Cursor): { value: string, span: Span } {

 // === complex scanners ===

+// TODO: in exprStart/patternStart make sure whitespace is consumed after they succesfuly produce token. Should we build it in the functions? Or should that be in `parser.ts`?
+
 // throws ExprScanError
 export function exprStart(cursor: Cursor): ExprStartToken {
-  skipWhitespaceAndComments(cursor);
-
  const start = cursor.currentLocation();
  if (cursor.eof()) {
    return { tag: "EOF", span: cursor.makeSpan(start) };
@ -220,53 +236,105 @@ export function exprStart(cursor: Cursor): ExprStartToken {
  }
 }

+export function patternStart(cursor: Cursor): PatternStartToken {
+  const start = cursor.currentLocation();

-// TODO: Need a Token to TokenKind function
-// TODO: Need is_start_of_expression(token): boolean
-//   identifier -> true
-//   symbol # -> true
-//   symbol $ -> true
-//   symbol @ -> true
-//   symbol ( -> true
-//   symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context 
-//   symbol _ -> false
-//   number   -> true
-//   string   -> true
-//   keyword let -> true
-//   keyword fn -> true
-//   keyword apply -> true
-//   keyword = -> false
-//   keyword | -> false
-//   EOF -> false
-//
-// TODO: function that matches a token with a token_type (returns bool)
+  if (cursor.eof()) {
+    return { tag: "EOF", span: cursor.makeSpan(start) };
+  }

-// TODO: forbidden characters are
-// '('
-// ')'
-// '{'
-// '}'
-// '.'
-// ','
-// '|'
-// '$'
-// '#'
-// '@'
-// '"'
-// ' '
-// '\r'
-// '\t'
-// '\n'
-// TODO: need function is_forbidden_char
+  const c = cursor.peek()!;
+  // === tuple ===
+  if (c === char('(')) {
+    cursor.next();
+    return { tag: "tuple_start", span: cursor.makeSpan(start) };
+  }

+  // === record ===
+  if (c === char('{')) {
+    cursor.next();
+    return { tag: "record_start", span: cursor.makeSpan(start) };
+  }

+  // === tag ===
+  if (c === char('#')) {
+    cursor.next();
+    const { name } = identifier(cursor, 'tag_construction');
+    return { tag: "tag", name, span: cursor.makeSpan(start) };
+  }

-// === scanner functions ===
-// TODO: whitespace - consumes whitespace
-// TODO: comment - consumes token
-// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier
-// TODO: string - consumes string like "foo bar\njfjdsajfksd"
-// TODO: number - consumes number like 123123 or 000123 or 23919233.123
-//
-// TODO: token - gives next token
+  // TODO: This is more subtle... -foo is a valid pattern name... I think I should restrict: can't have identifiers start with `-`? But then `-` itself can't be an identifier, which is a bit sad.
+  // TODO: This is gonna be different once we allow number/string literals as patterns.
+  if (isDigit(c) || c === char('"') || c === char('-')) {
+     throw {
+       tag: "UnexpectedCharacter",
+       char: c,
+       span: cursor.makeSpan(start)
+     } as ExprScanError;
+  }

+  // === pattern binding ===
+  const { name } = identifier(cursor, 'pattern_binding');
+  return { tag: "pattern_binding", name, span: cursor.makeSpan(start) };
+}
+
+export function isNextTokenExprStart(cursor: Cursor): boolean {
+  const state = cursor.save();
+  try {
+    const token = exprStart(cursor);
+    
+    switch (token.tag) {
+      case "number":
+      case "string":
+      case "variable_use":
+      case "tag":
+      case "tuple_start":
+      case "record_start":
+      case "function_name": // e.g. my_func(x)
+        return true;
+
+      case "keyword":
+        switch (token.kw) {
+          case "let":
+          case "fn":
+          case "match":
+          case "apply":
+            return true;
+          case "=":
+          case "|":
+          case "!":
+            return false;
+        }
+
+      case "EOF":
+        return false;
+        
+      default:
+        return false;
+    }
+
+  } catch (e) {
+    return false;
+  } finally {
+    cursor.restore(state);
+  }
+}
+
+export function isNextTokenProductPatternStart(cursor: Cursor): boolean {
+  const state = cursor.save();
+  try {
+    const token = patternStart(cursor);
+    switch (token.tag) {
+      case "pattern_binding":
+      case "tuple_start":
+      case "record_start":
+        return true;
+      default:
+        return false;
+    }
+  } catch (e) {
+    return false;
+  } finally {
+    cursor.restore(state);
+  }
+}