Finish parser

2026-02-06 18:55:21 +01:00 · 2026-02-06 18:55:21 +01:00 · 13a66f2d16
commit 13a66f2d16
parent d553a05f45
6 changed files with 772 additions and 58 deletions
--- a/src/parser/parser.ts
+++ b/src/parser/parser.ts
@ -0,0 +1,435 @@
+import { Expr, ExprBinding, FieldAssignment, FieldPattern, MatchBranch, Pattern, ProductPattern } from '../value';
+import { Cursor } from './cursor';
+import { ExprScanError, exprStart, ExprStartToken, identifier, isNextTokenExprStart, isNextTokenProductPatternStart, patternStart, PatternStartToken, skipWhitespaceAndComments } from './scanner';
+import { char, CodePoint, SourceText, Span } from './source_text';
+import { Result } from '../result';
+
+// CONVENTION: Every parser is responsible to consume whitespace/comments at the end.
+//             Every parser is not responsible for cleaning up whitespace/comments at the start - only the final `parse` that's exposed to the public.
+
+const DELIMITER_COMMA = char(',');
+const DELIMITER_PIPE = char('|');
+const TERMINATOR_CLOSE_PAREN = char(')');
+const TERMINATOR_CLOSE_BRACE = char('}');
+const TERMINATOR_DOT = char('.');
+
+// TODO: Errors have to have better contextual information
+export type ParseError =
+  | ExprScanError
+  | { tag: "UnexpectedToken", expected: string, span: Span }
+  | { tag: "UnexpectedTokenWhileParsingSequence", reason: "ExpectedDelimiterOrTerminator", received: CodePoint, expectedDelimiter: CodePoint, expectedTerminator: CodePoint, span: Span }
+
+  // === Specific Context Errors ===
+  | { tag: "ExpectedExpression", span: Span }          // Expected start of expr (e.g. hit EOF or keyword)
+  | { tag: "ExpectedFieldAssignmentSymbol", span: Span }  // Expected '=' in field assignment
+  | { tag: "ExpectedPatternAssignmentSymbol", span: Span }  // Expected '=' in pattern assignment
+  | { tag: "ExpectedPatternBindingSymbol", span: Span }  // Expected '.' in pattern binding
+  | { tag: "ExpectedFunctionCallStart", span: Span }   // Expected '(' after function name
+  | { tag: "ExpectedLetBlockOpen", span: Span }        // Expected '{' after 'let'
+  | { tag: "ExpectedLetBlockClose", span: Span }       // Expected '}' at end of 'let' expression
+  | { tag: "ExpectedMatchBlockOpen", span: Span }        // Expected '{' after 'match'
+  | { tag: "ExpectedMatchBlockClose", span: Span }        // Expected '}' at end of 'match' expression
+  | { tag: "ExpectedLambdaBlockOpen", span: Span }   // Expected '{' after `fn`
+  | { tag: "ExpectedLambdaBlockClose", span: Span }   // Expected '}' at end of `fn` expression
+  | { tag: "ExpectedApplyStart", span: Span }          // Expected '(' after 'apply'
+  | { tag: "ExpectedApplySeparator", span: Span }      // Expected '!' inside 'apply'
+  | { tag: "UnexpectedTagPattern", span: Span }       // Found #tag where product pattern expected
+  | { tag: "ExpectedPattern", span: Span }            // EOF or invalid start of pattern
+  | { tag: "ExpectedRecordField", span: Span };       // Expected identifier in record pattern
+
+  // TODO: Delete?
+export type Expectation =
+  | "ExpectedExpression"
+  | "ExpectedFieldAssignmentSymbol"
+  | "ExpectedPatternAssignmentSymbol"
+  | "ExpectedPatternBindingSymbol"
+  | "ExpectedFunctionCallStart"
+  | "ExpectedLetBlockOpen"
+  | "ExpectedLetBlockClose"
+  | "ExpectedMatchBlockOpen"
+  | "ExpectedMatchBlockClose"
+  | "ExpectedApplyStart"
+  | "ExpectedApplySeparator"
+  | "UnexpectedTagPattern"
+  | "ExpectedPattern"
+  | "ExpectedRecordField"
+
+export type Parser<T> = (cursor: Cursor) => T
+
+// === Helpers ===
+// Don't use `cursor.next()` directly - for most of the cases you also want to consume the WS/comments after.
+function nextWS(cursor: Cursor): CodePoint | undefined {
+  const c = cursor.next();
+  if (c === undefined) {
+    return undefined;
+  } else {
+    skipWhitespaceAndComments(cursor);
+    return c;
+  }
+}
+
+// Returns true if the char was consumed.
+function tryConsume(cursor: Cursor, expected: CodePoint): boolean {
+  if (cursor.peek() === expected) {
+    nextWS(cursor);
+    return true;
+  }
+  return false;
+}
+
+// TODO: Perhaps when it comes to terminators, we should allow the user of this function to create better contextual errors?
+// Parses a delimited sequence of `p` that's terminated by a special character.
+function delimitedTerminalSequence<A>(cursor: Cursor, delimiter: CodePoint, terminator: CodePoint, p: Parser<A>): A[] {
+  // Let's say the terminator is `.` and delimiter is `,`. This is the grammar for valid sequences
+  //   list(p) :=
+  //     | p
+  //     | p , p
+  //     | p , p , p
+  //     | ...
+  //   delimitedTerminalSequence(p) :=
+  //     | .
+  //     | optional(,) list(p) optional(,) .
+  //
+  // All our use-cases always have a well-defined terminator character:
+  //   tuples:         ( a, b, c )           -> `)`
+  //   records:        { f0 = e0, f1 = e1 }  -> `}`
+  //   function call:  f(a, b, c)            -> `)`
+  //   let-binding:    let { p = e . body }  -> `.`
+  //   fn-asbtraction: fn { p0, p1 . body }  -> `.`
+  //   fn-application: apply(e ! e0, e1, e2) -> `)`
+  const items: A[] = [];
+
+  if (cursor.peek() === terminator) {
+    nextWS(cursor);
+    return items;
+  }
+
+  // optional delimiter
+  if (cursor.peek() === delimiter) {
+    nextWS(cursor);
+  }
+
+  while (true) {
+    const item = p(cursor); // `p` should be responsible for getting rid of whitespace after it has done its work
+    items.push(item);
+
+    const nextChar = cursor.peek();
+
+    if (nextChar === terminator) {
+      nextWS(cursor);
+
+      return items;
+    } else if (nextChar === delimiter) {
+      nextWS(cursor);
+
+      if (cursor.peek() === terminator) {
+        nextWS(cursor);
+        // the previous delimiter was trailing
+        return items;
+      }
+      // the previous delimiter was a separator
+      continue;
+    } else {
+      throw { 
+        tag: "UnexpectedTokenWhileParsingSequence", 
+        received: nextChar,
+        reason: "ExpectedDelimiterOrTerminator",
+        expectedDelimiter: delimiter,
+        expectedTerminator: terminator,
+        span: cursor.makeSpan(cursor.currentLocation()),
+      } as ParseError;
+    }
+  }
+}
+
+function exprStartToken(cursor: Cursor): ExprStartToken {
+  const token = exprStart(cursor);
+  skipWhitespaceAndComments(cursor);
+  return token;
+}
+
+function patternStartToken(cursor: Cursor): PatternStartToken {
+  const token = patternStart(cursor);
+  skipWhitespaceAndComments(cursor);
+  return token;
+}
+
+// === Expression Parsers ===
+
+function expr(cursor: Cursor): Expr {
+  const token = exprStartToken(cursor);
+  // TODO: You need to include the spans and perhaps other meta-info.
+  switch (token.tag) {
+    case "EOF":
+      throw { 
+          tag: "UnexpectedToken", 
+          expected: "Expression", 
+          span: token.span 
+      } as ParseError;
+    case "number":
+      return Expr.number(token.value);
+    case "string":
+      return Expr.string(token.text);
+    case "variable_use":
+      return Expr.var_use(token.name);
+    case "tag":
+      // #true
+      // #foo e
+      if (isNextTokenExprStart(cursor)) {
+        const value = expr(cursor);
+        return Expr.tagged(token.name, value);
+      } else {
+        return Expr.tag(token.name);
+      }
+    case "tuple_start":
+      // e.g. (a, b, c)
+      const items = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_CLOSE_PAREN, expr);
+      return Expr.tuple(items);
+    case "record_start":
+      // e.g. { x = 1, y = 2 }
+      const fields = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_CLOSE_BRACE, fieldAssignment);
+      return Expr.record(fields);
+    case "function_name":
+      // e.g. my_func(arg1, arg2)
+      // parse a `,` delimiter sequence of expr
+      // need to consume )
+      if (!tryConsume(cursor, char('('))) {
+        throw { 
+          tag: "ExpectedFunctionCallStart", 
+          span: cursor.makeSpan(cursor.currentLocation()) 
+        } as ParseError;
+      }
+      const args = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_CLOSE_PAREN, expr);
+      return Expr.call(token.name, args);
+    case "keyword":
+      switch (token.kw) {
+        case "let":
+          // let { p0 = e0, p1 = e2 . body }
+          if (!tryConsume(cursor, char('{'))) {
+            throw { 
+              tag: "ExpectedLetBlockOpen", 
+              span: cursor.makeSpan(cursor.currentLocation()) 
+            } as ParseError;
+          }
+          const bindings = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_DOT, productPatternBinding);
+          const body = expr(cursor);
+
+          if (!tryConsume(cursor, TERMINATOR_CLOSE_BRACE)) {
+            throw { 
+              tag: "ExpectedLetBlockClose", 
+              span: cursor.makeSpan(cursor.currentLocation()) 
+            } as ParseError;
+          }
+          return Expr.let_(bindings, body);
+        case "fn": {
+          // fn { p0, p1, p2 . body }
+          if (!tryConsume(cursor, char('{'))) {
+            throw { 
+              tag: "ExpectedLambdaBlockOpen", 
+              span: cursor.makeSpan(cursor.currentLocation()) 
+            } as ParseError;
+          }
+
+          const parameters = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_DOT, productPattern);
+          const body = expr(cursor);
+          if (!tryConsume(cursor, TERMINATOR_CLOSE_BRACE)) {
+            throw { 
+              tag: "ExpectedLambdaBlockClose", 
+              span: cursor.makeSpan(cursor.currentLocation()) 
+            } as ParseError;
+          }
+          return Expr.lambda(parameters, body);
+        }
+        case "apply":
+          // apply(e ! e0, e1, e2)
+          if (!tryConsume(cursor, char('('))) {
+            throw { 
+              tag: "ExpectedApplyStart", 
+              span: cursor.makeSpan(cursor.currentLocation()) 
+            } as ParseError;
+          }
+          const callee = expr(cursor);
+          if (!tryConsume(cursor, char('!'))) {
+             throw { 
+               tag: "ExpectedApplySeparator", 
+               span: cursor.makeSpan(cursor.currentLocation()) 
+             } as ParseError;
+           }
+
+          const args = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_CLOSE_PAREN, expr);
+          return Expr.apply(callee, args);
+        case "match":
+          // match e { branch0 | branch1 | branch2 }
+          const arg = expr(cursor);
+          if (!tryConsume(cursor, char('{'))) {
+            throw { 
+              tag: "ExpectedMatchBlockOpen", 
+              span: cursor.makeSpan(cursor.currentLocation()) 
+            } as ParseError;
+          }
+
+          const branches = delimitedTerminalSequence(cursor, DELIMITER_PIPE, TERMINATOR_CLOSE_BRACE, matchBranch);
+          return Expr.match(arg, branches)
+        case "=":
+        case "|":
+        case "!":
+          // These keywords CANNOT start an expression.
+          throw { 
+            tag: "ExpectedExpression", 
+            span: token.span 
+          } as ParseError;
+      }
+  }
+}
+
+function matchBranch(cursor: Cursor): MatchBranch {
+  // p . body
+  const p = pattern(cursor);
+
+  if (!tryConsume(cursor, char("."))) {
+    throw { 
+      tag: "ExpectedPatternBindingSymbol", 
+      span: cursor.makeSpan(cursor.currentLocation()) 
+    } as ParseError;
+  }
+  const e = expr(cursor);
+  return Expr.matchBranch(p, e);
+}
+
+function productPatternBinding(cursor: Cursor): ExprBinding {
+  // TODO: There's a potential here to do a lot of work on nice errors.
+  // `p = e`
+  // here there could be problems like the pattern being just a variable that uses `=` as its part
+  // `x= = 123` is valid. Maybe in case of erroneous things like
+  // `x=123` - which just parses as an identifier - we should analyze the identifier and product a suggestion? idk...
+  // or even...
+  // `x= 123` - which just parses as an identifier - we should analyze the identifier and product a suggestion? idk...
+  // or
+  // `x =123` - this one technically is a sequence of two identifiers.
+  const pattern = productPattern(cursor);
+
+  if (!tryConsume(cursor, char('='))) {
+    throw { 
+      tag: "ExpectedPatternBindingSymbol", 
+      span: cursor.makeSpan(cursor.currentLocation()) 
+    } as ParseError;
+  }
+  const e = expr(cursor);
+  return Expr.exprBinding(pattern, e);
+}
+
+function fieldAssignment(cursor: Cursor): FieldAssignment {
+  // `f = e`
+  const { name, span } = identifier(cursor, 'identifier');
+
+  if (!tryConsume(cursor, char('='))) {
+    throw { 
+      tag: "ExpectedFieldAssignmentSymbol", // Specific error for this context
+      span: cursor.makeSpan(cursor.currentLocation()) 
+    } as ParseError;
+  }
+
+  const value = expr(cursor);
+  return Expr.fieldAssignment(name, value);
+}
+
+function pattern(cursor: Cursor): Pattern {
+  // x
+  // (x, y, z)
+  // ((x, y), z)
+  // { a = x, b = y } 
+  // { a, b = y } 
+  // #foo
+  // #foo x
+  // #foo (x, y)
+  const token = patternStartToken(cursor);
+
+  // === Tag Pattern logic ===
+  if (token.tag === "tag") {
+    // Check Lookahead: Is this a Tagged Value? e.g. #foo x or #foo (a,b)
+    if (isNextTokenProductPatternStart(cursor)) {
+      // Parse the payload (must be a product pattern)
+      const payload = productPattern(cursor);
+      return Pattern.tagged(token.name, payload);
+    } else {
+      // Standalone Tag: #foo
+      return Pattern.tag(token.name);
+    }
+  }
+
+  return finishProductPattern(cursor, token);
+}
+
+function productPattern(cursor: Cursor): ProductPattern {
+  // x
+  // (x, y, z)
+  // ((x, y), z)
+  // { a = x, b = y } 
+  // { a, b = y } 
+  //
+  // tag patterns are syntax errors i.e. unexpected-tag-pattern (renderer then could explain that we don't allow tag patterns to be deeply nested)
+
+  const token = patternStartToken(cursor);
+  return finishProductPattern(cursor, token);
+}
+
+function finishProductPattern(cursor: Cursor, token: PatternStartToken): ProductPattern {
+  switch (token.tag) {
+    case "pattern_binding":
+      // foo
+      return ProductPattern.any(token.name);
+
+    case "tuple_start": {
+      // ( p1, p2 )
+      const items = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_CLOSE_PAREN, productPattern);
+      return ProductPattern.tuple(items);
+    }
+
+    case "record_start": {
+      // { a = p, b }
+      const fields = delimitedTerminalSequence(cursor, DELIMITER_COMMA, TERMINATOR_CLOSE_BRACE, recordPatternField);
+      return ProductPattern.record(fields);
+    }
+
+    case "tag":
+      throw { tag: "UnexpectedTagPattern", span: token.span } as ParseError;
+
+    case "EOF":
+      throw { tag: "ExpectedPattern", span: token.span } as ParseError;
+  }
+}
+
+function recordPatternField(cursor: Cursor): FieldPattern {
+  const { name, span } = identifier(cursor, 'identifier'); // Reuse existing identifier scanner
+  if (tryConsume(cursor, char('='))) {
+    const p = productPattern(cursor);
+    return ProductPattern.fieldPattern(name, p);
+  } else {
+    // Punning: { a } -> { a = a }
+    return ProductPattern.fieldPattern(name, ProductPattern.any(name));
+  }
+}
+
+
+export function parse(input: string): Result<Expr, ParseError> {
+  const source = new SourceText(input);
+  const cursor = new Cursor(source);
+
+  try {
+    skipWhitespaceAndComments(cursor);
+    const expression = expr(cursor);
+
+    if (!cursor.eof()) {
+      return Result.error({
+        tag: "UnexpectedToken",
+        expected: "End of File",
+        span: cursor.makeSpan(cursor.currentLocation())
+      } as ParseError);
+    }
+
+    return Result.ok(expression);
+  } catch (e) {
+    // TODO: This is a bit sketchy. We maybe forced to have "checked" Exceptions for `ParseError` by wrapping it in something that has a proper tag.
+    return Result.error(e as ParseError);
+  }
+}