Prep for parser

2026-02-05 00:23:27 +01:00 · 2026-02-05 00:23:27 +01:00 · 99cd517a58
commit 99cd517a58
parent a478542c2a
3 changed files with 598 additions and 14 deletions
--- a/src/PARSER-PLAN.md
+++ b/src/PARSER-PLAN.md
@ -0,0 +1,336 @@
+## Goal
+
+Implement a correct parser for the language described in `SYNTAX.md`, producing the existing AST types (`Expr`, `Pattern`, `ProductPattern`, etc.).
+
+Code quality is **not** the primary concern.
+Correctness, clarity, and reasonable error messages are.
+
+---
+
+## Overall architecture
+
+The parser is split into **two stages**:
+
+1. **Lexing (tokenization)**
+   Converts source text into a stream of tokens, each with precise source location info.
+2. **Parsing**
+   Consumes the token stream and constructs the AST using recursive-descent parsing.
+
+This split is deliberate and should be preserved.
+
+---
+
+## Stage 1: Lexer (Tokenizer)
+
+### Purpose
+
+The lexer exists to:
+
+* Normalize the input into a small set of token types
+* Track **line / column / offset** precisely
+* Make parsing simpler and more reliable
+* Enable good error messages later
+
+The lexer is intentionally **simple and dumb**:
+
+* No semantic decisions
+* No AST construction
+* Minimal lookahead
+
+---
+
+### Unicode handling
+
+The input may contain arbitrary Unicode (including emoji) inside identifiers and strings.
+
+**Important rule**:
+
+* Iterate over Unicode *code points*, not UTF-16 code units.
+
+In TypeScript:
+
+* Use `for (const ch of input)` or equivalent
+* Do **not** index into strings with `input[i]`
+
+Column counting:
+
+* Increment column by **1 per code point**
+* Exact visual width is not required
+
+---
+
+### Source positions and spans
+
+All tokens must carry precise location information.
+
+Suggested types (can be adjusted):
+
+```ts
+type Position = {
+  offset: number; // code-point index from start of input
+  line: number;   // 1-based
+  column: number; // 1-based
+};
+
+type Span = {
+  start: Position;
+  end: Position;
+};
+```
+
+Each token has a `span`.
+
+---
+
+### Token types
+
+Suggested minimal token set:
+
+```ts
+type Token =
+  | { kind: "number"; value: number; span: Span }
+  | { kind: "string"; value: string; span: Span }
+  | { kind: "identifier"; value: string; span: Span }
+  | { kind: "keyword"; value: Keyword; span: Span }
+  | { kind: "symbol"; value: Symbol; span: Span }
+  | { kind: "eof"; span: Span };
+```
+
+Where:
+
+```ts
+type Keyword = "let" | "fn" | "match" | "apply" | "=" | "!" | "|";
+type Symbol = "#" | "$" | "(" | ")" | "{" | "}" | "," | ".";
+```
+
+Notes:
+
+* Operators like `+`, `==`, `<=`, `*` are **identifiers**
+* `=` is treated as a keyword (same ofr `|`)
+* Identifiers are parsed first, then checked against keywords
+
+---
+
+### Lexer responsibilities
+
+The lexer should:
+
+* Skip whitespace (spaces, tabs, newlines)
+* Track line and column numbers
+* Emit tokens with correct spans
+* Fail immediately on:
+
+  * Unterminated string literals
+  * Invalid characters
+
+The lexer **should not**:
+
+* Attempt error recovery
+* Guess intent
+* Validate grammar rules
+
+---
+
+## Stage 2: Parser
+
+### Parsing strategy
+
+Use **recursive-descent parsing**.
+
+The grammar is:
+
+* Context-free
+* Non-left-recursive
+* No precedence rules
+* No implicit associativity
+
+This makes recursive descent ideal.
+
+---
+
+### Parser state
+
+The parser operates over:
+
+```ts
+class Parser {
+  tokens: Token[];
+  pos: number;
+}
+```
+
+Helper methods are encouraged:
+
+```ts
+peek(): Token
+advance(): Token
+matchKeyword(kw: Keyword): boolean
+matchSymbol(sym: Symbol): boolean
+expectKeyword(kw: Keyword): Token
+expectSymbol(sym: Symbol): Token
+error(message: string, span?: Span): never
+```
+
+---
+
+### Error handling
+
+Error recovery is **not required**.
+
+On error:
+
+* Throw a `ParseError`
+* Include:
+
+  * A clear message
+  * A span pointing to the offending token (or best approximation)
+
+The goal is:
+
+* One good error
+* Accurate location
+* No cascading failures
+
+---
+
+### Expression parsing
+
+There is **no precedence hierarchy**.
+
+`parseExpr()` should:
+
+* Look at the next token
+* Dispatch to the correct parse function based on:
+
+  * keyword (e.g. `let`, `fn`, `match`, `apply`)
+  * symbol (e.g. `$`, `#`, `(`, `{`)
+  * identifier (e.g. top-level function call)
+
+Order matters.
+
+---
+
+### Important parsing rules
+
+#### Variable use
+
+```txt
+$x
+```
+
+* `$` immediately followed by identifier
+* No whitespace allowed
+
+#### Tag expressions
+
+```txt
+#foo
+#foo expr
+```
+
+Parsing rule:
+
+* After `#tag`, look at the next token
+* If the next token can start an expression **and is not a terminator** (`)`, `}`, `,`, `|`, `.`):
+
+  * Parse a `tagged-expr`
+* Otherwise:
+
+  * Parse a `tag-expr`
+
+This rule is intentional and should be implemented directly.
+
+---
+
+#### Tuples vs grouping
+
+Parentheses always construct **tuples**.
+
+```txt
+()
+(123)
+(1, 2, 3)
+```
+
+Parentheses are **not** used for grouping expressions. So `(123)` is NOT the same as `123`.
+
+---
+
+#### Lists with separators
+
+Many constructs use:
+
+```txt
+list-sep-by(p, sep)
+```
+
+This allows:
+
+* Empty lists
+* Optional leading separator
+* Optional trailing separator
+
+Implement a reusable helper that:
+
+* Stops at a known terminator token
+* Does not allow repeated separators without elements
+
+---
+
+### Parsing patterns
+
+Patterns are parsed only in specific contexts:
+
+* `match` branches
+* `let` bindings
+* lambda parameters
+
+There are **two distinct pattern parsers**:
+
+* `parsePattern()` — full patterns (including tags)
+* `parseProductPattern()` — no tags allowed
+
+These should be separate functions.
+
+---
+
+### AST construction
+
+Parser functions should construct AST nodes directly, matching the existing AST types exactly.
+
+If necessary, spans may be:
+
+* Stored directly on AST nodes, or
+* Discarded after parsing
+
+Either is acceptable.
+
+---
+
+## Division of responsibility
+
+**Lexer**:
+
+* Characters → tokens
+* Unicode-safe
+* Tracks positions
+
+**Parser**:
+
+* Tokens → AST
+* Grammar enforcement
+* Context-sensitive decisions
+* Error reporting
+
+Do **not** merge these stages.
+
+---
+
+## Final notes
+
+* Favor clarity over cleverness
+* Favor explicit structure over abstraction
+* Assume the grammar in `SYNTAX.md` is authoritative
+* It is acceptable to tweak helper types or utilities if needed
+
+Correct parsing is the goal. Performance and elegance are not.
--- a/src/SYNTAX.md
+++ b/src/SYNTAX.md
@ -0,0 +1,248 @@
+
+Syntax is designed to be easily parsable by a recursive-descent/parser-combinators - not necessarily designed for intuitive human use.
+It is inspired by syntax like LISP, but instead of `(f a b c)` we'd write `f(a, b, c)`.
+Note that we allow fairly permissive identifiers like
+```
+foo-bar
+_asd123
+foo☺️
+☺️
+<
+<=
+==
+
+*
+-
+empty?
+bang-you-dead!
+```
+The following are not valid identifiers
+```
+123foo // starts with a digit
+#foo   // special symbol #
+$bar   // special symbol $
+b@ar   // special symbol @
+b(ar   // special symbol (
+=      // only a special symbol by itself so for example `==` is ok, or even `foo=` is ok.
+```
+Special symbols can't ever occur in any sort of way in an identifier. The following are special symbols
+```
+#
+$
+@
+.
+,
+|
+(
+)
+{
+}
+```
+
+Then there are keywords, which can occur as substrings, but can't just be the keywords (TODO: Not sure if perhaps `|` we could allow as a keyword
+```
+let
+fn
+match
+=
+```
+
+
+Examples of expressions in the language (except the top-level function definitions which are here just for illustration)
+```
+// numbers
+123
+12312.154
+
+// strings
+"hello, world!"
+
+"unicode◊yes☺️"
+
+// variable use
+$x
+
+// top-level function call
+square(3)
+pyth(3, 4)
+(16, 17) // even primitive operations are just function calls
+(*($x, $x), *($y, $y)) // x^2 + y^2
+
+// top-level function definitions
+fn square(x) { *($x, $x) }
+fn pyth(x, y) { +(square($x), square($y) }
+
+// let-binding
+let { foo = 123 . $foo }
+let { x = 123, y = +($x, 1) . *($y, $y) } // i.e. y = (x + 1)^2
+let { . 123 } // degenerate case
+
+let {
+, x = 123 // note the position of comma at the start (it is optional there)
+, y = +($x, 1)
+, z = *($x, $y)
+. $z
+}
+
+// tuples
+(1, 2, 3)
+()
+(123) // this is not the same as 123. Parentheses used like this don't introduce redundancy. 1-tuple technically doesn't act as identity (in LISP `(f)` is also very different from `f`)
+((123, 56), (78, 123))
+
+(
+  "hello",
+  " ",
+  "world", // note the comma at the end (also optional)
+)
+
+( , "hello" , " ", "world" , ) // this is also technically valid syntax - the commas at the start and end are optional.
+
+// records
+{ x = 123, y = 512 }
+{} // different from (). Perhaps in the future I'll make them equivalent or... disallow one of them. But right now these are different.
+{ name = "Conan", position = { x = 5, y = 6 } }
+
+// tags (zero-ary constructors)
+#true
+#false
+#nil
+#empty
+#unit
+#none
+#whatever-you-want
+
+// tagged values (unary constructors)
+#just 123 
+#cons (10, #cons (20, #nil))
+
+// match-statement and patterns
+fn len(xs) {
+  match $xs {
+  | #nil . 0
+  | #cons (_x, xs) . +(1, len($xs))
+  }
+}
+
+fn is-some?(xs) {
+  match $xs {
+  | #some _ . #true
+  | #none . #false
+  }
+}
+
+// patterns
+x
+(x, y, z)
+{ foo , bar }
+{ foo = x, bar } // equivalent to { foo = x, bar = bar }
+{ foo = _, bar = (x, y, z) }
+
+// lambdas/anonymous-functions
+fn { x . $x } // identity function
+fn { x, y . +(*($x, $x), *($y, $y)) // (x, y) => x^2 + y^2
+
+// Actually the following is a valid lambda abstraction, since the product-patterns are allowed as the input parameters
+fn { (x, (y, z)), { foo, bar = u } . +($x, $y, $z, $u) }
+
+// application of lambdas
+apply(e !)
+apply(e ! e0)
+apply(e ! e0, e1)
+apply(e ! e0, e1, e2)
+
+apply(apply(e ! e0) ! e1, e2)
+apply(e ! apply(f ! e0, e1), e2)
+```
+
+
+More formal grammar
+```
+number := ...
+string := ...
+
+identifier := ...
+
+variable-identifier := identifier
+fn-identifier := identifier
+field-identifier := identifier
+tag-identifier := identifier
+
+expr :=
+  | var-use
+  | let-expr
+  | top-fn-call
+  | tuple-expr
+  | record-expr
+  | tag-expr
+  | tagged-expr
+  | match-expr
+  | lambda-abstraction
+  | lambda-application
+
+var-use := `$`identifier // note that here we don't allow a space between $ and the identifier
+
+let-expr := `let` `{` list-sep-by(let-binding, `,`) `.` expr `}`
+let-binding := product-pattern `=` expr
+
+top-fn-call := identifier`(` args `)`
+
+tuple-expr := `(` args `)`
+
+record-expr := `{` list-sep-by(field, `,`) `}`
+field := variable-identifier `=` expr
+
+tag-expr := `#`tag-identifier // note how we don't allow a space between # and the identifier
+tagged-expr := `#`tag-identifier expr
+
+match-expr := `match` expr `{` list-sep-by(match-branch, '|') `}`
+match-branch := pattern `.` expr
+
+lambda-abstraction := `fn` `{` params `.` expr `}`
+lambda-application := `apply` `(` expr `!` args  `)`
+
+args := list-sep-by(expr, `,`)
+params := list-sep-by(product-pattern, `,`)
+
+// patterns
+pattern :=
+  | `#`tag-identifier // note we don't allow a space between # and the identifier
+  | `#`tag-identifier expr // tagged value pattern
+  | product-pattern
+
+product-pattern :=
+  | variable-identifier // match-all pattern
+  | `(` list-sep-by(product-pattern, `,`) `)`
+  | `{` list-sep-by(field-pattern, `,`) `}`
+
+field-pattern :=
+  | field-identifier
+  | field-identifier `=` product-pattern
+
+
+// helpers
+strict-list-sep-by<p, sep> :=
+  | p
+  | p sep p
+  | p sep p sep p
+  | ...
+
+list-sep-by<p, sep> :=
+  | ''
+  | optional(sep) strict-list-sep-by(p, sep) optinal(sep)
+
+optional<p> :=
+  | ''
+  | p
+```
+
+Also:
+A tagged expression is parsed when a #tag is followed by a token that can start an expression and is not a terminator.
+So for example when parsing `#foo 123`, we parse the tag, the remainder is `123`, and we see that the next token is not a special symbol like `,` or `}` or `)` etc,
+so we conclude that this has to be a tagged value we're parsing, so the next thing is an expression.
+But for a case like `#foo, ...`, we see that the next token is `,`, so we must have just parsed a tag - not a tagged value.
+
+
+
+TODO:
+We don't yet have any convinient syntax for creating new records from old (like an immutable update syntax).
--- a/src/value.ts
+++ b/src/value.ts
@ -1,15 +1,15 @@

 // === Identifiers ===
-type VariableName = string
-type FunctionName = string
+export type VariableName = string
+export type FunctionName = string
 // type CellName = string
-type Tag = string
-type FieldName = string
+export type Tag = string
+export type FieldName = string

 // === Program ===
-type Timestamp = number;
+export type Timestamp = number;

-type Program = {
+export type Program = {
    function_definitions: Map<FunctionName, FunctionDefinition>,
    function_definition_order: FunctionName[],
    // TODO: Perhaps include the story and the environment?
@ -34,11 +34,11 @@ type Program = {
 //   | "dirty"
 //   | "error"

-type FunctionDefinition =
+export type FunctionDefinition =
  | { tag: "user", def: UserFunctionDefinition }
  | { tag: "primitive", def: PrimitiveFunctionDefinition  }

-type UserFunctionDefinition = {
+export type UserFunctionDefinition = {
  // Raw user input (authoritative)
  name: FunctionName,
  raw_parameters: string;
@ -54,7 +54,7 @@ type UserFunctionDefinition = {
  last_modified_at: Timestamp;
 }

-type PrimitiveFunctionDefinition = {
+export type PrimitiveFunctionDefinition = {
  name: FunctionName,
  implementation: (args: Value[]) => Value,
 }
@ -136,7 +136,7 @@ export namespace Program {

 // === Expressions ===

-type Expr =
+export type Expr =
  | { tag: "literal", literal: Literal }
  | { tag: "var_use", name: VariableName }
  // | { tag: "cell_ref", name: CellName }
@ -150,21 +150,21 @@ type Expr =
  | { tag: "lambda", parameters: ProductPattern[], body: Expr }
  | { tag: "apply", callee: Expr, args: Expr[] }

-type Literal =
+export type Literal =
  | { tag: "number", value: number }
  | { tag: "string", value: string }

-type ExprBinding = {
+export type ExprBinding = {
  var: ProductPattern,
  expr: Expr,
 }

-type ProductPattern =
+export type ProductPattern =
  | { tag: "any", name: VariableName }
  | { tag: "tuple", patterns: ProductPattern[] }
  | { tag: "record", fields: { field_name: FieldName, pattern: ProductPattern }[] }

-type Pattern =
+export type Pattern =
  | ProductPattern
  | { tag: "tag", tag_name: Tag }
  | { tag: "tagged", tag_name: Tag, pattern: Pattern }