diff --git a/src/PARSER-PLAN.md b/src/PARSER-PLAN.md new file mode 100644 index 0000000..b7dda85 --- /dev/null +++ b/src/PARSER-PLAN.md @@ -0,0 +1,336 @@ +## Goal + +Implement a correct parser for the language described in `SYNTAX.md`, producing the existing AST types (`Expr`, `Pattern`, `ProductPattern`, etc.). + +Code quality is **not** the primary concern. +Correctness, clarity, and reasonable error messages are. + +--- + +## Overall architecture + +The parser is split into **two stages**: + +1. **Lexing (tokenization)** + Converts source text into a stream of tokens, each with precise source location info. +2. **Parsing** + Consumes the token stream and constructs the AST using recursive-descent parsing. + +This split is deliberate and should be preserved. + +--- + +## Stage 1: Lexer (Tokenizer) + +### Purpose + +The lexer exists to: + +* Normalize the input into a small set of token types +* Track **line / column / offset** precisely +* Make parsing simpler and more reliable +* Enable good error messages later + +The lexer is intentionally **simple and dumb**: + +* No semantic decisions +* No AST construction +* Minimal lookahead + +--- + +### Unicode handling + +The input may contain arbitrary Unicode (including emoji) inside identifiers and strings. + +**Important rule**: + +* Iterate over Unicode *code points*, not UTF-16 code units. + +In TypeScript: + +* Use `for (const ch of input)` or equivalent +* Do **not** index into strings with `input[i]` + +Column counting: + +* Increment column by **1 per code point** +* Exact visual width is not required + +--- + +### Source positions and spans + +All tokens must carry precise location information. + +Suggested types (can be adjusted): + +```ts +type Position = { + offset: number; // code-point index from start of input + line: number; // 1-based + column: number; // 1-based +}; + +type Span = { + start: Position; + end: Position; +}; +``` + +Each token has a `span`. + +--- + +### Token types + +Suggested minimal token set: + +```ts +type Token = + | { kind: "number"; value: number; span: Span } + | { kind: "string"; value: string; span: Span } + | { kind: "identifier"; value: string; span: Span } + | { kind: "keyword"; value: Keyword; span: Span } + | { kind: "symbol"; value: Symbol; span: Span } + | { kind: "eof"; span: Span }; +``` + +Where: + +```ts +type Keyword = "let" | "fn" | "match" | "apply" | "=" | "!" | "|"; +type Symbol = "#" | "$" | "(" | ")" | "{" | "}" | "," | "."; +``` + +Notes: + +* Operators like `+`, `==`, `<=`, `*` are **identifiers** +* `=` is treated as a keyword (same ofr `|`) +* Identifiers are parsed first, then checked against keywords + +--- + +### Lexer responsibilities + +The lexer should: + +* Skip whitespace (spaces, tabs, newlines) +* Track line and column numbers +* Emit tokens with correct spans +* Fail immediately on: + + * Unterminated string literals + * Invalid characters + +The lexer **should not**: + +* Attempt error recovery +* Guess intent +* Validate grammar rules + +--- + +## Stage 2: Parser + +### Parsing strategy + +Use **recursive-descent parsing**. + +The grammar is: + +* Context-free +* Non-left-recursive +* No precedence rules +* No implicit associativity + +This makes recursive descent ideal. + +--- + +### Parser state + +The parser operates over: + +```ts +class Parser { + tokens: Token[]; + pos: number; +} +``` + +Helper methods are encouraged: + +```ts +peek(): Token +advance(): Token +matchKeyword(kw: Keyword): boolean +matchSymbol(sym: Symbol): boolean +expectKeyword(kw: Keyword): Token +expectSymbol(sym: Symbol): Token +error(message: string, span?: Span): never +``` + +--- + +### Error handling + +Error recovery is **not required**. + +On error: + +* Throw a `ParseError` +* Include: + + * A clear message + * A span pointing to the offending token (or best approximation) + +The goal is: + +* One good error +* Accurate location +* No cascading failures + +--- + +### Expression parsing + +There is **no precedence hierarchy**. + +`parseExpr()` should: + +* Look at the next token +* Dispatch to the correct parse function based on: + + * keyword (e.g. `let`, `fn`, `match`, `apply`) + * symbol (e.g. `$`, `#`, `(`, `{`) + * identifier (e.g. top-level function call) + +Order matters. + +--- + +### Important parsing rules + +#### Variable use + +```txt +$x +``` + +* `$` immediately followed by identifier +* No whitespace allowed + +#### Tag expressions + +```txt +#foo +#foo expr +``` + +Parsing rule: + +* After `#tag`, look at the next token +* If the next token can start an expression **and is not a terminator** (`)`, `}`, `,`, `|`, `.`): + + * Parse a `tagged-expr` +* Otherwise: + + * Parse a `tag-expr` + +This rule is intentional and should be implemented directly. + +--- + +#### Tuples vs grouping + +Parentheses always construct **tuples**. + +```txt +() +(123) +(1, 2, 3) +``` + +Parentheses are **not** used for grouping expressions. So `(123)` is NOT the same as `123`. + +--- + +#### Lists with separators + +Many constructs use: + +```txt +list-sep-by(p, sep) +``` + +This allows: + +* Empty lists +* Optional leading separator +* Optional trailing separator + +Implement a reusable helper that: + +* Stops at a known terminator token +* Does not allow repeated separators without elements + +--- + +### Parsing patterns + +Patterns are parsed only in specific contexts: + +* `match` branches +* `let` bindings +* lambda parameters + +There are **two distinct pattern parsers**: + +* `parsePattern()` — full patterns (including tags) +* `parseProductPattern()` — no tags allowed + +These should be separate functions. + +--- + +### AST construction + +Parser functions should construct AST nodes directly, matching the existing AST types exactly. + +If necessary, spans may be: + +* Stored directly on AST nodes, or +* Discarded after parsing + +Either is acceptable. + +--- + +## Division of responsibility + +**Lexer**: + +* Characters → tokens +* Unicode-safe +* Tracks positions + +**Parser**: + +* Tokens → AST +* Grammar enforcement +* Context-sensitive decisions +* Error reporting + +Do **not** merge these stages. + +--- + +## Final notes + +* Favor clarity over cleverness +* Favor explicit structure over abstraction +* Assume the grammar in `SYNTAX.md` is authoritative +* It is acceptable to tweak helper types or utilities if needed + +Correct parsing is the goal. Performance and elegance are not. diff --git a/src/SYNTAX.md b/src/SYNTAX.md new file mode 100644 index 0000000..4ae6747 --- /dev/null +++ b/src/SYNTAX.md @@ -0,0 +1,248 @@ + +Syntax is designed to be easily parsable by a recursive-descent/parser-combinators - not necessarily designed for intuitive human use. +It is inspired by syntax like LISP, but instead of `(f a b c)` we'd write `f(a, b, c)`. +Note that we allow fairly permissive identifiers like +``` +foo-bar +_asd123 +foo☺️ +☺️ +< +<= +== ++ +* +- +empty? +bang-you-dead! +``` +The following are not valid identifiers +``` +123foo // starts with a digit +#foo // special symbol # +$bar // special symbol $ +b@ar // special symbol @ +b(ar // special symbol ( += // only a special symbol by itself so for example `==` is ok, or even `foo=` is ok. +``` +Special symbols can't ever occur in any sort of way in an identifier. The following are special symbols +``` +# +$ +@ +. +, +| +( +) +{ +} +``` + +Then there are keywords, which can occur as substrings, but can't just be the keywords (TODO: Not sure if perhaps `|` we could allow as a keyword +``` +let +fn +match += +``` + + +Examples of expressions in the language (except the top-level function definitions which are here just for illustration) +``` +// numbers +123 +12312.154 + +// strings +"hello, world!" + +"unicode◊yes☺️" + +// variable use +$x + +// top-level function call +square(3) +pyth(3, 4) ++(16, 17) // even primitive operations are just function calls ++(*($x, $x), *($y, $y)) // x^2 + y^2 + +// top-level function definitions +fn square(x) { *($x, $x) } +fn pyth(x, y) { +(square($x), square($y) } + +// let-binding +let { foo = 123 . $foo } +let { x = 123, y = +($x, 1) . *($y, $y) } // i.e. y = (x + 1)^2 +let { . 123 } // degenerate case + +let { +, x = 123 // note the position of comma at the start (it is optional there) +, y = +($x, 1) +, z = *($x, $y) +. $z +} + +// tuples +(1, 2, 3) +() +(123) // this is not the same as 123. Parentheses used like this don't introduce redundancy. 1-tuple technically doesn't act as identity (in LISP `(f)` is also very different from `f`) +((123, 56), (78, 123)) + +( + "hello", + " ", + "world", // note the comma at the end (also optional) +) + +( , "hello" , " ", "world" , ) // this is also technically valid syntax - the commas at the start and end are optional. + +// records +{ x = 123, y = 512 } +{} // different from (). Perhaps in the future I'll make them equivalent or... disallow one of them. But right now these are different. +{ name = "Conan", position = { x = 5, y = 6 } } + +// tags (zero-ary constructors) +#true +#false +#nil +#empty +#unit +#none +#whatever-you-want + +// tagged values (unary constructors) +#just 123 +#cons (10, #cons (20, #nil)) + +// match-statement and patterns +fn len(xs) { + match $xs { + | #nil . 0 + | #cons (_x, xs) . +(1, len($xs)) + } +} + +fn is-some?(xs) { + match $xs { + | #some _ . #true + | #none . #false + } +} + +// patterns +x +(x, y, z) +{ foo , bar } +{ foo = x, bar } // equivalent to { foo = x, bar = bar } +{ foo = _, bar = (x, y, z) } + +// lambdas/anonymous-functions +fn { x . $x } // identity function +fn { x, y . +(*($x, $x), *($y, $y)) // (x, y) => x^2 + y^2 + +// Actually the following is a valid lambda abstraction, since the product-patterns are allowed as the input parameters +fn { (x, (y, z)), { foo, bar = u } . +($x, $y, $z, $u) } + +// application of lambdas +apply(e !) +apply(e ! e0) +apply(e ! e0, e1) +apply(e ! e0, e1, e2) + +apply(apply(e ! e0) ! e1, e2) +apply(e ! apply(f ! e0, e1), e2) +``` + + +More formal grammar +``` +number := ... +string := ... + +identifier := ... + +variable-identifier := identifier +fn-identifier := identifier +field-identifier := identifier +tag-identifier := identifier + +expr := + | var-use + | let-expr + | top-fn-call + | tuple-expr + | record-expr + | tag-expr + | tagged-expr + | match-expr + | lambda-abstraction + | lambda-application + +var-use := `$`identifier // note that here we don't allow a space between $ and the identifier + +let-expr := `let` `{` list-sep-by(let-binding, `,`) `.` expr `}` +let-binding := product-pattern `=` expr + +top-fn-call := identifier`(` args `)` + +tuple-expr := `(` args `)` + +record-expr := `{` list-sep-by(field, `,`) `}` +field := variable-identifier `=` expr + +tag-expr := `#`tag-identifier // note how we don't allow a space between # and the identifier +tagged-expr := `#`tag-identifier expr + +match-expr := `match` expr `{` list-sep-by(match-branch, '|') `}` +match-branch := pattern `.` expr + +lambda-abstraction := `fn` `{` params `.` expr `}` +lambda-application := `apply` `(` expr `!` args `)` + +args := list-sep-by(expr, `,`) +params := list-sep-by(product-pattern, `,`) + +// patterns +pattern := + | `#`tag-identifier // note we don't allow a space between # and the identifier + | `#`tag-identifier expr // tagged value pattern + | product-pattern + +product-pattern := + | variable-identifier // match-all pattern + | `(` list-sep-by(product-pattern, `,`) `)` + | `{` list-sep-by(field-pattern, `,`) `}` + +field-pattern := + | field-identifier + | field-identifier `=` product-pattern + + +// helpers +strict-list-sep-by := + | p + | p sep p + | p sep p sep p + | ... + +list-sep-by := + | '' + | optional(sep) strict-list-sep-by(p, sep) optinal(sep) + +optional

:= + | '' + | p +``` + +Also: +A tagged expression is parsed when a #tag is followed by a token that can start an expression and is not a terminator. +So for example when parsing `#foo 123`, we parse the tag, the remainder is `123`, and we see that the next token is not a special symbol like `,` or `}` or `)` etc, +so we conclude that this has to be a tagged value we're parsing, so the next thing is an expression. +But for a case like `#foo, ...`, we see that the next token is `,`, so we must have just parsed a tag - not a tagged value. + + + +TODO: +We don't yet have any convinient syntax for creating new records from old (like an immutable update syntax). diff --git a/src/value.ts b/src/value.ts index b7917ee..eb10a40 100644 --- a/src/value.ts +++ b/src/value.ts @@ -1,15 +1,15 @@ // === Identifiers === -type VariableName = string -type FunctionName = string +export type VariableName = string +export type FunctionName = string // type CellName = string -type Tag = string -type FieldName = string +export type Tag = string +export type FieldName = string // === Program === -type Timestamp = number; +export type Timestamp = number; -type Program = { +export type Program = { function_definitions: Map, function_definition_order: FunctionName[], // TODO: Perhaps include the story and the environment? @@ -34,11 +34,11 @@ type Program = { // | "dirty" // | "error" -type FunctionDefinition = +export type FunctionDefinition = | { tag: "user", def: UserFunctionDefinition } | { tag: "primitive", def: PrimitiveFunctionDefinition } -type UserFunctionDefinition = { +export type UserFunctionDefinition = { // Raw user input (authoritative) name: FunctionName, raw_parameters: string; @@ -54,7 +54,7 @@ type UserFunctionDefinition = { last_modified_at: Timestamp; } -type PrimitiveFunctionDefinition = { +export type PrimitiveFunctionDefinition = { name: FunctionName, implementation: (args: Value[]) => Value, } @@ -136,7 +136,7 @@ export namespace Program { // === Expressions === -type Expr = +export type Expr = | { tag: "literal", literal: Literal } | { tag: "var_use", name: VariableName } // | { tag: "cell_ref", name: CellName } @@ -150,21 +150,21 @@ type Expr = | { tag: "lambda", parameters: ProductPattern[], body: Expr } | { tag: "apply", callee: Expr, args: Expr[] } -type Literal = +export type Literal = | { tag: "number", value: number } | { tag: "string", value: string } -type ExprBinding = { +export type ExprBinding = { var: ProductPattern, expr: Expr, } -type ProductPattern = +export type ProductPattern = | { tag: "any", name: VariableName } | { tag: "tuple", patterns: ProductPattern[] } | { tag: "record", fields: { field_name: FieldName, pattern: ProductPattern }[] } -type Pattern = +export type Pattern = | ProductPattern | { tag: "tag", tag_name: Tag } | { tag: "tagged", tag_name: Tag, pattern: Pattern }