diff --git a/src/PARSER-PLAN.md b/src/PARSER-PLAN.md deleted file mode 100644 index b7dda85..0000000 --- a/src/PARSER-PLAN.md +++ /dev/null @@ -1,336 +0,0 @@ -## Goal - -Implement a correct parser for the language described in `SYNTAX.md`, producing the existing AST types (`Expr`, `Pattern`, `ProductPattern`, etc.). - -Code quality is **not** the primary concern. -Correctness, clarity, and reasonable error messages are. - ---- - -## Overall architecture - -The parser is split into **two stages**: - -1. **Lexing (tokenization)** - Converts source text into a stream of tokens, each with precise source location info. -2. **Parsing** - Consumes the token stream and constructs the AST using recursive-descent parsing. - -This split is deliberate and should be preserved. - ---- - -## Stage 1: Lexer (Tokenizer) - -### Purpose - -The lexer exists to: - -* Normalize the input into a small set of token types -* Track **line / column / offset** precisely -* Make parsing simpler and more reliable -* Enable good error messages later - -The lexer is intentionally **simple and dumb**: - -* No semantic decisions -* No AST construction -* Minimal lookahead - ---- - -### Unicode handling - -The input may contain arbitrary Unicode (including emoji) inside identifiers and strings. - -**Important rule**: - -* Iterate over Unicode *code points*, not UTF-16 code units. - -In TypeScript: - -* Use `for (const ch of input)` or equivalent -* Do **not** index into strings with `input[i]` - -Column counting: - -* Increment column by **1 per code point** -* Exact visual width is not required - ---- - -### Source positions and spans - -All tokens must carry precise location information. - -Suggested types (can be adjusted): - -```ts -type Position = { - offset: number; // code-point index from start of input - line: number; // 1-based - column: number; // 1-based -}; - -type Span = { - start: Position; - end: Position; -}; -``` - -Each token has a `span`. - ---- - -### Token types - -Suggested minimal token set: - -```ts -type Token = - | { kind: "number"; value: number; span: Span } - | { kind: "string"; value: string; span: Span } - | { kind: "identifier"; value: string; span: Span } - | { kind: "keyword"; value: Keyword; span: Span } - | { kind: "symbol"; value: Symbol; span: Span } - | { kind: "eof"; span: Span }; -``` - -Where: - -```ts -type Keyword = "let" | "fn" | "match" | "apply" | "=" | "!" | "|"; -type Symbol = "#" | "$" | "(" | ")" | "{" | "}" | "," | "."; -``` - -Notes: - -* Operators like `+`, `==`, `<=`, `*` are **identifiers** -* `=` is treated as a keyword (same ofr `|`) -* Identifiers are parsed first, then checked against keywords - ---- - -### Lexer responsibilities - -The lexer should: - -* Skip whitespace (spaces, tabs, newlines) -* Track line and column numbers -* Emit tokens with correct spans -* Fail immediately on: - - * Unterminated string literals - * Invalid characters - -The lexer **should not**: - -* Attempt error recovery -* Guess intent -* Validate grammar rules - ---- - -## Stage 2: Parser - -### Parsing strategy - -Use **recursive-descent parsing**. - -The grammar is: - -* Context-free -* Non-left-recursive -* No precedence rules -* No implicit associativity - -This makes recursive descent ideal. - ---- - -### Parser state - -The parser operates over: - -```ts -class Parser { - tokens: Token[]; - pos: number; -} -``` - -Helper methods are encouraged: - -```ts -peek(): Token -advance(): Token -matchKeyword(kw: Keyword): boolean -matchSymbol(sym: Symbol): boolean -expectKeyword(kw: Keyword): Token -expectSymbol(sym: Symbol): Token -error(message: string, span?: Span): never -``` - ---- - -### Error handling - -Error recovery is **not required**. - -On error: - -* Throw a `ParseError` -* Include: - - * A clear message - * A span pointing to the offending token (or best approximation) - -The goal is: - -* One good error -* Accurate location -* No cascading failures - ---- - -### Expression parsing - -There is **no precedence hierarchy**. - -`parseExpr()` should: - -* Look at the next token -* Dispatch to the correct parse function based on: - - * keyword (e.g. `let`, `fn`, `match`, `apply`) - * symbol (e.g. `$`, `#`, `(`, `{`) - * identifier (e.g. top-level function call) - -Order matters. - ---- - -### Important parsing rules - -#### Variable use - -```txt -$x -``` - -* `$` immediately followed by identifier -* No whitespace allowed - -#### Tag expressions - -```txt -#foo -#foo expr -``` - -Parsing rule: - -* After `#tag`, look at the next token -* If the next token can start an expression **and is not a terminator** (`)`, `}`, `,`, `|`, `.`): - - * Parse a `tagged-expr` -* Otherwise: - - * Parse a `tag-expr` - -This rule is intentional and should be implemented directly. - ---- - -#### Tuples vs grouping - -Parentheses always construct **tuples**. - -```txt -() -(123) -(1, 2, 3) -``` - -Parentheses are **not** used for grouping expressions. So `(123)` is NOT the same as `123`. - ---- - -#### Lists with separators - -Many constructs use: - -```txt -list-sep-by(p, sep) -``` - -This allows: - -* Empty lists -* Optional leading separator -* Optional trailing separator - -Implement a reusable helper that: - -* Stops at a known terminator token -* Does not allow repeated separators without elements - ---- - -### Parsing patterns - -Patterns are parsed only in specific contexts: - -* `match` branches -* `let` bindings -* lambda parameters - -There are **two distinct pattern parsers**: - -* `parsePattern()` — full patterns (including tags) -* `parseProductPattern()` — no tags allowed - -These should be separate functions. - ---- - -### AST construction - -Parser functions should construct AST nodes directly, matching the existing AST types exactly. - -If necessary, spans may be: - -* Stored directly on AST nodes, or -* Discarded after parsing - -Either is acceptable. - ---- - -## Division of responsibility - -**Lexer**: - -* Characters → tokens -* Unicode-safe -* Tracks positions - -**Parser**: - -* Tokens → AST -* Grammar enforcement -* Context-sensitive decisions -* Error reporting - -Do **not** merge these stages. - ---- - -## Final notes - -* Favor clarity over cleverness -* Favor explicit structure over abstraction -* Assume the grammar in `SYNTAX.md` is authoritative -* It is acceptable to tweak helper types or utilities if needed - -Correct parsing is the goal. Performance and elegance are not. diff --git a/src/SYNTAX.md b/src/lang/SYNTAX.md similarity index 100% rename from src/SYNTAX.md rename to src/lang/SYNTAX.md diff --git a/src/debug/expr_show.ts b/src/lang/debug/expr_show.ts similarity index 100% rename from src/debug/expr_show.ts rename to src/lang/debug/expr_show.ts diff --git a/src/debug/repl.ts b/src/lang/debug/repl.ts similarity index 100% rename from src/debug/repl.ts rename to src/lang/debug/repl.ts diff --git a/src/debug/value_show.ts b/src/lang/debug/value_show.ts similarity index 100% rename from src/debug/value_show.ts rename to src/lang/debug/value_show.ts diff --git a/src/parser/SCANNER.md b/src/lang/parser/SCANNER.md similarity index 100% rename from src/parser/SCANNER.md rename to src/lang/parser/SCANNER.md diff --git a/src/parser/cursor.test.ts b/src/lang/parser/cursor.test.ts similarity index 100% rename from src/parser/cursor.test.ts rename to src/lang/parser/cursor.test.ts diff --git a/src/parser/cursor.ts b/src/lang/parser/cursor.ts similarity index 100% rename from src/parser/cursor.ts rename to src/lang/parser/cursor.ts diff --git a/src/parser/parser.ts b/src/lang/parser/parser.ts similarity index 100% rename from src/parser/parser.ts rename to src/lang/parser/parser.ts diff --git a/src/parser/scanner.ts b/src/lang/parser/scanner.ts similarity index 96% rename from src/parser/scanner.ts rename to src/lang/parser/scanner.ts index 2a748bb..c1bd3c8 100644 --- a/src/parser/scanner.ts +++ b/src/lang/parser/scanner.ts @@ -1,10 +1,8 @@ -import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text'; -import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text'; +import { CARRIAGE_RETURN, char, NEW_LINE } from './source_text'; +import type { Span, CodePoint } from './source_text'; import { isDigit, isWhitespace, scanNumber, scanString } from './cursor'; -import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor'; -import { Result } from '../result'; -import { Expr } from 'src/value'; +import type { Cursor, GenericScanError, NumberError, StringError } from './cursor'; export function skipWhitespaceAndComments(cursor: Cursor): number { let totalConsumed = 0; diff --git a/src/parser/source_text.ts b/src/lang/parser/source_text.ts similarity index 100% rename from src/parser/source_text.ts rename to src/lang/parser/source_text.ts diff --git a/src/result.ts b/src/lang/result.ts similarity index 100% rename from src/result.ts rename to src/lang/result.ts diff --git a/src/value.ts b/src/lang/value.ts similarity index 100% rename from src/value.ts rename to src/lang/value.ts diff --git a/tmp_repl/test.flux b/tmp_repl/test.flux index d1286fd..8a321e3 100644 --- a/tmp_repl/test.flux +++ b/tmp_repl/test.flux @@ -9,3 +9,5 @@ let { } } + + diff --git a/tmp_repl/tmp_repl.md b/tmp_repl/tmp_repl.md index f4d86eb..5bc61d2 100644 --- a/tmp_repl/tmp_repl.md +++ b/tmp_repl/tmp_repl.md @@ -15,7 +15,7 @@ npm install -D sass-embedded npx ts-node src/parser/cursor.test.ts -npx ts-node src/debug/repl.ts +npx ts-node src/lang/debug/repl.ts -npx ts-node src/debug/repl.ts tmp_repl/test.flux +npx ts-node src/lang/debug/repl.ts tmp_repl/test.flux