Finish parser
This commit is contained in:
parent
d553a05f45
commit
13a66f2d16
6 changed files with 772 additions and 58 deletions
|
|
@ -6,7 +6,7 @@ import type { Cursor, CursorState, GenericScanError, NumberError, StringError }
|
|||
import { Result } from '../result';
|
||||
import { Expr } from 'src/value';
|
||||
|
||||
function skipWhitespaceAndComments(cursor: Cursor): number {
|
||||
export function skipWhitespaceAndComments(cursor: Cursor): number {
|
||||
let totalConsumed = 0;
|
||||
|
||||
while (true) {
|
||||
|
|
@ -60,7 +60,13 @@ export type ExprScanError =
|
|||
| { tag: "InvalidIdentifier", text: string, kind: IdentifierKind, reason: IdentifierErrorReason, span: Span }
|
||||
|
||||
// What kind of identifier were we trying to parse?
|
||||
export type IdentifierKind = "identifier" | "variable_use" | "tag_construction" | "function_call";
|
||||
export type IdentifierKind =
|
||||
| "identifier"
|
||||
| "variable_use"
|
||||
| "tag_construction"
|
||||
| "function_call"
|
||||
| "pattern_binding";
|
||||
|
||||
export type IdentifierErrorReason =
|
||||
| { tag: "StartsWithDigit" }
|
||||
| { tag: "IsKeyword", kw: Keyword }
|
||||
|
|
@ -73,12 +79,20 @@ export type ExprStartToken =
|
|||
| { tag: "function_name", name: string, span: Span }
|
||||
| { tag: "variable_use", name: string, span: Span }
|
||||
| { tag: "tag", name: string, span: Span }
|
||||
| { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
|
||||
| { tag: "tuple_start", span: Span }
|
||||
| { tag: "record_start", span: Span }
|
||||
| { tag: "keyword", kw: Keyword, span: Span }
|
||||
// TODO: ger rid of EOF
|
||||
| { tag: "EOF", span: Span }
|
||||
|
||||
export type PatternStartToken =
|
||||
| { tag: "pattern_binding", name: string, span: Span }
|
||||
| { tag: "tag", name: string, span: Span }
|
||||
| { tag: "tuple_start", span: Span }
|
||||
| { tag: "record_start", span: Span }
|
||||
// TODO: ger rid of EOF
|
||||
| { tag: "EOF", span: Span };
|
||||
|
||||
// === Identifier Scanners ===
|
||||
|
||||
// Returns the raw string.
|
||||
|
|
@ -146,12 +160,14 @@ export function identifier(cursor: Cursor, kind: IdentifierKind): { name: string
|
|||
|
||||
// === Literal Scanners ===
|
||||
// throws ExprScanError
|
||||
// TODO: handle trailing whitespace
|
||||
function number(cursor: Cursor): { value: number, span: Span } {
|
||||
const res = scanNumber(cursor);
|
||||
if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
|
||||
}
|
||||
|
||||
// throws ExprScanError
|
||||
// TODO: handle trailing whitespace
|
||||
function string(cursor: Cursor): { value: string, span: Span } {
|
||||
const res = scanString(cursor);
|
||||
if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
|
||||
|
|
@ -159,10 +175,10 @@ function string(cursor: Cursor): { value: string, span: Span } {
|
|||
|
||||
// === complex scanners ===
|
||||
|
||||
// TODO: in exprStart/patternStart make sure whitespace is consumed after they succesfuly produce token. Should we build it in the functions? Or should that be in `parser.ts`?
|
||||
|
||||
// throws ExprScanError
|
||||
export function exprStart(cursor: Cursor): ExprStartToken {
|
||||
skipWhitespaceAndComments(cursor);
|
||||
|
||||
const start = cursor.currentLocation();
|
||||
if (cursor.eof()) {
|
||||
return { tag: "EOF", span: cursor.makeSpan(start) };
|
||||
|
|
@ -220,53 +236,105 @@ export function exprStart(cursor: Cursor): ExprStartToken {
|
|||
}
|
||||
}
|
||||
|
||||
export function patternStart(cursor: Cursor): PatternStartToken {
|
||||
const start = cursor.currentLocation();
|
||||
|
||||
// TODO: Need a Token to TokenKind function
|
||||
// TODO: Need is_start_of_expression(token): boolean
|
||||
// identifier -> true
|
||||
// symbol # -> true
|
||||
// symbol $ -> true
|
||||
// symbol @ -> true
|
||||
// symbol ( -> true
|
||||
// symbol { -> true // this is actually context dependent. Sometimes its a start of a binding context { params . body } or { let-params . body }, and sometimes it is a record. But this function is gonna be used only in the first context
|
||||
// symbol _ -> false
|
||||
// number -> true
|
||||
// string -> true
|
||||
// keyword let -> true
|
||||
// keyword fn -> true
|
||||
// keyword apply -> true
|
||||
// keyword = -> false
|
||||
// keyword | -> false
|
||||
// EOF -> false
|
||||
//
|
||||
// TODO: function that matches a token with a token_type (returns bool)
|
||||
if (cursor.eof()) {
|
||||
return { tag: "EOF", span: cursor.makeSpan(start) };
|
||||
}
|
||||
|
||||
// TODO: forbidden characters are
|
||||
// '('
|
||||
// ')'
|
||||
// '{'
|
||||
// '}'
|
||||
// '.'
|
||||
// ','
|
||||
// '|'
|
||||
// '$'
|
||||
// '#'
|
||||
// '@'
|
||||
// '"'
|
||||
// ' '
|
||||
// '\r'
|
||||
// '\t'
|
||||
// '\n'
|
||||
// TODO: need function is_forbidden_char
|
||||
const c = cursor.peek()!;
|
||||
// === tuple ===
|
||||
if (c === char('(')) {
|
||||
cursor.next();
|
||||
return { tag: "tuple_start", span: cursor.makeSpan(start) };
|
||||
}
|
||||
|
||||
// === record ===
|
||||
if (c === char('{')) {
|
||||
cursor.next();
|
||||
return { tag: "record_start", span: cursor.makeSpan(start) };
|
||||
}
|
||||
|
||||
// === tag ===
|
||||
if (c === char('#')) {
|
||||
cursor.next();
|
||||
const { name } = identifier(cursor, 'tag_construction');
|
||||
return { tag: "tag", name, span: cursor.makeSpan(start) };
|
||||
}
|
||||
|
||||
// === scanner functions ===
|
||||
// TODO: whitespace - consumes whitespace
|
||||
// TODO: comment - consumes token
|
||||
// TODO: raw_identifier - consumes raw identifier - then we can decide whether that was a keyword or an identifier
|
||||
// TODO: string - consumes string like "foo bar\njfjdsajfksd"
|
||||
// TODO: number - consumes number like 123123 or 000123 or 23919233.123
|
||||
//
|
||||
// TODO: token - gives next token
|
||||
// TODO: This is more subtle... -foo is a valid pattern name... I think I should restrict: can't have identifiers start with `-`? But then `-` itself can't be an identifier, which is a bit sad.
|
||||
// TODO: This is gonna be different once we allow number/string literals as patterns.
|
||||
if (isDigit(c) || c === char('"') || c === char('-')) {
|
||||
throw {
|
||||
tag: "UnexpectedCharacter",
|
||||
char: c,
|
||||
span: cursor.makeSpan(start)
|
||||
} as ExprScanError;
|
||||
}
|
||||
|
||||
// === pattern binding ===
|
||||
const { name } = identifier(cursor, 'pattern_binding');
|
||||
return { tag: "pattern_binding", name, span: cursor.makeSpan(start) };
|
||||
}
|
||||
|
||||
export function isNextTokenExprStart(cursor: Cursor): boolean {
|
||||
const state = cursor.save();
|
||||
try {
|
||||
const token = exprStart(cursor);
|
||||
|
||||
switch (token.tag) {
|
||||
case "number":
|
||||
case "string":
|
||||
case "variable_use":
|
||||
case "tag":
|
||||
case "tuple_start":
|
||||
case "record_start":
|
||||
case "function_name": // e.g. my_func(x)
|
||||
return true;
|
||||
|
||||
case "keyword":
|
||||
switch (token.kw) {
|
||||
case "let":
|
||||
case "fn":
|
||||
case "match":
|
||||
case "apply":
|
||||
return true;
|
||||
case "=":
|
||||
case "|":
|
||||
case "!":
|
||||
return false;
|
||||
}
|
||||
|
||||
case "EOF":
|
||||
return false;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
return false;
|
||||
} finally {
|
||||
cursor.restore(state);
|
||||
}
|
||||
}
|
||||
|
||||
export function isNextTokenProductPatternStart(cursor: Cursor): boolean {
|
||||
const state = cursor.save();
|
||||
try {
|
||||
const token = patternStart(cursor);
|
||||
switch (token.tag) {
|
||||
case "pattern_binding":
|
||||
case "tuple_start":
|
||||
case "record_start":
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} catch (e) {
|
||||
return false;
|
||||
} finally {
|
||||
cursor.restore(state);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue