356 lines
9.9 KiB
TypeScript
356 lines
9.9 KiB
TypeScript
|
|
import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text';
|
|
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
|
|
import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
|
|
import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
|
|
import { Result } from '../result';
|
|
import { Expr } from 'src/value';
|
|
|
|
export function skipWhitespaceAndComments(cursor: Cursor): number {
|
|
let totalConsumed = 0;
|
|
|
|
while (true) {
|
|
// 1. Consume standard whitespace (spaces, tabs, newlines)
|
|
const wsCount = cursor.consumeWhile(isWhitespace);
|
|
totalConsumed += wsCount;
|
|
|
|
// 2. Check for Line Comment start ('//')
|
|
const c = cursor.peek();
|
|
const nextC = cursor.peek(1);
|
|
|
|
if (c === char('/') && nextC === char('/')) {
|
|
// Found comment start. Consume the '//' markers
|
|
cursor.next();
|
|
cursor.next();
|
|
totalConsumed += 2;
|
|
|
|
// Consume everything until the next newline (or EOF).
|
|
// Note: We do NOT consume the newline itself here.
|
|
// We let the NEXT iteration of the 'while(true)' loop catch
|
|
// the newline as standard whitespace.
|
|
const commentContentLength = cursor.consumeWhile(c => c !== NEW_LINE && c !== CARRIAGE_RETURN);
|
|
totalConsumed += commentContentLength;
|
|
} else {
|
|
// We are not at a comment.
|
|
// If we also didn't consume any whitespace in step 1, we are truly done.
|
|
if (wsCount === 0) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return totalConsumed;
|
|
}
|
|
|
|
// === Language Specific Stuff ===
|
|
const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
|
|
export type Delimiter = typeof DELIMITER_CHARS[number];
|
|
const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
|
|
|
|
const KEYWORD_LIST = ["let" , "fn" , "match" , "apply" , "=" , "|" , "!", ":"] as const;
|
|
export type Keyword = typeof KEYWORD_LIST[number];
|
|
const KEYWORD_SET: Set<string> = new Set(KEYWORD_LIST);
|
|
|
|
// === Errors ===
|
|
|
|
export type ExprScanError =
|
|
| GenericScanError
|
|
| NumberError
|
|
| StringError
|
|
| { tag: "InvalidIdentifier", text: string, kind: IdentifierKind, reason: IdentifierErrorReason, span: Span }
|
|
|
|
// What kind of identifier were we trying to parse?
|
|
export type IdentifierKind =
|
|
| "variable_use"
|
|
| "field_name"
|
|
| "tag_construction"
|
|
| "function_call"
|
|
| "pattern_binding";
|
|
|
|
export type IdentifierErrorReason =
|
|
| { tag: "Empty" }
|
|
| { tag: "StartsWithDigit" }
|
|
| { tag: "IsKeyword", kw: Keyword }
|
|
|
|
// === Tokens ===
|
|
|
|
export type ExprStartToken =
|
|
| { tag: "number", value: number, span: Span }
|
|
| { tag: "string", text: string, span: Span }
|
|
| { tag: "function_name", name: string, span: Span }
|
|
| { tag: "variable_use", name: string, span: Span }
|
|
| { tag: "tag", name: string, span: Span }
|
|
| { tag: "tuple_start", span: Span }
|
|
| { tag: "keyword", kw: Keyword, span: Span }
|
|
// TODO: ger rid of EOF
|
|
| { tag: "EOF", span: Span }
|
|
|
|
export type PatternStartToken =
|
|
| { tag: "pattern_binding", name: string, span: Span }
|
|
| { tag: "tag", name: string, span: Span }
|
|
| { tag: "tuple_start", span: Span }
|
|
| { tag: "keyword", kw: Keyword, span: Span }
|
|
// TODO: ger rid of EOF
|
|
| { tag: "EOF", span: Span };
|
|
|
|
// === Identifier Scanners ===
|
|
|
|
// Returns the raw string.
|
|
function rawIdentifier(cursor: Cursor): string {
|
|
const start = cursor.currentIndex;
|
|
while (!cursor.eof()) {
|
|
const c = cursor.peek();
|
|
|
|
if (DELIMITER_SET.has(c) || isWhitespace(c)) {
|
|
break;
|
|
}
|
|
|
|
if (c === char('/') && cursor.peek(1) === char('/')) {
|
|
break;
|
|
}
|
|
cursor.next();
|
|
}
|
|
return cursor.text.sliceByCp(start, cursor.currentIndex);
|
|
}
|
|
|
|
// Scans raw identifier,
|
|
// checks if it is a keyword,
|
|
// if it ain't, validates it into a proper identifier.
|
|
function identifierOrKeywordScanner(
|
|
cursor: Cursor,
|
|
kind: IdentifierKind,
|
|
): { tag: "keyword", kw: Keyword, span: Span }
|
|
| { tag: "identifier", name: string, span: Span } {
|
|
const start = cursor.currentLocation();
|
|
const text = rawIdentifier(cursor);
|
|
const span = cursor.makeSpan(start);
|
|
if (text.length === 0) {
|
|
throw ({
|
|
tag: "InvalidIdentifier",
|
|
text,
|
|
kind,
|
|
reason: { tag: "Empty" },
|
|
span
|
|
} as ExprScanError);
|
|
}
|
|
|
|
if (KEYWORD_SET.has(text)) {
|
|
return { tag: "keyword", kw: text as Keyword, span };
|
|
}
|
|
|
|
// validation
|
|
if (isDigit(char(text[0]))) {
|
|
throw ({
|
|
tag: "InvalidIdentifier",
|
|
text,
|
|
kind,
|
|
reason: { tag: "StartsWithDigit" },
|
|
span
|
|
} as ExprScanError);
|
|
}
|
|
return { tag: "identifier", name: text, span };
|
|
}
|
|
|
|
export function identifierScanner(cursor: Cursor, kind: IdentifierKind): { name: string, span: Span } {
|
|
const res = identifierOrKeywordScanner(cursor, kind);
|
|
|
|
if (res.tag === "keyword") {
|
|
throw ({
|
|
tag: "InvalidIdentifier",
|
|
text: res.kw,
|
|
kind,
|
|
reason: { tag: "IsKeyword", kw: res.kw },
|
|
span: res.span
|
|
} as ExprScanError);
|
|
}
|
|
|
|
return { name: res.name, span: res.span };
|
|
}
|
|
|
|
// === Literal Scanners ===
|
|
// throws ExprScanError
|
|
// TODO: handle trailing whitespace
|
|
function number(cursor: Cursor): { value: number, span: Span } {
|
|
const res = scanNumber(cursor);
|
|
if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
|
|
}
|
|
|
|
// throws ExprScanError
|
|
// TODO: handle trailing whitespace
|
|
function string(cursor: Cursor): { value: string, span: Span } {
|
|
const res = scanString(cursor);
|
|
if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
|
|
}
|
|
|
|
// === complex scanners ===
|
|
|
|
// TODO: in exprStart/patternStart make sure whitespace is consumed after they succesfuly produce token. Should we build it in the functions? Or should that be in `parser.ts`?
|
|
|
|
// throws ExprScanError
|
|
export function exprStart(cursor: Cursor): ExprStartToken {
|
|
const start = cursor.currentLocation();
|
|
if (cursor.eof()) {
|
|
return { tag: "EOF", span: cursor.makeSpan(start) };
|
|
}
|
|
|
|
const c = cursor.peek()!;
|
|
|
|
// === numbers ===
|
|
if (isDigit(c) || (c === char('-') && isDigit(cursor.peek(1) ?? 0))) {
|
|
const { value, span } = number(cursor);
|
|
return { tag: "number", value: value, span };
|
|
}
|
|
|
|
// === strings ===
|
|
if (c === char('"')) {
|
|
const { value, span } = string(cursor);
|
|
return { tag: "string", text: value, span };
|
|
}
|
|
|
|
// === variable use ===
|
|
if (c === char('$')) {
|
|
cursor.next();
|
|
const { name } = identifierScanner(cursor, 'variable_use');
|
|
return { tag: "variable_use", name, span: cursor.makeSpan(start) };
|
|
}
|
|
|
|
// === tags ===
|
|
if (c === char('#')) {
|
|
cursor.next();
|
|
const { name } = identifierScanner(cursor, 'tag_construction');
|
|
return { tag: "tag", name, span: cursor.makeSpan(start) };
|
|
}
|
|
|
|
|
|
// === tuples ===
|
|
if (c === char('(')) {
|
|
cursor.next();
|
|
return { tag: "tuple_start", span: cursor.makeSpan(start) };
|
|
}
|
|
|
|
// === keywords & identifiers ===
|
|
// Fallthrough: it must be a keyword or a function call
|
|
const result = identifierOrKeywordScanner(cursor, 'function_call');
|
|
switch (result.tag) {
|
|
case "keyword":
|
|
return result;
|
|
case "identifier":
|
|
return { tag: "function_name", name: result.name, span: result.span };
|
|
}
|
|
}
|
|
|
|
export function patternStart(cursor: Cursor): PatternStartToken {
|
|
const start = cursor.currentLocation();
|
|
|
|
if (cursor.eof()) {
|
|
return { tag: "EOF", span: cursor.makeSpan(start) };
|
|
}
|
|
|
|
const c = cursor.peek()!;
|
|
// === tuple ===
|
|
if (c === char('(')) {
|
|
cursor.next();
|
|
return { tag: "tuple_start", span: cursor.makeSpan(start) };
|
|
}
|
|
|
|
// === tag ===
|
|
if (c === char('#')) {
|
|
cursor.next();
|
|
const { name } = identifierScanner(cursor, 'tag_construction');
|
|
return { tag: "tag", name, span: cursor.makeSpan(start) };
|
|
}
|
|
|
|
// TODO: This is more subtle... -foo is a valid pattern name... I think I should restrict: can't have identifiers start with `-`? But then `-` itself can't be an identifier, which is a bit sad.
|
|
// TODO: This is gonna be different once we allow number/string literals as patterns.
|
|
if (isDigit(c) || c === char('"') || c === char('-')) {
|
|
throw {
|
|
tag: "UnexpectedCharacter",
|
|
char: c,
|
|
span: cursor.makeSpan(start)
|
|
} as ExprScanError;
|
|
}
|
|
|
|
// === pattern binding ===
|
|
// Fallthrough: it must be a keyword or a pattern-variable
|
|
const result = identifierOrKeywordScanner(cursor, 'function_call');
|
|
switch (result.tag) {
|
|
case "keyword":
|
|
return result;
|
|
case "identifier":
|
|
return { tag: "pattern_binding", name: result.name, span: result.span };
|
|
}
|
|
}
|
|
|
|
export function isNextTokenExprStart(cursor: Cursor): boolean {
|
|
const state = cursor.save();
|
|
try {
|
|
const token = exprStart(cursor);
|
|
|
|
switch (token.tag) {
|
|
case "number":
|
|
case "string":
|
|
case "variable_use":
|
|
case "tag":
|
|
case "tuple_start":
|
|
case "function_name": // e.g. my_func(x)
|
|
return true;
|
|
|
|
case "keyword":
|
|
switch (token.kw) {
|
|
case "let":
|
|
case "fn":
|
|
case "match":
|
|
case "apply":
|
|
case ":":
|
|
return true;
|
|
case "=":
|
|
case "|":
|
|
case "!":
|
|
return false;
|
|
}
|
|
|
|
case "EOF":
|
|
return false;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
} catch (e) {
|
|
return false;
|
|
} finally {
|
|
cursor.restore(state);
|
|
}
|
|
}
|
|
|
|
export function isNextTokenProductPatternStart(cursor: Cursor): boolean {
|
|
const state = cursor.save();
|
|
try {
|
|
const token = patternStart(cursor);
|
|
switch (token.tag) {
|
|
case "pattern_binding":
|
|
case "tuple_start":
|
|
return true;
|
|
|
|
case "keyword":
|
|
switch (token.kw) {
|
|
case ":":
|
|
return true;
|
|
case "let":
|
|
case "fn":
|
|
case "match":
|
|
case "apply":
|
|
case "=":
|
|
case "|":
|
|
case "!":
|
|
return false;
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
} catch (e) {
|
|
return false;
|
|
} finally {
|
|
cursor.restore(state);
|
|
}
|
|
}
|