Finish first scanning step

This commit is contained in:
Yura Dupyn 2026-02-06 12:31:10 +01:00
parent d5f9777711
commit d553a05f45
2 changed files with 180 additions and 38 deletions

View file

@ -4,41 +4,7 @@ import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePoin
import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
import { Result } from '../result';
// === Language Specific Stuff ===
const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
export type Delimiter = typeof DELIMITER_CHARS[number];
const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!";
// Returns the raw string. Does NOT create a token (the caller decides the token type).
function scanRawIdentifier(cursor: Cursor): string {
const start = cursor.currentIndex;
// Consume until EOF or Delimiter or Whitespace (including comments)
// TODO: How to check for comments? They are special in that they must start with two characters. Fuck.
cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c)));
return cursor.text.sliceByCp(start, cursor.currentIndex);
}
export type ExprScanError =
| { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
| { tag: "InvalidIdentifier", text: string, reason: string, span: Span }
| NumberError
| StringError;
export type ExprStartToken =
| { tag: "number", value: number, span: Span }
| { tag: "string", text: string, span: Span }
| { tag: "function_name", name: string, span: Span }
| { tag: "variable_use", name: string, span: Span }
| { tag: "tag", name: string, span: Span }
| { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
| { tag: "keyword", kw: Keyword, span: Span }
| { tag: "EOF", span: Span }
// TODO: Move this back to `cursor.ts`
import { Expr } from 'src/value';
function skipWhitespaceAndComments(cursor: Cursor): number {
let totalConsumed = 0;
@ -76,11 +42,185 @@ function skipWhitespaceAndComments(cursor: Cursor): number {
return totalConsumed;
}
export function scanExprStart(cursor: Cursor): Result<ExprStartToken, ExprScanError> {
// TODO
return (0 as any);
// === Language Specific Stuff ===
const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
export type Delimiter = typeof DELIMITER_CHARS[number];
const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
const KEYWORD_LIST = ["let" , "fn" , "match" , "apply" , "=" , "|" , "!"] as const;
export type Keyword = typeof KEYWORD_LIST[number];
const KEYWORD_SET: Set<string> = new Set(KEYWORD_LIST);
// === Errors ===
export type ExprScanError =
| GenericScanError
| NumberError
| StringError
| { tag: "InvalidIdentifier", text: string, kind: IdentifierKind, reason: IdentifierErrorReason, span: Span }
// What kind of identifier were we trying to parse?
export type IdentifierKind = "identifier" | "variable_use" | "tag_construction" | "function_call";
export type IdentifierErrorReason =
| { tag: "StartsWithDigit" }
| { tag: "IsKeyword", kw: Keyword }
// === Tokens ===
export type ExprStartToken =
| { tag: "number", value: number, span: Span }
| { tag: "string", text: string, span: Span }
| { tag: "function_name", name: string, span: Span }
| { tag: "variable_use", name: string, span: Span }
| { tag: "tag", name: string, span: Span }
| { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
| { tag: "tuple_start", span: Span }
| { tag: "record_start", span: Span }
| { tag: "keyword", kw: Keyword, span: Span }
| { tag: "EOF", span: Span }
// === Identifier Scanners ===
// Returns the raw string.
function rawIdentifier(cursor: Cursor): string {
const start = cursor.currentIndex;
while (!cursor.eof()) {
const c = cursor.peek();
if (DELIMITER_SET.has(c) || isWhitespace(c)) {
break;
}
if (c === char('/') && cursor.peek(1) === char('/')) {
break;
}
cursor.next();
}
return cursor.text.sliceByCp(start, cursor.currentIndex);
}
// Scans raw identifier,
// checks if it is a keyword,
// if it ain't, validates it into a proper identifier.
function identifierOrKeyword(
cursor: Cursor,
kind: IdentifierKind,
): { tag: "keyword", kw: Keyword, span: Span }
| { tag: "identifier", name: string, span: Span } {
const start = cursor.currentLocation();
const text = rawIdentifier(cursor);
const span = cursor.makeSpan(start);
if (KEYWORD_SET.has(text)) {
return { tag: "keyword", kw: text as Keyword, span };
}
// validation
if (isDigit(char(text[0]))) {
throw ({
tag: "InvalidIdentifier",
text,
kind,
reason: { tag: "StartsWithDigit" },
span
} as ExprScanError);
}
return { tag: "identifier", name: text, span };
}
export function identifier(cursor: Cursor, kind: IdentifierKind): { name: string, span: Span } {
const res = identifierOrKeyword(cursor, kind);
if (res.tag === "keyword") {
throw ({
tag: "InvalidIdentifier",
text: res.kw,
kind,
reason: { tag: "IsKeyword", kw: res.kw },
span: res.span
} as ExprScanError);
}
return { name: res.name, span: res.span };
}
// === Literal Scanners ===
// throws ExprScanError
function number(cursor: Cursor): { value: number, span: Span } {
const res = scanNumber(cursor);
if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
}
// throws ExprScanError
function string(cursor: Cursor): { value: string, span: Span } {
const res = scanString(cursor);
if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
}
// === complex scanners ===
// throws ExprScanError
export function exprStart(cursor: Cursor): ExprStartToken {
skipWhitespaceAndComments(cursor);
const start = cursor.currentLocation();
if (cursor.eof()) {
return { tag: "EOF", span: cursor.makeSpan(start) };
}
const c = cursor.peek()!;
// === numbers ===
if (isDigit(c) || (c === char('-') && isDigit(cursor.peek(1) ?? 0))) {
const { value, span } = number(cursor);
return { tag: "number", value: value, span };
}
// === strings ===
if (c === char('"')) {
const { value, span } = string(cursor);
return { tag: "string", text: value, span };
}
// === variable use ===
if (c === char('$')) {
cursor.next();
const { name } = identifier(cursor, 'variable_use');
return { tag: "variable_use", name, span: cursor.makeSpan(start) };
}
// === tags ===
if (c === char('#')) {
cursor.next();
const { name } = identifier(cursor, 'tag_construction');
return { tag: "tag", name, span: cursor.makeSpan(start) };
}
// === tuples ===
if (c === char('(')) {
cursor.next();
return { tag: "tuple_start", span: cursor.makeSpan(start) };
}
// === records ===
if (c === char('{')) {
cursor.next();
return { tag: "record_start", span: cursor.makeSpan(start) };
}
// === keywords & identifiers ===
// Fallthrough: it must be a keyword or a function call
const result = identifierOrKeyword(cursor, 'function_call');
switch (result.tag) {
case "keyword":
return result;
case "identifier":
return { tag: "function_name", name: result.name, span: result.span };
}
}
// TODO: Need a Token to TokenKind function
// TODO: Need is_start_of_expression(token): boolean
// identifier -> true

View file

@ -211,6 +211,8 @@ export namespace Expr {
export const call = (name: FunctionName, args: Expr[]): Expr => ({ tag: "call", name, args, });
export const tag = (tag_name: Tag): Expr => ({ tag: "tag", tag_name, });
export const tagged = (tag_name: Tag, expr: Expr): Expr => ({ tag: "tagged", tag_name, expr, });
export const tuple = (exprs: Expr[]): Expr => ({ tag: "tuple", exprs });
export const record = (fields: { name: FieldName, expr: Expr }[]): Expr => ({ tag: "record", fields });
export const match = (arg: Expr, branches: { pattern: Pattern; body: Expr }[]): Expr => ({ tag: "match", arg, branches, });
export const var_use = (name: VariableName): Expr => ({ tag: "var_use", name, });
export const let_ = (bindings: ExprBinding[], body: Expr): Expr => ({ tag: "let", bindings, body, });