From d553a05f453ffc92a759d4259bc6a101dc6955dc Mon Sep 17 00:00:00 2001 From: Yura Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Fri, 6 Feb 2026 12:31:10 +0100 Subject: [PATCH] Finish first scanning step --- src/parser/scanner.ts | 216 ++++++++++++++++++++++++++++++++++-------- src/value.ts | 2 + 2 files changed, 180 insertions(+), 38 deletions(-) diff --git a/src/parser/scanner.ts b/src/parser/scanner.ts index 95ef6d6..6564ac6 100644 --- a/src/parser/scanner.ts +++ b/src/parser/scanner.ts @@ -4,41 +4,7 @@ import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePoin import { isDigit, isWhitespace, scanNumber, scanString } from './cursor'; import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor'; import { Result } from '../result'; - -// === Language Specific Stuff === - -const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const; -export type Delimiter = typeof DELIMITER_CHARS[number]; -const DELIMITER_SET: Set = new Set(DELIMITER_CHARS.map(c => char(c))); - -export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!"; - -// Returns the raw string. Does NOT create a token (the caller decides the token type). -function scanRawIdentifier(cursor: Cursor): string { - const start = cursor.currentIndex; - // Consume until EOF or Delimiter or Whitespace (including comments) - // TODO: How to check for comments? They are special in that they must start with two characters. Fuck. - cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c))); - return cursor.text.sliceByCp(start, cursor.currentIndex); -} - -export type ExprScanError = - | { tag: "UnexpectedCharacter", char: CodePoint, span: Span } - | { tag: "InvalidIdentifier", text: string, reason: string, span: Span } - | NumberError - | StringError; - -export type ExprStartToken = - | { tag: "number", value: number, span: Span } - | { tag: "string", text: string, span: Span } - | { tag: "function_name", name: string, span: Span } - | { tag: "variable_use", name: string, span: Span } - | { tag: "tag", name: string, span: Span } - | { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start. - | { tag: "keyword", kw: Keyword, span: Span } - | { tag: "EOF", span: Span } - -// TODO: Move this back to `cursor.ts` +import { Expr } from 'src/value'; function skipWhitespaceAndComments(cursor: Cursor): number { let totalConsumed = 0; @@ -76,11 +42,185 @@ function skipWhitespaceAndComments(cursor: Cursor): number { return totalConsumed; } -export function scanExprStart(cursor: Cursor): Result { - // TODO - return (0 as any); +// === Language Specific Stuff === +const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const; +export type Delimiter = typeof DELIMITER_CHARS[number]; +const DELIMITER_SET: Set = new Set(DELIMITER_CHARS.map(c => char(c))); + +const KEYWORD_LIST = ["let" , "fn" , "match" , "apply" , "=" , "|" , "!"] as const; +export type Keyword = typeof KEYWORD_LIST[number]; +const KEYWORD_SET: Set = new Set(KEYWORD_LIST); + +// === Errors === + +export type ExprScanError = + | GenericScanError + | NumberError + | StringError + | { tag: "InvalidIdentifier", text: string, kind: IdentifierKind, reason: IdentifierErrorReason, span: Span } + +// What kind of identifier were we trying to parse? +export type IdentifierKind = "identifier" | "variable_use" | "tag_construction" | "function_call"; +export type IdentifierErrorReason = + | { tag: "StartsWithDigit" } + | { tag: "IsKeyword", kw: Keyword } + +// === Tokens === + +export type ExprStartToken = + | { tag: "number", value: number, span: Span } + | { tag: "string", text: string, span: Span } + | { tag: "function_name", name: string, span: Span } + | { tag: "variable_use", name: string, span: Span } + | { tag: "tag", name: string, span: Span } + | { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start. + | { tag: "tuple_start", span: Span } + | { tag: "record_start", span: Span } + | { tag: "keyword", kw: Keyword, span: Span } + | { tag: "EOF", span: Span } + +// === Identifier Scanners === + +// Returns the raw string. +function rawIdentifier(cursor: Cursor): string { + const start = cursor.currentIndex; + while (!cursor.eof()) { + const c = cursor.peek(); + + if (DELIMITER_SET.has(c) || isWhitespace(c)) { + break; + } + + if (c === char('/') && cursor.peek(1) === char('/')) { + break; + } + cursor.next(); + } + return cursor.text.sliceByCp(start, cursor.currentIndex); } +// Scans raw identifier, +// checks if it is a keyword, +// if it ain't, validates it into a proper identifier. +function identifierOrKeyword( + cursor: Cursor, + kind: IdentifierKind, +): { tag: "keyword", kw: Keyword, span: Span } + | { tag: "identifier", name: string, span: Span } { + const start = cursor.currentLocation(); + const text = rawIdentifier(cursor); + const span = cursor.makeSpan(start); + + if (KEYWORD_SET.has(text)) { + return { tag: "keyword", kw: text as Keyword, span }; + } + + // validation + if (isDigit(char(text[0]))) { + throw ({ + tag: "InvalidIdentifier", + text, + kind, + reason: { tag: "StartsWithDigit" }, + span + } as ExprScanError); + } + return { tag: "identifier", name: text, span }; +} + +export function identifier(cursor: Cursor, kind: IdentifierKind): { name: string, span: Span } { + const res = identifierOrKeyword(cursor, kind); + + if (res.tag === "keyword") { + throw ({ + tag: "InvalidIdentifier", + text: res.kw, + kind, + reason: { tag: "IsKeyword", kw: res.kw }, + span: res.span + } as ExprScanError); + } + + return { name: res.name, span: res.span }; +} + +// === Literal Scanners === +// throws ExprScanError +function number(cursor: Cursor): { value: number, span: Span } { + const res = scanNumber(cursor); + if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); } +} + +// throws ExprScanError +function string(cursor: Cursor): { value: string, span: Span } { + const res = scanString(cursor); + if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); } +} + +// === complex scanners === + +// throws ExprScanError +export function exprStart(cursor: Cursor): ExprStartToken { + skipWhitespaceAndComments(cursor); + + const start = cursor.currentLocation(); + if (cursor.eof()) { + return { tag: "EOF", span: cursor.makeSpan(start) }; + } + + const c = cursor.peek()!; + + // === numbers === + if (isDigit(c) || (c === char('-') && isDigit(cursor.peek(1) ?? 0))) { + const { value, span } = number(cursor); + return { tag: "number", value: value, span }; + } + + // === strings === + if (c === char('"')) { + const { value, span } = string(cursor); + return { tag: "string", text: value, span }; + } + + // === variable use === + if (c === char('$')) { + cursor.next(); + const { name } = identifier(cursor, 'variable_use'); + return { tag: "variable_use", name, span: cursor.makeSpan(start) }; + } + + // === tags === + if (c === char('#')) { + cursor.next(); + const { name } = identifier(cursor, 'tag_construction'); + return { tag: "tag", name, span: cursor.makeSpan(start) }; + } + + + // === tuples === + if (c === char('(')) { + cursor.next(); + return { tag: "tuple_start", span: cursor.makeSpan(start) }; + } + + // === records === + if (c === char('{')) { + cursor.next(); + return { tag: "record_start", span: cursor.makeSpan(start) }; + } + + // === keywords & identifiers === + // Fallthrough: it must be a keyword or a function call + const result = identifierOrKeyword(cursor, 'function_call'); + switch (result.tag) { + case "keyword": + return result; + case "identifier": + return { tag: "function_name", name: result.name, span: result.span }; + } +} + + // TODO: Need a Token to TokenKind function // TODO: Need is_start_of_expression(token): boolean // identifier -> true diff --git a/src/value.ts b/src/value.ts index eb10a40..59aee8b 100644 --- a/src/value.ts +++ b/src/value.ts @@ -211,6 +211,8 @@ export namespace Expr { export const call = (name: FunctionName, args: Expr[]): Expr => ({ tag: "call", name, args, }); export const tag = (tag_name: Tag): Expr => ({ tag: "tag", tag_name, }); export const tagged = (tag_name: Tag, expr: Expr): Expr => ({ tag: "tagged", tag_name, expr, }); + export const tuple = (exprs: Expr[]): Expr => ({ tag: "tuple", exprs }); + export const record = (fields: { name: FieldName, expr: Expr }[]): Expr => ({ tag: "record", fields }); export const match = (arg: Expr, branches: { pattern: Pattern; body: Expr }[]): Expr => ({ tag: "match", arg, branches, }); export const var_use = (name: VariableName): Expr => ({ tag: "var_use", name, }); export const let_ = (bindings: ExprBinding[], body: Expr): Expr => ({ tag: "let", bindings, body, });