import { CARRIAGE_RETURN, char, NEW_LINE, SPACE, TAB } from './source_text'; import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text'; import { isDigit, isWhitespace, scanNumber, scanString } from './cursor'; import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor'; import { Result } from '../result'; import { Expr } from 'src/value'; export function skipWhitespaceAndComments(cursor: Cursor): number { let totalConsumed = 0; while (true) { // 1. Consume standard whitespace (spaces, tabs, newlines) const wsCount = cursor.consumeWhile(isWhitespace); totalConsumed += wsCount; // 2. Check for Line Comment start ('//') const c = cursor.peek(); const nextC = cursor.peek(1); if (c === char('/') && nextC === char('/')) { // Found comment start. Consume the '//' markers cursor.next(); cursor.next(); totalConsumed += 2; // Consume everything until the next newline (or EOF). // Note: We do NOT consume the newline itself here. // We let the NEXT iteration of the 'while(true)' loop catch // the newline as standard whitespace. const commentContentLength = cursor.consumeWhile(c => c !== NEW_LINE && c !== CARRIAGE_RETURN); totalConsumed += commentContentLength; } else { // We are not at a comment. // If we also didn't consume any whitespace in step 1, we are truly done. if (wsCount === 0) { break; } } } return totalConsumed; } // === Language Specific Stuff === const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const; export type Delimiter = typeof DELIMITER_CHARS[number]; const DELIMITER_SET: Set = new Set(DELIMITER_CHARS.map(c => char(c))); const KEYWORD_LIST = ["let" , "fn" , "match" , "apply" , "=" , "|" , "!", ":"] as const; export type Keyword = typeof KEYWORD_LIST[number]; const KEYWORD_SET: Set = new Set(KEYWORD_LIST); // === Errors === export type ExprScanError = | GenericScanError | NumberError | StringError | { tag: "InvalidIdentifier", text: string, kind: IdentifierKind, reason: IdentifierErrorReason, span: Span } // What kind of identifier were we trying to parse? export type IdentifierKind = | "variable_use" | "field_name" | "tag_construction" | "function_call" | "pattern_binding"; export type IdentifierErrorReason = | { tag: "Empty" } | { tag: "StartsWithDigit" } | { tag: "IsKeyword", kw: Keyword } // === Tokens === export type ExprStartToken = | { tag: "number", value: number, span: Span } | { tag: "string", text: string, span: Span } | { tag: "function_name", name: string, span: Span } | { tag: "variable_use", name: string, span: Span } | { tag: "tag", name: string, span: Span } | { tag: "tuple_start", span: Span } | { tag: "keyword", kw: Keyword, span: Span } // TODO: ger rid of EOF | { tag: "EOF", span: Span } export type PatternStartToken = | { tag: "pattern_binding", name: string, span: Span } | { tag: "tag", name: string, span: Span } | { tag: "tuple_start", span: Span } | { tag: "keyword", kw: Keyword, span: Span } // TODO: ger rid of EOF | { tag: "EOF", span: Span }; // === Identifier Scanners === // Returns the raw string. function rawIdentifier(cursor: Cursor): string { const start = cursor.currentIndex; while (!cursor.eof()) { const c = cursor.peek(); if (DELIMITER_SET.has(c) || isWhitespace(c)) { break; } if (c === char('/') && cursor.peek(1) === char('/')) { break; } cursor.next(); } return cursor.text.sliceByCp(start, cursor.currentIndex); } // Scans raw identifier, // checks if it is a keyword, // if it ain't, validates it into a proper identifier. function identifierOrKeywordScanner( cursor: Cursor, kind: IdentifierKind, ): { tag: "keyword", kw: Keyword, span: Span } | { tag: "identifier", name: string, span: Span } { const start = cursor.currentLocation(); const text = rawIdentifier(cursor); const span = cursor.makeSpan(start); if (text.length === 0) { throw ({ tag: "InvalidIdentifier", text, kind, reason: { tag: "Empty" }, span } as ExprScanError); } if (KEYWORD_SET.has(text)) { return { tag: "keyword", kw: text as Keyword, span }; } // validation if (isDigit(char(text[0]))) { throw ({ tag: "InvalidIdentifier", text, kind, reason: { tag: "StartsWithDigit" }, span } as ExprScanError); } return { tag: "identifier", name: text, span }; } export function identifierScanner(cursor: Cursor, kind: IdentifierKind): { name: string, span: Span } { const res = identifierOrKeywordScanner(cursor, kind); if (res.tag === "keyword") { throw ({ tag: "InvalidIdentifier", text: res.kw, kind, reason: { tag: "IsKeyword", kw: res.kw }, span: res.span } as ExprScanError); } return { name: res.name, span: res.span }; } // === Literal Scanners === // throws ExprScanError // TODO: handle trailing whitespace function number(cursor: Cursor): { value: number, span: Span } { const res = scanNumber(cursor); if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); } } // throws ExprScanError // TODO: handle trailing whitespace function string(cursor: Cursor): { value: string, span: Span } { const res = scanString(cursor); if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); } } // === complex scanners === // TODO: in exprStart/patternStart make sure whitespace is consumed after they succesfuly produce token. Should we build it in the functions? Or should that be in `parser.ts`? // throws ExprScanError export function exprStart(cursor: Cursor): ExprStartToken { const start = cursor.currentLocation(); if (cursor.eof()) { return { tag: "EOF", span: cursor.makeSpan(start) }; } const c = cursor.peek()!; // === numbers === if (isDigit(c) || (c === char('-') && isDigit(cursor.peek(1) ?? 0))) { const { value, span } = number(cursor); return { tag: "number", value: value, span }; } // === strings === if (c === char('"')) { const { value, span } = string(cursor); return { tag: "string", text: value, span }; } // === variable use === if (c === char('$')) { cursor.next(); const { name } = identifierScanner(cursor, 'variable_use'); return { tag: "variable_use", name, span: cursor.makeSpan(start) }; } // === tags === if (c === char('#')) { cursor.next(); const { name } = identifierScanner(cursor, 'tag_construction'); return { tag: "tag", name, span: cursor.makeSpan(start) }; } // === tuples === if (c === char('(')) { cursor.next(); return { tag: "tuple_start", span: cursor.makeSpan(start) }; } // === keywords & identifiers === // Fallthrough: it must be a keyword or a function call const result = identifierOrKeywordScanner(cursor, 'function_call'); switch (result.tag) { case "keyword": return result; case "identifier": return { tag: "function_name", name: result.name, span: result.span }; } } export function patternStart(cursor: Cursor): PatternStartToken { const start = cursor.currentLocation(); if (cursor.eof()) { return { tag: "EOF", span: cursor.makeSpan(start) }; } const c = cursor.peek()!; // === tuple === if (c === char('(')) { cursor.next(); return { tag: "tuple_start", span: cursor.makeSpan(start) }; } // === tag === if (c === char('#')) { cursor.next(); const { name } = identifierScanner(cursor, 'tag_construction'); return { tag: "tag", name, span: cursor.makeSpan(start) }; } // TODO: This is more subtle... -foo is a valid pattern name... I think I should restrict: can't have identifiers start with `-`? But then `-` itself can't be an identifier, which is a bit sad. // TODO: This is gonna be different once we allow number/string literals as patterns. if (isDigit(c) || c === char('"') || c === char('-')) { throw { tag: "UnexpectedCharacter", char: c, span: cursor.makeSpan(start) } as ExprScanError; } // === pattern binding === // Fallthrough: it must be a keyword or a pattern-variable const result = identifierOrKeywordScanner(cursor, 'function_call'); switch (result.tag) { case "keyword": return result; case "identifier": return { tag: "pattern_binding", name: result.name, span: result.span }; } } export function isNextTokenExprStart(cursor: Cursor): boolean { const state = cursor.save(); try { const token = exprStart(cursor); switch (token.tag) { case "number": case "string": case "variable_use": case "tag": case "tuple_start": case "function_name": // e.g. my_func(x) return true; case "keyword": switch (token.kw) { case "let": case "fn": case "match": case "apply": case ":": return true; case "=": case "|": case "!": return false; } case "EOF": return false; default: return false; } } catch (e) { return false; } finally { cursor.restore(state); } } export function isNextTokenProductPatternStart(cursor: Cursor): boolean { const state = cursor.save(); try { const token = patternStart(cursor); switch (token.tag) { case "pattern_binding": case "tuple_start": return true; case "keyword": switch (token.kw) { case ":": return true; case "let": case "fn": case "match": case "apply": case "=": case "|": case "!": return false; } default: return false; } } catch (e) { return false; } finally { cursor.restore(state); } }