Finish first scanning step

2026-02-06 12:31:10 +01:00 · 2026-02-06 12:31:10 +01:00 · d553a05f45
commit d553a05f45
parent d5f9777711
2 changed files with 180 additions and 38 deletions
--- a/src/parser/scanner.ts
+++ b/src/parser/scanner.ts
@ -4,41 +4,7 @@ import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePoin
 import { isDigit, isWhitespace, scanNumber, scanString } from './cursor';
 import type { Cursor, CursorState, GenericScanError, NumberError, StringError } from './cursor';
 import { Result } from '../result';
-
-// === Language Specific Stuff ===
-
-const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
-export type Delimiter = typeof DELIMITER_CHARS[number];
-const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
-
-export type Keyword = "let" | "fn" | "match" | "apply" | "=" | "|" | "!";
-
-// Returns the raw string. Does NOT create a token (the caller decides the token type).
-function scanRawIdentifier(cursor: Cursor): string {
-  const start = cursor.currentIndex;
-  // Consume until EOF or Delimiter or Whitespace (including comments)
-  // TODO: How to check for comments? They are special in that they must start with two characters. Fuck.
-  cursor.consumeWhile(c => !(DELIMITER_SET.has(c) || isWhitespace(c)));
-  return cursor.text.sliceByCp(start, cursor.currentIndex);
-}
-
-export type ExprScanError = 
-  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
-  | { tag: "InvalidIdentifier", text: string, reason: string, span: Span }
-  | NumberError 
-  | StringError;
-
-export type ExprStartToken =
-  | { tag: "number", value: number, span: Span }
-  | { tag: "string", text: string, span: Span }
-  | { tag: "function_name", name: string, span: Span }
-  | { tag: "variable_use", name: string, span: Span }
-  | { tag: "tag", name: string, span: Span }
-  | { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
-  | { tag: "keyword", kw: Keyword, span: Span }
-  | { tag: "EOF", span: Span }
-
-// TODO: Move this back to `cursor.ts`
+import { Expr } from 'src/value';

 function skipWhitespaceAndComments(cursor: Cursor): number {
  let totalConsumed = 0;
@ -76,11 +42,185 @@ function skipWhitespaceAndComments(cursor: Cursor): number {
  return totalConsumed;
 }

-export function scanExprStart(cursor: Cursor): Result<ExprStartToken, ExprScanError> {
-  // TODO
-  return (0 as any);
+// === Language Specific Stuff ===
+const DELIMITER_CHARS = ["(", ")", "{", "}", ".", ",", "@", "$", "#", '"', "\\"] as const;
+export type Delimiter = typeof DELIMITER_CHARS[number];
+const DELIMITER_SET: Set<CodePoint> = new Set(DELIMITER_CHARS.map(c => char(c)));
+
+const KEYWORD_LIST = ["let" , "fn" , "match" , "apply" , "=" , "|" , "!"] as const;
+export type Keyword = typeof KEYWORD_LIST[number];
+const KEYWORD_SET: Set<string> = new Set(KEYWORD_LIST);
+
+// === Errors ===
+
+export type ExprScanError = 
+  | GenericScanError
+  | NumberError 
+  | StringError
+  | { tag: "InvalidIdentifier", text: string, kind: IdentifierKind, reason: IdentifierErrorReason, span: Span }
+
+// What kind of identifier were we trying to parse?
+export type IdentifierKind = "identifier" | "variable_use" | "tag_construction" | "function_call";
+export type IdentifierErrorReason = 
+  | { tag: "StartsWithDigit" }
+  | { tag: "IsKeyword", kw: Keyword }
+
+// === Tokens ===
+
+export type ExprStartToken =
+  | { tag: "number", value: number, span: Span }
+  | { tag: "string", text: string, span: Span }
+  | { tag: "function_name", name: string, span: Span }
+  | { tag: "variable_use", name: string, span: Span }
+  | { tag: "tag", name: string, span: Span }
+  | { tag: "tagged", name: string, span: Span } // TODO: This may be a bit weird. Actually we can lookahead and see if the next char after the identifier is NOT-AN-EXPRESSION start.
+  | { tag: "tuple_start", span: Span }
+  | { tag: "record_start", span: Span }
+  | { tag: "keyword", kw: Keyword, span: Span }
+  | { tag: "EOF", span: Span }
+
+// === Identifier Scanners ===
+
+// Returns the raw string.
+function rawIdentifier(cursor: Cursor): string {
+  const start = cursor.currentIndex;
+  while (!cursor.eof()) {
+    const c = cursor.peek();
+
+    if (DELIMITER_SET.has(c) || isWhitespace(c)) {
+      break;
+    }
+
+    if (c === char('/') && cursor.peek(1) === char('/')) {
+      break; 
+    }
+    cursor.next();
+  }
+  return cursor.text.sliceByCp(start, cursor.currentIndex);
 }

+// Scans raw identifier,
+// checks if it is a keyword,
+// if it ain't, validates it into a proper identifier.
+function identifierOrKeyword(
+  cursor: Cursor,
+  kind: IdentifierKind,
+): { tag: "keyword", kw: Keyword, span: Span }
+ | { tag: "identifier", name: string, span: Span } {
+  const start = cursor.currentLocation();
+  const text = rawIdentifier(cursor);
+  const span = cursor.makeSpan(start);
+
+  if (KEYWORD_SET.has(text)) {
+    return { tag: "keyword", kw: text as Keyword, span };
+  }
+
+  // validation
+  if (isDigit(char(text[0]))) {
+    throw ({ 
+      tag: "InvalidIdentifier", 
+      text, 
+      kind,
+      reason: { tag: "StartsWithDigit" },
+      span 
+    } as ExprScanError);
+  }
+  return { tag: "identifier", name: text, span };
+}
+
+export function identifier(cursor: Cursor, kind: IdentifierKind): { name: string, span: Span } {
+  const res = identifierOrKeyword(cursor, kind);
+  
+  if (res.tag === "keyword") {
+    throw ({
+      tag: "InvalidIdentifier",
+      text: res.kw,
+      kind,
+      reason: { tag: "IsKeyword", kw: res.kw },
+      span: res.span
+    } as ExprScanError);
+  }
+  
+  return { name: res.name, span: res.span };
+}
+
+// === Literal Scanners ===
+// throws ExprScanError
+function number(cursor: Cursor): { value: number, span: Span } {
+  const res = scanNumber(cursor);
+  if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
+}
+
+// throws ExprScanError
+function string(cursor: Cursor): { value: string, span: Span } {
+  const res = scanString(cursor);
+  if (res.tag === "ok") { return res.value; } else { throw (res.error as ExprScanError); }
+}
+
+// === complex scanners ===
+
+// throws ExprScanError
+export function exprStart(cursor: Cursor): ExprStartToken {
+  skipWhitespaceAndComments(cursor);
+
+  const start = cursor.currentLocation();
+  if (cursor.eof()) {
+    return { tag: "EOF", span: cursor.makeSpan(start) };
+  }
+
+  const c = cursor.peek()!;
+
+  // === numbers ===
+  if (isDigit(c) || (c === char('-') && isDigit(cursor.peek(1) ?? 0))) {
+    const { value, span } = number(cursor);
+    return { tag: "number", value: value, span };
+  }
+  
+  // === strings ===
+  if (c === char('"')) {
+    const { value, span } = string(cursor);
+    return { tag: "string", text: value, span };
+  }
+
+  // === variable use ===
+  if (c === char('$')) {
+    cursor.next();
+    const { name } = identifier(cursor, 'variable_use');
+    return { tag: "variable_use", name, span: cursor.makeSpan(start) };
+  }
+
+  // === tags ===
+  if (c === char('#')) {
+    cursor.next();
+    const { name } = identifier(cursor, 'tag_construction');
+    return { tag: "tag", name, span: cursor.makeSpan(start) };
+  }
+
+
+  // === tuples ===
+  if (c === char('(')) {
+    cursor.next();
+    return { tag: "tuple_start", span: cursor.makeSpan(start) };
+  }
+
+  // === records ===
+  if (c === char('{')) {
+    cursor.next();
+    return { tag: "record_start", span: cursor.makeSpan(start) };
+  }
+
+  // === keywords & identifiers ===
+  // Fallthrough: it must be a keyword or a function call
+  const result = identifierOrKeyword(cursor, 'function_call');
+  switch (result.tag) {
+    case "keyword":
+      return result;
+    case "identifier":
+      return { tag: "function_name", name: result.name, span: result.span };
+  }
+}
+
+
 // TODO: Need a Token to TokenKind function
 // TODO: Need is_start_of_expression(token): boolean
 //   identifier -> true
--- a/src/value.ts
+++ b/src/value.ts
@ -211,6 +211,8 @@ export namespace Expr {
  export const call = (name: FunctionName, args: Expr[]): Expr => ({ tag: "call", name, args, });
  export const tag = (tag_name: Tag): Expr => ({ tag: "tag", tag_name, });
  export const tagged = (tag_name: Tag, expr: Expr): Expr => ({ tag: "tagged", tag_name, expr, });
+  export const tuple = (exprs: Expr[]): Expr => ({ tag: "tuple", exprs });
+  export const record = (fields: { name: FieldName, expr: Expr }[]): Expr => ({ tag: "record", fields });
  export const match = (arg: Expr, branches: { pattern: Pattern; body: Expr }[]): Expr => ({ tag: "match", arg, branches, });
  export const var_use = (name: VariableName): Expr => ({ tag: "var_use", name, });
  export const let_ = (bindings: ExprBinding[], body: Expr): Expr => ({ tag: "let", bindings, body, });