scrowl/src/lang/parser/cursor.ts

import { char, NEW_LINE, CARRIAGE_RETURN, DOT, DIGIT_0, DIGIT_9, LOWERCASE_a, LOWERCASE_f, UPPERCASE_A, UPPERCASE_F, SPACE, TAB } from './source_text';
import type { SourceText, Span, SourceLocation, CodePoint, StringIndex, CodePointIndex } from './source_text';
import { Result } from '../result';

export type CursorState = {
  index: CodePointIndex,
  line: number,
  column: number,
  lastCharWasCR: boolean,
}

export class Cursor {
  private index: CodePointIndex = 0;
  private line: number = 1;
  private column: number = 1;
  // Track previous char to handle \r\n correctly
  private lastCharWasCR: boolean = false;

  constructor(readonly text: SourceText) {}

  save(): CursorState {
    return { index: this.index, line: this.line, column: this.column, lastCharWasCR: this.lastCharWasCR };
  }

  restore({ index, line, column, lastCharWasCR }: CursorState) {
    this.index = index;
    this.line = line;
    this.column = column;
    this.lastCharWasCR = lastCharWasCR;
  }

  eof(): boolean {
    return this.index >= this.text.length;
  }

  peek(n: number = 0): CodePoint | undefined {
    return this.text.chars[this.index + n]?.char;
  }

  next(): CodePoint | undefined {
    const ref = this.text.chars[this.index];
    if (!ref) return undefined;

    const c = ref.char;
    this.index++;

    if (c === NEW_LINE) {
      if (!this.lastCharWasCR) {
        this.line++;
        this.column = 1;
      } else {
        // We just saw \r, so this \n is part of \r\n.
        // We already bumped the line count on \r.
        // Just reset the flag.
        this.lastCharWasCR = false;
      }
    } else if (c === CARRIAGE_RETURN) {
      this.line++;
      this.column = 1;
      this.lastCharWasCR = true;
    } else {
      this.column++;
      this.lastCharWasCR = false;
    }

    return c;
  }

  get currentIndex(): CodePointIndex {
    return this.index;
  }

  // TODO: unicode-index ~> string-offset, make that into a separate function.
  currentOffset(): StringIndex {
    return this.text.chars[this.index]?.offset ?? this.text.source.length;
  }

  currentLocation(): SourceLocation {
    return { index: this.index, line: this.line, column: this.column };
  }

  makeSpan(start: SourceLocation): Span {
    return {
      start,
      end: this.currentLocation(),
    };
  }

  consumeWhile(pred: (c: CodePoint) => boolean): number {
    let count = 0;
    while (!this.eof()) {
      const c = this.peek();
      if (c === undefined || !pred(c)) break;
      this.next();
      count++;
    }
    return count;
  }

  // Helper to check for exact char matches quickly
  match(c: CodePoint): boolean {
    if (this.peek() === c) {
        this.next();
        return true;
    }
    return false;
  }

  // Checks if the next characters match the string.
  // If yes, consumes them and returns true.
  // If no, touches nothing and returns false.
  matchString(str: string): boolean {
    if (this.index + str.length > this.text.length) return false;

    const slice = this.text.sliceByCp(this.index, this.index + str.length);
    if (slice !== str) return false;

    for (let i = 0; i < str.length; i++) {
        // We must call next() to correctly update line/col tracking.
        // We already know it matches, so we just burn through.
        this.next();
    }
    return true;
  }
}

// === Basic Scanners/Predicates ===
export function isWhitespace(char: CodePoint): boolean {
  return char === SPACE || char === TAB || char === NEW_LINE || char === CARRIAGE_RETURN;
}

export function isDigit(char: CodePoint): boolean {
  return char >= DIGIT_0 && char <= DIGIT_9;
}

export type GenericScanError =
  | { tag: "UnexpectedCharacter", char: CodePoint, span: Span }
  | { tag: "UnexpectedEOF", span: Span }

export type NumberError =
  | { tag: "ExpectedNumber", span: Span }
  | { tag: "InvalidNumber", reason: "NotFinite" | "MissingFractionalDigits", span: Span }

export function scanNumber(cursor: Cursor): Result<{ value: number, span: Span }, NumberError> {
  // number :=
  //   | optional(`-`) digits optional(`.` digits)

  const startNumberLocation = cursor.currentLocation();

  let c: CodePoint;

  // 1. Optional Sign
  c = cursor.peek();
  if (c === char("-")) {
    cursor.next();
  }

  // 2. Integer Part
  c = cursor.peek();
  const integerPartDigitCount = cursor.consumeWhile(isDigit);
  if (integerPartDigitCount === 0) {
    return Result.error({
      tag: "ExpectedNumber",
      span: cursor.makeSpan(startNumberLocation),
    });
  }

  // 3. Fractional Part
  if (cursor.peek() === DOT) {
    const dotLocation = cursor.currentLocation();

    cursor.next(); // consume '.'

    const fracPartDigitCount = cursor.consumeWhile(isDigit);
    if (fracPartDigitCount === 0) {
      return Result.error({
        tag: "InvalidNumber",
        reason: "MissingFractionalDigits",
        span: cursor.makeSpan(dotLocation),
      });
    }
  }

  const text = cursor.text.sliceByCp(startNumberLocation.index, cursor.currentIndex);
  const value = Number(text);

  if (!Number.isFinite(value)) {
    return Result.error({
      tag: "InvalidNumber",
      reason: "NotFinite",
      span: cursor.makeSpan(startNumberLocation),
    });
  }
  return Result.ok({
    value,
    span: cursor.makeSpan(startNumberLocation),
  });
}

export type StringError =
  | { tag: "InvalidEscape", reason: EscapeErrorReason, span: Span };

export type EscapeErrorReason =
  | { tag: "UnknownEscapeSequence", char: CodePoint } // e.g. \k
  | { tag: "UnicodeMissingBrace" }                    // \u without {
  | { tag: "UnicodeNoDigits" }                        // \u{}
  | { tag: "UnicodeUnclosed" }                        // \u{FF without }
  | { tag: "UnicodeOverflow", value: number };        // \u{110000}

export function scanString(cursor: Cursor): Result<{ value: string, span: Span }, StringError | GenericScanError> {
  const start = cursor.currentLocation();

  const firstChar = cursor.peek();
  if (firstChar === undefined) {
    return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
  }
  if (firstChar !== char('"')) {
    return Result.error({ tag: "UnexpectedCharacter", char: firstChar, span: cursor.makeSpan(start) });
  }
  cursor.next();

  let value = ""; // The actual string content

  while (true) {
    if (cursor.eof()) {
      return Result.error({ tag: "UnexpectedEOF", span: cursor.makeSpan(start) });
    }

    const c = cursor.peek();

    // 1. End of string
    if (c === char('"')) {
      cursor.next(); // consume closing quote
      break;
    }

    if (c === char('\\')) {
      // 2. Escape Sequences
      const escapeStart = cursor.currentLocation();
      cursor.next(); // consume backslash
      const escaped = cursor.peek();

      switch (escaped) {
        case char('n'): value += '\n'; cursor.next(); break;
        case char('r'): value += '\r'; cursor.next(); break;
        case char('t'): value += '\t'; cursor.next(); break;
        case char('\\'): value += '\\'; cursor.next(); break;
        case char("0"): value += "\0"; cursor.next(); break;
        case char('"'): value += '"'; cursor.next(); break;
        // Unicode Escape: \u{XXXX}
        case char('u'): {
          cursor.next(); // consume 'u'

          // Expect '{'
          const braceStart = cursor.currentLocation();
          if (cursor.peek() !== char('{')) {
              return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeMissingBrace" }, span: cursor.makeSpan(braceStart) });
          }
          cursor.next(); // consume '{'

          // Consume Hex Digits
          const hexStart = cursor.currentIndex;
          const hexCount = cursor.consumeWhile(c =>
            (c >= DIGIT_0 && c <= DIGIT_9) ||
            (c >= LOWERCASE_a && c <= LOWERCASE_f) ||
            (c >= UPPERCASE_A && c <= UPPERCASE_F)
          );

          if (hexCount === 0) {
             return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeNoDigits" }, span: cursor.makeSpan(braceStart) });
          }

          // Expect '}'
          if (cursor.peek() !== char("}")) {
            return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeUnclosed" }, span: cursor.makeSpan(braceStart) });
          }
          cursor.next(); // consume '}'

          // Convert & Append
          const hexStr = cursor.text.sliceByCp(hexStart, hexStart + hexCount);
          const codePoint = parseInt(hexStr, 16);

          if (codePoint > 0x10FFFF) {
             return Result.error({ tag: "InvalidEscape", reason: { tag: "UnicodeOverflow", value: codePoint }, span: cursor.makeSpan(braceStart) });
          }

          value += String.fromCodePoint(codePoint);
          break;
        }
        default:
          return Result.error({
            tag: "InvalidEscape",
            reason: { tag: "UnknownEscapeSequence", char: escaped },
            span: cursor.makeSpan(escapeStart)
          });
      }
    } else {
      // 3. Regular character
      // Optimization: consume chunks of non-special chars for speed?
      // For now, char-by-char is fine.
      cursor.next();
      // Note: We use ! because we checked EOF at loop start
      value += String.fromCodePoint(c!);
    }
  }

  return Result.ok({
    value,
    span: cursor.makeSpan(start)
  });
}

// TODO: rendering of errors
// function renderStringError(err: StringError): string {
//   switch (err.tag) {
//     case "ExpectedQuote": return "Expected a string starting with \"";
//     case "UnexpectedEOF": return "Unterminated string literal";
//     case "InvalidEscape":
//       const k = err.kind;
//       switch (k.tag) {
//         case "Unknown":
//           return `Unknown escape sequence '\\${String.fromCodePoint(k.char)}'`;
//         case "UnicodeMissingBrace":
//           return "Unicode escape must start with '{', e.g. \\u{1F600}";
//         case "UnicodeNoDigits":
//           return "Empty unicode escape \\u{}";
//         case "UnicodeUnclosed":
//           return "Expected '}' to close unicode escape";
//         case "UnicodeOverflow":
//           return `Unicode code point 0x${k.value.toString(16)} is too large (max 0x10FFFF)`;
//       }
//   }
// }