Sketch basic scanner

This commit is contained in:
Yura Dupyn 2026-02-05 13:21:44 +01:00
parent 99cd517a58
commit eb6ade5a3d
7 changed files with 836 additions and 6 deletions

125
src/parser/source_text.ts Normal file
View file

@ -0,0 +1,125 @@
// === Char type ===
export type StringIndex = number; // UTF-16 index into string
export type CodePointIndex = number; // index into array of code-points
export type CodePoint = number; // could also name it `UnicodeCodePoint`. Basically for `s: string` we have `s.codePointAt(i: index): char`.
export function char(c: string): CodePoint {
return c.codePointAt(0)
}
export type CodePointRef = {
char: CodePoint,
offset: StringIndex,
};
// === Source Text ===
export class SourceText {
readonly source: string;
// TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
//
readonly chars: CodePointRef[];
// Stores the CodePointIndex where each line begins
readonly lineStarts: CodePointIndex[];
constructor(rawSource: string) {
const source = rawSource.normalize('NFC');
this.source = source;
this.chars = [];
this.lineStarts = [0]; // Line 1 always starts at index 0
let i = 0;
while (i < source.length) {
const char = source.codePointAt(i) as CodePoint;
const cpIndex = this.chars.length;
this.chars.push({ char: char, offset: i });
const size =(char > 0xFFFF ? 2 : 1);
i += size;
// === Newline Logic ===
// 0x0A is '\n', 0x0D is '\r'
if (char === 0x0A) {
// Found a newline, the NEXT char starts a new line
this.lineStarts.push(cpIndex + 1);
}
// Handle CR (Classic Mac) or CRLF start
else if (char === 0x0D) {
// Check if the next char is '\n' (CRLF)
// We peek ahead in the raw string to see if we need to skip the \n for line counting purposes
// or just treat this as a newline.
const nextIsNL = i < source.length && source.codePointAt(i) === 0x0A;
if (!nextIsNL) {
// Only push if it's NOT CRLF. If it is CRLF, the loop handles the \n next.
this.lineStarts.push(cpIndex + 1);
}
}
}
}
get length(): number {
return this.chars.length;
}
sliceByCp(start: number, end: number): string {
const startRef = this.chars[start];
// Handle out of bounds gracefully
if (!startRef) return "";
const startOff = startRef.offset;
const endOff = end < this.chars.length
? this.chars[end].offset
: this.source.length;
return this.source.slice(startOff, endOff);
}
// Converts a linear Code Point Index into SourceLocation
// getLocation(index: CodePointIndex): SourceLocation {
// // TODO: can be implemented either by a linear or binary search.
// return (0 as any);
// }
// Returns the full text of a specific line (1-based index)
getLineText(line: number): string {
const lineIndex = line - 1;
if (lineIndex < 0 || lineIndex >= this.lineStarts.length) return "";
const startCp = this.lineStarts[lineIndex];
const endCp = (lineIndex + 1 < this.lineStarts.length)
? this.lineStarts[lineIndex + 1] - 1 // -1 to exclude the newline char itself
: this.chars.length;
// TODO: Consider removing \r or \n from the end if they exist.
return this.sliceByCp(startCp, endCp);
}
}
export function sourceText(s: string) {
return new SourceText(s);
}
export function isWhitespace(char: CodePoint): boolean {
return char === 0x20 || char === 0x09 || char === 0x0A || char === 0x0D;
}
export function isDigit(char: CodePoint): boolean {
return char >= 0x30 && char <= 0x39;
}
export type Span = {
start: StringIndex,
end: StringIndex,
line: number,
column: number,
}
export type SourceLocation = {
index: CodePointIndex;
line: number; // 1-based
column: number; // 1-based
};