Sketch basic scanner
This commit is contained in:
parent
99cd517a58
commit
eb6ade5a3d
7 changed files with 836 additions and 6 deletions
125
src/parser/source_text.ts
Normal file
125
src/parser/source_text.ts
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
|
||||
// === Char type ===
|
||||
export type StringIndex = number; // UTF-16 index into string
|
||||
export type CodePointIndex = number; // index into array of code-points
|
||||
export type CodePoint = number; // could also name it `UnicodeCodePoint`. Basically for `s: string` we have `s.codePointAt(i: index): char`.
|
||||
|
||||
export function char(c: string): CodePoint {
|
||||
return c.codePointAt(0)
|
||||
}
|
||||
|
||||
export type CodePointRef = {
|
||||
char: CodePoint,
|
||||
offset: StringIndex,
|
||||
};
|
||||
|
||||
// === Source Text ===
|
||||
export class SourceText {
|
||||
readonly source: string;
|
||||
// TODO: Later you can try to change this to two `Uint32Array`s - one for codepoints (each 20 bit but whatever), the other for pointers to original string.
|
||||
//
|
||||
readonly chars: CodePointRef[];
|
||||
|
||||
// Stores the CodePointIndex where each line begins
|
||||
readonly lineStarts: CodePointIndex[];
|
||||
|
||||
constructor(rawSource: string) {
|
||||
const source = rawSource.normalize('NFC');
|
||||
|
||||
this.source = source;
|
||||
this.chars = [];
|
||||
this.lineStarts = [0]; // Line 1 always starts at index 0
|
||||
|
||||
let i = 0;
|
||||
while (i < source.length) {
|
||||
const char = source.codePointAt(i) as CodePoint;
|
||||
const cpIndex = this.chars.length;
|
||||
this.chars.push({ char: char, offset: i });
|
||||
|
||||
const size =(char > 0xFFFF ? 2 : 1);
|
||||
i += size;
|
||||
|
||||
// === Newline Logic ===
|
||||
// 0x0A is '\n', 0x0D is '\r'
|
||||
if (char === 0x0A) {
|
||||
// Found a newline, the NEXT char starts a new line
|
||||
this.lineStarts.push(cpIndex + 1);
|
||||
}
|
||||
// Handle CR (Classic Mac) or CRLF start
|
||||
else if (char === 0x0D) {
|
||||
// Check if the next char is '\n' (CRLF)
|
||||
// We peek ahead in the raw string to see if we need to skip the \n for line counting purposes
|
||||
// or just treat this as a newline.
|
||||
const nextIsNL = i < source.length && source.codePointAt(i) === 0x0A;
|
||||
if (!nextIsNL) {
|
||||
// Only push if it's NOT CRLF. If it is CRLF, the loop handles the \n next.
|
||||
this.lineStarts.push(cpIndex + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
get length(): number {
|
||||
return this.chars.length;
|
||||
}
|
||||
|
||||
sliceByCp(start: number, end: number): string {
|
||||
const startRef = this.chars[start];
|
||||
// Handle out of bounds gracefully
|
||||
if (!startRef) return "";
|
||||
|
||||
const startOff = startRef.offset;
|
||||
const endOff = end < this.chars.length
|
||||
? this.chars[end].offset
|
||||
: this.source.length;
|
||||
|
||||
return this.source.slice(startOff, endOff);
|
||||
}
|
||||
|
||||
// Converts a linear Code Point Index into SourceLocation
|
||||
// getLocation(index: CodePointIndex): SourceLocation {
|
||||
// // TODO: can be implemented either by a linear or binary search.
|
||||
// return (0 as any);
|
||||
// }
|
||||
|
||||
// Returns the full text of a specific line (1-based index)
|
||||
getLineText(line: number): string {
|
||||
const lineIndex = line - 1;
|
||||
if (lineIndex < 0 || lineIndex >= this.lineStarts.length) return "";
|
||||
|
||||
const startCp = this.lineStarts[lineIndex];
|
||||
const endCp = (lineIndex + 1 < this.lineStarts.length)
|
||||
? this.lineStarts[lineIndex + 1] - 1 // -1 to exclude the newline char itself
|
||||
: this.chars.length;
|
||||
|
||||
// TODO: Consider removing \r or \n from the end if they exist.
|
||||
return this.sliceByCp(startCp, endCp);
|
||||
}
|
||||
}
|
||||
|
||||
export function sourceText(s: string) {
|
||||
return new SourceText(s);
|
||||
}
|
||||
|
||||
|
||||
export function isWhitespace(char: CodePoint): boolean {
|
||||
return char === 0x20 || char === 0x09 || char === 0x0A || char === 0x0D;
|
||||
}
|
||||
|
||||
export function isDigit(char: CodePoint): boolean {
|
||||
return char >= 0x30 && char <= 0x39;
|
||||
}
|
||||
|
||||
export type Span = {
|
||||
start: StringIndex,
|
||||
end: StringIndex,
|
||||
line: number,
|
||||
column: number,
|
||||
}
|
||||
|
||||
export type SourceLocation = {
|
||||
index: CodePointIndex;
|
||||
line: number; // 1-based
|
||||
column: number; // 1-based
|
||||
};
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue