Skip to content

Commit

Permalink
feat: optimize unoptimizable token types
Browse files Browse the repository at this point in the history
  • Loading branch information
msujew committed Oct 30, 2024
1 parent e1479df commit e1746f8
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 26 deletions.
73 changes: 57 additions & 16 deletions packages/chevrotain/src/scan/lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ export interface IAnalyzeResult {
emptyGroups: { [groupName: string]: IToken[] };
hasCustom: boolean;
canBeOptimized: boolean;
unoptimizedPatterns: IPatternConfig[];
}

export let SUPPORT_STICKY =
Expand Down Expand Up @@ -306,6 +307,7 @@ export function analyzeTokenTypes(
});

let canBeOptimized = true;
let unoptimizedPatterns: IPatternConfig[] = [];
let charCodeToPatternIdxToConfig: { [charCode: number]: IPatternConfig[] } =
[];

Expand All @@ -317,7 +319,12 @@ export function analyzeTokenTypes(
if (typeof currTokType.PATTERN === "string") {
const charCode = currTokType.PATTERN.charCodeAt(0);
const optimizedIdx = charCodeToOptimizedIndex(charCode);
addToMapOfArrays(result, optimizedIdx, patternIdxToConfig[idx]);
addToMapOfArrays(
result,
optimizedIdx,
patternIdxToConfig[idx],
unoptimizedPatterns,
);
} else if (isArray(currTokType.START_CHARS_HINT)) {
let lastOptimizedIdx: number;
forEach(currTokType.START_CHARS_HINT, (charOrInt) => {
Expand All @@ -336,49 +343,81 @@ export function analyzeTokenTypes(
result,
currOptimizedIdx,
patternIdxToConfig[idx],
unoptimizedPatterns,
);
}
});
} else if (isRegExp(currTokType.PATTERN)) {
if (currTokType.PATTERN.unicode) {
canBeOptimized = false;
forEach(Object.keys(result), (code) => {
addToMapOfArrays(
result,
Number(code),
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
unoptimizedPatterns.push(patternIdxToConfig[idx]);
if (options.ensureOptimizations) {
PRINT_ERROR(
`${failedOptimizationPrefixMsg}` +
`\tUnable to analyze < ${currTokType.PATTERN.toString()} > pattern.\n` +
"\tThe regexp unicode flag is not currently supported by the regexp-to-ast library.\n" +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tThis reduces lexer performance.\n" +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNICODE_OPTIMIZE",
);
}
canBeOptimized = false;
} else {
const optimizedCodes = getOptimizedStartCodesIndices(
currTokType.PATTERN,
options.ensureOptimizations,
);
/* istanbul ignore if */
// start code will only be empty given an empty regExp or failure of regexp-to-ast library
// the first should be a different validation and the second cannot be tested.
if (isEmpty(optimizedCodes)) {
// we cannot understand what codes may start possible matches
// The optimization correctness requires knowing start codes for ALL patterns.
// Not actually sure this is an error, no debug message
// instead, simply add the token to all known start characters
forEach(Object.keys(result), (code) => {
addToMapOfArrays(
result,
Number(code),
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
unoptimizedPatterns.push(patternIdxToConfig[idx]);
canBeOptimized = false;
} else {
forEach(optimizedCodes, (code) => {
addToMapOfArrays(
result,
code,
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
}
forEach(optimizedCodes, (code) => {
addToMapOfArrays(result, code, patternIdxToConfig[idx]);
});
}
} else {
if (options.ensureOptimizations) {
PRINT_ERROR(
`${failedOptimizationPrefixMsg}` +
`\tTokenType: <${currTokType.name}> is using a custom token pattern without providing <start_chars_hint> parameter.\n` +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tThis reduces lexer performance.\n" +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_OPTIMIZE",
);
}
canBeOptimized = false;
forEach(Object.keys(result), (code) => {
addToMapOfArrays(
result,
Number(code),
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
unoptimizedPatterns.push(patternIdxToConfig[idx]);
}

return result;
Expand All @@ -389,11 +428,12 @@ export function analyzeTokenTypes(
}

return {
emptyGroups: emptyGroups,
patternIdxToConfig: patternIdxToConfig,
charCodeToPatternIdxToConfig: charCodeToPatternIdxToConfig,
hasCustom: hasCustom,
canBeOptimized: canBeOptimized,
emptyGroups,
patternIdxToConfig,
charCodeToPatternIdxToConfig,
hasCustom,
canBeOptimized,
unoptimizedPatterns,
};
}

Expand Down Expand Up @@ -1125,9 +1165,10 @@ function addToMapOfArrays<T>(
map: Record<number, T[]>,
key: number,
value: T,
initial: T[],
): void {
if (map[key] === undefined) {
map[key] = [value];
map[key] = [...initial, value];
} else {
map[key].push(value);
}
Expand Down
30 changes: 20 additions & 10 deletions packages/chevrotain/src/scan/lexer_public.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ export class Lexer {
public lexerDefinitionWarning: ILexerDefinitionError[] = [];

protected patternIdxToConfig: Record<string, IPatternConfig[]> = {};
protected unoptimizedPatterns: Record<string, IPatternConfig[]> = {};
protected charCodeToPatternIdxToConfig: {
[modeName: string]: { [charCode: number]: IPatternConfig[] };
} = {};
Expand Down Expand Up @@ -261,6 +262,9 @@ export class Lexer {
this.charCodeToPatternIdxToConfig[currModName] =
currAnalyzeResult.charCodeToPatternIdxToConfig;

this.unoptimizedPatterns[currModName] =
currAnalyzeResult.unoptimizedPatterns;

this.emptyGroups = assign(
{},
this.emptyGroups,
Expand Down Expand Up @@ -344,6 +348,11 @@ export class Lexer {
});

this.TRACE_INIT("Failed Optimization Warnings", () => {
if (config.ensureOptimizations !== true) {
// Return early
return;
}

const unOptimizedModes = reduce(
this.canModeBeOptimized,
(cannotBeOptimized, canBeOptimized, modeName) => {
Expand All @@ -355,7 +364,7 @@ export class Lexer {
[] as string[],
);

if (config.ensureOptimizations && !isEmpty(unOptimizedModes)) {
if (!isEmpty(unOptimizedModes)) {
throw Error(
`Lexer Modes: < ${unOptimizedModes.join(
", ",
Expand Down Expand Up @@ -438,14 +447,13 @@ export class Lexer {

let currModePatternsLength = 0;
let patternIdxToConfig: IPatternConfig[] = [];
let unoptimizedPatterns: IPatternConfig[] = [];
let currCharCodeToPatternIdxToConfig: {
[charCode: number]: IPatternConfig[];
} = [];

const modeStack: string[] = [];

const emptyArray: IPatternConfig[] = [];
Object.freeze(emptyArray);
let getPossiblePatterns!: (charCode: number) => IPatternConfig[];

function getPossiblePatternsSlow() {
Expand All @@ -457,7 +465,7 @@ export class Lexer {
const possiblePatterns =
currCharCodeToPatternIdxToConfig[optimizedCharIdx];
if (possiblePatterns === undefined) {
return emptyArray;
return unoptimizedPatterns;
} else {
return possiblePatterns;
}
Expand Down Expand Up @@ -492,10 +500,12 @@ export class Lexer {
currCharCodeToPatternIdxToConfig =
this.charCodeToPatternIdxToConfig[newMode];
currModePatternsLength = patternIdxToConfig.length;
const modeCanBeOptimized =
this.canModeBeOptimized[newMode] && this.config.safeMode === false;
unoptimizedPatterns = this.unoptimizedPatterns[newMode];

if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) {
if (
currCharCodeToPatternIdxToConfig &&
this.config.safeMode === false
) {
getPossiblePatterns = getPossiblePatternsOptimized;
} else {
getPossiblePatterns = getPossiblePatternsSlow;
Expand All @@ -508,14 +518,14 @@ export class Lexer {
currCharCodeToPatternIdxToConfig =
this.charCodeToPatternIdxToConfig[newMode];

unoptimizedPatterns = this.unoptimizedPatterns[newMode];

patternIdxToConfig = this.patternIdxToConfig[newMode];
currModePatternsLength = patternIdxToConfig.length;

currModePatternsLength = patternIdxToConfig.length;
const modeCanBeOptimized =
this.canModeBeOptimized[newMode] && this.config.safeMode === false;

if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) {
if (currCharCodeToPatternIdxToConfig && this.config.safeMode === false) {
getPossiblePatterns = getPossiblePatternsOptimized;
} else {
getPossiblePatterns = getPossiblePatternsSlow;
Expand Down
101 changes: 101 additions & 0 deletions packages/chevrotain/test/scan/lexer_spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2236,6 +2236,7 @@ describe("debugging and messages and optimizations", () => {
});
expect((<any>alphaLexerSafeMode).charCodeToPatternIdxToConfig.defaultMode)
.to.be.empty;
const safeModeResult = alphaLexerSafeMode.tokenize("a");

// compare to safeMode disabled
const alphaLexerNoSafeMode = new Lexer([Alpha], {
Expand All @@ -2245,6 +2246,106 @@ describe("debugging and messages and optimizations", () => {
(<any>alphaLexerNoSafeMode).charCodeToPatternIdxToConfig
.defaultMode[97][0].tokenType,
).to.equal(Alpha);
const noSafeModeResult = alphaLexerNoSafeMode.tokenize("a");
expect(safeModeResult).to.deep.equal(noSafeModeResult);
});

it("won't optimize with safe mode enabled - multi mode lexer", () => {
const Alpha = createToken({
name: "A",
pattern: /a/,
push_mode: "b",
});
const Beta = createToken({
name: "B",
pattern: /b/,
pop_mode: true,
});
const tokens = {
modes: {
a: [Alpha],
b: [Beta],
},
defaultMode: "a",
};
const text = "abab";
const lexerSafeMode = new Lexer(tokens, {
positionTracking: "onlyOffset",
safeMode: true,
});
expect((<any>lexerSafeMode).charCodeToPatternIdxToConfig.a).to.be.empty;
const safeModeResult = lexerSafeMode.tokenize(text);

// compare to safeMode disabled
const lexerNoSafeMode = new Lexer(tokens, {
positionTracking: "onlyOffset",
});
expect(
(<any>lexerNoSafeMode).charCodeToPatternIdxToConfig.a[97][0].tokenType,
).to.equal(Alpha);
const noSafeModeResult = lexerNoSafeMode.tokenize(text);
expect(safeModeResult).to.deep.equal(noSafeModeResult);
});

context("lexer optimization", () => {
const dFunction = (text: string, offset: number) => {
if (text.charAt(offset) === "d") {
return ["d"] as [string];
} else {
return null;
}
};

for (const [name, pattern] of [
["function", dFunction],
["unicode regexp", /d/u],
["lookbehind regexp", /(?<!a)d/],
]) {
it(`will optimize ${name} pattern`, () => {
const Alpha = createToken({
name: "A",
pattern: "a",
});
const Beta = createToken({
name: "B",
pattern: "b",
});
const Delta = createToken({
name: "D",
pattern,
});
const optimizedLexer = new Lexer([Alpha, Delta, Beta], {
positionTracking: "onlyOffset",
});
// Assert that the pattern will be added to all character codes
// Also assert that the ordering gets preserved
expect(
(<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
"a".charCodeAt(0)
].map((e: any) => e.tokenType),
).to.deep.equal([Alpha, Delta]);
expect(
(<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
"b".charCodeAt(0)
].map((e: any) => e.tokenType),
).to.deep.equal([Delta, Beta]);
// The lexer cannot identify that the pattern is only for the character 'd'
expect(
(<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
"d".charCodeAt(0)
],
).to.be.undefined;
expect(optimizedLexer.tokenize("a").tokens[0].tokenType).to.deep.equal(
Alpha,
);
expect(optimizedLexer.tokenize("b").tokens[0].tokenType).to.deep.equal(
Beta,
);
expect(optimizedLexer.tokenize("d").tokens[0].tokenType).to.deep.equal(
Delta,
);
});
}
});
});

Expand Down

0 comments on commit e1746f8

Please sign in to comment.