mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Rewrote lexer modes/tokes to capture errors better
This commit is contained in:
@@ -6,6 +6,7 @@ import { createToken, Lexer } from '../../lib/chevrotain.js';
|
|||||||
const modes = {
|
const modes = {
|
||||||
plaintext: 'plaintext_mode',
|
plaintext: 'plaintext_mode',
|
||||||
macro_def: 'macro_def_mode',
|
macro_def: 'macro_def_mode',
|
||||||
|
macro_identifier_end: 'macro_identifier_end',
|
||||||
macro_args: 'macro_args_mode',
|
macro_args: 'macro_args_mode',
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -19,9 +20,11 @@ const Tokens = {
|
|||||||
Start: createToken({ name: 'MacroStart', pattern: /\{\{/ }),
|
Start: createToken({ name: 'MacroStart', pattern: /\{\{/ }),
|
||||||
// Separate macro identifier needed, that is similar to the global indentifier, but captures the actual macro "name"
|
// Separate macro identifier needed, that is similar to the global indentifier, but captures the actual macro "name"
|
||||||
// We need this, because this token is going to switch lexer mode, while the general identifier does not.
|
// We need this, because this token is going to switch lexer mode, while the general identifier does not.
|
||||||
Identifier: createToken({ name: 'MacroIdentifier', pattern: /[a-zA-Z][\w-]*/ }),
|
|
||||||
Flags: createToken({ name: 'MacroFlag', pattern: /[!?#~/.$]/ }),
|
Flags: createToken({ name: 'MacroFlag', pattern: /[!?#~/.$]/ }),
|
||||||
// CaptureBeforeEnd: createToken({ name: 'MacroCaptureBeforeEnd', pattern: /.*?(?=\}\})/, pop_mode: true/*, group: Lexer.SKIPPED */ }),
|
Identifier: createToken({ name: 'MacroIdentifier', pattern: /[a-zA-Z][\w-]*/ }),
|
||||||
|
// At the end of an identifier, there has to be whitspace, or must be directly followed by colon/double-colon separator, output modifier or closing braces
|
||||||
|
EndOfIdentifier: createToken({ name: 'MacroEndOfIdentifier', pattern: /(?:\s+|(?=:{1,2})|(?=[|}]))/, group: Lexer.SKIPPED }),
|
||||||
|
BeforeEnd: createToken({ name: 'MacroBeforeEnd', pattern: /(?=\}\})/, group: Lexer.SKIPPED }),
|
||||||
End: createToken({ name: 'MacroEnd', pattern: /\}\}/ }),
|
End: createToken({ name: 'MacroEnd', pattern: /\}\}/ }),
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -46,7 +49,7 @@ const Tokens = {
|
|||||||
// DANGER ZONE: Careful with this token. This is used as a way to pop the current mode, if no other token matches.
|
// DANGER ZONE: Careful with this token. This is used as a way to pop the current mode, if no other token matches.
|
||||||
// Can be used in modes that don't have a "defined" end really, like when capturing a single argument, argument list, etc.
|
// Can be used in modes that don't have a "defined" end really, like when capturing a single argument, argument list, etc.
|
||||||
// Has to ALWAYS be the last token.
|
// Has to ALWAYS be the last token.
|
||||||
ModePopper: createToken({ name: 'EndMode', pattern: () => [''], pop_mode: true/*, group: Lexer.SKIPPED */ }),
|
ModePopper: createToken({ name: 'ModePopper', pattern: () => [''], pop_mode: true, group: Lexer.SKIPPED }),
|
||||||
};
|
};
|
||||||
|
|
||||||
/** @type {Map<string,string>} Saves all token definitions that are marked as entering modes */
|
/** @type {Map<string,string>} Saves all token definitions that are marked as entering modes */
|
||||||
@@ -67,8 +70,15 @@ const Def = {
|
|||||||
using(Tokens.WhiteSpace),
|
using(Tokens.WhiteSpace),
|
||||||
|
|
||||||
// Inside a macro, we will match the identifier
|
// Inside a macro, we will match the identifier
|
||||||
// Enter 'macro_args' mode automatically at the end of the identifier, to match any optional arguments
|
// Enter 'macro_identifier_end' mode automatically at the end of the identifier, so we don't match more than one identifier
|
||||||
enter(Tokens.Macro.Identifier, modes.macro_args),
|
enter(Tokens.Macro.Identifier, modes.macro_identifier_end),
|
||||||
|
],
|
||||||
|
[modes.macro_identifier_end]: [
|
||||||
|
exits(Tokens.Macro.BeforeEnd, modes.macro_identifier_end),
|
||||||
|
|
||||||
|
// After a macro identifier, there are only a few valid options. We check those, before we try to find optional macro args.
|
||||||
|
// Must either be followed with whitespace or colon/double-colon, which get captured, or must follow-up with macro end braces or an output modifier pipe.
|
||||||
|
enter(Tokens.Macro.EndOfIdentifier, modes.macro_args, { andExits: modes.macro_identifier_end }),
|
||||||
],
|
],
|
||||||
[modes.macro_args]: [
|
[modes.macro_args]: [
|
||||||
// Macro args allow nested macros
|
// Macro args allow nested macros
|
||||||
@@ -134,16 +144,22 @@ instance = MacroLexer.instance;
|
|||||||
*
|
*
|
||||||
* Marks the token to **enter** the following lexer mode.
|
* Marks the token to **enter** the following lexer mode.
|
||||||
*
|
*
|
||||||
|
* Optionally, you can specify the modes to exit when entering this mode.
|
||||||
|
*
|
||||||
* @param {TokenType} token - The token to modify
|
* @param {TokenType} token - The token to modify
|
||||||
* @param {string} mode - The mode to set
|
* @param {string} mode - The mode to set
|
||||||
|
* @param {object} [options={}] - Additional options
|
||||||
|
* @param {string?} [options.andExits=null] - The modes to exit when entering this mode
|
||||||
* @returns {TokenType} The token again
|
* @returns {TokenType} The token again
|
||||||
*/
|
*/
|
||||||
function enter(token, mode) {
|
function enter(token, mode, { andExits = null } = {}) {
|
||||||
if (!token) throw new Error('Token must not be undefined');
|
if (!token) throw new Error('Token must not be undefined');
|
||||||
if (enterModesMap.has(token.name) && enterModesMap.get(token.name) !== mode) {
|
if (enterModesMap.has(token.name) && enterModesMap.get(token.name) !== mode) {
|
||||||
throw new Error(`Token ${token.name} already is set to enter mode ${enterModesMap.get(token.name)}. The token definition are global, so they cannot be used to lead to different modes.`);
|
throw new Error(`Token ${token.name} already is set to enter mode ${enterModesMap.get(token.name)}. The token definition are global, so they cannot be used to lead to different modes.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (andExits) exits(token, andExits);
|
||||||
|
|
||||||
token.PUSH_MODE = mode;
|
token.PUSH_MODE = mode;
|
||||||
enterModesMap.set(token.name, mode);
|
enterModesMap.set(token.name, mode);
|
||||||
return token;
|
return token;
|
||||||
|
@@ -47,20 +47,6 @@ describe('MacroLexer', () => {
|
|||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
expect(tokens).toEqual(expectedTokens);
|
||||||
});
|
});
|
||||||
// {{ some macro }}
|
|
||||||
it('whitespaces between two valid identifiers will only capture the first as macro identifier', async () => {
|
|
||||||
const input = '{{ some macro }}';
|
|
||||||
const tokens = await runLexerGetTokens(input);
|
|
||||||
|
|
||||||
const expectedTokens = [
|
|
||||||
{ type: 'MacroStart', text: '{{' },
|
|
||||||
{ type: 'MacroIdentifier', text: 'some' },
|
|
||||||
{ type: 'Identifier', text: 'macro' },
|
|
||||||
{ type: 'MacroEnd', text: '}}' },
|
|
||||||
];
|
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
|
||||||
});
|
|
||||||
// {{macro1}}{{macro2}}
|
// {{macro1}}{{macro2}}
|
||||||
it('should handle multiple sequential macros', async () => {
|
it('should handle multiple sequential macros', async () => {
|
||||||
const input = '{{macro1}}{{macro2}}';
|
const input = '{{macro1}}{{macro2}}';
|
||||||
@@ -75,86 +61,12 @@ describe('MacroLexer', () => {
|
|||||||
{ type: 'MacroEnd', text: '}}' },
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
];
|
];
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
|
||||||
});
|
|
||||||
// {{my2cents}}
|
|
||||||
it('should allow numerics inside the macro identifier', async () => {
|
|
||||||
const input = '{{my2cents}}';
|
|
||||||
const tokens = await runLexerGetTokens(input);
|
|
||||||
|
|
||||||
const expectedTokens = [
|
|
||||||
{ type: 'MacroStart', text: '{{' },
|
|
||||||
{ type: 'MacroIdentifier', text: 'my2cents' },
|
|
||||||
{ type: 'MacroEnd', text: '}}' },
|
|
||||||
];
|
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
|
||||||
});
|
|
||||||
// {{SCREAM}}
|
|
||||||
it('should allow capslock macros', async () => {
|
|
||||||
const input = '{{SCREAM}}';
|
|
||||||
const tokens = await runLexerGetTokens(input);
|
|
||||||
|
|
||||||
const expectedTokens = [
|
|
||||||
{ type: 'MacroStart', text: '{{' },
|
|
||||||
{ type: 'MacroIdentifier', text: 'SCREAM' },
|
|
||||||
{ type: 'MacroEnd', text: '}}' },
|
|
||||||
];
|
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
|
||||||
});
|
|
||||||
// {{some-longer-macro}}
|
|
||||||
it('allow dashes in macro identifiers', async () => {
|
|
||||||
const input = '{{some-longer-macro}}';
|
|
||||||
const tokens = await runLexerGetTokens(input);
|
|
||||||
|
|
||||||
const expectedTokens = [
|
|
||||||
{ type: 'MacroStart', text: '{{' },
|
|
||||||
{ type: 'MacroIdentifier', text: 'some-longer-macro' },
|
|
||||||
{ type: 'MacroEnd', text: '}}' },
|
|
||||||
];
|
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
|
||||||
});
|
|
||||||
// {{macro!@#%}}
|
|
||||||
it('do not lex special characters as part of the macro identifier', async () => {
|
|
||||||
const input = '{{macro!@#%}}';
|
|
||||||
const tokens = await runLexerGetTokens(input);
|
|
||||||
|
|
||||||
const expectedTokens = [
|
|
||||||
{ type: 'MacroStart', text: '{{' },
|
|
||||||
{ type: 'MacroIdentifier', text: 'macro' },
|
|
||||||
{ type: 'Unknown', text: '!' },
|
|
||||||
{ type: 'Unknown', text: '@' },
|
|
||||||
{ type: 'Unknown', text: '#' },
|
|
||||||
{ type: 'Unknown', text: '%' },
|
|
||||||
{ type: 'MacroEnd', text: '}}' },
|
|
||||||
];
|
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
|
||||||
});
|
|
||||||
// {{ma!@#%ro}}
|
|
||||||
it('invalid chars in macro identifier are not parsed as valid macro identifier', async () => {
|
|
||||||
const input = '{{ma!@#%ro}}';
|
|
||||||
const tokens = await runLexerGetTokens(input);
|
|
||||||
|
|
||||||
const expectedTokens = [
|
|
||||||
{ type: 'MacroStart', text: '{{' },
|
|
||||||
{ type: 'MacroIdentifier', text: 'ma' },
|
|
||||||
{ type: 'Unknown', text: '!' },
|
|
||||||
{ type: 'Unknown', text: '@' },
|
|
||||||
{ type: 'Unknown', text: '#' },
|
|
||||||
{ type: 'Unknown', text: '%' },
|
|
||||||
{ type: 'Identifier', text: 'ro' },
|
|
||||||
{ type: 'MacroEnd', text: '}}' },
|
|
||||||
];
|
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
expect(tokens).toEqual(expectedTokens);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('Macro Nesting', () => {
|
describe('Macro Nesting', () => {
|
||||||
// {{outerMacro {{innerMacro}}}}
|
// {{outerMacro {{innerMacro}}}}
|
||||||
it('should handle nested macros', async () => {
|
it('should handle nested macros', async () => {
|
||||||
const input = '{{outerMacro {{innerMacro}}}}';
|
const input = '{{outerMacro {{innerMacro}}}}';
|
||||||
const tokens = await runLexerGetTokens(input);
|
const tokens = await runLexerGetTokens(input);
|
||||||
@@ -192,6 +104,115 @@ describe('MacroLexer', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('Macro Identifier', () => {
|
||||||
|
// {{ a }}
|
||||||
|
it('allow one-character macro identifiers', async () => {
|
||||||
|
const input = '{{ a }}';
|
||||||
|
const tokens = await runLexerGetTokens(input);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'a' },
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
// {{ some macro }}
|
||||||
|
it('whitespaces between two valid identifiers will only capture the first as macro identifier', async () => {
|
||||||
|
const input = '{{ some macro }}';
|
||||||
|
const tokens = await runLexerGetTokens(input);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'some' },
|
||||||
|
{ type: 'Identifier', text: 'macro' },
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
// {{my2cents}}
|
||||||
|
it('should allow numerics inside the macro identifier', async () => {
|
||||||
|
const input = '{{my2cents}}';
|
||||||
|
const tokens = await runLexerGetTokens(input);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'my2cents' },
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
// {{SCREAM}}
|
||||||
|
it('should allow capslock macro', async () => {
|
||||||
|
const input = '{{SCREAM}}';
|
||||||
|
const tokens = await runLexerGetTokens(input);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'SCREAM' },
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
// {{some-longer-macro}}
|
||||||
|
it('allow dashes in macro identifiers', async () => {
|
||||||
|
const input = '{{some-longer-macro}}';
|
||||||
|
const tokens = await runLexerGetTokens(input);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'some-longer-macro' },
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
// {{macro!@#%}}
|
||||||
|
it('do not lex special characters as part of the macro identifier', async () => {
|
||||||
|
const input = '{{macro!@#%}}';
|
||||||
|
const { tokens, errors } = await runLexerGetTokensAndErrors(input);
|
||||||
|
|
||||||
|
const expectedErrors = [
|
||||||
|
{ message: 'unexpected character: ->!<- at offset: 7, skipped 4 characters.' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(errors).toMatchObject(expectedErrors);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'macro' },
|
||||||
|
// Do not lex the wrong characters
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
// {{ma!@#%ro}}
|
||||||
|
it('[Error] invalid chars in macro identifier are not parsed as valid macro identifier', async () => {
|
||||||
|
const input = '{{ma!@#%ro}}';
|
||||||
|
const { tokens, errors } = await runLexerGetTokensAndErrors(input);
|
||||||
|
|
||||||
|
const expectedErrors = [
|
||||||
|
{ message: 'unexpected character: ->!<- at offset: 4, skipped 6 characters.' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(errors).toMatchObject(expectedErrors);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'ma' },
|
||||||
|
// Do not lex the wrong characters
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe('Macro Arguments', () => {
|
describe('Macro Arguments', () => {
|
||||||
// {{setvar::myVar::This is Sparta!}}
|
// {{setvar::myVar::This is Sparta!}}
|
||||||
it('should tokenize macros with double colons arguments correctly', async () => {
|
it('should tokenize macros with double colons arguments correctly', async () => {
|
||||||
@@ -710,6 +731,30 @@ describe('MacroLexer', () => {
|
|||||||
{ type: 'Plaintext', text: '{ { not a macro } }' },
|
{ type: 'Plaintext', text: '{ { not a macro } }' },
|
||||||
];
|
];
|
||||||
|
|
||||||
|
expect(tokens).toEqual(expectedTokens);
|
||||||
|
});
|
||||||
|
// invalid {{ 000 }} followed by correct {{ macro }}
|
||||||
|
it('valid macro still works after an invalid macro', async () => {
|
||||||
|
const input = 'invalid {{ 000 }} followed by correct {{ macro }}';
|
||||||
|
const { tokens, errors } = await runLexerGetTokensAndErrors(input);
|
||||||
|
|
||||||
|
const expectedErrors = [
|
||||||
|
{ message: 'unexpected character: ->0<- at offset: 11, skipped 3 characters.' },
|
||||||
|
];
|
||||||
|
|
||||||
|
expect(errors).toMatchObject(expectedErrors);
|
||||||
|
|
||||||
|
const expectedTokens = [
|
||||||
|
{ type: 'Plaintext', text: 'invalid ' },
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
// Do not capture '000' as anything, as it's a lexer error
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
{ type: 'Plaintext', text: ' followed by correct ' },
|
||||||
|
{ type: 'MacroStart', text: '{{' },
|
||||||
|
{ type: 'MacroIdentifier', text: 'macro' },
|
||||||
|
{ type: 'MacroEnd', text: '}}' },
|
||||||
|
];
|
||||||
|
|
||||||
expect(tokens).toEqual(expectedTokens);
|
expect(tokens).toEqual(expectedTokens);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@@ -765,7 +810,7 @@ function getTestableTokens(result) {
|
|||||||
const errors = result.errors;
|
const errors = result.errors;
|
||||||
const tokens = result.tokens
|
const tokens = result.tokens
|
||||||
// Filter out the mode popper. We don't care aobut that for testing
|
// Filter out the mode popper. We don't care aobut that for testing
|
||||||
.filter(token => token.tokenType.name !== 'EndMode')
|
//.filter(token => !['ModePopper', 'BeforeEnd'].includes(token.tokenType.name))
|
||||||
// Extract relevant properties from tokens for comparison
|
// Extract relevant properties from tokens for comparison
|
||||||
.map(token => ({
|
.map(token => ({
|
||||||
type: token.tokenType.name,
|
type: token.tokenType.name,
|
||||||
|
Reference in New Issue
Block a user