node/lib/internal/test_runner/tap_lexer.js
Mert Can Altın 3c6547fcc8
test_runner: parse non-ascii character correctly
PR-URL: https://github.com/nodejs/node/pull/45736
Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com>
Reviewed-By: Moshe Atlow <moshe@atlow.co.il>
2023-02-18 18:10:16 +00:00

541 lines
12 KiB
JavaScript

'use strict';
const {
ArrayPrototypePop,
ArrayPrototypePush,
MathMax,
SafeSet,
StringPrototypeCodePointAt,
StringPrototypeTrim,
} = primordials;
const {
codes: { ERR_TAP_LEXER_ERROR },
} = require('internal/errors');
const { isZeroWidthCodePoint } = require('internal/util/inspect');
const kEOL = '';
const kEOF = '';
const TokenKind = {
EOF: 'EOF',
EOL: 'EOL',
NEWLINE: 'NewLine',
NUMERIC: 'Numeric',
LITERAL: 'Literal',
KEYWORD: 'Keyword',
WHITESPACE: 'Whitespace',
COMMENT: 'Comment',
DASH: 'Dash',
PLUS: 'Plus',
HASH: 'Hash',
ESCAPE: 'Escape',
UNKNOWN: 'Unknown',
// TAP tokens
TAP: 'TAPKeyword',
TAP_VERSION: 'VersionKeyword',
TAP_PLAN: 'PlanKeyword',
TAP_TEST_POINT: 'TestPointKeyword',
TAP_SUBTEST_POINT: 'SubTestPointKeyword',
TAP_TEST_OK: 'TestOkKeyword',
TAP_TEST_NOTOK: 'TestNotOkKeyword',
TAP_YAML_START: 'YamlStartKeyword',
TAP_YAML_END: 'YamlEndKeyword',
TAP_YAML_BLOCK: 'YamlKeyword',
TAP_PRAGMA: 'PragmaKeyword',
TAP_BAIL_OUT: 'BailOutKeyword',
};
class Token {
constructor({ kind, value, stream }) {
const valueLength = ('' + value).length;
this.kind = kind;
this.value = value;
this.location = {
line: stream.line,
column: MathMax(stream.column - valueLength + 1, 1), // 1 based
start: MathMax(stream.pos - valueLength, 0), // zero based
end: stream.pos - (value === '' ? 0 : 1), // zero based
};
// EOF is a special case
if (value === TokenKind.EOF) {
const eofPosition = stream.input.length + 1; // We consider EOF to be outside the stream
this.location.start = eofPosition;
this.location.end = eofPosition;
this.location.column = stream.column + 1; // 1 based
}
}
}
class InputStream {
constructor(input) {
this.input = input;
this.pos = 0;
this.column = 0;
this.line = 1;
}
eof() {
return this.peek() === undefined;
}
peek(offset = 0) {
return this.input[this.pos + offset];
}
next() {
const char = this.peek();
if (char === undefined) {
return undefined;
}
this.pos++;
this.column++;
if (char === '\n') {
this.line++;
this.column = 0;
}
return char;
}
}
class TapLexer {
static Keywords = new SafeSet([
'TAP',
'version',
'ok',
'not',
'...',
'---',
'..',
'pragma',
'-',
'+',
// NOTE: "Skip", "Todo" and "Bail out!" literals are deferred to the parser
]);
#isComment = false;
#source = null;
#line = 1;
#column = 0;
#escapeStack = [];
#lastScannedToken = null;
constructor(source) {
this.#source = new InputStream(source);
this.#lastScannedToken = new Token({
kind: TokenKind.EOL,
value: kEOL,
stream: this.#source,
});
}
scan() {
const tokens = [];
let chunk = [];
while (!this.eof()) {
const token = this.#scanToken();
// Remember the last scanned token (except for whitespace)
if (token.kind !== TokenKind.WHITESPACE) {
this.#lastScannedToken = token;
}
ArrayPrototypePush(chunk, token);
if (token.kind === TokenKind.NEWLINE) {
// Store the current chunk + NEWLINE token
ArrayPrototypePush(tokens, chunk);
chunk = [];
}
}
if (chunk.length > 0) {
ArrayPrototypePush(chunk, this.#scanEOL());
ArrayPrototypePush(tokens, chunk);
}
// send EOF as a separate chunk
ArrayPrototypePush(tokens, [this.#scanEOF()]);
return tokens;
}
next() {
return this.#source.next();
}
eof() {
return this.#source.eof();
}
error(message, token, expected = '') {
this.#source.error(message, token, expected);
}
#scanToken() {
const char = this.next();
if (this.#isEOFSymbol(char)) {
return this.#scanEOF();
} else if (this.#isNewLineSymbol(char)) {
return this.#scanNewLine(char);
} else if (this.#isNumericSymbol(char)) {
return this.#scanNumeric(char);
} else if (this.#isDashSymbol(char)) {
return this.#scanDash(char);
} else if (this.#isPlusSymbol(char)) {
return this.#scanPlus(char);
} else if (this.#isHashSymbol(char)) {
return this.#scanHash(char);
} else if (this.#isEscapeSymbol(char)) {
return this.#scanEscapeSymbol(char);
} else if (this.#isWhitespaceSymbol(char)) {
return this.#scanWhitespace(char);
} else if (this.#isLiteralSymbol(char)) {
return this.#scanLiteral(char);
}
throw new ERR_TAP_LEXER_ERROR(
`Unexpected character: ${char} at line ${this.#line}, column ${
this.#column
}`,
);
}
#scanNewLine(char) {
// In case of odd number of ESCAPE symbols, we need to clear the remaining
// escape chars from the stack and start fresh for the next line.
this.#escapeStack = [];
// We also need to reset the comment flag
this.#isComment = false;
return new Token({
kind: TokenKind.NEWLINE,
value: char,
stream: this.#source,
});
}
#scanEOL() {
return new Token({
kind: TokenKind.EOL,
value: kEOL,
stream: this.#source,
});
}
#scanEOF() {
this.#isComment = false;
return new Token({
kind: TokenKind.EOF,
value: kEOF,
stream: this.#source,
});
}
#scanEscapeSymbol(char) {
// If the escape symbol has been escaped (by previous symbol),
// or if the next symbol is a whitespace symbol,
// then consume it as a literal.
if (
this.#hasTheCurrentCharacterBeenEscaped() ||
this.#source.peek(1) === TokenKind.WHITESPACE
) {
ArrayPrototypePop(this.#escapeStack);
return new Token({
kind: TokenKind.LITERAL,
value: char,
stream: this.#source,
});
}
// Otherwise, consume the escape symbol as an escape symbol that should be ignored by the parser
// we also need to push the escape symbol to the escape stack
// and consume the next character as a literal (done in the next turn)
ArrayPrototypePush(this.#escapeStack, char);
return new Token({
kind: TokenKind.ESCAPE,
value: char,
stream: this.#source,
});
}
#scanWhitespace(char) {
return new Token({
kind: TokenKind.WHITESPACE,
value: char,
stream: this.#source,
});
}
#scanDash(char) {
// Peek next 3 characters and check if it's a YAML start marker
const marker = char + this.#source.peek() + this.#source.peek(1);
if (this.#isYamlStartSymbol(marker)) {
this.next(); // consume second -
this.next(); // consume third -
return new Token({
kind: TokenKind.TAP_YAML_START,
value: marker,
stream: this.#source,
});
}
return new Token({
kind: TokenKind.DASH,
value: char,
stream: this.#source,
});
}
#scanPlus(char) {
return new Token({
kind: TokenKind.PLUS,
value: char,
stream: this.#source,
});
}
#scanHash(char) {
const lastCharacter = this.#source.peek(-2);
const nextToken = this.#source.peek();
// If we encounter a hash symbol at the beginning of a line,
// we consider it as a comment
if (!lastCharacter || this.#isNewLineSymbol(lastCharacter)) {
this.#isComment = true;
return new Token({
kind: TokenKind.COMMENT,
value: char,
stream: this.#source,
});
}
// The only valid case where a hash symbol is considered as a hash token
// is when it's preceded by a whitespace symbol and followed by a non-hash symbol
if (
this.#isWhitespaceSymbol(lastCharacter) &&
!this.#isHashSymbol(nextToken)
) {
return new Token({
kind: TokenKind.HASH,
value: char,
stream: this.#source,
});
}
const charHasBeenEscaped = this.#hasTheCurrentCharacterBeenEscaped();
if (this.#isComment || charHasBeenEscaped) {
if (charHasBeenEscaped) {
ArrayPrototypePop(this.#escapeStack);
}
return new Token({
kind: TokenKind.LITERAL,
value: char,
stream: this.#source,
});
}
// As a fallback, we consume the hash symbol as a literal
return new Token({
kind: TokenKind.LITERAL,
value: char,
stream: this.#source,
});
}
#scanLiteral(char) {
let word = char;
while (!this.#source.eof()) {
const nextChar = this.#source.peek();
if (this.#isLiteralSymbol(nextChar)) {
word += this.#source.next();
} else {
break;
}
}
word = StringPrototypeTrim(word);
if (TapLexer.Keywords.has(word)) {
const token = this.#scanTAPKeyword(word);
if (token) {
return token;
}
}
if (this.#isYamlEndSymbol(word)) {
return new Token({
kind: TokenKind.TAP_YAML_END,
value: word,
stream: this.#source,
});
}
return new Token({
kind: TokenKind.LITERAL,
value: word,
stream: this.#source,
});
}
#scanTAPKeyword(word) {
const isLastScannedTokenEOLorNewLine =
TokenKind.EOL === this.#lastScannedToken.kind ||
TokenKind.NEWLINE === this.#lastScannedToken.kind;
if (word === 'TAP' && isLastScannedTokenEOLorNewLine) {
return new Token({
kind: TokenKind.TAP,
value: word,
stream: this.#source,
});
}
if (word === 'version' && this.#lastScannedToken.kind === TokenKind.TAP) {
return new Token({
kind: TokenKind.TAP_VERSION,
value: word,
stream: this.#source,
});
}
if (word === '..' && this.#lastScannedToken.kind === TokenKind.NUMERIC) {
return new Token({
kind: TokenKind.TAP_PLAN,
value: word,
stream: this.#source,
});
}
if (word === 'not' && isLastScannedTokenEOLorNewLine) {
return new Token({
kind: TokenKind.TAP_TEST_NOTOK,
value: word,
stream: this.#source,
});
}
if (
word === 'ok' &&
(this.#lastScannedToken.kind === TokenKind.TAP_TEST_NOTOK ||
isLastScannedTokenEOLorNewLine)
) {
return new Token({
kind: TokenKind.TAP_TEST_OK,
value: word,
stream: this.#source,
});
}
if (word === 'pragma' && isLastScannedTokenEOLorNewLine) {
return new Token({
kind: TokenKind.TAP_PRAGMA,
value: word,
stream: this.#source,
});
}
return null;
}
#scanNumeric(char) {
let number = char;
while (!this.#source.eof()) {
const nextChar = this.#source.peek();
if (this.#isNumericSymbol(nextChar)) {
number += nextChar;
this.#source.next();
} else {
break;
}
}
return new Token({
kind: TokenKind.NUMERIC,
value: number,
stream: this.#source,
});
}
#hasTheCurrentCharacterBeenEscaped() {
// Use the escapeStack to keep track of the escape characters
return this.#escapeStack.length > 0;
}
#isNumericSymbol(char) {
return char >= '0' && char <= '9';
}
#isLiteralSymbol(char) {
if (typeof char !== 'string') return false;
const charCode = StringPrototypeCodePointAt(char);
if (isZeroWidthCodePoint(charCode)) return false;
if (this.#isWhitespaceSymbol(char)) return false;
const MAX_ASCII_CHAR_CODE = 0b111_1111; // ASCII is 7-bit long
// Allow all non-latin characters.
if (charCode > MAX_ASCII_CHAR_CODE) return true;
const ZERO = 48; // 0
const NINE = 58; // 9
// Disallow numeric values
if (charCode >= ZERO && char <= NINE) return false;
// Disallow characters with special meaning in TAP
const HASH = 35; // #
const BACKSLASH = 92; // \
const PLUS = 43; // +
const DASH = 45; // -
// Disallow characters with special meaning in TAP
return charCode !== HASH && charCode !== BACKSLASH &&
charCode !== PLUS && charCode !== DASH;
}
#isWhitespaceSymbol(char) {
return char === ' ' || char === '\t';
}
#isEOFSymbol(char) {
return char === undefined;
}
#isNewLineSymbol(char) {
return char === '\n' || char === '\r';
}
#isHashSymbol(char) {
return char === '#';
}
#isDashSymbol(char) {
return char === '-';
}
#isPlusSymbol(char) {
return char === '+';
}
#isEscapeSymbol(char) {
return char === '\\';
}
#isYamlStartSymbol(char) {
return char === '---';
}
#isYamlEndSymbol(char) {
return char === '...';
}
}
module.exports = { TapLexer, TokenKind };