Insert T_BAD_CHARACTER tokens for missing characters

The token stream should cover all characters in the original code,
insert a dummy token for missing illegal characters. We should
really be doing this in token_get_all() as well.
This commit is contained in:
Nikita Popov 2019-06-30 11:43:48 +02:00
parent a4b43edb03
commit b9b45dd2bc
2 changed files with 52 additions and 39 deletions

View File

@ -6,6 +6,11 @@ use PhpParser\Parser\Tokens;
class Lexer
{
/* Token ID used for illegal characters part of the token stream. These are dropped by token_get_all(),
* but we restore them here to make sure that the tokens cover the full original text, and to prevent
* file positions from going out of sync. */
const T_BAD_CHARACTER = -1;
protected $code;
protected $tokens;
protected $pos;
@ -40,7 +45,7 @@ class Lexer
// map of tokens to drop while lexing (the map is only used for isset lookup,
// that's why the value is simply set to 1; the value is never actually used.)
$this->dropTokens = array_fill_keys(
[\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT], 1
[\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT, self::T_BAD_CHARACTER], 1
);
$defaultAttributes = ['comments', 'startLine', 'endLine'];
@ -92,13 +97,9 @@ class Lexer
}
private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
$tokens = [];
for ($i = $start; $i < $end; $i++) {
$chr = $this->code[$i];
if ($chr === 'b' || $chr === 'B') {
// HHVM does not treat b" tokens correctly, so ignore these
continue;
}
if ($chr === "\0") {
// PHP cuts error message after null byte, so need special case
$errorMsg = 'Unexpected null byte';
@ -108,6 +109,7 @@ class Lexer
);
}
$tokens[] = [self::T_BAD_CHARACTER, $chr, $line];
$errorHandler->handleError(new Error($errorMsg, [
'startLine' => $line,
'endLine' => $line,
@ -115,6 +117,7 @@ class Lexer
'endFilePos' => $i,
]));
}
return $tokens;
}
/**
@ -155,16 +158,22 @@ class Lexer
$filePos = 0;
$line = 1;
foreach ($this->tokens as $token) {
$numTokens = \count($this->tokens);
for ($i = 0; $i < $numTokens; $i++) {
$token = $this->tokens[$i];
$tokenValue = \is_string($token) ? $token : $token[1];
$tokenLen = \strlen($tokenValue);
if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
// Something is missing, must be an invalid character
$nextFilePos = strpos($this->code, $tokenValue, $filePos);
$this->handleInvalidCharacterRange(
$badCharTokens = $this->handleInvalidCharacterRange(
$filePos, $nextFilePos, $line, $errorHandler);
$filePos = (int) $nextFilePos;
array_splice($this->tokens, $i, 0, $badCharTokens);
$numTokens += \count($badCharTokens);
$i += \count($badCharTokens);
}
$filePos += $tokenLen;
@ -187,8 +196,9 @@ class Lexer
$this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line];
} else {
// Invalid characters at the end of the input
$this->handleInvalidCharacterRange(
$badCharTokens = $this->handleInvalidCharacterRange(
$filePos, \strlen($this->code), $line, $errorHandler);
$this->tokens = array_merge($this->tokens, $badCharTokens);
}
return;
}

View File

@ -32,24 +32,25 @@ $a = 42;
@@{ "\1" }@@
$b = 24;
-----
!!positions
Unexpected character "" (ASCII 1) from 4:1 to 4:1
array(
0: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
0: Stmt_Expression[3:1 - 3:8](
expr: Expr_Assign[3:1 - 3:7](
var: Expr_Variable[3:1 - 3:2](
name: a
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[3:6 - 3:7](
value: 42
)
)
)
1: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
1: Stmt_Expression[5:1 - 5:8](
expr: Expr_Assign[5:1 - 5:7](
var: Expr_Variable[5:1 - 5:2](
name: b
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[5:6 - 5:7](
value: 24
)
)
@ -62,24 +63,25 @@ $a = 42;
@@{ "\0" }@@
$b = 24;
-----
!!positions
Unexpected null byte from 4:1 to 4:1
array(
0: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
0: Stmt_Expression[3:1 - 3:8](
expr: Expr_Assign[3:1 - 3:7](
var: Expr_Variable[3:1 - 3:2](
name: a
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[3:6 - 3:7](
value: 42
)
)
)
1: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
1: Stmt_Expression[5:1 - 5:8](
expr: Expr_Assign[5:1 - 5:7](
var: Expr_Variable[5:1 - 5:2](
name: b
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[5:6 - 5:7](
value: 24
)
)
@ -94,35 +96,36 @@ $b = 2;
@@{ "\2" }@@
$c = 3;
-----
Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1
!!positions
Unexpected character "" (ASCII 1) from 4:1 to 4:1
Unexpected character "" (ASCII 2) from 6:1 to 6:1
array(
0: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
0: Stmt_Expression[3:1 - 3:7](
expr: Expr_Assign[3:1 - 3:6](
var: Expr_Variable[3:1 - 3:2](
name: a
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[3:6 - 3:6](
value: 1
)
)
)
1: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
1: Stmt_Expression[5:1 - 5:7](
expr: Expr_Assign[5:1 - 5:6](
var: Expr_Variable[5:1 - 5:2](
name: b
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[5:6 - 5:6](
value: 2
)
)
)
2: Stmt_Expression(
expr: Expr_Assign(
var: Expr_Variable(
2: Stmt_Expression[7:1 - 7:7](
expr: Expr_Assign[7:1 - 7:6](
var: Expr_Variable[7:1 - 7:2](
name: c
)
expr: Scalar_LNumber(
expr: Scalar_LNumber[7:6 - 7:6](
value: 3
)
)