Insert T_BAD_CHARACTER tokens for missing characters

The token stream should cover all characters in the original code,
insert a dummy token for missing illegal characters. We should
really be doing this in token_get_all() as well.
This commit is contained in:
Nikita Popov 2019-06-30 11:43:48 +02:00
parent a4b43edb03
commit b9b45dd2bc
2 changed files with 52 additions and 39 deletions

View File

@ -6,6 +6,11 @@ use PhpParser\Parser\Tokens;
class Lexer class Lexer
{ {
/* Token ID used for illegal characters part of the token stream. These are dropped by token_get_all(),
* but we restore them here to make sure that the tokens cover the full original text, and to prevent
* file positions from going out of sync. */
const T_BAD_CHARACTER = -1;
protected $code; protected $code;
protected $tokens; protected $tokens;
protected $pos; protected $pos;
@ -40,7 +45,7 @@ class Lexer
// map of tokens to drop while lexing (the map is only used for isset lookup, // map of tokens to drop while lexing (the map is only used for isset lookup,
// that's why the value is simply set to 1; the value is never actually used.) // that's why the value is simply set to 1; the value is never actually used.)
$this->dropTokens = array_fill_keys( $this->dropTokens = array_fill_keys(
[\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT], 1 [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT, self::T_BAD_CHARACTER], 1
); );
$defaultAttributes = ['comments', 'startLine', 'endLine']; $defaultAttributes = ['comments', 'startLine', 'endLine'];
@ -92,13 +97,9 @@ class Lexer
} }
private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) { private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
$tokens = [];
for ($i = $start; $i < $end; $i++) { for ($i = $start; $i < $end; $i++) {
$chr = $this->code[$i]; $chr = $this->code[$i];
if ($chr === 'b' || $chr === 'B') {
// HHVM does not treat b" tokens correctly, so ignore these
continue;
}
if ($chr === "\0") { if ($chr === "\0") {
// PHP cuts error message after null byte, so need special case // PHP cuts error message after null byte, so need special case
$errorMsg = 'Unexpected null byte'; $errorMsg = 'Unexpected null byte';
@ -108,6 +109,7 @@ class Lexer
); );
} }
$tokens[] = [self::T_BAD_CHARACTER, $chr, $line];
$errorHandler->handleError(new Error($errorMsg, [ $errorHandler->handleError(new Error($errorMsg, [
'startLine' => $line, 'startLine' => $line,
'endLine' => $line, 'endLine' => $line,
@ -115,6 +117,7 @@ class Lexer
'endFilePos' => $i, 'endFilePos' => $i,
])); ]));
} }
return $tokens;
} }
/** /**
@ -155,16 +158,22 @@ class Lexer
$filePos = 0; $filePos = 0;
$line = 1; $line = 1;
foreach ($this->tokens as $token) { $numTokens = \count($this->tokens);
for ($i = 0; $i < $numTokens; $i++) {
$token = $this->tokens[$i];
$tokenValue = \is_string($token) ? $token : $token[1]; $tokenValue = \is_string($token) ? $token : $token[1];
$tokenLen = \strlen($tokenValue); $tokenLen = \strlen($tokenValue);
if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) { if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
// Something is missing, must be an invalid character // Something is missing, must be an invalid character
$nextFilePos = strpos($this->code, $tokenValue, $filePos); $nextFilePos = strpos($this->code, $tokenValue, $filePos);
$this->handleInvalidCharacterRange( $badCharTokens = $this->handleInvalidCharacterRange(
$filePos, $nextFilePos, $line, $errorHandler); $filePos, $nextFilePos, $line, $errorHandler);
$filePos = (int) $nextFilePos; $filePos = (int) $nextFilePos;
array_splice($this->tokens, $i, 0, $badCharTokens);
$numTokens += \count($badCharTokens);
$i += \count($badCharTokens);
} }
$filePos += $tokenLen; $filePos += $tokenLen;
@ -187,8 +196,9 @@ class Lexer
$this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line]; $this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line];
} else { } else {
// Invalid characters at the end of the input // Invalid characters at the end of the input
$this->handleInvalidCharacterRange( $badCharTokens = $this->handleInvalidCharacterRange(
$filePos, \strlen($this->code), $line, $errorHandler); $filePos, \strlen($this->code), $line, $errorHandler);
$this->tokens = array_merge($this->tokens, $badCharTokens);
} }
return; return;
} }

View File

@ -32,24 +32,25 @@ $a = 42;
@@{ "\1" }@@ @@{ "\1" }@@
$b = 24; $b = 24;
----- -----
!!positions
Unexpected character "" (ASCII 1) from 4:1 to 4:1 Unexpected character "" (ASCII 1) from 4:1 to 4:1
array( array(
0: Stmt_Expression( 0: Stmt_Expression[3:1 - 3:8](
expr: Expr_Assign( expr: Expr_Assign[3:1 - 3:7](
var: Expr_Variable( var: Expr_Variable[3:1 - 3:2](
name: a name: a
) )
expr: Scalar_LNumber( expr: Scalar_LNumber[3:6 - 3:7](
value: 42 value: 42
) )
) )
) )
1: Stmt_Expression( 1: Stmt_Expression[5:1 - 5:8](
expr: Expr_Assign( expr: Expr_Assign[5:1 - 5:7](
var: Expr_Variable( var: Expr_Variable[5:1 - 5:2](
name: b name: b
) )
expr: Scalar_LNumber( expr: Scalar_LNumber[5:6 - 5:7](
value: 24 value: 24
) )
) )
@ -62,24 +63,25 @@ $a = 42;
@@{ "\0" }@@ @@{ "\0" }@@
$b = 24; $b = 24;
----- -----
!!positions
Unexpected null byte from 4:1 to 4:1 Unexpected null byte from 4:1 to 4:1
array( array(
0: Stmt_Expression( 0: Stmt_Expression[3:1 - 3:8](
expr: Expr_Assign( expr: Expr_Assign[3:1 - 3:7](
var: Expr_Variable( var: Expr_Variable[3:1 - 3:2](
name: a name: a
) )
expr: Scalar_LNumber( expr: Scalar_LNumber[3:6 - 3:7](
value: 42 value: 42
) )
) )
) )
1: Stmt_Expression( 1: Stmt_Expression[5:1 - 5:8](
expr: Expr_Assign( expr: Expr_Assign[5:1 - 5:7](
var: Expr_Variable( var: Expr_Variable[5:1 - 5:2](
name: b name: b
) )
expr: Scalar_LNumber( expr: Scalar_LNumber[5:6 - 5:7](
value: 24 value: 24
) )
) )
@ -94,35 +96,36 @@ $b = 2;
@@{ "\2" }@@ @@{ "\2" }@@
$c = 3; $c = 3;
----- -----
Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1 !!positions
Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1 Unexpected character "" (ASCII 1) from 4:1 to 4:1
Unexpected character "" (ASCII 2) from 6:1 to 6:1
array( array(
0: Stmt_Expression( 0: Stmt_Expression[3:1 - 3:7](
expr: Expr_Assign( expr: Expr_Assign[3:1 - 3:6](
var: Expr_Variable( var: Expr_Variable[3:1 - 3:2](
name: a name: a
) )
expr: Scalar_LNumber( expr: Scalar_LNumber[3:6 - 3:6](
value: 1 value: 1
) )
) )
) )
1: Stmt_Expression( 1: Stmt_Expression[5:1 - 5:7](
expr: Expr_Assign( expr: Expr_Assign[5:1 - 5:6](
var: Expr_Variable( var: Expr_Variable[5:1 - 5:2](
name: b name: b
) )
expr: Scalar_LNumber( expr: Scalar_LNumber[5:6 - 5:6](
value: 2 value: 2
) )
) )
) )
2: Stmt_Expression( 2: Stmt_Expression[7:1 - 7:7](
expr: Expr_Assign( expr: Expr_Assign[7:1 - 7:6](
var: Expr_Variable( var: Expr_Variable[7:1 - 7:2](
name: c name: c
) )
expr: Scalar_LNumber( expr: Scalar_LNumber[7:6 - 7:6](
value: 3 value: 3
) )
) )