From b9b45dd2bcce5dacc94dd853732d9a89a28fe606 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sun, 30 Jun 2019 11:43:48 +0200 Subject: [PATCH] Insert T_BAD_CHARACTER tokens for missing characters The token stream should cover all characters in the original code, insert a dummy token for missing illegal characters. We should really be doing this in token_get_all() as well. --- lib/PhpParser/Lexer.php | 28 ++++++--- .../parser/errorHandling/lexerErrors.test | 63 ++++++++++--------- 2 files changed, 52 insertions(+), 39 deletions(-) diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php index dc5e533..51ffd1e 100644 --- a/lib/PhpParser/Lexer.php +++ b/lib/PhpParser/Lexer.php @@ -6,6 +6,11 @@ use PhpParser\Parser\Tokens; class Lexer { + /* Token ID used for illegal characters part of the token stream. These are dropped by token_get_all(), + * but we restore them here to make sure that the tokens cover the full original text, and to prevent + * file positions from going out of sync. */ + const T_BAD_CHARACTER = -1; + protected $code; protected $tokens; protected $pos; @@ -40,7 +45,7 @@ class Lexer // map of tokens to drop while lexing (the map is only used for isset lookup, // that's why the value is simply set to 1; the value is never actually used.) $this->dropTokens = array_fill_keys( - [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT], 1 + [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT, self::T_BAD_CHARACTER], 1 ); $defaultAttributes = ['comments', 'startLine', 'endLine']; @@ -92,13 +97,9 @@ class Lexer } private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) { + $tokens = []; for ($i = $start; $i < $end; $i++) { $chr = $this->code[$i]; - if ($chr === 'b' || $chr === 'B') { - // HHVM does not treat b" tokens correctly, so ignore these - continue; - } - if ($chr === "\0") { // PHP cuts error message after null byte, so need special case $errorMsg = 'Unexpected null byte'; @@ -108,6 +109,7 @@ class Lexer ); } + $tokens[] = [self::T_BAD_CHARACTER, $chr, $line]; $errorHandler->handleError(new Error($errorMsg, [ 'startLine' => $line, 'endLine' => $line, @@ -115,6 +117,7 @@ class Lexer 'endFilePos' => $i, ])); } + return $tokens; } /** @@ -155,16 +158,22 @@ class Lexer $filePos = 0; $line = 1; - foreach ($this->tokens as $token) { + $numTokens = \count($this->tokens); + for ($i = 0; $i < $numTokens; $i++) { + $token = $this->tokens[$i]; $tokenValue = \is_string($token) ? $token : $token[1]; $tokenLen = \strlen($tokenValue); if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) { // Something is missing, must be an invalid character $nextFilePos = strpos($this->code, $tokenValue, $filePos); - $this->handleInvalidCharacterRange( + $badCharTokens = $this->handleInvalidCharacterRange( $filePos, $nextFilePos, $line, $errorHandler); $filePos = (int) $nextFilePos; + + array_splice($this->tokens, $i, 0, $badCharTokens); + $numTokens += \count($badCharTokens); + $i += \count($badCharTokens); } $filePos += $tokenLen; @@ -187,8 +196,9 @@ class Lexer $this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line]; } else { // Invalid characters at the end of the input - $this->handleInvalidCharacterRange( + $badCharTokens = $this->handleInvalidCharacterRange( $filePos, \strlen($this->code), $line, $errorHandler); + $this->tokens = array_merge($this->tokens, $badCharTokens); } return; } diff --git a/test/code/parser/errorHandling/lexerErrors.test b/test/code/parser/errorHandling/lexerErrors.test index 163b743..7ff27eb 100644 --- a/test/code/parser/errorHandling/lexerErrors.test +++ b/test/code/parser/errorHandling/lexerErrors.test @@ -32,24 +32,25 @@ $a = 42; @@{ "\1" }@@ $b = 24; ----- +!!positions Unexpected character "" (ASCII 1) from 4:1 to 4:1 array( - 0: Stmt_Expression( - expr: Expr_Assign( - var: Expr_Variable( + 0: Stmt_Expression[3:1 - 3:8]( + expr: Expr_Assign[3:1 - 3:7]( + var: Expr_Variable[3:1 - 3:2]( name: a ) - expr: Scalar_LNumber( + expr: Scalar_LNumber[3:6 - 3:7]( value: 42 ) ) ) - 1: Stmt_Expression( - expr: Expr_Assign( - var: Expr_Variable( + 1: Stmt_Expression[5:1 - 5:8]( + expr: Expr_Assign[5:1 - 5:7]( + var: Expr_Variable[5:1 - 5:2]( name: b ) - expr: Scalar_LNumber( + expr: Scalar_LNumber[5:6 - 5:7]( value: 24 ) ) @@ -62,24 +63,25 @@ $a = 42; @@{ "\0" }@@ $b = 24; ----- +!!positions Unexpected null byte from 4:1 to 4:1 array( - 0: Stmt_Expression( - expr: Expr_Assign( - var: Expr_Variable( + 0: Stmt_Expression[3:1 - 3:8]( + expr: Expr_Assign[3:1 - 3:7]( + var: Expr_Variable[3:1 - 3:2]( name: a ) - expr: Scalar_LNumber( + expr: Scalar_LNumber[3:6 - 3:7]( value: 42 ) ) ) - 1: Stmt_Expression( - expr: Expr_Assign( - var: Expr_Variable( + 1: Stmt_Expression[5:1 - 5:8]( + expr: Expr_Assign[5:1 - 5:7]( + var: Expr_Variable[5:1 - 5:2]( name: b ) - expr: Scalar_LNumber( + expr: Scalar_LNumber[5:6 - 5:7]( value: 24 ) ) @@ -94,35 +96,36 @@ $b = 2; @@{ "\2" }@@ $c = 3; ----- -Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1 -Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1 +!!positions +Unexpected character "" (ASCII 1) from 4:1 to 4:1 +Unexpected character "" (ASCII 2) from 6:1 to 6:1 array( - 0: Stmt_Expression( - expr: Expr_Assign( - var: Expr_Variable( + 0: Stmt_Expression[3:1 - 3:7]( + expr: Expr_Assign[3:1 - 3:6]( + var: Expr_Variable[3:1 - 3:2]( name: a ) - expr: Scalar_LNumber( + expr: Scalar_LNumber[3:6 - 3:6]( value: 1 ) ) ) - 1: Stmt_Expression( - expr: Expr_Assign( - var: Expr_Variable( + 1: Stmt_Expression[5:1 - 5:7]( + expr: Expr_Assign[5:1 - 5:6]( + var: Expr_Variable[5:1 - 5:2]( name: b ) - expr: Scalar_LNumber( + expr: Scalar_LNumber[5:6 - 5:6]( value: 2 ) ) ) - 2: Stmt_Expression( - expr: Expr_Assign( - var: Expr_Variable( + 2: Stmt_Expression[7:1 - 7:7]( + expr: Expr_Assign[7:1 - 7:6]( + var: Expr_Variable[7:1 - 7:2]( name: c ) - expr: Scalar_LNumber( + expr: Scalar_LNumber[7:6 - 7:6]( value: 3 ) )