From 4abc531213ec074b87cccb1a39c06bdfc0dec5e4 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 27 Jun 2020 18:53:09 +0200 Subject: [PATCH] Canonicalize to PHP 8 comment token format The trailing newline is no longer part of the comment token. --- lib/PhpParser/Lexer.php | 53 ++++++++----------- test/PhpParser/LexerTest.php | 21 ++++---- test/PhpParser/NodeAbstractTest.php | 10 ++-- test/PhpParser/ParserTest.php | 8 +-- .../formatPreservation/classMethodNop.test | 7 +-- 5 files changed, 47 insertions(+), 52 deletions(-) diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php index 694a848..da36f2f 100644 --- a/lib/PhpParser/Lexer.php +++ b/lib/PhpParser/Lexer.php @@ -89,7 +89,7 @@ class Lexer error_clear_last(); $this->tokens = @token_get_all($code); - $this->handleErrors($errorHandler); + $this->postprocessTokens($errorHandler); if (false !== $scream) { ini_set('xdebug.scream', $scream); @@ -131,40 +131,14 @@ class Lexer && substr($token[1], -2) !== '*/'; } - /** - * Check whether an error *may* have occurred during tokenization. - * - * @return bool - */ - private function errorMayHaveOccurred() : bool { - if (defined('HHVM_VERSION')) { - // In HHVM token_get_all() does not throw warnings, so we need to conservatively - // assume that an error occurred - return true; - } - - if (PHP_VERSION_ID >= 80000) { - // PHP 8 converts the "bad character" case into a parse error, rather than treating - // it as a lexing warning. To preserve previous behavior, we need to assume that an - // error occurred. - // TODO: We should handle this the same way as PHP 8: Only generate T_BAD_CHARACTER - // token here (for older PHP versions) and leave generationg of the actual parse error - // to the parser. This will also save the full token scan on PHP 8 here. - return true; - } - - return null !== error_get_last(); - } - - protected function handleErrors(ErrorHandler $errorHandler) { - if (!$this->errorMayHaveOccurred()) { - return; - } - + protected function postprocessTokens(ErrorHandler $errorHandler) { // PHP's error handling for token_get_all() is rather bad, so if we want detailed // error information we need to compute it ourselves. Invalid character errors are // detected by finding "gaps" in the token array. Unterminated comments are detected // by checking if a trailing comment has a "*/" at the end. + // + // Additionally, we canonicalize to the PHP 8 comment format here, which does not include + // the trailing whitespace anymore $filePos = 0; $line = 1; @@ -178,6 +152,23 @@ class Lexer $this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler); } + if ($token[0] === \T_COMMENT && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) { + $trailingNewline = $matches[0]; + $token[1] = substr($token[1], 0, -strlen($trailingNewline)); + $this->tokens[$i] = $token; + if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) { + // Move trailing newline into following T_WHITESPACE token, if it already exists. + $this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1]; + $this->tokens[$i + 1][2]--; + } else { + // Otherwise, we need to create a new T_WHITESPACE token. + array_splice($this->tokens, $i + 1, 0, [ + [\T_WHITESPACE, $trailingNewline, $line], + ]); + $numTokens++; + } + } + $tokenValue = \is_string($token) ? $token : $token[1]; $tokenLen = \strlen($tokenValue); diff --git a/test/PhpParser/LexerTest.php b/test/PhpParser/LexerTest.php index 9c34960..2e487d5 100644 --- a/test/PhpParser/LexerTest.php +++ b/test/PhpParser/LexerTest.php @@ -124,12 +124,12 @@ class LexerTest extends \PHPUnit\Framework\TestCase 'comments' => [ new Comment('/* comment */', 1, 6, 1, 1, 18, 1), - new Comment('// comment' . "\n", - 1, 20, 3, 2, 30, 3), + new Comment('// comment', + 1, 20, 3, 1, 29, 3), new Comment\Doc('/** docComment 1 */', - 2, 31, 4, 2, 49, 4), + 2, 31, 5, 2, 49, 5), new Comment\Doc('/** docComment 2 */', - 2, 50, 5, 2, 68, 5), + 2, 50, 6, 2, 68, 6), ], ], ['endLine' => 2] @@ -185,11 +185,11 @@ class LexerTest extends \PHPUnit\Framework\TestCase ], [ Tokens::T_CONSTANT_ENCAPSED_STRING, '"b"', - ['startTokenPos' => 5], ['endTokenPos' => 5] + ['startTokenPos' => 6], ['endTokenPos' => 6] ], [ ord(';'), ';', - ['startTokenPos' => 6], ['endTokenPos' => 6] + ['startTokenPos' => 7], ['endTokenPos' => 7] ], ] ], @@ -251,14 +251,17 @@ class LexerTest extends \PHPUnit\Framework\TestCase } public function testGetTokens() { - $code = 'assertInstanceOf(Stmt\Echo_::class, $echo); $this->assertEquals([ 'comments' => [ - new Comment("// Line\n", - 4, 49, 12, 5, 56, 12), - new Comment("// Comments\n", - 5, 61, 14, 6, 72, 14), + new Comment("// Line", + 4, 49, 12, 4, 55, 12), + new Comment("// Comments", + 5, 61, 14, 5, 71, 14), ], 'startLine' => 6, 'endLine' => 6, diff --git a/test/code/formatPreservation/classMethodNop.test b/test/code/formatPreservation/classMethodNop.test index d353e65..e2ad5ae 100644 --- a/test/code/formatPreservation/classMethodNop.test +++ b/test/code/formatPreservation/classMethodNop.test @@ -15,8 +15,8 @@ class Foo { public function __construct() { // I'm just a comment - - $foo; } + $foo; + } } -----