Canonicalize to PHP 8 comment token format

The trailing newline is no longer part of the comment token.
2024-11-30 04:19:30 +01:00 · 2020-06-27 18:53:09 +02:00 · 2020-06-27 18:53:09 +02:00 · 4abc531213
commit 4abc531213
parent b58b19ed1d
5 changed files with 47 additions and 52 deletions
--- a/lib/PhpParser/Lexer.php
+++ b/lib/PhpParser/Lexer.php
@ -89,7 +89,7 @@ class Lexer
        error_clear_last();
        $this->tokens = @token_get_all($code);
-        $this->handleErrors($errorHandler);
+        $this->postprocessTokens($errorHandler);
        if (false !== $scream) {
            ini_set('xdebug.scream', $scream);
@ -131,40 +131,14 @@ class Lexer
            && substr($token[1], -2) !== '*/';
    }
-    /**
+    protected function postprocessTokens(ErrorHandler $errorHandler) {
     * Check whether an error *may* have occurred during tokenization.
     *
     * @return bool
     */
    private function errorMayHaveOccurred() : bool {
        if (defined('HHVM_VERSION')) {
            // In HHVM token_get_all() does not throw warnings, so we need to conservatively
            // assume that an error occurred
            return true;
        }
        if (PHP_VERSION_ID >= 80000) {
            // PHP 8 converts the "bad character" case into a parse error, rather than treating
            // it as a lexing warning. To preserve previous behavior, we need to assume that an
            // error occurred.
            // TODO: We should handle this the same way as PHP 8: Only generate T_BAD_CHARACTER
            // token here (for older PHP versions) and leave generationg of the actual parse error
            // to the parser. This will also save the full token scan on PHP 8 here.
            return true;
        }
        return null !== error_get_last();
    }
    protected function handleErrors(ErrorHandler $errorHandler) {
        if (!$this->errorMayHaveOccurred()) {
            return;
        }
        // PHP's error handling for token_get_all() is rather bad, so if we want detailed
        // error information we need to compute it ourselves. Invalid character errors are
        // detected by finding "gaps" in the token array. Unterminated comments are detected
        // by checking if a trailing comment has a "*/" at the end.
        //
        // Additionally, we canonicalize to the PHP 8 comment format here, which does not include
        // the trailing whitespace anymore
        $filePos = 0;
        $line = 1;
@ -178,6 +152,23 @@ class Lexer
                $this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler);
            }
            if ($token[0] === \T_COMMENT && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) {
                $trailingNewline = $matches[0];
                $token[1] = substr($token[1], 0, -strlen($trailingNewline));
                $this->tokens[$i] = $token;
                if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) {
                    // Move trailing newline into following T_WHITESPACE token, if it already exists.
                    $this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1];
                    $this->tokens[$i + 1][2]--;
                } else {
                    // Otherwise, we need to create a new T_WHITESPACE token.
                    array_splice($this->tokens, $i + 1, 0, [
                        [\T_WHITESPACE, $trailingNewline, $line],
                    ]);
                    $numTokens++;
                }
            }
            $tokenValue = \is_string($token) ? $token : $token[1];
            $tokenLen = \strlen($tokenValue);
--- a/test/PhpParser/LexerTest.php
+++ b/test/PhpParser/LexerTest.php
@ -124,12 +124,12 @@ class LexerTest extends \PHPUnit\Framework\TestCase
                            'comments' => [
                                new Comment('/* comment */',
                                    1, 6, 1, 1, 18, 1),
-                                new Comment('// comment' . "\n",
+                                new Comment('// comment',
-                                    1, 20, 3, 2, 30, 3),
+                                    1, 20, 3, 1, 29, 3),
                                new Comment\Doc('/** docComment 1 */',
-                                    2, 31, 4, 2, 49, 4),
+                                    2, 31, 5, 2, 49, 5),
                                new Comment\Doc('/** docComment 2 */',
-                                    2, 50, 5, 2, 68, 5),
+                                    2, 50, 6, 2, 68, 6),
                            ],
                        ],
                        ['endLine' => 2]
@ -185,11 +185,11 @@ class LexerTest extends \PHPUnit\Framework\TestCase
                    ],
                    [
                        Tokens::T_CONSTANT_ENCAPSED_STRING, '"b"',
-                        ['startTokenPos' => 5], ['endTokenPos' => 5]
+                        ['startTokenPos' => 6], ['endTokenPos' => 6]
                    ],
                    [
                        ord(';'), ';',
-                        ['startTokenPos' => 6], ['endTokenPos' => 6]
+                        ['startTokenPos' => 7], ['endTokenPos' => 7]
                    ],
                ]
            ],
@ -251,14 +251,17 @@ class LexerTest extends \PHPUnit\Framework\TestCase
    }
    public function testGetTokens() {
-        $code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";';
+        $code = '<?php "a";' . "\n" . '// foo' . "\n" . '// bar' . "\n\n" . '"b";';
        $expectedTokens = [
            [T_OPEN_TAG, '<?php ', 1],
            [T_CONSTANT_ENCAPSED_STRING, '"a"', 1],
            ';',
            [T_WHITESPACE, "\n", 1],
-            [T_COMMENT, '// foo' . "\n", 2],
+            [T_COMMENT, '// foo', 2],
-            [T_CONSTANT_ENCAPSED_STRING, '"b"', 3],
+            [T_WHITESPACE, "\n", 2],
            [T_COMMENT, '// bar', 3],
            [T_WHITESPACE, "\n\n", 3],
            [T_CONSTANT_ENCAPSED_STRING, '"b"', 5],
            ';',
        ];
--- a/test/PhpParser/NodeAbstractTest.php
+++ b/test/PhpParser/NodeAbstractTest.php
@ -307,12 +307,12 @@ PHP;
            "comments": [
                {
                    "nodeType": "Comment",
-                    "text": "\/\/ comment\n",
+                    "text": "\/\/ comment",
                    "line": 2,
                    "filePos": 6,
                    "tokenPos": 1,
-                    "endLine": 3,
+                    "endLine": 2,
-                    "endFilePos": 16,
+                    "endFilePos": 15,
                    "endTokenPos": 1
                },
                {
@ -320,10 +320,10 @@ PHP;
                    "text": "\/** doc comment *\/",
                    "line": 3,
                    "filePos": 17,
-                    "tokenPos": 2,
+                    "tokenPos": 3,
                    "endLine": 3,
                    "endFilePos": 34,
-                    "endTokenPos": 2
+                    "endTokenPos": 3
                }
            ],
            "endLine": 6
--- a/test/PhpParser/ParserTest.php
+++ b/test/PhpParser/ParserTest.php
@ -83,10 +83,10 @@ EOC;
        $this->assertInstanceOf(Stmt\Echo_::class, $echo);
        $this->assertEquals([
            'comments' => [
-                new Comment("// Line\n",
+                new Comment("// Line",
-                    4, 49, 12, 5, 56, 12),
+                    4, 49, 12, 4, 55, 12),
-                new Comment("// Comments\n",
+                new Comment("// Comments",
-                    5, 61, 14, 6, 72, 14),
+                    5, 61, 14, 5, 71, 14),
            ],
            'startLine' => 6,
            'endLine' => 6,
--- a/test/code/formatPreservation/classMethodNop.test
+++ b/test/code/formatPreservation/classMethodNop.test
@ -15,8 +15,8 @@ class Foo {
    public function __construct()
    {
        // I'm just a comment
-
+        $foo;
-        $foo;    }
+    }
 }
 -----
 <?php
@ -72,5 +72,6 @@ class Foo {
    public function __construct()
    {
        // I'm a new comment
-            }
+
    }
 }