From 4abc531213ec074b87cccb1a39c06bdfc0dec5e4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sat, 27 Jun 2020 18:53:09 +0200
Subject: [PATCH] Canonicalize to PHP 8 comment token format

The trailing newline is no longer part of the comment token.
---
 lib/PhpParser/Lexer.php                       | 53 ++++++++-----------
 test/PhpParser/LexerTest.php                  | 21 ++++----
 test/PhpParser/NodeAbstractTest.php           | 10 ++--
 test/PhpParser/ParserTest.php                 |  8 +--
 .../formatPreservation/classMethodNop.test    |  7 +--
 5 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php
index 694a848..da36f2f 100644
--- a/lib/PhpParser/Lexer.php
+++ b/lib/PhpParser/Lexer.php
@@ -89,7 +89,7 @@ class Lexer
 
         error_clear_last();
         $this->tokens = @token_get_all($code);
-        $this->handleErrors($errorHandler);
+        $this->postprocessTokens($errorHandler);
 
         if (false !== $scream) {
             ini_set('xdebug.scream', $scream);
@@ -131,40 +131,14 @@ class Lexer
             && substr($token[1], -2) !== '*/';
     }
 
-    /**
-     * Check whether an error *may* have occurred during tokenization.
-     *
-     * @return bool
-     */
-    private function errorMayHaveOccurred() : bool {
-        if (defined('HHVM_VERSION')) {
-            // In HHVM token_get_all() does not throw warnings, so we need to conservatively
-            // assume that an error occurred
-            return true;
-        }
-
-        if (PHP_VERSION_ID >= 80000) {
-            // PHP 8 converts the "bad character" case into a parse error, rather than treating
-            // it as a lexing warning. To preserve previous behavior, we need to assume that an
-            // error occurred.
-            // TODO: We should handle this the same way as PHP 8: Only generate T_BAD_CHARACTER
-            // token here (for older PHP versions) and leave generationg of the actual parse error
-            // to the parser. This will also save the full token scan on PHP 8 here.
-            return true;
-        }
-
-        return null !== error_get_last();
-    }
-
-    protected function handleErrors(ErrorHandler $errorHandler) {
-        if (!$this->errorMayHaveOccurred()) {
-            return;
-        }
-
+    protected function postprocessTokens(ErrorHandler $errorHandler) {
         // PHP's error handling for token_get_all() is rather bad, so if we want detailed
         // error information we need to compute it ourselves. Invalid character errors are
         // detected by finding "gaps" in the token array. Unterminated comments are detected
         // by checking if a trailing comment has a "*/" at the end.
+        //
+        // Additionally, we canonicalize to the PHP 8 comment format here, which does not include
+        // the trailing whitespace anymore
 
         $filePos = 0;
         $line = 1;
@@ -178,6 +152,23 @@ class Lexer
                 $this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler);
             }
 
+            if ($token[0] === \T_COMMENT && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) {
+                $trailingNewline = $matches[0];
+                $token[1] = substr($token[1], 0, -strlen($trailingNewline));
+                $this->tokens[$i] = $token;
+                if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) {
+                    // Move trailing newline into following T_WHITESPACE token, if it already exists.
+                    $this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1];
+                    $this->tokens[$i + 1][2]--;
+                } else {
+                    // Otherwise, we need to create a new T_WHITESPACE token.
+                    array_splice($this->tokens, $i + 1, 0, [
+                        [\T_WHITESPACE, $trailingNewline, $line],
+                    ]);
+                    $numTokens++;
+                }
+            }
+
             $tokenValue = \is_string($token) ? $token : $token[1];
             $tokenLen = \strlen($tokenValue);
 
diff --git a/test/PhpParser/LexerTest.php b/test/PhpParser/LexerTest.php
index 9c34960..2e487d5 100644
--- a/test/PhpParser/LexerTest.php
+++ b/test/PhpParser/LexerTest.php
@@ -124,12 +124,12 @@ class LexerTest extends \PHPUnit\Framework\TestCase
                             'comments' => [
                                 new Comment('/* comment */',
                                     1, 6, 1, 1, 18, 1),
-                                new Comment('// comment' . "\n",
-                                    1, 20, 3, 2, 30, 3),
+                                new Comment('// comment',
+                                    1, 20, 3, 1, 29, 3),
                                 new Comment\Doc('/** docComment 1 */',
-                                    2, 31, 4, 2, 49, 4),
+                                    2, 31, 5, 2, 49, 5),
                                 new Comment\Doc('/** docComment 2 */',
-                                    2, 50, 5, 2, 68, 5),
+                                    2, 50, 6, 2, 68, 6),
                             ],
                         ],
                         ['endLine' => 2]
@@ -185,11 +185,11 @@ class LexerTest extends \PHPUnit\Framework\TestCase
                     ],
                     [
                         Tokens::T_CONSTANT_ENCAPSED_STRING, '"b"',
-                        ['startTokenPos' => 5], ['endTokenPos' => 5]
+                        ['startTokenPos' => 6], ['endTokenPos' => 6]
                     ],
                     [
                         ord(';'), ';',
-                        ['startTokenPos' => 6], ['endTokenPos' => 6]
+                        ['startTokenPos' => 7], ['endTokenPos' => 7]
                     ],
                 ]
             ],
@@ -251,14 +251,17 @@ class LexerTest extends \PHPUnit\Framework\TestCase
     }
 
     public function testGetTokens() {
-        $code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";';
+        $code = '<?php "a";' . "\n" . '// foo' . "\n" . '// bar' . "\n\n" . '"b";';
         $expectedTokens = [
             [T_OPEN_TAG, '<?php ', 1],
             [T_CONSTANT_ENCAPSED_STRING, '"a"', 1],
             ';',
             [T_WHITESPACE, "\n", 1],
-            [T_COMMENT, '// foo' . "\n", 2],
-            [T_CONSTANT_ENCAPSED_STRING, '"b"', 3],
+            [T_COMMENT, '// foo', 2],
+            [T_WHITESPACE, "\n", 2],
+            [T_COMMENT, '// bar', 3],
+            [T_WHITESPACE, "\n\n", 3],
+            [T_CONSTANT_ENCAPSED_STRING, '"b"', 5],
             ';',
         ];
 
diff --git a/test/PhpParser/NodeAbstractTest.php b/test/PhpParser/NodeAbstractTest.php
index a4627cc..103ec58 100644
--- a/test/PhpParser/NodeAbstractTest.php
+++ b/test/PhpParser/NodeAbstractTest.php
@@ -307,12 +307,12 @@ PHP;
             "comments": [
                 {
                     "nodeType": "Comment",
-                    "text": "\/\/ comment\n",
+                    "text": "\/\/ comment",
                     "line": 2,
                     "filePos": 6,
                     "tokenPos": 1,
-                    "endLine": 3,
-                    "endFilePos": 16,
+                    "endLine": 2,
+                    "endFilePos": 15,
                     "endTokenPos": 1
                 },
                 {
@@ -320,10 +320,10 @@ PHP;
                     "text": "\/** doc comment *\/",
                     "line": 3,
                     "filePos": 17,
-                    "tokenPos": 2,
+                    "tokenPos": 3,
                     "endLine": 3,
                     "endFilePos": 34,
-                    "endTokenPos": 2
+                    "endTokenPos": 3
                 }
             ],
             "endLine": 6
diff --git a/test/PhpParser/ParserTest.php b/test/PhpParser/ParserTest.php
index fc6e53f..5080365 100644
--- a/test/PhpParser/ParserTest.php
+++ b/test/PhpParser/ParserTest.php
@@ -83,10 +83,10 @@ EOC;
         $this->assertInstanceOf(Stmt\Echo_::class, $echo);
         $this->assertEquals([
             'comments' => [
-                new Comment("// Line\n",
-                    4, 49, 12, 5, 56, 12),
-                new Comment("// Comments\n",
-                    5, 61, 14, 6, 72, 14),
+                new Comment("// Line",
+                    4, 49, 12, 4, 55, 12),
+                new Comment("// Comments",
+                    5, 61, 14, 5, 71, 14),
             ],
             'startLine' => 6,
             'endLine' => 6,
diff --git a/test/code/formatPreservation/classMethodNop.test b/test/code/formatPreservation/classMethodNop.test
index d353e65..e2ad5ae 100644
--- a/test/code/formatPreservation/classMethodNop.test
+++ b/test/code/formatPreservation/classMethodNop.test
@@ -15,8 +15,8 @@ class Foo {
     public function __construct()
     {
         // I'm just a comment
-
-        $foo;    }
+        $foo;
+    }
 }
 -----
 <?php
@@ -72,5 +72,6 @@ class Foo {
     public function __construct()
     {
         // I'm a new comment
-            }
+
+    }
 }
\ No newline at end of file