1
0
mirror of https://github.com/danog/PHP-Parser.git synced 2024-11-30 04:19:30 +01:00

Canonicalize to PHP 8 comment token format

The trailing newline is no longer part of the comment token.
This commit is contained in:
Nikita Popov 2020-06-27 18:53:09 +02:00
parent b58b19ed1d
commit 4abc531213
5 changed files with 47 additions and 52 deletions

View File

@ -89,7 +89,7 @@ class Lexer
error_clear_last(); error_clear_last();
$this->tokens = @token_get_all($code); $this->tokens = @token_get_all($code);
$this->handleErrors($errorHandler); $this->postprocessTokens($errorHandler);
if (false !== $scream) { if (false !== $scream) {
ini_set('xdebug.scream', $scream); ini_set('xdebug.scream', $scream);
@ -131,40 +131,14 @@ class Lexer
&& substr($token[1], -2) !== '*/'; && substr($token[1], -2) !== '*/';
} }
/** protected function postprocessTokens(ErrorHandler $errorHandler) {
* Check whether an error *may* have occurred during tokenization.
*
* @return bool
*/
private function errorMayHaveOccurred() : bool {
if (defined('HHVM_VERSION')) {
// In HHVM token_get_all() does not throw warnings, so we need to conservatively
// assume that an error occurred
return true;
}
if (PHP_VERSION_ID >= 80000) {
// PHP 8 converts the "bad character" case into a parse error, rather than treating
// it as a lexing warning. To preserve previous behavior, we need to assume that an
// error occurred.
// TODO: We should handle this the same way as PHP 8: Only generate T_BAD_CHARACTER
// token here (for older PHP versions) and leave generationg of the actual parse error
// to the parser. This will also save the full token scan on PHP 8 here.
return true;
}
return null !== error_get_last();
}
protected function handleErrors(ErrorHandler $errorHandler) {
if (!$this->errorMayHaveOccurred()) {
return;
}
// PHP's error handling for token_get_all() is rather bad, so if we want detailed // PHP's error handling for token_get_all() is rather bad, so if we want detailed
// error information we need to compute it ourselves. Invalid character errors are // error information we need to compute it ourselves. Invalid character errors are
// detected by finding "gaps" in the token array. Unterminated comments are detected // detected by finding "gaps" in the token array. Unterminated comments are detected
// by checking if a trailing comment has a "*/" at the end. // by checking if a trailing comment has a "*/" at the end.
//
// Additionally, we canonicalize to the PHP 8 comment format here, which does not include
// the trailing whitespace anymore
$filePos = 0; $filePos = 0;
$line = 1; $line = 1;
@ -178,6 +152,23 @@ class Lexer
$this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler); $this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler);
} }
if ($token[0] === \T_COMMENT && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) {
$trailingNewline = $matches[0];
$token[1] = substr($token[1], 0, -strlen($trailingNewline));
$this->tokens[$i] = $token;
if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) {
// Move trailing newline into following T_WHITESPACE token, if it already exists.
$this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1];
$this->tokens[$i + 1][2]--;
} else {
// Otherwise, we need to create a new T_WHITESPACE token.
array_splice($this->tokens, $i + 1, 0, [
[\T_WHITESPACE, $trailingNewline, $line],
]);
$numTokens++;
}
}
$tokenValue = \is_string($token) ? $token : $token[1]; $tokenValue = \is_string($token) ? $token : $token[1];
$tokenLen = \strlen($tokenValue); $tokenLen = \strlen($tokenValue);

View File

@ -124,12 +124,12 @@ class LexerTest extends \PHPUnit\Framework\TestCase
'comments' => [ 'comments' => [
new Comment('/* comment */', new Comment('/* comment */',
1, 6, 1, 1, 18, 1), 1, 6, 1, 1, 18, 1),
new Comment('// comment' . "\n", new Comment('// comment',
1, 20, 3, 2, 30, 3), 1, 20, 3, 1, 29, 3),
new Comment\Doc('/** docComment 1 */', new Comment\Doc('/** docComment 1 */',
2, 31, 4, 2, 49, 4), 2, 31, 5, 2, 49, 5),
new Comment\Doc('/** docComment 2 */', new Comment\Doc('/** docComment 2 */',
2, 50, 5, 2, 68, 5), 2, 50, 6, 2, 68, 6),
], ],
], ],
['endLine' => 2] ['endLine' => 2]
@ -185,11 +185,11 @@ class LexerTest extends \PHPUnit\Framework\TestCase
], ],
[ [
Tokens::T_CONSTANT_ENCAPSED_STRING, '"b"', Tokens::T_CONSTANT_ENCAPSED_STRING, '"b"',
['startTokenPos' => 5], ['endTokenPos' => 5] ['startTokenPos' => 6], ['endTokenPos' => 6]
], ],
[ [
ord(';'), ';', ord(';'), ';',
['startTokenPos' => 6], ['endTokenPos' => 6] ['startTokenPos' => 7], ['endTokenPos' => 7]
], ],
] ]
], ],
@ -251,14 +251,17 @@ class LexerTest extends \PHPUnit\Framework\TestCase
} }
public function testGetTokens() { public function testGetTokens() {
$code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";'; $code = '<?php "a";' . "\n" . '// foo' . "\n" . '// bar' . "\n\n" . '"b";';
$expectedTokens = [ $expectedTokens = [
[T_OPEN_TAG, '<?php ', 1], [T_OPEN_TAG, '<?php ', 1],
[T_CONSTANT_ENCAPSED_STRING, '"a"', 1], [T_CONSTANT_ENCAPSED_STRING, '"a"', 1],
';', ';',
[T_WHITESPACE, "\n", 1], [T_WHITESPACE, "\n", 1],
[T_COMMENT, '// foo' . "\n", 2], [T_COMMENT, '// foo', 2],
[T_CONSTANT_ENCAPSED_STRING, '"b"', 3], [T_WHITESPACE, "\n", 2],
[T_COMMENT, '// bar', 3],
[T_WHITESPACE, "\n\n", 3],
[T_CONSTANT_ENCAPSED_STRING, '"b"', 5],
';', ';',
]; ];

View File

@ -307,12 +307,12 @@ PHP;
"comments": [ "comments": [
{ {
"nodeType": "Comment", "nodeType": "Comment",
"text": "\/\/ comment\n", "text": "\/\/ comment",
"line": 2, "line": 2,
"filePos": 6, "filePos": 6,
"tokenPos": 1, "tokenPos": 1,
"endLine": 3, "endLine": 2,
"endFilePos": 16, "endFilePos": 15,
"endTokenPos": 1 "endTokenPos": 1
}, },
{ {
@ -320,10 +320,10 @@ PHP;
"text": "\/** doc comment *\/", "text": "\/** doc comment *\/",
"line": 3, "line": 3,
"filePos": 17, "filePos": 17,
"tokenPos": 2, "tokenPos": 3,
"endLine": 3, "endLine": 3,
"endFilePos": 34, "endFilePos": 34,
"endTokenPos": 2 "endTokenPos": 3
} }
], ],
"endLine": 6 "endLine": 6

View File

@ -83,10 +83,10 @@ EOC;
$this->assertInstanceOf(Stmt\Echo_::class, $echo); $this->assertInstanceOf(Stmt\Echo_::class, $echo);
$this->assertEquals([ $this->assertEquals([
'comments' => [ 'comments' => [
new Comment("// Line\n", new Comment("// Line",
4, 49, 12, 5, 56, 12), 4, 49, 12, 4, 55, 12),
new Comment("// Comments\n", new Comment("// Comments",
5, 61, 14, 6, 72, 14), 5, 61, 14, 5, 71, 14),
], ],
'startLine' => 6, 'startLine' => 6,
'endLine' => 6, 'endLine' => 6,

View File

@ -15,8 +15,8 @@ class Foo {
public function __construct() public function __construct()
{ {
// I'm just a comment // I'm just a comment
$foo;
$foo; } }
} }
----- -----
<?php <?php
@ -72,5 +72,6 @@ class Foo {
public function __construct() public function __construct()
{ {
// I'm a new comment // I'm a new comment
}
}
} }