1
0
mirror of https://github.com/danog/PHP-Parser.git synced 2024-11-26 20:04:48 +01:00

Canonicalize to PHP 8 comment token format

The trailing newline is no longer part of the comment token.
This commit is contained in:
Nikita Popov 2020-06-27 18:53:09 +02:00
parent b58b19ed1d
commit 4abc531213
5 changed files with 47 additions and 52 deletions

View File

@ -89,7 +89,7 @@ class Lexer
error_clear_last();
$this->tokens = @token_get_all($code);
$this->handleErrors($errorHandler);
$this->postprocessTokens($errorHandler);
if (false !== $scream) {
ini_set('xdebug.scream', $scream);
@ -131,40 +131,14 @@ class Lexer
&& substr($token[1], -2) !== '*/';
}
/**
* Check whether an error *may* have occurred during tokenization.
*
* @return bool
*/
private function errorMayHaveOccurred() : bool {
if (defined('HHVM_VERSION')) {
// In HHVM token_get_all() does not throw warnings, so we need to conservatively
// assume that an error occurred
return true;
}
if (PHP_VERSION_ID >= 80000) {
// PHP 8 converts the "bad character" case into a parse error, rather than treating
// it as a lexing warning. To preserve previous behavior, we need to assume that an
// error occurred.
// TODO: We should handle this the same way as PHP 8: Only generate T_BAD_CHARACTER
// token here (for older PHP versions) and leave generationg of the actual parse error
// to the parser. This will also save the full token scan on PHP 8 here.
return true;
}
return null !== error_get_last();
}
protected function handleErrors(ErrorHandler $errorHandler) {
if (!$this->errorMayHaveOccurred()) {
return;
}
protected function postprocessTokens(ErrorHandler $errorHandler) {
// PHP's error handling for token_get_all() is rather bad, so if we want detailed
// error information we need to compute it ourselves. Invalid character errors are
// detected by finding "gaps" in the token array. Unterminated comments are detected
// by checking if a trailing comment has a "*/" at the end.
//
// Additionally, we canonicalize to the PHP 8 comment format here, which does not include
// the trailing whitespace anymore
$filePos = 0;
$line = 1;
@ -178,6 +152,23 @@ class Lexer
$this->handleInvalidCharacterRange($filePos, $filePos + 1, $line, $errorHandler);
}
if ($token[0] === \T_COMMENT && preg_match('/(\r\n|\n|\r)$/D', $token[1], $matches)) {
$trailingNewline = $matches[0];
$token[1] = substr($token[1], 0, -strlen($trailingNewline));
$this->tokens[$i] = $token;
if (isset($this->tokens[$i + 1]) && $this->tokens[$i + 1][0] === \T_WHITESPACE) {
// Move trailing newline into following T_WHITESPACE token, if it already exists.
$this->tokens[$i + 1][1] = $trailingNewline . $this->tokens[$i + 1][1];
$this->tokens[$i + 1][2]--;
} else {
// Otherwise, we need to create a new T_WHITESPACE token.
array_splice($this->tokens, $i + 1, 0, [
[\T_WHITESPACE, $trailingNewline, $line],
]);
$numTokens++;
}
}
$tokenValue = \is_string($token) ? $token : $token[1];
$tokenLen = \strlen($tokenValue);

View File

@ -124,12 +124,12 @@ class LexerTest extends \PHPUnit\Framework\TestCase
'comments' => [
new Comment('/* comment */',
1, 6, 1, 1, 18, 1),
new Comment('// comment' . "\n",
1, 20, 3, 2, 30, 3),
new Comment('// comment',
1, 20, 3, 1, 29, 3),
new Comment\Doc('/** docComment 1 */',
2, 31, 4, 2, 49, 4),
2, 31, 5, 2, 49, 5),
new Comment\Doc('/** docComment 2 */',
2, 50, 5, 2, 68, 5),
2, 50, 6, 2, 68, 6),
],
],
['endLine' => 2]
@ -185,11 +185,11 @@ class LexerTest extends \PHPUnit\Framework\TestCase
],
[
Tokens::T_CONSTANT_ENCAPSED_STRING, '"b"',
['startTokenPos' => 5], ['endTokenPos' => 5]
['startTokenPos' => 6], ['endTokenPos' => 6]
],
[
ord(';'), ';',
['startTokenPos' => 6], ['endTokenPos' => 6]
['startTokenPos' => 7], ['endTokenPos' => 7]
],
]
],
@ -251,14 +251,17 @@ class LexerTest extends \PHPUnit\Framework\TestCase
}
public function testGetTokens() {
$code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";';
$code = '<?php "a";' . "\n" . '// foo' . "\n" . '// bar' . "\n\n" . '"b";';
$expectedTokens = [
[T_OPEN_TAG, '<?php ', 1],
[T_CONSTANT_ENCAPSED_STRING, '"a"', 1],
';',
[T_WHITESPACE, "\n", 1],
[T_COMMENT, '// foo' . "\n", 2],
[T_CONSTANT_ENCAPSED_STRING, '"b"', 3],
[T_COMMENT, '// foo', 2],
[T_WHITESPACE, "\n", 2],
[T_COMMENT, '// bar', 3],
[T_WHITESPACE, "\n\n", 3],
[T_CONSTANT_ENCAPSED_STRING, '"b"', 5],
';',
];

View File

@ -307,12 +307,12 @@ PHP;
"comments": [
{
"nodeType": "Comment",
"text": "\/\/ comment\n",
"text": "\/\/ comment",
"line": 2,
"filePos": 6,
"tokenPos": 1,
"endLine": 3,
"endFilePos": 16,
"endLine": 2,
"endFilePos": 15,
"endTokenPos": 1
},
{
@ -320,10 +320,10 @@ PHP;
"text": "\/** doc comment *\/",
"line": 3,
"filePos": 17,
"tokenPos": 2,
"tokenPos": 3,
"endLine": 3,
"endFilePos": 34,
"endTokenPos": 2
"endTokenPos": 3
}
],
"endLine": 6

View File

@ -83,10 +83,10 @@ EOC;
$this->assertInstanceOf(Stmt\Echo_::class, $echo);
$this->assertEquals([
'comments' => [
new Comment("// Line\n",
4, 49, 12, 5, 56, 12),
new Comment("// Comments\n",
5, 61, 14, 6, 72, 14),
new Comment("// Line",
4, 49, 12, 4, 55, 12),
new Comment("// Comments",
5, 61, 14, 5, 71, 14),
],
'startLine' => 6,
'endLine' => 6,

View File

@ -15,8 +15,8 @@ class Foo {
public function __construct()
{
// I'm just a comment
$foo; }
$foo;
}
}
-----
<?php
@ -72,5 +72,6 @@ class Foo {
public function __construct()
{
// I'm a new comment
}
}
}