Support recovery from lexer errors

Lexer::startLexing() no longer throws, instead errors can be fetched
using Lexer::getErrors().

Lexer errors now also contain full line and position information.
This commit is contained in:
Nikita Popov 2016-09-30 18:28:35 +02:00
parent e926efd62e
commit c79ea6d1d3
5 changed files with 229 additions and 57 deletions

View File

@ -9,6 +9,7 @@ class Lexer
{
protected $code;
protected $tokens;
protected $errors;
protected $pos;
protected $line;
protected $filePos;
@ -49,11 +50,22 @@ class Lexer
/**
* Initializes the lexer for lexing the provided source code.
*
* @param string $code The source code to lex
* This function does not throw if lexing errors occur. Instead, errors may be retrieved using
* the getErrors() method.
*
* @throws Error on lexing errors (unterminated comment or unexpected character)
* @param string $code The source code to lex
*/
public function startLexing($code) {
$this->code = $code; // keep the code around for __halt_compiler() handling
$this->pos = -1;
$this->line = 1;
$this->filePos = 0;
$this->errors = [];
// If inline HTML occurs without preceding code, treat it as if it had a leading newline.
// This ensures proper composability, because having a newline is the "safe" assumption.
$this->prevCloseTagHasNewline = true;
$scream = ini_set('xdebug.scream', '0');
$this->resetErrors();
@ -63,15 +75,6 @@ class Lexer
if (false !== $scream) {
ini_set('xdebug.scream', $scream);
}
$this->code = $code; // keep the code around for __halt_compiler() handling
$this->pos = -1;
$this->line = 1;
$this->filePos = 0;
// If inline HTML occurs without preceding code, treat it as if it had a leading newline.
// This ensures proper composability, because having a newline is the "safe" assumption.
$this->prevCloseTagHasNewline = true;
}
protected function resetErrors() {
@ -85,32 +88,85 @@ class Lexer
}
}
protected function handleErrors() {
private function handleInvalidCharacterRange($start, $end, $line) {
for ($i = $start; $i < $end; $i++) {
$chr = $this->code[$i];
if ($chr === "\0") {
// PHP cuts error message after null byte, so need special case
$errorMsg = 'Unexpected null byte';
} else {
$errorMsg = sprintf(
'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
);
}
$this->errors[] = new Error($errorMsg, [
'startLine' => $line,
'endLine' => $line,
'startFilePos' => $i,
'endFilePos' => $i,
]);
}
}
private function isUnterminatedComment($token) {
return ($token[0] === T_COMMENT || $token[0] === T_DOC_COMMENT)
&& substr($token[1], 0, 2) === '/*'
&& substr($token[1], -2) !== '*/';
}
private function errorMayHaveOccurred() {
if (defined('HHVM_VERSION')) {
// In HHVM token_get_all() does not throw warnings, so we need to conservatively
// assume that an error occurred
return true;
}
$error = error_get_last();
if (null === $error) {
return null !== $error
&& false === strpos($error['message'], 'Undefined variable');
}
protected function handleErrors() {
if (!$this->errorMayHaveOccurred()) {
return;
}
if (preg_match(
'~^Unterminated comment starting line ([0-9]+)$~',
$error['message'], $matches
)) {
throw new Error('Unterminated comment', (int) $matches[1]);
// PHP's error handling for token_get_all() is rather bad, so if we want detailed
// error information we need to compute it ourselves. Invalid character errors are
// detected by finding "gaps" in the token array. Unterminated comments are detected
// by checking if a trailing comment has a "*/" at the end.
$filePos = 0;
$line = 1;
foreach ($this->tokens as $i => $token) {
$tokenValue = \is_string($token) ? $token : $token[1];
$tokenLen = \strlen($tokenValue);
if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
// Something is missing, must be an invalid character
$nextFilePos = strpos($this->code, $tokenValue, $filePos);
$this->handleInvalidCharacterRange($filePos, $nextFilePos, $line);
$filePos = $nextFilePos;
}
$filePos += $tokenLen;
$line += substr_count($tokenValue, "\n");
}
if (preg_match(
'~^Unexpected character in input: \'(.)\' \(ASCII=([0-9]+)\)~s',
$error['message'], $matches
)) {
throw new Error(sprintf(
'Unexpected character "%s" (ASCII %d)',
$matches[1], $matches[2]
));
// Invalid characters at the end of the input
if ($filePos !== \strlen($this->code)) {
$this->handleInvalidCharacterRange($filePos, \strlen($this->code), $line);
}
// PHP cuts error message after null byte, so need special case
if (preg_match('~^Unexpected character in input: \'$~', $error['message'])) {
throw new Error('Unexpected null byte');
// Check for unterminated comment
$lastToken = $this->tokens[count($this->tokens) - 1];
if ($this->isUnterminatedComment($lastToken)) {
$this->errors[] = new Error('Unterminated comment', [
'startLine' => $line - substr_count($lastToken[1], "\n"),
'endLine' => $line,
'startFilePos' => $filePos - \strlen($lastToken[1]),
'endFilePos' => $filePos,
]);
}
}
@ -224,6 +280,15 @@ class Lexer
return $this->tokens;
}
/**
* Returns errors that occurred during lexing.
*
* @return Error[] Array of lexer errors
*/
public function getErrors() {
return $this->errors;
}
/**
* Handles __halt_compiler() by returning the text after it.
*

View File

@ -132,19 +132,11 @@ abstract class ParserAbstract implements Parser
* unable to recover from an error).
*/
public function parse($code) {
$this->errors = array();
// Initialize the lexer
try {
$this->lexer->startLexing($code);
} catch (Error $e) {
$this->errors[] = $e;
if ($this->throwOnError) {
throw $e;
} else {
// Currently can't recover from lexer errors
return null;
}
// Initialize the lexer and inherit lexing errors
$this->lexer->startLexing($code);
$this->errors = $this->lexer->getErrors();
if ($this->throwOnError && !empty($this->errors)) {
throw $this->errors[0];
}
// We start off with no lookahead-token

View File

@ -14,28 +14,35 @@ class LexerTest extends \PHPUnit_Framework_TestCase
/**
* @dataProvider provideTestError
*/
public function testError($code, $message) {
public function testError($code, $messages) {
if (defined('HHVM_VERSION')) {
$this->markTestSkipped('HHVM does not throw warnings from token_get_all()');
}
$lexer = $this->getLexer();
try {
$lexer->startLexing($code);
} catch (Error $e) {
$this->assertSame($message, $e->getMessage());
$lexer = $this->getLexer(['usedAttributes' => [
'comments', 'startLine', 'endLine', 'startFilePos', 'endFilePos'
]]);
$lexer->startLexing($code);
$errors = $lexer->getErrors();
return;
$this->assertSame(count($messages), count($errors));
for ($i = 0; $i < count($messages); $i++) {
$this->assertSame($messages[$i], $errors[$i]->getMessageWithColumnInfo($code));
}
$this->fail('Expected PhpParser\Error');
}
public function provideTestError() {
return array(
array('<?php /*', 'Unterminated comment on line 1'),
array('<?php ' . "\1", 'Unexpected character "' . "\1" . '" (ASCII 1) on unknown line'),
array('<?php ' . "\0", 'Unexpected null byte on unknown line'),
array("<?php /*", array("Unterminated comment from 1:7 to 1:9")),
array("<?php \1", array("Unexpected character \"\1\" (ASCII 1) from 1:7 to 1:7")),
array("<?php \0", array("Unexpected null byte from 1:7 to 1:7")),
// Error with potentially emulated token
array("<?php ?? \0", array("Unexpected null byte from 1:10 to 1:10")),
array("<?php\n\0\1 foo /* bar", array(
"Unexpected null byte from 2:1 to 2:1",
"Unexpected character \"\1\" (ASCII 1) from 2:2 to 2:2",
"Unterminated comment from 2:8 to 2:14"
)),
);
}

View File

@ -30,6 +30,15 @@ abstract class ParserTest extends \PHPUnit_Framework_TestCase
$parser->parse('<?php use foo as self;');
}
/**
* @expectedException \PhpParser\Error
* @expectedExceptionMessage Unterminated comment on line 1
*/
public function testParserThrowsLexerError() {
$parser = $this->getParser(new Lexer());
$parser->parse('<?php /*');
}
public function testAttributeAssignment() {
$lexer = new Lexer(array(
'usedAttributes' => array(

View File

@ -6,7 +6,23 @@ $a = 42;
/*
$b = 24;
-----
Unterminated comment on line 4
Unterminated comment from 4:1 to 5:9
array(
0: Expr_Assign(
var: Expr_Variable(
name: a
)
expr: Scalar_LNumber(
value: 42
)
)
1: Stmt_Nop(
comments: array(
0: /*
$b = 24;
)
)
)
-----
<?php
@ -14,7 +30,25 @@ $a = 42;
@@{ "\1" }@@
$b = 24;
-----
Unexpected character "@@{ "\1" }@@" (ASCII 1) on unknown line
Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
array(
0: Expr_Assign(
var: Expr_Variable(
name: a
)
expr: Scalar_LNumber(
value: 42
)
)
1: Expr_Assign(
var: Expr_Variable(
name: b
)
expr: Scalar_LNumber(
value: 24
)
)
)
-----
<?php
@ -22,4 +56,69 @@ $a = 42;
@@{ "\0" }@@
$b = 24;
-----
Unexpected null byte on unknown line
Unexpected null byte from 4:1 to 4:1
array(
0: Expr_Assign(
var: Expr_Variable(
name: a
)
expr: Scalar_LNumber(
value: 42
)
)
1: Expr_Assign(
var: Expr_Variable(
name: b
)
expr: Scalar_LNumber(
value: 24
)
)
)
-----
<?php
$a = 1;
@@{ "\1" }@@
$b = 2;
@@{ "\2" }@@
$c = 3;
-----
Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1
array(
0: Expr_Assign(
var: Expr_Variable(
name: a
)
expr: Scalar_LNumber(
value: 1
)
)
1: Expr_Assign(
var: Expr_Variable(
name: b
)
expr: Scalar_LNumber(
value: 2
)
)
2: Expr_Assign(
var: Expr_Variable(
name: c
)
expr: Scalar_LNumber(
value: 3
)
)
)
-----
<?php
if ($b) {
$a = 1;
/* unterminated
}
-----
Unterminated comment from 5:5 to 6:2
Syntax error, unexpected EOF from 6:2 to 6:2