Support token position attributes in lexer

Also change endFilePos semantics to refer to the last character that
is *included* in the token, rather than one past the last character.
This ensures that all end* attributes have the same semantics.
This commit is contained in:
Nikita Popov 2014-12-18 23:26:17 +01:00
parent e0f3e8a492
commit 46975107a7
2 changed files with 82 additions and 15 deletions

View File

@ -20,8 +20,9 @@ class Lexer
*
* @param array $options Options array. Currently only the 'usedAttributes' option is supported,
* which is an array of attributes to add to the AST nodes. Possible attributes
* are: 'comments', 'startLine', 'endLine', 'startFilePos', 'endFilePos'. The
* option defaults to the first three. For more info see getNextToken() docs.
* are: 'comments', 'startLine', 'endLine', 'startTokenPos', 'endTokenPos',
* 'startFilePos', 'endFilePos'. The option defaults to the first three.
* For more info see getNextToken() docs.
*/
public function __construct(array $options = array()) {
// map from internal tokens to PhpParser tokens
@ -100,13 +101,16 @@ class Lexer
* The available attributes are determined by the 'usedAttributes' option, which can
* be specified in the constructor. The following attributes are supported:
*
* * 'comments' => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
* representing all comments that occurred between the previous
* non-discarded token and the current one.
* * 'startLine' => Line in which the token starts.
* * 'endLine' => Line in which the token ends.
* * 'startFilePos' => Offset into the code string at which the token starts.
* * 'endFilePos' => EXPERIMENTAL! Offset into the code string one past where the token ends.
* * 'comments' => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
* representing all comments that occurred between the previous
* non-discarded token and the current one.
* * 'startLine' => Line in which the token starts.
* * 'endLine' => Line in which the token ends.
* * 'startTokenPos' => Position in the token array of the first token in the node.
* * 'endTokenPos' => Position in the token array of the last token in the node.
* * 'startFilePos' => Offset into the code string at which the token starts.
* * 'endFilePos' => Offset into the code string at which the last character that
* is part of the token occurs.
*
* @param mixed $value Variable to store token content in
* @param mixed $startAttributes Variable to store start attributes in
@ -121,6 +125,9 @@ class Lexer
while (isset($this->tokens[++$this->pos])) {
$token = $this->tokens[$this->pos];
if (isset($this->usedAttributes['startTokenPos'])) {
$startAttributes['startTokenPos'] = $this->pos;
}
if (isset($this->usedAttributes['startFilePos'])) {
$startAttributes['startFilePos'] = $this->filePos;
}
@ -143,8 +150,11 @@ class Lexer
if (isset($this->usedAttributes['endLine'])) {
$endAttributes['endLine'] = $this->line;
}
if (isset($this->usedAttributes['endTokenPos'])) {
$endAttributes['endTokenPos'] = $this->pos;
}
if (isset($this->usedAttributes['endFilePos'])) {
$endAttributes['endFilePos'] = $this->filePos;
$endAttributes['endFilePos'] = $this->filePos - 1;
}
return $id;
@ -169,8 +179,11 @@ class Lexer
if (isset($this->usedAttributes['endLine'])) {
$endAttributes['endLine'] = $this->line;
}
if (isset($this->usedAttributes['endTokenPos'])) {
$endAttributes['endTokenPos'] = $this->pos;
}
if (isset($this->usedAttributes['endFilePos'])) {
$endAttributes['endFilePos'] = $this->filePos;
$endAttributes['endFilePos'] = $this->filePos - 1;
}
return $this->tokenMap[$token[0]];
@ -184,6 +197,20 @@ class Lexer
return 0;
}
/**
* Returns the token array for current code.
*
* The token array is in the same format as provided by the
* token_get_all() function and does not discard tokens (i.e.
* whitespace and comments are included). The token position
* attributes are against this token array.
*
* @return array Array of tokens in token_get_all() format
*/
public function getTokens() {
return $this->tokens;
}
/**
* Handles __halt_compiler() by returning the text after it.
*

View File

@ -131,19 +131,42 @@ class LexerTest extends \PHPUnit_Framework_TestCase
array(
array(
Parser::T_CONSTANT_ENCAPSED_STRING, '"a"',
array('startFilePos' => 6), array('endFilePos' => 9)
array('startFilePos' => 6), array('endFilePos' => 8)
),
array(
ord(';'), ';',
array('startFilePos' => 9), array('endFilePos' => 10)
array('startFilePos' => 9), array('endFilePos' => 9)
),
array(
Parser::T_CONSTANT_ENCAPSED_STRING, '"b"',
array('startFilePos' => 18), array('endFilePos' => 21)
array('startFilePos' => 18), array('endFilePos' => 20)
),
array(
ord(';'), ';',
array('startFilePos' => 21), array('endFilePos' => 22)
array('startFilePos' => 21), array('endFilePos' => 21)
),
)
),
// tests token offsets
array(
'<?php "a";' . "\n" . '// foo' . "\n" . '"b";',
array('usedAttributes' => array('startTokenPos', 'endTokenPos')),
array(
array(
Parser::T_CONSTANT_ENCAPSED_STRING, '"a"',
array('startTokenPos' => 1), array('endTokenPos' => 1)
),
array(
ord(';'), ';',
array('startTokenPos' => 2), array('endTokenPos' => 2)
),
array(
Parser::T_CONSTANT_ENCAPSED_STRING, '"b"',
array('startTokenPos' => 5), array('endTokenPos' => 5)
),
array(
ord(';'), ';',
array('startTokenPos' => 6), array('endTokenPos' => 6)
),
)
),
@ -187,4 +210,21 @@ class LexerTest extends \PHPUnit_Framework_TestCase
//array('<?php ... __halt_compiler /* */ ( ) ;Remaining Text', 'Remaining Text'),
);
}
public function testGetTokens() {
$code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";';
$expectedTokens = array(
array(T_OPEN_TAG, '<?php ', 1),
array(T_CONSTANT_ENCAPSED_STRING, '"a"', 1),
';',
array(T_WHITESPACE, "\n", 1),
array(T_COMMENT, '// foo' . "\n", 2),
array(T_CONSTANT_ENCAPSED_STRING, '"b"', 3),
';',
);
$lexer = $this->getLexer();
$lexer->startLexing($code);
$this->assertSame($expectedTokens, $lexer->getTokens());
}
}