From 46975107a7ecb2443fb33c638053343eda8b4dd0 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 18 Dec 2014 23:26:17 +0100 Subject: [PATCH] Support token position attributes in lexer Also change endFilePos semantics to refer to the last character that is *included* in the token, rather than one past the last character. This ensures that all end* attributes have the same semantics. --- lib/PhpParser/Lexer.php | 49 ++++++++++++++++++++++++++++-------- test/PhpParser/LexerTest.php | 48 ++++++++++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php index f6767e1..566014d 100644 --- a/lib/PhpParser/Lexer.php +++ b/lib/PhpParser/Lexer.php @@ -20,8 +20,9 @@ class Lexer * * @param array $options Options array. Currently only the 'usedAttributes' option is supported, * which is an array of attributes to add to the AST nodes. Possible attributes - * are: 'comments', 'startLine', 'endLine', 'startFilePos', 'endFilePos'. The - * option defaults to the first three. For more info see getNextToken() docs. + * are: 'comments', 'startLine', 'endLine', 'startTokenPos', 'endTokenPos', + * 'startFilePos', 'endFilePos'. The option defaults to the first three. + * For more info see getNextToken() docs. */ public function __construct(array $options = array()) { // map from internal tokens to PhpParser tokens @@ -100,13 +101,16 @@ class Lexer * The available attributes are determined by the 'usedAttributes' option, which can * be specified in the constructor. The following attributes are supported: * - * * 'comments' => Array of PhpParser\Comment or PhpParser\Comment\Doc instances, - * representing all comments that occurred between the previous - * non-discarded token and the current one. - * * 'startLine' => Line in which the token starts. - * * 'endLine' => Line in which the token ends. - * * 'startFilePos' => Offset into the code string at which the token starts. - * * 'endFilePos' => EXPERIMENTAL! Offset into the code string one past where the token ends. + * * 'comments' => Array of PhpParser\Comment or PhpParser\Comment\Doc instances, + * representing all comments that occurred between the previous + * non-discarded token and the current one. + * * 'startLine' => Line in which the token starts. + * * 'endLine' => Line in which the token ends. + * * 'startTokenPos' => Position in the token array of the first token in the node. + * * 'endTokenPos' => Position in the token array of the last token in the node. + * * 'startFilePos' => Offset into the code string at which the token starts. + * * 'endFilePos' => Offset into the code string at which the last character that + * is part of the token occurs. * * @param mixed $value Variable to store token content in * @param mixed $startAttributes Variable to store start attributes in @@ -121,6 +125,9 @@ class Lexer while (isset($this->tokens[++$this->pos])) { $token = $this->tokens[$this->pos]; + if (isset($this->usedAttributes['startTokenPos'])) { + $startAttributes['startTokenPos'] = $this->pos; + } if (isset($this->usedAttributes['startFilePos'])) { $startAttributes['startFilePos'] = $this->filePos; } @@ -143,8 +150,11 @@ class Lexer if (isset($this->usedAttributes['endLine'])) { $endAttributes['endLine'] = $this->line; } + if (isset($this->usedAttributes['endTokenPos'])) { + $endAttributes['endTokenPos'] = $this->pos; + } if (isset($this->usedAttributes['endFilePos'])) { - $endAttributes['endFilePos'] = $this->filePos; + $endAttributes['endFilePos'] = $this->filePos - 1; } return $id; @@ -169,8 +179,11 @@ class Lexer if (isset($this->usedAttributes['endLine'])) { $endAttributes['endLine'] = $this->line; } + if (isset($this->usedAttributes['endTokenPos'])) { + $endAttributes['endTokenPos'] = $this->pos; + } if (isset($this->usedAttributes['endFilePos'])) { - $endAttributes['endFilePos'] = $this->filePos; + $endAttributes['endFilePos'] = $this->filePos - 1; } return $this->tokenMap[$token[0]]; @@ -184,6 +197,20 @@ class Lexer return 0; } + /** + * Returns the token array for current code. + * + * The token array is in the same format as provided by the + * token_get_all() function and does not discard tokens (i.e. + * whitespace and comments are included). The token position + * attributes are against this token array. + * + * @return array Array of tokens in token_get_all() format + */ + public function getTokens() { + return $this->tokens; + } + /** * Handles __halt_compiler() by returning the text after it. * diff --git a/test/PhpParser/LexerTest.php b/test/PhpParser/LexerTest.php index fd82166..ad2dc30 100644 --- a/test/PhpParser/LexerTest.php +++ b/test/PhpParser/LexerTest.php @@ -131,19 +131,42 @@ class LexerTest extends \PHPUnit_Framework_TestCase array( array( Parser::T_CONSTANT_ENCAPSED_STRING, '"a"', - array('startFilePos' => 6), array('endFilePos' => 9) + array('startFilePos' => 6), array('endFilePos' => 8) ), array( ord(';'), ';', - array('startFilePos' => 9), array('endFilePos' => 10) + array('startFilePos' => 9), array('endFilePos' => 9) ), array( Parser::T_CONSTANT_ENCAPSED_STRING, '"b"', - array('startFilePos' => 18), array('endFilePos' => 21) + array('startFilePos' => 18), array('endFilePos' => 20) ), array( ord(';'), ';', - array('startFilePos' => 21), array('endFilePos' => 22) + array('startFilePos' => 21), array('endFilePos' => 21) + ), + ) + ), + // tests token offsets + array( + ' array('startTokenPos', 'endTokenPos')), + array( + array( + Parser::T_CONSTANT_ENCAPSED_STRING, '"a"', + array('startTokenPos' => 1), array('endTokenPos' => 1) + ), + array( + ord(';'), ';', + array('startTokenPos' => 2), array('endTokenPos' => 2) + ), + array( + Parser::T_CONSTANT_ENCAPSED_STRING, '"b"', + array('startTokenPos' => 5), array('endTokenPos' => 5) + ), + array( + ord(';'), ';', + array('startTokenPos' => 6), array('endTokenPos' => 6) ), ) ), @@ -187,4 +210,21 @@ class LexerTest extends \PHPUnit_Framework_TestCase //array('getLexer(); + $lexer->startLexing($code); + $this->assertSame($expectedTokens, $lexer->getTokens()); + } }