Support token position attributes in lexer

Also change endFilePos semantics to refer to the last character that is *included* in the token, rather than one past the last character. This ensures that all end* attributes have the same semantics.
2024-11-26 20:14:46 +01:00 · 2014-12-18 23:26:17 +01:00 · 2014-12-18 23:26:17 +01:00 · 46975107a7
commit 46975107a7
parent e0f3e8a492
2 changed files with 82 additions and 15 deletions
--- a/lib/PhpParser/Lexer.php
+++ b/lib/PhpParser/Lexer.php
@ -20,8 +20,9 @@ class Lexer
     *
     * @param array $options Options array. Currently only the 'usedAttributes' option is supported,
     *                       which is an array of attributes to add to the AST nodes. Possible attributes
-     *                       are: 'comments', 'startLine', 'endLine', 'startFilePos', 'endFilePos'. The
-     *                       option defaults to the first three. For more info see getNextToken() docs.
+     *                       are: 'comments', 'startLine', 'endLine', 'startTokenPos', 'endTokenPos',
+     *                       'startFilePos', 'endFilePos'. The option defaults to the first three.
+     *                       For more info see getNextToken() docs.
     */
    public function __construct(array $options = array()) {
        // map from internal tokens to PhpParser tokens
@ -100,13 +101,16 @@ class Lexer
     * The available attributes are determined by the 'usedAttributes' option, which can
     * be specified in the constructor. The following attributes are supported:
     *
-     *  * 'comments'     => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
-     *                      representing all comments that occurred between the previous
-     *                      non-discarded token and the current one.
-     *  * 'startLine'    => Line in which the token starts.
-     *  * 'endLine'      => Line in which the token ends.
-     *  * 'startFilePos' => Offset into the code string at which the token starts.
-     *  * 'endFilePos'   => EXPERIMENTAL! Offset into the code string one past where the token ends.
+     *  * 'comments'      => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
+     *                       representing all comments that occurred between the previous
+     *                       non-discarded token and the current one.
+     *  * 'startLine'     => Line in which the token starts.
+     *  * 'endLine'       => Line in which the token ends.
+     *  * 'startTokenPos' => Position in the token array of the first token in the node.
+     *  * 'endTokenPos'   => Position in the token array of the last token in the node.
+     *  * 'startFilePos'  => Offset into the code string at which the token starts.
+     *  * 'endFilePos'    => Offset into the code string at which the last character that
+     *                       is part of the token occurs.
     *
     * @param mixed $value           Variable to store token content in
     * @param mixed $startAttributes Variable to store start attributes in
@ -121,6 +125,9 @@ class Lexer
        while (isset($this->tokens[++$this->pos])) {
            $token = $this->tokens[$this->pos];

+            if (isset($this->usedAttributes['startTokenPos'])) {
+                $startAttributes['startTokenPos'] = $this->pos;
+            }
            if (isset($this->usedAttributes['startFilePos'])) {
                $startAttributes['startFilePos'] = $this->filePos;
            }
@ -143,8 +150,11 @@ class Lexer
                if (isset($this->usedAttributes['endLine'])) {
                    $endAttributes['endLine'] = $this->line;
                }
+                if (isset($this->usedAttributes['endTokenPos'])) {
+                    $endAttributes['endTokenPos'] = $this->pos;
+                }
                if (isset($this->usedAttributes['endFilePos'])) {
-                    $endAttributes['endFilePos'] = $this->filePos;
+                    $endAttributes['endFilePos'] = $this->filePos - 1;
                }

                return $id;
@ -169,8 +179,11 @@ class Lexer
                    if (isset($this->usedAttributes['endLine'])) {
                        $endAttributes['endLine'] = $this->line;
                    }
+                    if (isset($this->usedAttributes['endTokenPos'])) {
+                        $endAttributes['endTokenPos'] = $this->pos;
+                    }
                    if (isset($this->usedAttributes['endFilePos'])) {
-                        $endAttributes['endFilePos'] = $this->filePos;
+                        $endAttributes['endFilePos'] = $this->filePos - 1;
                    }

                    return $this->tokenMap[$token[0]];
@ -184,6 +197,20 @@ class Lexer
        return 0;
    }

+    /**
+     * Returns the token array for current code.
+     *
+     * The token array is in the same format as provided by the
+     * token_get_all() function and does not discard tokens (i.e.
+     * whitespace and comments are included). The token position
+     * attributes are against this token array.
+     *
+     * @return array Array of tokens in token_get_all() format
+     */
+    public function getTokens() {
+        return $this->tokens;
+    }
+
    /**
     * Handles __halt_compiler() by returning the text after it.
     *
--- a/test/PhpParser/LexerTest.php
+++ b/test/PhpParser/LexerTest.php
@ -131,19 +131,42 @@ class LexerTest extends \PHPUnit_Framework_TestCase
                array(
                    array(
                        Parser::T_CONSTANT_ENCAPSED_STRING, '"a"',
-                        array('startFilePos' => 6), array('endFilePos' => 9)
+                        array('startFilePos' => 6), array('endFilePos' => 8)
                    ),
                    array(
                        ord(';'), ';',
-                        array('startFilePos' => 9), array('endFilePos' => 10)
+                        array('startFilePos' => 9), array('endFilePos' => 9)
                    ),
                    array(
                        Parser::T_CONSTANT_ENCAPSED_STRING, '"b"',
-                        array('startFilePos' => 18), array('endFilePos' => 21)
+                        array('startFilePos' => 18), array('endFilePos' => 20)
                    ),
                    array(
                        ord(';'), ';',
-                        array('startFilePos' => 21), array('endFilePos' => 22)
+                        array('startFilePos' => 21), array('endFilePos' => 21)
+                    ),
+                )
+            ),
+            // tests token offsets
+            array(
+                '<?php "a";' . "\n" . '// foo' . "\n" . '"b";',
+                array('usedAttributes' => array('startTokenPos', 'endTokenPos')),
+                array(
+                    array(
+                        Parser::T_CONSTANT_ENCAPSED_STRING, '"a"',
+                        array('startTokenPos' => 1), array('endTokenPos' => 1)
+                    ),
+                    array(
+                        ord(';'), ';',
+                        array('startTokenPos' => 2), array('endTokenPos' => 2)
+                    ),
+                    array(
+                        Parser::T_CONSTANT_ENCAPSED_STRING, '"b"',
+                        array('startTokenPos' => 5), array('endTokenPos' => 5)
+                    ),
+                    array(
+                        ord(';'), ';',
+                        array('startTokenPos' => 6), array('endTokenPos' => 6)
                    ),
                )
            ),
@ -187,4 +210,21 @@ class LexerTest extends \PHPUnit_Framework_TestCase
            //array('<?php ... __halt_compiler /* */ ( ) ;Remaining Text', 'Remaining Text'),
        );
    }
+
+    public function testGetTokens() {
+        $code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";';
+        $expectedTokens = array(
+            array(T_OPEN_TAG, '<?php ', 1),
+            array(T_CONSTANT_ENCAPSED_STRING, '"a"', 1),
+            ';',
+            array(T_WHITESPACE, "\n", 1),
+            array(T_COMMENT, '// foo' . "\n", 2),
+            array(T_CONSTANT_ENCAPSED_STRING, '"b"', 3),
+            ';',
+        );
+
+        $lexer = $this->getLexer();
+        $lexer->startLexing($code);
+        $this->assertSame($expectedTokens, $lexer->getTokens());
+    }
 }