From 46975107a7ecb2443fb33c638053343eda8b4dd0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@googlemail.com>
Date: Thu, 18 Dec 2014 23:26:17 +0100
Subject: [PATCH] Support token position attributes in lexer

Also change endFilePos semantics to refer to the last character that
is *included* in the token, rather than one past the last character.
This ensures that all end* attributes have the same semantics.
---
 lib/PhpParser/Lexer.php      | 49 ++++++++++++++++++++++++++++--------
 test/PhpParser/LexerTest.php | 48 ++++++++++++++++++++++++++++++++---
 2 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php
index f6767e1..566014d 100644
--- a/lib/PhpParser/Lexer.php
+++ b/lib/PhpParser/Lexer.php
@@ -20,8 +20,9 @@ class Lexer
      *
      * @param array $options Options array. Currently only the 'usedAttributes' option is supported,
      *                       which is an array of attributes to add to the AST nodes. Possible attributes
-     *                       are: 'comments', 'startLine', 'endLine', 'startFilePos', 'endFilePos'. The
-     *                       option defaults to the first three. For more info see getNextToken() docs.
+     *                       are: 'comments', 'startLine', 'endLine', 'startTokenPos', 'endTokenPos',
+     *                       'startFilePos', 'endFilePos'. The option defaults to the first three.
+     *                       For more info see getNextToken() docs.
      */
     public function __construct(array $options = array()) {
         // map from internal tokens to PhpParser tokens
@@ -100,13 +101,16 @@ class Lexer
      * The available attributes are determined by the 'usedAttributes' option, which can
      * be specified in the constructor. The following attributes are supported:
      *
-     *  * 'comments'     => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
-     *                      representing all comments that occurred between the previous
-     *                      non-discarded token and the current one.
-     *  * 'startLine'    => Line in which the token starts.
-     *  * 'endLine'      => Line in which the token ends.
-     *  * 'startFilePos' => Offset into the code string at which the token starts.
-     *  * 'endFilePos'   => EXPERIMENTAL! Offset into the code string one past where the token ends.
+     *  * 'comments'      => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,
+     *                       representing all comments that occurred between the previous
+     *                       non-discarded token and the current one.
+     *  * 'startLine'     => Line in which the token starts.
+     *  * 'endLine'       => Line in which the token ends.
+     *  * 'startTokenPos' => Position in the token array of the first token in the node.
+     *  * 'endTokenPos'   => Position in the token array of the last token in the node.
+     *  * 'startFilePos'  => Offset into the code string at which the token starts.
+     *  * 'endFilePos'    => Offset into the code string at which the last character that
+     *                       is part of the token occurs.
      *
      * @param mixed $value           Variable to store token content in
      * @param mixed $startAttributes Variable to store start attributes in
@@ -121,6 +125,9 @@ class Lexer
         while (isset($this->tokens[++$this->pos])) {
             $token = $this->tokens[$this->pos];
 
+            if (isset($this->usedAttributes['startTokenPos'])) {
+                $startAttributes['startTokenPos'] = $this->pos;
+            }
             if (isset($this->usedAttributes['startFilePos'])) {
                 $startAttributes['startFilePos'] = $this->filePos;
             }
@@ -143,8 +150,11 @@ class Lexer
                 if (isset($this->usedAttributes['endLine'])) {
                     $endAttributes['endLine'] = $this->line;
                 }
+                if (isset($this->usedAttributes['endTokenPos'])) {
+                    $endAttributes['endTokenPos'] = $this->pos;
+                }
                 if (isset($this->usedAttributes['endFilePos'])) {
-                    $endAttributes['endFilePos'] = $this->filePos;
+                    $endAttributes['endFilePos'] = $this->filePos - 1;
                 }
 
                 return $id;
@@ -169,8 +179,11 @@ class Lexer
                     if (isset($this->usedAttributes['endLine'])) {
                         $endAttributes['endLine'] = $this->line;
                     }
+                    if (isset($this->usedAttributes['endTokenPos'])) {
+                        $endAttributes['endTokenPos'] = $this->pos;
+                    }
                     if (isset($this->usedAttributes['endFilePos'])) {
-                        $endAttributes['endFilePos'] = $this->filePos;
+                        $endAttributes['endFilePos'] = $this->filePos - 1;
                     }
 
                     return $this->tokenMap[$token[0]];
@@ -184,6 +197,20 @@ class Lexer
         return 0;
     }
 
+    /**
+     * Returns the token array for current code.
+     *
+     * The token array is in the same format as provided by the
+     * token_get_all() function and does not discard tokens (i.e.
+     * whitespace and comments are included). The token position
+     * attributes are against this token array.
+     *
+     * @return array Array of tokens in token_get_all() format
+     */
+    public function getTokens() {
+        return $this->tokens;
+    }
+
     /**
      * Handles __halt_compiler() by returning the text after it.
      *
diff --git a/test/PhpParser/LexerTest.php b/test/PhpParser/LexerTest.php
index fd82166..ad2dc30 100644
--- a/test/PhpParser/LexerTest.php
+++ b/test/PhpParser/LexerTest.php
@@ -131,19 +131,42 @@ class LexerTest extends \PHPUnit_Framework_TestCase
                 array(
                     array(
                         Parser::T_CONSTANT_ENCAPSED_STRING, '"a"',
-                        array('startFilePos' => 6), array('endFilePos' => 9)
+                        array('startFilePos' => 6), array('endFilePos' => 8)
                     ),
                     array(
                         ord(';'), ';',
-                        array('startFilePos' => 9), array('endFilePos' => 10)
+                        array('startFilePos' => 9), array('endFilePos' => 9)
                     ),
                     array(
                         Parser::T_CONSTANT_ENCAPSED_STRING, '"b"',
-                        array('startFilePos' => 18), array('endFilePos' => 21)
+                        array('startFilePos' => 18), array('endFilePos' => 20)
                     ),
                     array(
                         ord(';'), ';',
-                        array('startFilePos' => 21), array('endFilePos' => 22)
+                        array('startFilePos' => 21), array('endFilePos' => 21)
+                    ),
+                )
+            ),
+            // tests token offsets
+            array(
+                '<?php "a";' . "\n" . '// foo' . "\n" . '"b";',
+                array('usedAttributes' => array('startTokenPos', 'endTokenPos')),
+                array(
+                    array(
+                        Parser::T_CONSTANT_ENCAPSED_STRING, '"a"',
+                        array('startTokenPos' => 1), array('endTokenPos' => 1)
+                    ),
+                    array(
+                        ord(';'), ';',
+                        array('startTokenPos' => 2), array('endTokenPos' => 2)
+                    ),
+                    array(
+                        Parser::T_CONSTANT_ENCAPSED_STRING, '"b"',
+                        array('startTokenPos' => 5), array('endTokenPos' => 5)
+                    ),
+                    array(
+                        ord(';'), ';',
+                        array('startTokenPos' => 6), array('endTokenPos' => 6)
                     ),
                 )
             ),
@@ -187,4 +210,21 @@ class LexerTest extends \PHPUnit_Framework_TestCase
             //array('<?php ... __halt_compiler /* */ ( ) ;Remaining Text', 'Remaining Text'),
         );
     }
+
+    public function testGetTokens() {
+        $code = '<?php "a";' . "\n" . '// foo' . "\n" . '"b";';
+        $expectedTokens = array(
+            array(T_OPEN_TAG, '<?php ', 1),
+            array(T_CONSTANT_ENCAPSED_STRING, '"a"', 1),
+            ';',
+            array(T_WHITESPACE, "\n", 1),
+            array(T_COMMENT, '// foo' . "\n", 2),
+            array(T_CONSTANT_ENCAPSED_STRING, '"b"', 3),
+            ';',
+        );
+
+        $lexer = $this->getLexer();
+        $lexer->startLexing($code);
+        $this->assertSame($expectedTokens, $lexer->getTokens());
+    }
 }