From 767f23c3a98bd5e90ac222784b64baee283f9c0c Mon Sep 17 00:00:00 2001
From: nikic <nikita.ppv@googlemail.com>
Date: Sat, 11 Oct 2014 21:45:43 +0200
Subject: [PATCH] Update lexer docs

Remove some very questionable examples for changing startLexing()
to accept a file name.

Add token offset lexer implementation and usage example.
---
 doc/component/Lexer.markdown | 127 +++++++++++++++++++----------------
 1 file changed, 68 insertions(+), 59 deletions(-)

diff --git a/doc/component/Lexer.markdown b/doc/component/Lexer.markdown
index 314cb41..cd1b5ef 100644
--- a/doc/component/Lexer.markdown
+++ b/doc/component/Lexer.markdown
@@ -7,45 +7,22 @@ newer PHP versions and thus allows parsing of new code on older versions.
 
 A lexer has to define the following public interface:
 
-    startLexing($code);
-    getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null);
-    handleHaltCompiler();
+    void startLexing(string $code);
+    string handleHaltCompiler();
+    int getNextToken(string &$value = null, array &$startAttributes = null, array &$endAttributes = null);
 
-startLexing
------------
+The `startLexing()` method is invoked with the source code that is to be lexed (including the opening tag) whenever the
+`parse()` method of the parser is called. It can be used to reset state or preprocess the source code or tokens.
 
-The `startLexing` method is invoked when the `parse()` method of the parser is called. It's argument will be whatever
-was passed to the `parse()` method.
+The `handleHaltCompiler()` method is called whenever a `T_HALT_COMPILER` token is encountered. It has to return the
+remaining string after the construct (not including `();`).
 
-Even though `startLexing` is meant to accept a source code string, you could for example overwrite it to accept a file:
+The `getNextToken()` method returns the ID of the next token (as defined by the `Parser::T_*` constants). If no more
+tokens are available it must return `0`, which is the ID of the `EOF` token. Furthermore the string content of the
+token should be written into the by-reference `$value` parameter (which will then be available as `$n` in the parser).
 
-```php
-<?php
-
-class FileLexer extends PhpParser\Lexer {
-    public function startLexing($fileName) {
-        if (!file_exists($fileName)) {
-            throw new InvalidArgumentException(sprintf('File "%s" does not exist', $fileName));
-        }
-
-        parent::startLexing(file_get_contents($fileName));
-    }
-}
-
-$parser = new PhpParser\Parser(new FileLexer);
-
-var_dump($parser->parse('someFile.php'));
-var_dump($parser->parse('someOtherFile.php'));
-```
-
-getNextToken
-------------
-
-`getNextToken` returns the ID of the next token and sets some additional information in the three variables which it
-accepts by-ref. If no more tokens are available it must return `0`, which is the ID of the `EOF` token.
-
-The first by-ref variable `$value` should contain the textual content of the token. It is what will be available as `$1`
-etc in the parser.
+Attribute handling
+------------------
 
 The other two by-ref variables `$startAttributes` and `$endAttributes` define which attributes will eventually be
 assigned to the generated nodes: The parser will take the `$startAttributes` from the first token which is part of the
@@ -76,39 +53,71 @@ class LessAttributesLexer extends PhpParser\Lexer {
 }
 ```
 
-You can obviously also add additional attributes. E.g. in conjunction with the above `FileLexer` you might want to add
-a `fileName` attribute to all nodes:
+Token offset lexer
+------------------
+
+A useful application for custom attributes is the token offset lexer, which provides the start and end token for a node
+as attributes:
 
 ```php
 <?php
 
-class FileLexer extends PhpParser\Lexer {
-    protected $fileName;
-
-    public function startLexing($fileName) {
-        if (!file_exists($fileName)) {
-            throw new InvalidArgumentException(sprintf('File "%s" does not exist', $fileName));
-        }
-
-        $this->fileName = $fileName;
-        parent::startLexing(file_get_contents($fileName));
-    }
-
+class TokenOffsetLexer extends PhpParser\Lexer {
     public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) {
         $tokenId = parent::getNextToken($value, $startAttributes, $endAttributes);
-
-        // we could use either $startAttributes or $endAttributes here, because the fileName is always the same
-        // (regardless of whether it is the start or end token). We choose $endAttributes, because it is slightly
-        // more efficient (as the parser has to keep a stack for the $startAttributes).
-        $endAttributes['fileName'] = $this->fileName;
-
+        $startAttributes['startOffset'] = $endAttributes['endOffset'] = $this->pos;
         return $tokenId;
     }
+
+    public function getTokens() {
+        return $this->tokens;
+    }
 }
 ```
 
-handleHaltCompiler
-------------------
+This information can now be used to examine the exact formatting used for a node. For example the AST does not
+distinguish whether a property was declared using `public` or using `var`, but you can retrieve this information based
+on the token offset:
 
-The method is invoked whenever a `T_HALT_COMPILER` token is encountered. It has to return the remaining string after the
-construct (not including `();`).
+```php
+function isDeclaredUsingVar(array $tokens, PhpParser\Node\Stmt\Property $prop) {
+    $i = $prop->getAttribute('startOffset');
+    return $tokens[$i][0] === T_VAR;
+}
+```
+
+In order to make use of this function, you will have to provide the tokens from the lexer to your node visitor using
+code similar to the following:
+
+```php
+class MyNodeVisitor extends PhpParser\NodeVisitorAbstract {
+    private $tokens;
+    public function setTokens(array $tokens) {
+        $this->tokens = $tokens;
+    }
+
+    public function leaveNode(PhpParser\Node $node) {
+        if ($node instanceof PhpParser\Node\Stmt\Property) {
+            var_dump(isImplicitlyPublicProperty($this->tokens, $node));
+        }
+    }
+}
+
+$lexer = new TokenOffsetLexer();
+$parser = new PhpParser\Parser($lexer);
+
+$visitor = new MyNodeVisitor();
+$traverser = new PhpParser\NodeTraverser();
+$traverser->addVisitor($visitor);
+
+try {
+    $stmts = $parser->parse($code);
+    $visitor->setTokens($lexer->getTokens());
+    $stmts = $traverser->traverse($stmts);
+} catch (PhpParser\Error $e) {
+    echo 'Parse Error: ', $e->getMessage();
+}
+```
+
+The same approach can also be used to perform specific modifications in the code, without changing the formatting in
+other places (which is the case when using the pretty printer).