From 767f23c3a98bd5e90ac222784b64baee283f9c0c Mon Sep 17 00:00:00 2001 From: nikic Date: Sat, 11 Oct 2014 21:45:43 +0200 Subject: [PATCH] Update lexer docs Remove some very questionable examples for changing startLexing() to accept a file name. Add token offset lexer implementation and usage example. --- doc/component/Lexer.markdown | 127 +++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 59 deletions(-) diff --git a/doc/component/Lexer.markdown b/doc/component/Lexer.markdown index 314cb41..cd1b5ef 100644 --- a/doc/component/Lexer.markdown +++ b/doc/component/Lexer.markdown @@ -7,45 +7,22 @@ newer PHP versions and thus allows parsing of new code on older versions. A lexer has to define the following public interface: - startLexing($code); - getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null); - handleHaltCompiler(); + void startLexing(string $code); + string handleHaltCompiler(); + int getNextToken(string &$value = null, array &$startAttributes = null, array &$endAttributes = null); -startLexing ------------ +The `startLexing()` method is invoked with the source code that is to be lexed (including the opening tag) whenever the +`parse()` method of the parser is called. It can be used to reset state or preprocess the source code or tokens. -The `startLexing` method is invoked when the `parse()` method of the parser is called. It's argument will be whatever -was passed to the `parse()` method. +The `handleHaltCompiler()` method is called whenever a `T_HALT_COMPILER` token is encountered. It has to return the +remaining string after the construct (not including `();`). -Even though `startLexing` is meant to accept a source code string, you could for example overwrite it to accept a file: +The `getNextToken()` method returns the ID of the next token (as defined by the `Parser::T_*` constants). If no more +tokens are available it must return `0`, which is the ID of the `EOF` token. Furthermore the string content of the +token should be written into the by-reference `$value` parameter (which will then be available as `$n` in the parser). -```php -parse('someFile.php')); -var_dump($parser->parse('someOtherFile.php')); -``` - -getNextToken ------------- - -`getNextToken` returns the ID of the next token and sets some additional information in the three variables which it -accepts by-ref. If no more tokens are available it must return `0`, which is the ID of the `EOF` token. - -The first by-ref variable `$value` should contain the textual content of the token. It is what will be available as `$1` -etc in the parser. +Attribute handling +------------------ The other two by-ref variables `$startAttributes` and `$endAttributes` define which attributes will eventually be assigned to the generated nodes: The parser will take the `$startAttributes` from the first token which is part of the @@ -76,39 +53,71 @@ class LessAttributesLexer extends PhpParser\Lexer { } ``` -You can obviously also add additional attributes. E.g. in conjunction with the above `FileLexer` you might want to add -a `fileName` attribute to all nodes: +Token offset lexer +------------------ + +A useful application for custom attributes is the token offset lexer, which provides the start and end token for a node +as attributes: ```php fileName = $fileName; - parent::startLexing(file_get_contents($fileName)); - } - +class TokenOffsetLexer extends PhpParser\Lexer { public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) { $tokenId = parent::getNextToken($value, $startAttributes, $endAttributes); - - // we could use either $startAttributes or $endAttributes here, because the fileName is always the same - // (regardless of whether it is the start or end token). We choose $endAttributes, because it is slightly - // more efficient (as the parser has to keep a stack for the $startAttributes). - $endAttributes['fileName'] = $this->fileName; - + $startAttributes['startOffset'] = $endAttributes['endOffset'] = $this->pos; return $tokenId; } + + public function getTokens() { + return $this->tokens; + } } ``` -handleHaltCompiler ------------------- +This information can now be used to examine the exact formatting used for a node. For example the AST does not +distinguish whether a property was declared using `public` or using `var`, but you can retrieve this information based +on the token offset: -The method is invoked whenever a `T_HALT_COMPILER` token is encountered. It has to return the remaining string after the -construct (not including `();`). +```php +function isDeclaredUsingVar(array $tokens, PhpParser\Node\Stmt\Property $prop) { + $i = $prop->getAttribute('startOffset'); + return $tokens[$i][0] === T_VAR; +} +``` + +In order to make use of this function, you will have to provide the tokens from the lexer to your node visitor using +code similar to the following: + +```php +class MyNodeVisitor extends PhpParser\NodeVisitorAbstract { + private $tokens; + public function setTokens(array $tokens) { + $this->tokens = $tokens; + } + + public function leaveNode(PhpParser\Node $node) { + if ($node instanceof PhpParser\Node\Stmt\Property) { + var_dump(isImplicitlyPublicProperty($this->tokens, $node)); + } + } +} + +$lexer = new TokenOffsetLexer(); +$parser = new PhpParser\Parser($lexer); + +$visitor = new MyNodeVisitor(); +$traverser = new PhpParser\NodeTraverser(); +$traverser->addVisitor($visitor); + +try { + $stmts = $parser->parse($code); + $visitor->setTokens($lexer->getTokens()); + $stmts = $traverser->traverse($stmts); +} catch (PhpParser\Error $e) { + echo 'Parse Error: ', $e->getMessage(); +} +``` + +The same approach can also be used to perform specific modifications in the code, without changing the formatting in +other places (which is the case when using the pretty printer).