From a7797918b84deed17d769017faf8f7124ddd3a67 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 19 Dec 2014 00:06:09 +0100 Subject: [PATCH] Update lexer docs for attribute options --- doc/component/Lexer.markdown | 149 ++++++++++++++++++++--------------- lib/PhpParser/Lexer.php | 13 ++- 2 files changed, 90 insertions(+), 72 deletions(-) diff --git a/doc/component/Lexer.markdown b/doc/component/Lexer.markdown index b9b6e8b..c1185f1 100644 --- a/doc/component/Lexer.markdown +++ b/doc/component/Lexer.markdown @@ -5,83 +5,47 @@ The lexer is responsible for providing tokens to the parser. The project comes w `PhpParser\Lexer\Emulative`. The latter is an extension of the former, which adds the ability to emulate tokens of newer PHP versions and thus allows parsing of new code on older versions. -A lexer has to define the following public interface: +This documentation discusses options available for the default lexers and explains how lexers can be extended. - void startLexing(string $code); - string handleHaltCompiler(); - int getNextToken(string &$value = null, array &$startAttributes = null, array &$endAttributes = null); +Lexer options +------------- -The `startLexing()` method is invoked with the source code that is to be lexed (including the opening tag) whenever the -`parse()` method of the parser is called. It can be used to reset state or preprocess the source code or tokens. - -The `handleHaltCompiler()` method is called whenever a `T_HALT_COMPILER` token is encountered. It has to return the -remaining string after the construct (not including `();`). - -The `getNextToken()` method returns the ID of the next token (as defined by the `Parser::T_*` constants). If no more -tokens are available it must return `0`, which is the ID of the `EOF` token. Furthermore the string content of the -token should be written into the by-reference `$value` parameter (which will then be available as `$n` in the parser). - -Attribute handling ------------------- - -The other two by-ref variables `$startAttributes` and `$endAttributes` define which attributes will eventually be -assigned to the generated nodes: The parser will take the `$startAttributes` from the first token which is part of the -node and the `$endAttributes` from the last token that is part of the node. - -E.g. if the tokens `T_FUNCTION T_STRING ... '{' ... '}'` constitute a node, then the `$startAttributes` from the -`T_FUNCTION` token will be taken and the `$endAttributes` from the `'}'` token. - -By default the lexer creates the attributes `startLine`, `comments` (both part of `$startAttributes`) and `endLine` -(part of `$endAttributes`). - -If you don't want all these attributes to be added (to reduce memory usage of the AST) you can simply remove them by -overriding the method: +The two default lexers accept an `$options` array in the constructor. Currently only the `'usedAttributes'` option is +supported, which allows you to specify which attributes will be added to the AST nodes. The attributes can then be +accessed using `$node->getAttribute()`, `$node->setAttribute()`, `$node->hasAttribute()` and `$node->getAttributes()` +methods. A sample options array: ```php - array( + 'comments', 'startLine', 'endLine' + ) +)); ``` -Token offset lexer ------------------- +The attributes used in this example match the default behavior of the lexer. The following attributes are supported: -A useful application for custom attributes is the token offset lexer, which provides the start and end token for a node -as attributes: + * `comments`: Array of `PhpParser\Comment` or `PhpParser\Comment\Doc` instances, representing all comments that occurred + between the previous non-discarded token and the current one. Use of this attribute is required for the + `$node->getDocComment()` method to work. The attribute is also needed if you wish the pretty printer to retain + comments present in the original code. + * `startLine`: Line in which the node starts. This attribute is required for the `$node->getLine()` to work. It is also + required if syntax errors should contain line number information. + * `endLine`: Line in which the node ends. + * `startTokenPos`: Offset into the token array of the first token in the node. + * `endTokenPos`: Offset into the token array of the last token in the node. + * `startFilePos`: Offset into the code string of the first character that is part of the node. + * `endFilePos`: Offset into the code string of the last character that is part of the node. -```php -pos; - return $tokenId; - } - - public function getTokens() { - return $this->tokens; - } -} -``` - -This information can now be used to examine the exact formatting used for a node. For example the AST does not -distinguish whether a property was declared using `public` or using `var`, but you can retrieve this information based -on the token offset: +The token offset information is useful if you wish to examine the exact formatting used for a node. For example the AST +does not distinguish whether a property was declared using `public` or using `var`, but you can retrieve this +information based on the token position: ```php function isDeclaredUsingVar(array $tokens, PhpParser\Node\Stmt\Property $prop) { - $i = $prop->getAttribute('startOffset'); + $i = $prop->getAttribute('startTokenPos'); return $tokens[$i][0] === T_VAR; } ``` @@ -121,3 +85,58 @@ try { The same approach can also be used to perform specific modifications in the code, without changing the formatting in other places (which is the case when using the pretty printer). + +Lexer extension +--------------- + +A lexer has to define the following public interface: + + void startLexing(string $code); + array getTokens(); + string handleHaltCompiler(); + int getNextToken(string &$value = null, array &$startAttributes = null, array &$endAttributes = null); + +The `startLexing()` method is invoked with the source code that is to be lexed (including the opening tag) whenever the +`parse()` method of the parser is called. It can be used to reset state or preprocess the source code or tokens. + +The `getTokens()` method returns the current token array, in the usual `token_get_all()` format. This method is not +used by the parser (which uses `getNextToken()`), but is useful in combination with the token position attributes. + +The `handleHaltCompiler()` method is called whenever a `T_HALT_COMPILER` token is encountered. It has to return the +remaining string after the construct (not including `();`). + +The `getNextToken()` method returns the ID of the next token (as defined by the `Parser::T_*` constants). If no more +tokens are available it must return `0`, which is the ID of the `EOF` token. Furthermore the string content of the +token should be written into the by-reference `$value` parameter (which will then be available as `$n` in the parser). + +### Attribute handling + +The other two by-ref variables `$startAttributes` and `$endAttributes` define which attributes will eventually be +assigned to the generated nodes: The parser will take the `$startAttributes` from the first token which is part of the +node and the `$endAttributes` from the last token that is part of the node. + +E.g. if the tokens `T_FUNCTION T_STRING ... '{' ... '}'` constitute a node, then the `$startAttributes` from the +`T_FUNCTION` token will be taken and the `$endAttributes` from the `'}'` token. + +An application of custom attributes is storing the original formatting of literals: The parser does not retain +information about the formatting of integers (like decimal vs. hexadecimal) or strings (like used quote type or used +escape sequences). This can be remedied by storing the original value in an attribute: + +```php +class KeepOriginalValueLexer extends PHPParser\Lexer // or PHPParser\Lexer\Emulative +{ + public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) { + $tokenId = parent::getNextToken($value, $startAttributes, $endAttributes); + + if ($tokenId == PHPParser\Parser::T_CONSTANT_ENCAPSED_STRING // non-interpolated string + || $tokenId == PHPParser\Parser::T_LNUMBER // integer + || $tokenId == PHPParser\Parser::T_DNUMBER // floating point number + ) { + // could also use $startAttributes, doesn't really matter here + $endAttributes['originalValue'] = $value; + } + + return $tokenId; + } +} +``` diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php index 566014d..a809382 100644 --- a/lib/PhpParser/Lexer.php +++ b/lib/PhpParser/Lexer.php @@ -104,13 +104,12 @@ class Lexer * * 'comments' => Array of PhpParser\Comment or PhpParser\Comment\Doc instances, * representing all comments that occurred between the previous * non-discarded token and the current one. - * * 'startLine' => Line in which the token starts. - * * 'endLine' => Line in which the token ends. - * * 'startTokenPos' => Position in the token array of the first token in the node. - * * 'endTokenPos' => Position in the token array of the last token in the node. - * * 'startFilePos' => Offset into the code string at which the token starts. - * * 'endFilePos' => Offset into the code string at which the last character that - * is part of the token occurs. + * * 'startLine' => Line in which the node starts. + * * 'endLine' => Line in which the node ends. + * * 'startTokenPos' => Offset into the token array of the first token in the node. + * * 'endTokenPos' => Offset into the token array of the last token in the node. + * * 'startFilePos' => Offset into the code string of the first character that is part of the node. + * * 'endFilePos' => Offset into the code string of the last character that is part of the node * * @param mixed $value Variable to store token content in * @param mixed $startAttributes Variable to store start attributes in