From d98a65086b3daff08d5e31982ca87d6187efb22e Mon Sep 17 00:00:00 2001 From: nikic Date: Tue, 21 Feb 2012 17:56:07 +0100 Subject: [PATCH] Minor refactor and comments for emlative lexer The emulative lexer is a single dirty hack so it needs a few more comments :) --- lib/PHPParser/Lexer/Emulative.php | 77 ++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/lib/PHPParser/Lexer/Emulative.php b/lib/PHPParser/Lexer/Emulative.php index 9aa89a2..bd21e26 100644 --- a/lib/PHPParser/Lexer/Emulative.php +++ b/lib/PHPParser/Lexer/Emulative.php @@ -23,16 +23,35 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer public function __construct($code) { $this->inObjectAccess = false; - if (version_compare(PHP_VERSION, '5.4.0RC1', '<')) { - // binary notation - $code = preg_replace('(\b0b[01]+\b)', '~__EMU__BINARY__$0__~', $code); + // on PHP 5.4 don't do anything + if (version_compare(PHP_VERSION, '5.4.0RC1', '>=')) { + parent::__construct($code); + } else { + $code = $this->preprocessCode($code); + parent::__construct($code); + $this->postprocessTokens(); } + } + + /* + * Replaces new features in the code by ~__EMU__{NAME}__{DATA}__~ sequences. + * ~LABEL~ is never valid PHP code, that's why we can (to some degree) safely + * use it here. + * Later when preprocessing the tokens these sequences will either be replaced + * by real tokens or replaced with their original content (e.g. if they occured + * inside a string, i.e. a place where they don't have a special meaning). + */ + protected function preprocessCode($code) { + // binary notation (0b010101101001...) + $code = preg_replace('(\b0b[01]+\b)', '~__EMU__BINARY__$0__~', $code); if (version_compare(PHP_VERSION, '5.3.0', '<')) { - // namespace separator + // namespace separator (backslash not followed by some special characters, + // which are not valid after a NS separator, but would cause problems with + // escape sequence parsing if one would replace the backslash there) $code = preg_replace('(\\\\(?!["\'`$\\\\]))', '~__EMU__NS__~', $code); - // nowdoc + // nowdoc (<<<'ABC'\ncontent\nABC;) $code = preg_replace_callback( '((*BSR_ANYCRLF) # set \R to (?>\r\n|\r|\n) (b?<<<[\t ]*\'([a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*)\'\R) # opening token @@ -45,9 +64,31 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer ); } - parent::__construct($code); + return $code; + } + /* + * As nowdocs can have arbitrary content but LABELs can only contain a certain + * range of characters, the nowdoc content is encoded as hex and separated by + * 'x' tokens. So the result of the encoding will look like this: + * ~__EMU__NOWDOC__{HEX(START_TOKEN)}x{HEX(CONTENT)}x{HEX(END_TOKEN)}~ + */ + public function encodeNowdocCallback(array $matches) { + return '~__EMU__NOWDOC__' + . bin2hex($matches[1]) . 'x' . bin2hex($matches[3]) . 'x' . bin2hex($matches[4]) + . '__~'; + } + + /* + * Replaces the ~__EMU__...~ sequences with real tokens or their original + * value. + */ + protected function postprocessTokens() { + // we need to manually iterate and manage a count because we'll change + // the tokens array on the way for ($i = 0, $c = count($this->tokens); $i < $c; ++$i) { + // first check that the following tokens are form ~LABEL~, + // then match the __EMU__... sequence. if ('~' === $this->tokens[$i] && isset($this->tokens[$i + 2]) && '~' === $this->tokens[$i + 2] @@ -55,12 +96,18 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer && preg_match('(^__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?$)', $this->tokens[$i + 1][1], $matches) ) { if ('BINARY' === $matches[1]) { + // the binary number can either be an integer or a double, so return a LNUMBER + // or DNUMBER respectively $replace = array( array(is_int(bindec($matches[2])) ? T_LNUMBER : T_DNUMBER, $matches[2], $this->tokens[$i + 1][2]) ); } elseif ('NS' === $matches[1]) { + // a \ single char token is returned here and replaced by a + // PHPParser_Parser::T_NS_SEPARATOR token in ->lex(). This hacks around the + // limitations arising from T_NS_SEPARATOR not being defined on 5.3 $replace = array('\\'); } elseif ('NOWDOC' === $matches[1]) { + // decode the encoded nowdoc payload; pack('H*' is bin2hex( for 5.3 list($start, $content, $end) = explode('x', $matches[2]); list($start, $content, $end) = array(pack('H*', $start), pack('H*', $content), pack('H*', $end)); @@ -71,11 +118,14 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer } $replace[] = array(T_END_HEREDOC, $end, -1); } else { + // just ignore all other __EMU__ sequences continue; } array_splice($this->tokens, $i, 3, $replace); $c -= 3 - count($replace); + // for multichar tokens (e.g. strings) replace any ~__EMU__...~ sequences + // in their content with the original character sequence } elseif (is_array($this->tokens[$i]) && 0 !== strpos($this->tokens[$i][1], '__EMU__') ) { @@ -88,12 +138,10 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer } } - public function encodeNowdocCallback(array $matches) { - return '~__EMU__NOWDOC__' - . bin2hex($matches[1]) . 'x' . bin2hex($matches[3]) . 'x' . bin2hex($matches[4]) - . '__~'; - } - + /* + * This method is a callback for restoring EMU sequences in + * multichar tokens (like strings) to their original value. + */ public function restoreContentCallback(array $matches) { if ('BINARY' === $matches[1]) { return $matches[2]; @@ -110,12 +158,17 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer public function lex(&$value = null, &$line = null, &$docComment = null) { $token = parent::lex($value, $line, $docComment); + // replace new keywords by their respective tokens. This is not done + // if we currently are in an object access (e.g. in $obj->namespace + // "namespace" stays a T_STRING tokens and isn't converted to T_NAMESPACE) if (PHPParser_Parser::T_STRING === $token && !$this->inObjectAccess) { if (isset(self::$keywords[strtolower($value)])) { return self::$keywords[strtolower($value)]; } + // backslashes are replaced by T_NS_SEPARATOR tokens } elseif (92 === $token) { // ord('\\') return PHPParser_Parser::T_NS_SEPARATOR; + // keep track of whether we currently are in an object access (after ->) } elseif (PHPParser_Parser::T_OBJECT_OPERATOR === $token) { $this->inObjectAccess = true; } else {