php-parser/lib/PHPParser/Lexer/Emulative.php
nikic 417a8bb07e Add support for yield expressions (PHP 5.5)
This adds a new Yield expression type, with subnodes key and value.
2012-09-07 23:41:57 +02:00

200 lines
8.5 KiB
PHP

<?php
/**
* ATTENTION: This code is WRITE-ONLY. Do not try to read it.
*/
class PHPParser_Lexer_Emulative extends PHPParser_Lexer
{
protected $newKeywords;
protected $inObjectAccess;
public function __construct() {
parent::__construct();
$newKeywordsPerVersion = array(
'5.5.0-dev' => array(
'finally' => PHPParser_Parser::T_FINALLY,
'yield' => PHPParser_Parser::T_YIELD,
),
'5.4.0-dev' => array(
'callable' => PHPParser_Parser::T_CALLABLE,
'insteadof' => PHPParser_Parser::T_INSTEADOF,
'trait' => PHPParser_Parser::T_TRAIT,
'__trait__' => PHPParser_Parser::T_TRAIT_C,
),
'5.3.0-dev' => array(
'__dir__' => PHPParser_Parser::T_DIR,
'goto' => PHPParser_Parser::T_GOTO,
'namespace' => PHPParser_Parser::T_NAMESPACE,
'__namespace__' => PHPParser_Parser::T_NS_C,
),
);
$this->newKeywords = array();
foreach ($newKeywordsPerVersion as $version => $newKeywords) {
if (version_compare(PHP_VERSION, $version, '>=')) {
break;
}
$this->newKeywords += $newKeywords;
}
}
public function startLexing($code) {
$this->inObjectAccess = false;
// on PHP 5.4 don't do anything
if (version_compare(PHP_VERSION, '5.4.0RC1', '>=')) {
parent::startLexing($code);
} else {
$code = $this->preprocessCode($code);
parent::startLexing($code);
$this->postprocessTokens();
}
}
/*
* Replaces new features in the code by ~__EMU__{NAME}__{DATA}__~ sequences.
* ~LABEL~ is never valid PHP code, that's why we can (to some degree) safely
* use it here.
* Later when preprocessing the tokens these sequences will either be replaced
* by real tokens or replaced with their original content (e.g. if they occured
* inside a string, i.e. a place where they don't have a special meaning).
*/
protected function preprocessCode($code) {
// binary notation (0b010101101001...)
$code = preg_replace('(\b0b[01]+\b)', '~__EMU__BINARY__$0__~', $code);
if (version_compare(PHP_VERSION, '5.3.0', '<')) {
// namespace separator (backslash not followed by some special characters,
// which are not valid after a NS separator, but would cause problems with
// escape sequence parsing if one would replace the backslash there)
$code = preg_replace('(\\\\(?!["\'`${\\\\]))', '~__EMU__NS__~', $code);
// nowdoc (<<<'ABC'\ncontent\nABC;)
$code = preg_replace_callback(
'((*BSR_ANYCRLF) # set \R to (?>\r\n|\r|\n)
(b?<<<[\t ]*\'([a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*)\'\R) # opening token
((?:(?!\2;?\R).*\R)*) # content
(\2) # closing token
(?=;?\R) # must be followed by newline (with optional semicolon)
)x',
array($this, 'encodeNowdocCallback'),
$code
);
}
return $code;
}
/*
* As nowdocs can have arbitrary content but LABELs can only contain a certain
* range of characters, the nowdoc content is encoded as hex and separated by
* 'x' tokens. So the result of the encoding will look like this:
* ~__EMU__NOWDOC__{HEX(START_TOKEN)}x{HEX(CONTENT)}x{HEX(END_TOKEN)}~
*/
public function encodeNowdocCallback(array $matches) {
return '~__EMU__NOWDOC__'
. bin2hex($matches[1]) . 'x' . bin2hex($matches[3]) . 'x' . bin2hex($matches[4])
. '__~';
}
/*
* Replaces the ~__EMU__...~ sequences with real tokens or their original
* value.
*/
protected function postprocessTokens() {
// we need to manually iterate and manage a count because we'll change
// the tokens array on the way
for ($i = 0, $c = count($this->tokens); $i < $c; ++$i) {
// first check that the following tokens are form ~LABEL~,
// then match the __EMU__... sequence.
if ('~' === $this->tokens[$i]
&& isset($this->tokens[$i + 2])
&& '~' === $this->tokens[$i + 2]
&& T_STRING === $this->tokens[$i + 1][0]
&& preg_match('(^__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?$)', $this->tokens[$i + 1][1], $matches)
) {
if ('BINARY' === $matches[1]) {
// the binary number can either be an integer or a double, so return a LNUMBER
// or DNUMBER respectively
$replace = array(
array(is_int(bindec($matches[2])) ? T_LNUMBER : T_DNUMBER, $matches[2], $this->tokens[$i + 1][2])
);
} elseif ('NS' === $matches[1]) {
// a \ single char token is returned here and replaced by a
// PHPParser_Parser::T_NS_SEPARATOR token in ->getNextToken(). This hacks around
// the limitations arising from T_NS_SEPARATOR not being defined on 5.3
$replace = array('\\');
} elseif ('NOWDOC' === $matches[1]) {
// decode the encoded nowdoc payload; pack('H*' is bin2hex( for 5.3
list($start, $content, $end) = explode('x', $matches[2]);
list($start, $content, $end) = array(pack('H*', $start), pack('H*', $content), pack('H*', $end));
$replace = array();
$replace[] = array(T_START_HEREDOC, $start, $this->tokens[$i + 1][2]);
if ('' !== $content) {
$replace[] = array(T_ENCAPSED_AND_WHITESPACE, $content, -1);
}
$replace[] = array(T_END_HEREDOC, $end, -1);
} else {
// just ignore all other __EMU__ sequences
continue;
}
array_splice($this->tokens, $i, 3, $replace);
$c -= 3 - count($replace);
// for multichar tokens (e.g. strings) replace any ~__EMU__...~ sequences
// in their content with the original character sequence
} elseif (is_array($this->tokens[$i])
&& 0 !== strpos($this->tokens[$i][1], '__EMU__')
) {
$this->tokens[$i][1] = preg_replace_callback(
'(~__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?~)',
array($this, 'restoreContentCallback'),
$this->tokens[$i][1]
);
}
}
}
/*
* This method is a callback for restoring EMU sequences in
* multichar tokens (like strings) to their original value.
*/
public function restoreContentCallback(array $matches) {
if ('BINARY' === $matches[1]) {
return $matches[2];
} elseif ('NS' === $matches[1]) {
return '\\';
} elseif ('NOWDOC' === $matches[1]) {
list($start, $content, $end) = explode('x', $matches[2]);
return pack('H*', $start) . pack('H*', $content) . pack('H*', $end);
} else {
return $matches[0];
}
}
public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) {
$token = parent::getNextToken($value, $startAttributes, $endAttributes);
// replace new keywords by their respective tokens. This is not done
// if we currently are in an object access (e.g. in $obj->namespace
// "namespace" stays a T_STRING tokens and isn't converted to T_NAMESPACE)
if (PHPParser_Parser::T_STRING === $token && !$this->inObjectAccess) {
if (isset($this->newKeywords[strtolower($value)])) {
return $this->newKeywords[strtolower($value)];
}
// backslashes are replaced by T_NS_SEPARATOR tokens
} elseif (92 === $token) { // ord('\\')
return PHPParser_Parser::T_NS_SEPARATOR;
// keep track of whether we currently are in an object access (after ->)
} elseif (PHPParser_Parser::T_OBJECT_OPERATOR === $token) {
$this->inObjectAccess = true;
} else {
$this->inObjectAccess = false;
}
return $token;
}
}