mirror of
https://github.com/danog/PHP-Parser.git
synced 2025-01-22 13:51:12 +01:00
Minor refactor and comments for emlative lexer
The emulative lexer is a single dirty hack so it needs a few more comments :)
This commit is contained in:
parent
608cfbba4e
commit
d98a65086b
@ -23,16 +23,35 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
|
|||||||
public function __construct($code) {
|
public function __construct($code) {
|
||||||
$this->inObjectAccess = false;
|
$this->inObjectAccess = false;
|
||||||
|
|
||||||
if (version_compare(PHP_VERSION, '5.4.0RC1', '<')) {
|
// on PHP 5.4 don't do anything
|
||||||
// binary notation
|
if (version_compare(PHP_VERSION, '5.4.0RC1', '>=')) {
|
||||||
$code = preg_replace('(\b0b[01]+\b)', '~__EMU__BINARY__$0__~', $code);
|
parent::__construct($code);
|
||||||
|
} else {
|
||||||
|
$code = $this->preprocessCode($code);
|
||||||
|
parent::__construct($code);
|
||||||
|
$this->postprocessTokens();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Replaces new features in the code by ~__EMU__{NAME}__{DATA}__~ sequences.
|
||||||
|
* ~LABEL~ is never valid PHP code, that's why we can (to some degree) safely
|
||||||
|
* use it here.
|
||||||
|
* Later when preprocessing the tokens these sequences will either be replaced
|
||||||
|
* by real tokens or replaced with their original content (e.g. if they occured
|
||||||
|
* inside a string, i.e. a place where they don't have a special meaning).
|
||||||
|
*/
|
||||||
|
protected function preprocessCode($code) {
|
||||||
|
// binary notation (0b010101101001...)
|
||||||
|
$code = preg_replace('(\b0b[01]+\b)', '~__EMU__BINARY__$0__~', $code);
|
||||||
|
|
||||||
if (version_compare(PHP_VERSION, '5.3.0', '<')) {
|
if (version_compare(PHP_VERSION, '5.3.0', '<')) {
|
||||||
// namespace separator
|
// namespace separator (backslash not followed by some special characters,
|
||||||
|
// which are not valid after a NS separator, but would cause problems with
|
||||||
|
// escape sequence parsing if one would replace the backslash there)
|
||||||
$code = preg_replace('(\\\\(?!["\'`$\\\\]))', '~__EMU__NS__~', $code);
|
$code = preg_replace('(\\\\(?!["\'`$\\\\]))', '~__EMU__NS__~', $code);
|
||||||
|
|
||||||
// nowdoc
|
// nowdoc (<<<'ABC'\ncontent\nABC;)
|
||||||
$code = preg_replace_callback(
|
$code = preg_replace_callback(
|
||||||
'((*BSR_ANYCRLF) # set \R to (?>\r\n|\r|\n)
|
'((*BSR_ANYCRLF) # set \R to (?>\r\n|\r|\n)
|
||||||
(b?<<<[\t ]*\'([a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*)\'\R) # opening token
|
(b?<<<[\t ]*\'([a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*)\'\R) # opening token
|
||||||
@ -45,9 +64,31 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
parent::__construct($code);
|
return $code;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* As nowdocs can have arbitrary content but LABELs can only contain a certain
|
||||||
|
* range of characters, the nowdoc content is encoded as hex and separated by
|
||||||
|
* 'x' tokens. So the result of the encoding will look like this:
|
||||||
|
* ~__EMU__NOWDOC__{HEX(START_TOKEN)}x{HEX(CONTENT)}x{HEX(END_TOKEN)}~
|
||||||
|
*/
|
||||||
|
public function encodeNowdocCallback(array $matches) {
|
||||||
|
return '~__EMU__NOWDOC__'
|
||||||
|
. bin2hex($matches[1]) . 'x' . bin2hex($matches[3]) . 'x' . bin2hex($matches[4])
|
||||||
|
. '__~';
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Replaces the ~__EMU__...~ sequences with real tokens or their original
|
||||||
|
* value.
|
||||||
|
*/
|
||||||
|
protected function postprocessTokens() {
|
||||||
|
// we need to manually iterate and manage a count because we'll change
|
||||||
|
// the tokens array on the way
|
||||||
for ($i = 0, $c = count($this->tokens); $i < $c; ++$i) {
|
for ($i = 0, $c = count($this->tokens); $i < $c; ++$i) {
|
||||||
|
// first check that the following tokens are form ~LABEL~,
|
||||||
|
// then match the __EMU__... sequence.
|
||||||
if ('~' === $this->tokens[$i]
|
if ('~' === $this->tokens[$i]
|
||||||
&& isset($this->tokens[$i + 2])
|
&& isset($this->tokens[$i + 2])
|
||||||
&& '~' === $this->tokens[$i + 2]
|
&& '~' === $this->tokens[$i + 2]
|
||||||
@ -55,12 +96,18 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
|
|||||||
&& preg_match('(^__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?$)', $this->tokens[$i + 1][1], $matches)
|
&& preg_match('(^__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?$)', $this->tokens[$i + 1][1], $matches)
|
||||||
) {
|
) {
|
||||||
if ('BINARY' === $matches[1]) {
|
if ('BINARY' === $matches[1]) {
|
||||||
|
// the binary number can either be an integer or a double, so return a LNUMBER
|
||||||
|
// or DNUMBER respectively
|
||||||
$replace = array(
|
$replace = array(
|
||||||
array(is_int(bindec($matches[2])) ? T_LNUMBER : T_DNUMBER, $matches[2], $this->tokens[$i + 1][2])
|
array(is_int(bindec($matches[2])) ? T_LNUMBER : T_DNUMBER, $matches[2], $this->tokens[$i + 1][2])
|
||||||
);
|
);
|
||||||
} elseif ('NS' === $matches[1]) {
|
} elseif ('NS' === $matches[1]) {
|
||||||
|
// a \ single char token is returned here and replaced by a
|
||||||
|
// PHPParser_Parser::T_NS_SEPARATOR token in ->lex(). This hacks around the
|
||||||
|
// limitations arising from T_NS_SEPARATOR not being defined on 5.3
|
||||||
$replace = array('\\');
|
$replace = array('\\');
|
||||||
} elseif ('NOWDOC' === $matches[1]) {
|
} elseif ('NOWDOC' === $matches[1]) {
|
||||||
|
// decode the encoded nowdoc payload; pack('H*' is bin2hex( for 5.3
|
||||||
list($start, $content, $end) = explode('x', $matches[2]);
|
list($start, $content, $end) = explode('x', $matches[2]);
|
||||||
list($start, $content, $end) = array(pack('H*', $start), pack('H*', $content), pack('H*', $end));
|
list($start, $content, $end) = array(pack('H*', $start), pack('H*', $content), pack('H*', $end));
|
||||||
|
|
||||||
@ -71,11 +118,14 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
|
|||||||
}
|
}
|
||||||
$replace[] = array(T_END_HEREDOC, $end, -1);
|
$replace[] = array(T_END_HEREDOC, $end, -1);
|
||||||
} else {
|
} else {
|
||||||
|
// just ignore all other __EMU__ sequences
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
array_splice($this->tokens, $i, 3, $replace);
|
array_splice($this->tokens, $i, 3, $replace);
|
||||||
$c -= 3 - count($replace);
|
$c -= 3 - count($replace);
|
||||||
|
// for multichar tokens (e.g. strings) replace any ~__EMU__...~ sequences
|
||||||
|
// in their content with the original character sequence
|
||||||
} elseif (is_array($this->tokens[$i])
|
} elseif (is_array($this->tokens[$i])
|
||||||
&& 0 !== strpos($this->tokens[$i][1], '__EMU__')
|
&& 0 !== strpos($this->tokens[$i][1], '__EMU__')
|
||||||
) {
|
) {
|
||||||
@ -88,12 +138,10 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function encodeNowdocCallback(array $matches) {
|
/*
|
||||||
return '~__EMU__NOWDOC__'
|
* This method is a callback for restoring EMU sequences in
|
||||||
. bin2hex($matches[1]) . 'x' . bin2hex($matches[3]) . 'x' . bin2hex($matches[4])
|
* multichar tokens (like strings) to their original value.
|
||||||
. '__~';
|
*/
|
||||||
}
|
|
||||||
|
|
||||||
public function restoreContentCallback(array $matches) {
|
public function restoreContentCallback(array $matches) {
|
||||||
if ('BINARY' === $matches[1]) {
|
if ('BINARY' === $matches[1]) {
|
||||||
return $matches[2];
|
return $matches[2];
|
||||||
@ -110,12 +158,17 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
|
|||||||
public function lex(&$value = null, &$line = null, &$docComment = null) {
|
public function lex(&$value = null, &$line = null, &$docComment = null) {
|
||||||
$token = parent::lex($value, $line, $docComment);
|
$token = parent::lex($value, $line, $docComment);
|
||||||
|
|
||||||
|
// replace new keywords by their respective tokens. This is not done
|
||||||
|
// if we currently are in an object access (e.g. in $obj->namespace
|
||||||
|
// "namespace" stays a T_STRING tokens and isn't converted to T_NAMESPACE)
|
||||||
if (PHPParser_Parser::T_STRING === $token && !$this->inObjectAccess) {
|
if (PHPParser_Parser::T_STRING === $token && !$this->inObjectAccess) {
|
||||||
if (isset(self::$keywords[strtolower($value)])) {
|
if (isset(self::$keywords[strtolower($value)])) {
|
||||||
return self::$keywords[strtolower($value)];
|
return self::$keywords[strtolower($value)];
|
||||||
}
|
}
|
||||||
|
// backslashes are replaced by T_NS_SEPARATOR tokens
|
||||||
} elseif (92 === $token) { // ord('\\')
|
} elseif (92 === $token) { // ord('\\')
|
||||||
return PHPParser_Parser::T_NS_SEPARATOR;
|
return PHPParser_Parser::T_NS_SEPARATOR;
|
||||||
|
// keep track of whether we currently are in an object access (after ->)
|
||||||
} elseif (PHPParser_Parser::T_OBJECT_OPERATOR === $token) {
|
} elseif (PHPParser_Parser::T_OBJECT_OPERATOR === $token) {
|
||||||
$this->inObjectAccess = true;
|
$this->inObjectAccess = true;
|
||||||
} else {
|
} else {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user