Minor refactor and comments for emlative lexer

The emulative lexer is a single dirty hack so it needs a few more comments :)
2025-01-22 05:11:39 +01:00 · 2012-02-21 17:56:07 +01:00 · 2012-02-21 17:56:07 +01:00 · d98a65086b
commit d98a65086b
parent 608cfbba4e
1 changed files with 65 additions and 12 deletions
--- a/lib/PHPParser/Lexer/Emulative.php
+++ b/lib/PHPParser/Lexer/Emulative.php
@ -23,16 +23,35 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
    public function __construct($code) {
        $this->inObjectAccess = false;

-        if (version_compare(PHP_VERSION, '5.4.0RC1', '<')) {
-            // binary notation
-            $code = preg_replace('(\b0b[01]+\b)', '~__EMU__BINARY__$0__~', $code);
+        // on PHP 5.4 don't do anything
+        if (version_compare(PHP_VERSION, '5.4.0RC1', '>=')) {
+            parent::__construct($code);
+        } else {
+            $code = $this->preprocessCode($code);
+            parent::__construct($code);
+            $this->postprocessTokens();
        }
+    }
+
+    /*
+     * Replaces new features in the code by ~__EMU__{NAME}__{DATA}__~ sequences.
+     * ~LABEL~ is never valid PHP code, that's why we can (to some degree) safely
+     * use it here.
+     * Later when preprocessing the tokens these sequences will either be replaced
+     * by real tokens or replaced with their original content (e.g. if they occured
+     * inside a string, i.e. a place where they don't have a special meaning).
+     */
+    protected function preprocessCode($code) {
+        // binary notation (0b010101101001...)
+        $code = preg_replace('(\b0b[01]+\b)', '~__EMU__BINARY__$0__~', $code);

        if (version_compare(PHP_VERSION, '5.3.0', '<')) {
-            // namespace separator
+            // namespace separator (backslash not followed by some special characters,
+            // which are not valid after a NS separator, but would cause problems with
+            // escape sequence parsing if one would replace the backslash there)
            $code = preg_replace('(\\\\(?!["\'`$\\\\]))', '~__EMU__NS__~', $code);

-            // nowdoc
+            // nowdoc (<<<'ABC'\ncontent\nABC;)
            $code = preg_replace_callback(
                '((*BSR_ANYCRLF)        # set \R to (?>\r\n|\r|\n)
                  (b?<<<[\t ]*\'([a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*)\'\R) # opening token
@ -45,9 +64,31 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
            );
        }

-        parent::__construct($code);
+        return $code;
+    }

+    /*
+     * As nowdocs can have arbitrary content but LABELs can only contain a certain
+     * range of characters, the nowdoc content is encoded as hex and separated by
+     * 'x' tokens. So the result of the encoding will look like this:
+     * ~__EMU__NOWDOC__{HEX(START_TOKEN)}x{HEX(CONTENT)}x{HEX(END_TOKEN)}~
+     */
+    public function encodeNowdocCallback(array $matches) {
+        return '~__EMU__NOWDOC__'
+                . bin2hex($matches[1]) . 'x' . bin2hex($matches[3]) . 'x' . bin2hex($matches[4])
+                . '__~';
+    }
+
+    /*
+     * Replaces the ~__EMU__...~ sequences with real tokens or their original
+     * value.
+     */
+    protected function postprocessTokens() {
+        // we need to manually iterate and manage a count because we'll change
+        // the tokens array on the way
        for ($i = 0, $c = count($this->tokens); $i < $c; ++$i) {
+            // first check that the following tokens are form ~LABEL~,
+            // then match the __EMU__... sequence.
            if ('~' === $this->tokens[$i]
                && isset($this->tokens[$i + 2])
                && '~' === $this->tokens[$i + 2]
@ -55,12 +96,18 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
                && preg_match('(^__EMU__([A-Z]++)__(?:([A-Za-z0-9]++)__)?$)', $this->tokens[$i + 1][1], $matches)
            ) {
                if ('BINARY' === $matches[1]) {
+                    // the binary number can either be an integer or a double, so return a LNUMBER
+                    // or DNUMBER respectively
                    $replace = array(
                        array(is_int(bindec($matches[2])) ? T_LNUMBER : T_DNUMBER, $matches[2], $this->tokens[$i + 1][2])
                    );
                } elseif ('NS' === $matches[1]) {
+                    // a \ single char token is returned here and replaced by a
+                    // PHPParser_Parser::T_NS_SEPARATOR token in ->lex(). This hacks around the
+                    // limitations arising from T_NS_SEPARATOR not being defined on 5.3
                    $replace = array('\\');
                } elseif ('NOWDOC' === $matches[1]) {
+                    // decode the encoded nowdoc payload; pack('H*' is bin2hex( for 5.3
                    list($start, $content, $end) = explode('x', $matches[2]);
                    list($start, $content, $end) = array(pack('H*', $start), pack('H*', $content), pack('H*', $end));

@ -71,11 +118,14 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
                    }
                    $replace[] = array(T_END_HEREDOC, $end, -1);
                } else {
+                    // just ignore all other __EMU__ sequences
                    continue;
                }

                array_splice($this->tokens, $i, 3, $replace);
                $c -= 3 - count($replace);
+            // for multichar tokens (e.g. strings) replace any ~__EMU__...~ sequences
+            // in their content with the original character sequence
            } elseif (is_array($this->tokens[$i])
                      && 0 !== strpos($this->tokens[$i][1], '__EMU__')
            ) {
@ -88,12 +138,10 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
        }
    }

-    public function encodeNowdocCallback(array $matches) {
-        return '~__EMU__NOWDOC__'
-             . bin2hex($matches[1]) . 'x' . bin2hex($matches[3]) . 'x' . bin2hex($matches[4])
-             . '__~';
-    }
-
+    /*
+     * This method is a callback for restoring EMU sequences in
+     * multichar tokens (like strings) to their original value.
+     */
    public function restoreContentCallback(array $matches) {
        if ('BINARY' === $matches[1]) {
            return $matches[2];
@ -110,12 +158,17 @@ class PHPParser_Lexer_Emulative extends PHPParser_Lexer
    public function lex(&$value = null, &$line = null, &$docComment = null) {
        $token = parent::lex($value, $line, $docComment);

+        // replace new keywords by their respective tokens. This is not done
+        // if we currently are in an object access (e.g. in $obj->namespace
+        // "namespace" stays a T_STRING tokens and isn't converted to T_NAMESPACE)
        if (PHPParser_Parser::T_STRING === $token && !$this->inObjectAccess) {
            if (isset(self::$keywords[strtolower($value)])) {
                return self::$keywords[strtolower($value)];
            }
+        // backslashes are replaced by T_NS_SEPARATOR tokens
        } elseif (92 === $token) { // ord('\\')
            return PHPParser_Parser::T_NS_SEPARATOR;
+        // keep track of whether we currently are in an object access (after ->)
        } elseif (PHPParser_Parser::T_OBJECT_OPERATOR === $token) {
            $this->inObjectAccess = true;
        } else {