php-parser/lib/PHPParser/Lexer.php

<?php

class PHPParser_Lexer
{
    protected $code;
    protected $tokens;
    protected $pos;
    protected $line;

    protected $tokenMap;
    protected $dropTokens;

    /**
     * Creates a Lexer.
     */
    public function __construct() {
        // map from internal tokens to PHPParser tokens
        $this->tokenMap = $this->createTokenMap();

        // map of tokens to drop while lexing (the map is only used for isset lookup,
        // that's why the value is simply set to 1; the value is never actually used.)
        $this->dropTokens = array_fill_keys(array(T_WHITESPACE, T_COMMENT, T_OPEN_TAG), 1);
    }

    /**
     * Initializes the lexer for lexing the provided source code.
     *
     * @param string $code The source code to lex
     *
     * @throws PHPParser_Error on lexing errors (unterminated comment or unexpected character)
     */
    public function startLexing($code) {
        $this->resetErrors();
        $this->tokens = @token_get_all($code);
        $this->handleErrors();

        $this->code = $code; // keep the code around for __halt_compiler() handling
        $this->pos  = -1;
        $this->line =  1;
    }

    protected function resetErrors() {
        // clear error_get_last() by forcing an undefined variable error
        @$undefinedVariable;
    }

    protected function handleErrors() {
        $error = error_get_last();

        if (preg_match(
            '~^Unterminated comment starting line ([0-9]+)$~',
            $error['message'], $matches
        )) {
            throw new PHPParser_Error('Unterminated comment', $matches[1]);
        }

        if (preg_match(
            '~^Unexpected character in input:  \'(.)\' \(ASCII=([0-9]+)\)~s',
            $error['message'], $matches
        )) {
            throw new PHPParser_Error(sprintf(
                'Unexpected character "%s" (ASCII %d)',
                $matches[1], $matches[2]
            ));
        }

        // PHP cuts error message after null byte, so need special case
        if (preg_match('~^Unexpected character in input:  \'$~', $error['message'])) {
            throw new PHPParser_Error('Unexpected null byte');
        }
    }

    /**
     * Returns the next token id.
     *
     * @param mixed $value      Variable to store token content in
     * @param mixed $line       Variable to store line in
     * @param mixed $docComment Variable to store doc comment in
     *
     * @return int Token id
     */
    public function getNextToken(&$value = null, &$line = null, &$docComment = null) {
        $docComment = null;

        while (isset($this->tokens[++$this->pos])) {
            $token = $this->tokens[$this->pos];

            if (is_string($token)) {
                $line = $this->line;

                // bug in token_get_all
                if ('b"' === $token) {
                    $value = 'b"';
                    return ord('"');
                } else {
                    $value = $token;
                    return ord($token);
                }
            } else {
                $this->line += substr_count($token[1], "\n");

                if (T_DOC_COMMENT === $token[0]) {
                    $docComment = $token[1];
                } elseif (!isset($this->dropTokens[$token[0]])) {
                    $value = $token[1];
                    $line  = $token[2];
                    return $this->tokenMap[$token[0]];
                }
            }
        }

        // 0 is the EOF token
        return 0;
    }

    /**
     * Handles __halt_compiler() by returning the text after it.
     *
     * @return string Remaining text
     */
    public function handleHaltCompiler() {
        // get the length of the text before the T_HALT_COMPILER token
        $textBefore = '';
        for ($i = 0; $i <= $this->pos; ++$i) {
            if (is_string($this->tokens[$i])) {
                $textBefore .= $this->tokens[$i];
            } else {
                $textBefore .= $this->tokens[$i][1];
            }
        }

        // text after T_HALT_COMPILER, still including ();
        $textAfter = substr($this->code, strlen($textBefore));

        // ensure that it is followed by ();
        // this simplifies the situation, by not allowing any comments
        // in between of the tokens.
        if (!preg_match('~\s*\(\s*\)\s*(?:;|\?>\r?\n?)~', $textAfter, $matches)) {
            throw new PHPParser_Error('__halt_compiler must be followed by "();"');
        }

        // prevent the lexer from returning any further tokens
        $this->pos = count($this->tokens);

        // return with (); removed
        return (string) substr($textAfter, strlen($matches[0])); // (string) converts false to ''
    }

    /**
     * Creates the token map.
     *
     * The token map maps the PHP internal token identifiers
     * to the identifiers used by the Parser. Additionally it
     * maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.
     *
     * @return array The token map
     */
    protected function createTokenMap() {
        $tokenMap = array();

        // 256 is the minimum possible token number, as everything below
        // it is an ASCII value
        for ($i = 256; $i < 1000; ++$i) {
            // T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM
            if (T_DOUBLE_COLON === $i) {
                $tokenMap[$i] = PHPParser_Parser::T_PAAMAYIM_NEKUDOTAYIM;
            // T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO
            } elseif(T_OPEN_TAG_WITH_ECHO === $i) {
                $tokenMap[$i] = PHPParser_Parser::T_ECHO;
            // T_CLOSE_TAG is equivalent to ';'
            } elseif(T_CLOSE_TAG === $i) {
                $tokenMap[$i] = ord(';');
            // and the others can be mapped directly
            } elseif ('UNKNOWN' !== ($name = token_name($i))
                      && defined($name = 'PHPParser_Parser::' . $name)
            ) {
                $tokenMap[$i] = constant($name);
            }
        }

        return $tokenMap;
    }
}
Initial commit 2011-04-18 19:02:30 +02:00			`<?php`

Prefix all classes with PHPParser_ 2011-06-05 18:40:04 +02:00			`class PHPParser_Lexer`
Initial commit 2011-04-18 19:02:30 +02:00			`{`
Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`protected $code;`
Initial commit 2011-04-18 19:02:30 +02:00			`protected $tokens;`
			`protected $pos;`
Associate some line information with nodes (currently the line the node ends in, as the starting line is harder to fetch) 2011-06-12 17:12:47 +02:00			`protected $line;`
Initial commit 2011-04-18 19:02:30 +02:00
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`protected $tokenMap;`
			`protected $dropTokens;`
Initial commit 2011-04-18 19:02:30 +02:00
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`/**`
			`* Creates a Lexer.`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`*/`
			`public function __construct() {`
			`// map from internal tokens to PHPParser tokens`
			`$this->tokenMap = $this->createTokenMap();`

			`// map of tokens to drop while lexing (the map is only used for isset lookup,`
			`// that's why the value is simply set to 1; the value is never actually used.)`
			`$this->dropTokens = array_fill_keys(array(T_WHITESPACE, T_COMMENT, T_OPEN_TAG), 1);`
			`}`

			`/**`
			`* Initializes the lexer for lexing the provided source code.`
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`*`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`* @param string $code The source code to lex`
Add some more unit tests 2011-07-13 23:07:05 +02:00			`*`
			`* @throws PHPParser_Error on lexing errors (unterminated comment or unexpected character)`
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`*/`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`public function startLexing($code) {`
Factor out error handling out of Lexer construcor Makes the constructor more concise and puts the strange error handling stuff in separate methods 2012-02-21 17:00:49 +01:00			`$this->resetErrors();`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`$this->tokens = @token_get_all($code);`
Factor out error handling out of Lexer construcor Makes the constructor more concise and puts the strange error handling stuff in separate methods 2012-02-21 17:00:49 +01:00			`$this->handleErrors();`

			`$this->code = $code; // keep the code around for __halt_compiler() handling`
			`$this->pos = -1;`
			`$this->line = 1;`
			`}`

			`protected function resetErrors() {`
			`// clear error_get_last() by forcing an undefined variable error`
			`@$undefinedVariable;`
			`}`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00
Factor out error handling out of Lexer construcor Makes the constructor more concise and puts the strange error handling stuff in separate methods 2012-02-21 17:00:49 +01:00			`protected function handleErrors() {`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`$error = error_get_last();`

			`if (preg_match(`
Factor out error handling out of Lexer construcor Makes the constructor more concise and puts the strange error handling stuff in separate methods 2012-02-21 17:00:49 +01:00			`'~^Unterminated comment starting line ([0-9]+)$~',`
			`$error['message'], $matches`
			`)) {`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`throw new PHPParser_Error('Unterminated comment', $matches[1]);`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`}`

			`if (preg_match(`
Factor out error handling out of Lexer construcor Makes the constructor more concise and puts the strange error handling stuff in separate methods 2012-02-21 17:00:49 +01:00			`'~^Unexpected character in input: \'(.)\' \(ASCII=([0-9]+)\)~s',`
			`$error['message'], $matches`
			`)) {`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`throw new PHPParser_Error(sprintf(`
			`'Unexpected character "%s" (ASCII %d)',`
			`$matches[1], $matches[2]`
			`));`
			`}`

			`// PHP cuts error message after null byte, so need special case`
			`if (preg_match('~^Unexpected character in input: \'$~', $error['message'])) {`
			`throw new PHPParser_Error('Unexpected null byte');`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`}`
Initial commit 2011-04-18 19:02:30 +02:00			`}`

Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`/**`
			`* Returns the next token id.`
			`*`
Associate last encountered doccomment with next node 2011-07-03 16:35:45 +02:00			`* @param mixed $value Variable to store token content in`
			`* @param mixed $line Variable to store line in`
			`* @param mixed $docComment Variable to store doc comment in`
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`*`
			`* @return int Token id`
			`*/`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`public function getNextToken(&$value = null, &$line = null, &$docComment = null) {`
Associate last encountered doccomment with next node 2011-07-03 16:35:45 +02:00			`$docComment = null;`

Initial commit 2011-04-18 19:02:30 +02:00			`while (isset($this->tokens[++$this->pos])) {`
			`$token = $this->tokens[$this->pos];`
Make halt_compiler() work 2011-06-03 22:02:02 +02:00
Initial commit 2011-04-18 19:02:30 +02:00			`if (is_string($token)) {`
Circumvent token_get_all bug with b"$var" 2011-10-19 18:09:13 +02:00			`$line = $this->line;`

			`// bug in token_get_all`
			`if ('b"' === $token) {`
			`$value = 'b"';`
			`return ord('"');`
			`} else {`
			`$value = $token;`
			`return ord($token);`
			`}`
Fix incorrect line number extraction 2011-07-13 13:27:14 +02:00			`} else {`
			`$this->line += substr_count($token[1], "\n");`

			`if (T_DOC_COMMENT === $token[0]) {`
			`$docComment = $token[1];`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`} elseif (!isset($this->dropTokens[$token[0]])) {`
Fix incorrect line number extraction 2011-07-13 13:27:14 +02:00			`$value = $token[1];`
			`$line = $token[2];`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`return $this->tokenMap[$token[0]];`
Fix incorrect line number extraction 2011-07-13 13:27:14 +02:00			`}`
Initial commit 2011-04-18 19:02:30 +02:00			`}`
			`}`

Start refactoring parser skeleton The yacc parser skeleton with all those odd $yy short names is quite non-obvious. This commits starts to refactor it a bit, to use more obvious names and logic. 2012-04-29 22:57:46 +02:00			`// 0 is the EOF token`
Initial commit 2011-04-18 19:02:30 +02:00			`return 0;`
			`}`

Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`/**`
			`* Handles __halt_compiler() by returning the text after it.`
			`*`
			`* @return string Remaining text`
			`*/`
			`public function handleHaltCompiler() {`
			`// get the length of the text before the T_HALT_COMPILER token`
			`$textBefore = '';`
			`for ($i = 0; $i <= $this->pos; ++$i) {`
			`if (is_string($this->tokens[$i])) {`
			`$textBefore .= $this->tokens[$i];`
			`} else {`
			`$textBefore .= $this->tokens[$i][1];`
			`}`
			`}`

			`// text after T_HALT_COMPILER, still including ();`
			`$textAfter = substr($this->code, strlen($textBefore));`

			`// ensure that it is followed by ();`
			`// this simplifies the situation, by not allowing any comments`
			`// in between of the tokens.`
Fix __halt_compiler with newline after closing tag The newline after the closing tag is now not returned as part of the remaining text (PHP eats one newline after ?>). 2011-11-27 11:21:06 +01:00			`if (!preg_match('~\s\(\s\)\s*(?:;\|\?>\r?\n?)~', $textAfter, $matches)) {`
Rename PHPParser_ParseErrorException to PHPParser_Error 2011-06-05 18:52:41 +02:00			`throw new PHPParser_Error('__halt_compiler must be followed by "();"');`
Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`}`

			`// prevent the lexer from returning any further tokens`
			`$this->pos = count($this->tokens);`

			`// return with (); removed`
Fix __halt_compiler() usage in namespace This fixes the only left bug that was reported by parsing the PHP testsuite :) 2011-12-07 18:36:38 +01:00			`return (string) substr($textAfter, strlen($matches[0])); // (string) converts false to ''`
Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`}`

Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`/**`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`* Creates the token map.`
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`*`
			`* The token map maps the PHP internal token identifiers`
			`* to the identifiers used by the Parser. Additionally it`
			`* maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`*`
			`* @return array The token map`
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`*/`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00			`protected function createTokenMap() {`
			`$tokenMap = array();`

			`// 256 is the minimum possible token number, as everything below`
			`// it is an ASCII value`
			`for ($i = 256; $i < 1000; ++$i) {`
			`// T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM`
			`if (T_DOUBLE_COLON === $i) {`
			`$tokenMap[$i] = PHPParser_Parser::T_PAAMAYIM_NEKUDOTAYIM;`
			`// T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO`
			`} elseif(T_OPEN_TAG_WITH_ECHO === $i) {`
			`$tokenMap[$i] = PHPParser_Parser::T_ECHO;`
			`// T_CLOSE_TAG is equivalent to ';'`
			`} elseif(T_CLOSE_TAG === $i) {`
			`$tokenMap[$i] = ord(';');`
			`// and the others can be mapped directly`
			`} elseif ('UNKNOWN' !== ($name = token_name($i))`
			`&& defined($name = 'PHPParser_Parser::' . $name)`
			`) {`
			`$tokenMap[$i] = constant($name);`
Initial commit 2011-04-18 19:02:30 +02:00			`}`
			`}`
Use inject-once approach for lexer Now the lexer is injected only once when creating the parser. Instead of $parser = new PHPParser_Parser; $parser->parse(new PHPParser_Lexer($code)); $parser->parse(new PHPParser_Lexer($code2)); you write: $parser = new PHPParser_Parser(new PHPParser_Lexer); $parser->parse($code); $parser->parse($code2); 2012-04-25 20:04:46 +02:00
			`return $tokenMap;`
Initial commit 2011-04-18 19:02:30 +02:00			`}`
			`}`