php-parser/lib/PHPParser/Lexer.php

<?php

class PHPParser_Lexer
{
    protected $code;
    protected $tokens;
    protected $pos;
    protected $line;

    protected static $tokenMap;
    protected static $dropTokens = array(
        T_WHITESPACE => 1, T_COMMENT => 1, T_OPEN_TAG => 1
    );

    /**
     * Creates a Lexer.
     *
     * @param string $code
     *
     * @throws PHPParser_Error on lexing errors (unterminated comment or unexpected character)
     */
    public function __construct($code) {
        self::initTokenMap();

        // Reset the error message in error_get_last()
        // Still hoping for a better solution to be found.
        @$errorGetLastResetUndefinedVariable;

        $this->code   = $code;
        $this->tokens = @token_get_all($code);
        $this->pos    = -1;
        $this->line   =  1;

        $error = error_get_last();

        if (preg_match(
                '~^Unterminated comment starting line ([0-9]+)$~',
                $error['message'],
                $matches
            )
        ) {
            throw new PHPParser_Error('Unterminated comment', $matches[1]);
        }

        if (preg_match(
                '~^Unexpected character in input:  \'(.)\' \(ASCII=([0-9]+)\)~s',
                $error['message'],
                $matches
            )
        ) {
            throw new PHPParser_Error(sprintf(
                'Unexpected character "%s" (ASCII %d)',
                $matches[1], $matches[2]
            ));
        }

        // PHP cuts error message after null byte, so need special case
        if (preg_match('~^Unexpected character in input:  \'$~', $error['message'])) {
            throw new PHPParser_Error('Unexpected null byte');
        }
    }

    /**
     * Returns the next token id.
     *
     * @param mixed $value      Variable to store token content in
     * @param mixed $line       Variable to store line in
     * @param mixed $docComment Variable to store doc comment in
     *
     * @return int Token id
     */
    public function lex(&$value = null, &$line = null, &$docComment = null) {
        $docComment = null;

        while (isset($this->tokens[++$this->pos])) {
            $token = $this->tokens[$this->pos];

            if (is_string($token)) {
                $value = $token;
                $line  = $this->line;
                return ord($token);
            } else {
                $this->line += substr_count($token[1], "\n");

                if (T_DOC_COMMENT === $token[0]) {
                    $docComment = $token[1];
                } elseif (!isset(self::$dropTokens[$token[0]])) {
                    $value = $token[1];
                    $line  = $token[2];
                    return self::$tokenMap[$token[0]];
                }
            }
        }

        return 0;
    }

    /**
     * Handles __halt_compiler() by returning the text after it.
     *
     * @return string Remaining text
     */
    public function handleHaltCompiler() {
        // get the length of the text before the T_HALT_COMPILER token
        $textBefore = '';
        for ($i = 0; $i <= $this->pos; ++$i) {
            if (is_string($this->tokens[$i])) {
                $textBefore .= $this->tokens[$i];
            } else {
                $textBefore .= $this->tokens[$i][1];
            }
        }

        // text after T_HALT_COMPILER, still including ();
        $textAfter = substr($this->code, strlen($textBefore));

        // ensure that it is followed by ();
        // this simplifies the situation, by not allowing any comments
        // in between of the tokens.
        if (!preg_match('~\s*\(\s*\)\s*;~', $textAfter, $matches)) {
            throw new PHPParser_Error('__halt_compiler must be followed by "();"');
        }

        // prevent the lexer from returning any further tokens
        $this->pos = count($this->tokens);

        // return with (); removed
        return substr($textAfter, strlen($matches[0]));
    }

    /**
     * Initializes the token map.
     *
     * The token map maps the PHP internal token identifiers
     * to the identifiers used by the Parser. Additionally it
     * maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.
     */
    protected static function initTokenMap() {
        if (!self::$tokenMap) {
            self::$tokenMap = array();

            // 256 is the minimum possible token number, as everything below
            // it is an ASCII value
            for ($i = 256; $i < 1000; ++$i) {
                // T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM
                if (T_DOUBLE_COLON === $i) {
                    self::$tokenMap[$i] = PHPParser_Parser::T_PAAMAYIM_NEKUDOTAYIM;
                // T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO
                } elseif(T_OPEN_TAG_WITH_ECHO === $i) {
                    self::$tokenMap[$i] = PHPParser_Parser::T_ECHO;
                // T_CLOSE_TAG is equivalent to ';'
                } elseif(T_CLOSE_TAG === $i) {
                    self::$tokenMap[$i] = ord(';');
                // and the others can be mapped directly
                } elseif ('UNKNOWN' !== ($name = token_name($i))
                          && defined($name = 'PHPParser_Parser::' . $name)
                ) {
                    self::$tokenMap[$i] = constant($name);
                }
            }
        }
    }
}
Initial commit 2011-04-18 19:02:30 +02:00			`<?php`

Prefix all classes with PHPParser_ 2011-06-05 18:40:04 +02:00			`class PHPParser_Lexer`
Initial commit 2011-04-18 19:02:30 +02:00			`{`
Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`protected $code;`
Initial commit 2011-04-18 19:02:30 +02:00			`protected $tokens;`
			`protected $pos;`
Associate some line information with nodes (currently the line the node ends in, as the starting line is harder to fetch) 2011-06-12 17:12:47 +02:00			`protected $line;`
Initial commit 2011-04-18 19:02:30 +02:00
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`protected static $tokenMap;`
			`protected static $dropTokens = array(`
Associate last encountered doccomment with next node 2011-07-03 16:35:45 +02:00			`T_WHITESPACE => 1, T_COMMENT => 1, T_OPEN_TAG => 1`
Initial commit 2011-04-18 19:02:30 +02:00			`);`

Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`/**`
			`* Creates a Lexer.`
			`*`
			`* @param string $code`
Add some more unit tests 2011-07-13 23:07:05 +02:00			`*`
			`* @throws PHPParser_Error on lexing errors (unterminated comment or unexpected character)`
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`*/`
Initial commit 2011-04-18 19:02:30 +02:00			`public function __construct($code) {`
			`self::initTokenMap();`

Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`// Reset the error message in error_get_last()`
			`// Still hoping for a better solution to be found.`
			`@$errorGetLastResetUndefinedVariable;`

Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`$this->code = $code;`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`$this->tokens = @token_get_all($code);`
Initial commit 2011-04-18 19:02:30 +02:00			`$this->pos = -1;`
Fix incorrect line number extraction 2011-07-13 13:27:14 +02:00			`$this->line = 1;`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00
			`$error = error_get_last();`

			`if (preg_match(`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`'~^Unterminated comment starting line ([0-9]+)$~',`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`$error['message'],`
			`$matches`
			`)`
			`) {`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`throw new PHPParser_Error('Unterminated comment', $matches[1]);`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`}`

			`if (preg_match(`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`'~^Unexpected character in input: \'(.)\' \(ASCII=([0-9]+)\)~s',`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`$error['message'],`
			`$matches`
			`)`
			`) {`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`throw new PHPParser_Error(sprintf(`
			`'Unexpected character "%s" (ASCII %d)',`
			`$matches[1], $matches[2]`
			`));`
			`}`

			`// PHP cuts error message after null byte, so need special case`
			`if (preg_match('~^Unexpected character in input: \'$~', $error['message'])) {`
			`throw new PHPParser_Error('Unexpected null byte');`
Throw ParseErrorException on error instead of error callback As long as the parser isn't reentrant having an error callback doesn't really make sense and only complicates everything. 2011-06-03 17:44:23 +02:00			`}`
Initial commit 2011-04-18 19:02:30 +02:00			`}`

Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`/**`
			`* Returns the next token id.`
			`*`
Associate last encountered doccomment with next node 2011-07-03 16:35:45 +02:00			`* @param mixed $value Variable to store token content in`
			`* @param mixed $line Variable to store line in`
			`* @param mixed $docComment Variable to store doc comment in`
Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`*`
			`* @return int Token id`
			`*/`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`public function lex(&$value = null, &$line = null, &$docComment = null) {`
Associate last encountered doccomment with next node 2011-07-03 16:35:45 +02:00			`$docComment = null;`

Initial commit 2011-04-18 19:02:30 +02:00			`while (isset($this->tokens[++$this->pos])) {`
			`$token = $this->tokens[$this->pos];`
Make halt_compiler() work 2011-06-03 22:02:02 +02:00
Initial commit 2011-04-18 19:02:30 +02:00			`if (is_string($token)) {`
Associate some line information with nodes (currently the line the node ends in, as the starting line is harder to fetch) 2011-06-12 17:12:47 +02:00			`$value = $token;`
			`$line = $this->line;`
Initial commit 2011-04-18 19:02:30 +02:00			`return ord($token);`
Fix incorrect line number extraction 2011-07-13 13:27:14 +02:00			`} else {`
			`$this->line += substr_count($token[1], "\n");`

			`if (T_DOC_COMMENT === $token[0]) {`
			`$docComment = $token[1];`
			`} elseif (!isset(self::$dropTokens[$token[0]])) {`
			`$value = $token[1];`
			`$line = $token[2];`
			`return self::$tokenMap[$token[0]];`
			`}`
Initial commit 2011-04-18 19:02:30 +02:00			`}`
			`}`

			`return 0;`
			`}`

Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`/**`
			`* Handles __halt_compiler() by returning the text after it.`
			`*`
			`* @return string Remaining text`
			`*/`
			`public function handleHaltCompiler() {`
			`// get the length of the text before the T_HALT_COMPILER token`
			`$textBefore = '';`
			`for ($i = 0; $i <= $this->pos; ++$i) {`
			`if (is_string($this->tokens[$i])) {`
			`$textBefore .= $this->tokens[$i];`
			`} else {`
			`$textBefore .= $this->tokens[$i][1];`
			`}`
			`}`

			`// text after T_HALT_COMPILER, still including ();`
			`$textAfter = substr($this->code, strlen($textBefore));`

			`// ensure that it is followed by ();`
			`// this simplifies the situation, by not allowing any comments`
			`// in between of the tokens.`
			`if (!preg_match('~\s\(\s\)\s*;~', $textAfter, $matches)) {`
Rename PHPParser_ParseErrorException to PHPParser_Error 2011-06-05 18:52:41 +02:00			`throw new PHPParser_Error('__halt_compiler must be followed by "();"');`
Make halt_compiler() work 2011-06-03 22:02:02 +02:00			`}`

			`// prevent the lexer from returning any further tokens`
			`$this->pos = count($this->tokens);`

			`// return with (); removed`
			`return substr($textAfter, strlen($matches[0]));`
			`}`

Add doccomments and slightly change some APIs 2011-05-31 16:33:11 +02:00			`/**`
			`* Initializes the token map.`
			`*`
			`* The token map maps the PHP internal token identifiers`
			`* to the identifiers used by the Parser. Additionally it`
			`* maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.`
			`*/`
Start adding Unit test (PHPUnit) 2011-07-13 12:24:10 +02:00			`protected static function initTokenMap() {`
Initial commit 2011-04-18 19:02:30 +02:00			`if (!self::$tokenMap) {`
			`self::$tokenMap = array();`

			`// 256 is the minimum possible token number, as everything below`
			`// it is an ASCII value`
			`for ($i = 256; $i < 1000; ++$i) {`
			`// T_DOUBLE_COLON is equivalent to T_PAAMAYIM_NEKUDOTAYIM`
			`if (T_DOUBLE_COLON === $i) {`
Prefix all classes with PHPParser_ 2011-06-05 18:40:04 +02:00			`self::$tokenMap[$i] = PHPParser_Parser::T_PAAMAYIM_NEKUDOTAYIM;`
Initial commit 2011-04-18 19:02:30 +02:00			`// T_OPEN_TAG_WITH_ECHO with dropped T_OPEN_TAG results in T_ECHO`
			`} elseif(T_OPEN_TAG_WITH_ECHO === $i) {`
Prefix all classes with PHPParser_ 2011-06-05 18:40:04 +02:00			`self::$tokenMap[$i] = PHPParser_Parser::T_ECHO;`
Initial commit 2011-04-18 19:02:30 +02:00			`// T_CLOSE_TAG is equivalent to ';'`
			`} elseif(T_CLOSE_TAG === $i) {`
			`self::$tokenMap[$i] = ord(';');`
			`// and the others can be mapped directly`
Make the parser run without errors on 5.4 2011-07-04 21:09:20 +02:00			`} elseif ('UNKNOWN' !== ($name = token_name($i))`
			`&& defined($name = 'PHPParser_Parser::' . $name)`
			`) {`
			`self::$tokenMap[$i] = constant($name);`
Initial commit 2011-04-18 19:02:30 +02:00			`}`
			`}`
			`}`
			`}`
			`}`