From b9b45dd2bcce5dacc94dd853732d9a89a28fe606 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 30 Jun 2019 11:43:48 +0200
Subject: [PATCH] Insert T_BAD_CHARACTER tokens for missing characters

The token stream should cover all characters in the original code,
insert a dummy token for missing illegal characters. We should
really be doing this in token_get_all() as well.
---
 lib/PhpParser/Lexer.php                       | 28 ++++++---
 .../parser/errorHandling/lexerErrors.test     | 63 ++++++++++---------
 2 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/lib/PhpParser/Lexer.php b/lib/PhpParser/Lexer.php
index dc5e533..51ffd1e 100644
--- a/lib/PhpParser/Lexer.php
+++ b/lib/PhpParser/Lexer.php
@@ -6,6 +6,11 @@ use PhpParser\Parser\Tokens;
 
 class Lexer
 {
+    /* Token ID used for illegal characters part of the token stream. These are dropped by token_get_all(),
+     * but we restore them here to make sure that the tokens cover the full original text, and to prevent
+     * file positions from going out of sync. */
+    const T_BAD_CHARACTER = -1;
+
     protected $code;
     protected $tokens;
     protected $pos;
@@ -40,7 +45,7 @@ class Lexer
         // map of tokens to drop while lexing (the map is only used for isset lookup,
         // that's why the value is simply set to 1; the value is never actually used.)
         $this->dropTokens = array_fill_keys(
-            [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT], 1
+            [\T_WHITESPACE, \T_OPEN_TAG, \T_COMMENT, \T_DOC_COMMENT, self::T_BAD_CHARACTER], 1
         );
 
         $defaultAttributes = ['comments', 'startLine', 'endLine'];
@@ -92,13 +97,9 @@ class Lexer
     }
 
     private function handleInvalidCharacterRange($start, $end, $line, ErrorHandler $errorHandler) {
+        $tokens = [];
         for ($i = $start; $i < $end; $i++) {
             $chr = $this->code[$i];
-            if ($chr === 'b' || $chr === 'B') {
-                // HHVM does not treat b" tokens correctly, so ignore these
-                continue;
-            }
-
             if ($chr === "\0") {
                 // PHP cuts error message after null byte, so need special case
                 $errorMsg = 'Unexpected null byte';
@@ -108,6 +109,7 @@ class Lexer
                 );
             }
 
+            $tokens[] = [self::T_BAD_CHARACTER, $chr, $line];
             $errorHandler->handleError(new Error($errorMsg, [
                 'startLine' => $line,
                 'endLine' => $line,
@@ -115,6 +117,7 @@ class Lexer
                 'endFilePos' => $i,
             ]));
         }
+        return $tokens;
     }
 
     /**
@@ -155,16 +158,22 @@ class Lexer
 
         $filePos = 0;
         $line = 1;
-        foreach ($this->tokens as $token) {
+        $numTokens = \count($this->tokens);
+        for ($i = 0; $i < $numTokens; $i++) {
+            $token = $this->tokens[$i];
             $tokenValue = \is_string($token) ? $token : $token[1];
             $tokenLen = \strlen($tokenValue);
 
             if (substr($this->code, $filePos, $tokenLen) !== $tokenValue) {
                 // Something is missing, must be an invalid character
                 $nextFilePos = strpos($this->code, $tokenValue, $filePos);
-                $this->handleInvalidCharacterRange(
+                $badCharTokens = $this->handleInvalidCharacterRange(
                     $filePos, $nextFilePos, $line, $errorHandler);
                 $filePos = (int) $nextFilePos;
+
+                array_splice($this->tokens, $i, 0, $badCharTokens);
+                $numTokens += \count($badCharTokens);
+                $i += \count($badCharTokens);
             }
 
             $filePos += $tokenLen;
@@ -187,8 +196,9 @@ class Lexer
                 $this->tokens[] = [$isDocComment ? \T_DOC_COMMENT : \T_COMMENT, $comment, $line];
             } else {
                 // Invalid characters at the end of the input
-                $this->handleInvalidCharacterRange(
+                $badCharTokens = $this->handleInvalidCharacterRange(
                     $filePos, \strlen($this->code), $line, $errorHandler);
+                $this->tokens = array_merge($this->tokens, $badCharTokens);
             }
             return;
         }
diff --git a/test/code/parser/errorHandling/lexerErrors.test b/test/code/parser/errorHandling/lexerErrors.test
index 163b743..7ff27eb 100644
--- a/test/code/parser/errorHandling/lexerErrors.test
+++ b/test/code/parser/errorHandling/lexerErrors.test
@@ -32,24 +32,25 @@ $a = 42;
 @@{ "\1" }@@
 $b = 24;
 -----
+!!positions
 Unexpected character "" (ASCII 1) from 4:1 to 4:1
 array(
-    0: Stmt_Expression(
-        expr: Expr_Assign(
-            var: Expr_Variable(
+    0: Stmt_Expression[3:1 - 3:8](
+        expr: Expr_Assign[3:1 - 3:7](
+            var: Expr_Variable[3:1 - 3:2](
                 name: a
             )
-            expr: Scalar_LNumber(
+            expr: Scalar_LNumber[3:6 - 3:7](
                 value: 42
             )
         )
     )
-    1: Stmt_Expression(
-        expr: Expr_Assign(
-            var: Expr_Variable(
+    1: Stmt_Expression[5:1 - 5:8](
+        expr: Expr_Assign[5:1 - 5:7](
+            var: Expr_Variable[5:1 - 5:2](
                 name: b
             )
-            expr: Scalar_LNumber(
+            expr: Scalar_LNumber[5:6 - 5:7](
                 value: 24
             )
         )
@@ -62,24 +63,25 @@ $a = 42;
 @@{ "\0" }@@
 $b = 24;
 -----
+!!positions
 Unexpected null byte from 4:1 to 4:1
 array(
-    0: Stmt_Expression(
-        expr: Expr_Assign(
-            var: Expr_Variable(
+    0: Stmt_Expression[3:1 - 3:8](
+        expr: Expr_Assign[3:1 - 3:7](
+            var: Expr_Variable[3:1 - 3:2](
                 name: a
             )
-            expr: Scalar_LNumber(
+            expr: Scalar_LNumber[3:6 - 3:7](
                 value: 42
             )
         )
     )
-    1: Stmt_Expression(
-        expr: Expr_Assign(
-            var: Expr_Variable(
+    1: Stmt_Expression[5:1 - 5:8](
+        expr: Expr_Assign[5:1 - 5:7](
+            var: Expr_Variable[5:1 - 5:2](
                 name: b
             )
-            expr: Scalar_LNumber(
+            expr: Scalar_LNumber[5:6 - 5:7](
                 value: 24
             )
         )
@@ -94,35 +96,36 @@ $b = 2;
 @@{ "\2" }@@
 $c = 3;
 -----
-Unexpected character "@@{ "\1" }@@" (ASCII 1) from 4:1 to 4:1
-Unexpected character "@@{ "\2" }@@" (ASCII 2) from 6:1 to 6:1
+!!positions
+Unexpected character "" (ASCII 1) from 4:1 to 4:1
+Unexpected character "" (ASCII 2) from 6:1 to 6:1
 array(
-    0: Stmt_Expression(
-        expr: Expr_Assign(
-            var: Expr_Variable(
+    0: Stmt_Expression[3:1 - 3:7](
+        expr: Expr_Assign[3:1 - 3:6](
+            var: Expr_Variable[3:1 - 3:2](
                 name: a
             )
-            expr: Scalar_LNumber(
+            expr: Scalar_LNumber[3:6 - 3:6](
                 value: 1
             )
         )
     )
-    1: Stmt_Expression(
-        expr: Expr_Assign(
-            var: Expr_Variable(
+    1: Stmt_Expression[5:1 - 5:7](
+        expr: Expr_Assign[5:1 - 5:6](
+            var: Expr_Variable[5:1 - 5:2](
                 name: b
             )
-            expr: Scalar_LNumber(
+            expr: Scalar_LNumber[5:6 - 5:6](
                 value: 2
             )
         )
     )
-    2: Stmt_Expression(
-        expr: Expr_Assign(
-            var: Expr_Variable(
+    2: Stmt_Expression[7:1 - 7:7](
+        expr: Expr_Assign[7:1 - 7:6](
+            var: Expr_Variable[7:1 - 7:2](
                 name: c
             )
-            expr: Scalar_LNumber(
+            expr: Scalar_LNumber[7:6 - 7:6](
                 value: 3
             )
         )