fix: better heredoc/nowdoc implementation (#183)

2024-11-26 20:04:57 +01:00 · 2022-12-09 01:44:20 +00:00 · 2022-12-09 01:44:20 +00:00 · 825577d18d
commit 825577d18d
parent bd2023c3c4
12 changed files with 435 additions and 193 deletions
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@ -68,7 +68,7 @@ impl Lexer {
                // The shell exec state is entered when inside of a execution string (`).
                StackFrame::ShellExec => self.shell_exec(&mut state, &mut tokens)?,
                // The doc string state is entered when tokenizing heredocs and nowdocs.
-                StackFrame::DocString(kind, label) => {
+                StackFrame::DocString(kind, label, ..) => {
                    let kind = *kind;
                    let label = label.clone();

@ -471,7 +471,12 @@ impl Lexer {
                }

                state.source.next();
-                state.replace(StackFrame::DocString(doc_string_kind, label.clone()));
+                state.replace(StackFrame::DocString(
+                    doc_string_kind,
+                    label.clone(),
+                    DocStringIndentationKind::None,
+                    0,
+                ));

                TokenKind::StartDocString(label, doc_string_kind)
            }
@ -877,52 +882,128 @@ impl Lexer {
        kind: DocStringKind,
        label: ByteString,
    ) -> SyntaxResult<()> {
-        let span = state.source.span();
-        let mut buffer = Vec::new();
-        let mut new_line = false;
-
-        let mut indentation_amount: usize = 0;
-
-        // 1. Check if there's any whitespace here. It can either be a space or tab character.
-        let indentation_type = match state.source.read(1) {
-            [b' '] => Some(DocStringIndentationKind::Space),
-            [b'\t'] => Some(DocStringIndentationKind::Tab),
-            _ => None,
+        match kind {
+            DocStringKind::Heredoc => self.heredoc(state, tokens, label)?,
+            DocStringKind::Nowdoc => self.nowdoc(state, tokens, label)?,
        };

-        // 2. Count how much whitespace there is on this line.
-        if let Some(indentation_type) = indentation_type {
-            loop {
-                match (indentation_type, state.source.read(1)) {
-                    (DocStringIndentationKind::Space, [b' ']) => {
-                        indentation_amount += 1;
-                        state.source.next();
-                        buffer.push(b' ');
-                    }
-                    (DocStringIndentationKind::Tab, [b'\t']) => {
-                        indentation_amount += 1;
-                        state.source.next();
-                        buffer.push(b'\t');
-                    }
-                    _ => break,
-                };
-            }
-        }
+        Ok(())
+    }
+
+    fn heredoc(
+        &self,
+        state: &mut State,
+        tokens: &mut Vec<Token>,
+        label: ByteString,
+    ) -> SyntaxResult<()> {
+        let span = state.source.span();
+        let mut buffer: Vec<u8> = Vec::new();

        let kind = loop {
-            match state.source.read(2) {
-                [b'$', b'{'] if kind == DocStringKind::Heredoc => {
+            match state.source.read(3) {
+                [b'$', b'{', ..] => {
                    state.source.skip(2);
                    state.enter(StackFrame::LookingForVarname);
                    break TokenKind::DollarLeftBrace;
                }
-                [b'{', b'$'] if kind == DocStringKind::Heredoc => {
+                [b'{', b'$', ..] => {
                    // Intentionally only consume the left brace.
                    state.source.next();
                    state.enter(StackFrame::Scripting);
                    break TokenKind::LeftBrace;
                }
-                [b'$', ident_start!()] if kind == DocStringKind::Heredoc => {
+                &[b'\\', b @ (b'"' | b'\\' | b'$'), ..] => {
+                    state.source.skip(2);
+                    buffer.push(b);
+                }
+                &[b'\\', b'n', ..] => {
+                    state.source.skip(2);
+                    buffer.push(b'\n');
+                }
+                &[b'\\', b'r', ..] => {
+                    state.source.skip(2);
+                    buffer.push(b'\r');
+                }
+                &[b'\\', b't', ..] => {
+                    state.source.skip(2);
+                    buffer.push(b'\t');
+                }
+                &[b'\\', b'v', ..] => {
+                    state.source.skip(2);
+                    buffer.push(b'\x0b');
+                }
+                &[b'\\', b'e', ..] => {
+                    state.source.skip(2);
+                    buffer.push(b'\x1b');
+                }
+                &[b'\\', b'f', ..] => {
+                    state.source.skip(2);
+                    buffer.push(b'\x0c');
+                }
+                &[b'\\', b'x', b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')] => {
+                    state.source.skip(3);
+
+                    let mut hex = String::from(b as char);
+                    if let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
+                        state.source.current()
+                    {
+                        state.source.next();
+                        hex.push(*b as char);
+                    }
+
+                    let b = u8::from_str_radix(&hex, 16).unwrap();
+                    buffer.push(b);
+                }
+                &[b'\\', b'u', b'{'] => {
+                    state.source.skip(3);
+
+                    let mut code_point = String::new();
+                    while let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
+                        state.source.current()
+                    {
+                        state.source.next();
+                        code_point.push(*b as char);
+                    }
+
+                    if code_point.is_empty() || state.source.current() != Some(&b'}') {
+                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
+                    }
+                    state.source.next();
+
+                    let c = if let Ok(c) = u32::from_str_radix(&code_point, 16) {
+                        c
+                    } else {
+                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
+                    };
+
+                    if let Some(c) = char::from_u32(c) {
+                        let mut tmp = [0; 4];
+                        let bytes = c.encode_utf8(&mut tmp);
+                        buffer.extend(bytes.as_bytes());
+                    } else {
+                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
+                    }
+                }
+                &[b'\\', b @ b'0'..=b'7', ..] => {
+                    state.source.skip(2);
+
+                    let mut octal = String::from(b as char);
+                    if let Some(b @ b'0'..=b'7') = state.source.current() {
+                        state.source.next();
+                        octal.push(*b as char);
+                    }
+                    if let Some(b @ b'0'..=b'7') = state.source.current() {
+                        state.source.next();
+                        octal.push(*b as char);
+                    }
+
+                    if let Ok(b) = u8::from_str_radix(&octal, 8) {
+                        buffer.push(b);
+                    } else {
+                        return Err(SyntaxError::InvalidOctalEscape(state.source.span()));
+                    }
+                }
+                [b'$', ident_start!(), ..] => {
                    state.source.next();
                    let ident = self.consume_identifier(state);

@ -936,139 +1017,88 @@ impl Lexer {

                    break TokenKind::Variable(ident.into());
                }
-                &[b'\n', ..] => {
-                    new_line = true;
-                    state.source.next();
+                // If we find a new-line, we can start to check if we can see the EndHeredoc token.
+                [b'\n', ..] => {
                    buffer.push(b'\n');
+                    state.source.next();
+
+                    // Check if we can see the closing label right here.
+                    if state.source.at(&label, label.len()) {
+                        state.source.skip(label.len());
+                        state.replace(StackFrame::Scripting);
+                        break TokenKind::EndDocString(label, DocStringIndentationKind::None, 0);
+                    }
+
+                    // Check if there's any whitespace first.
+                    let (whitespace_kind, whitespace_amount) = match state.source.read(1) {
+                        [b' '] => {
+                            let mut amount = 0;
+                            while state.source.read(1) == [b' '] {
+                                amount += 1;
+                                state.source.next();
+                            }
+                            (DocStringIndentationKind::Space, amount)
+                        }
+                        [b'\t'] => {
+                            let mut amount = 0;
+                            while state.source.read(1) == [b'\t'] {
+                                amount += 1;
+                                state.source.next();
+                            }
+                            (DocStringIndentationKind::Tab, amount)
+                        }
+                        _ => (DocStringIndentationKind::None, 0),
+                    };
+
+                    // We've figured out what type of whitespace was being used
+                    // at the start of the line.
+                    // We should now check for any extra whitespace, of any kind.
+                    let mut extra_whitespace_buffer = Vec::new();
+                    while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
+                        extra_whitespace_buffer.push(b);
+                        state.source.next();
+                    }
+
+                    // We've consumed all leading whitespace on this line now,
+                    // so let's try to read the label again.
+                    if state.source.at(&label, label.len()) {
+                        // We've found the label, finally! We need to do 1 last
+                        // check to make sure there wasn't a mixture of indentation types.
+                        if whitespace_kind != DocStringIndentationKind::None
+                            && !extra_whitespace_buffer.is_empty()
+                        {
+                            return Err(SyntaxError::InvalidDocIndentation(state.source.span()));
+                        }
+
+                        // If we get here, only 1 type of indentation was found. We can move
+                        // the process along by reading over the label and breaking out
+                        // with the EndHeredoc token, storing the kind and amount of whitespace.
+                        state.source.skip(label.len());
+                        state.replace(StackFrame::Scripting);
+                        break TokenKind::EndDocString(label, whitespace_kind, whitespace_amount);
+                    } else {
+                        // We didn't find the label. The buffer still needs to know about
+                        // the whitespace, so let's extend the buffer with the whitespace
+                        // and let the loop run again to handle the rest of the line.
+                        if whitespace_kind != DocStringIndentationKind::None {
+                            let whitespace_char: u8 = whitespace_kind.into();
+                            for _ in 0..whitespace_amount {
+                                buffer.push(whitespace_char);
+                            }
+                        }
+
+                        buffer.extend(extra_whitespace_buffer);
+                    }
                }
                &[b, ..] => {
-                    // If we're not on a new line, just add to the buffer as usual.
-                    if !new_line {
-                        new_line = false;
-                        state.source.next();
-                        buffer.push(b);
-                        continue;
-                    }
-
-                    // If we can see the label here, we can consume it and exit early.
-                    if state.source.at(&label, label.length) {
-                        state.source.skip(label.length);
-                        state.replace(StackFrame::Scripting);
-                        break TokenKind::EndDocString(label, None, 0);
-                    }
-
-                    // We know the label isn't at the start of the line, so we can
-                    // check if the line starts with any whitespace.
-                    let line_starts_with_whitespace =
-                        matches!(state.source.read(1), [b' '] | [b'\t']);
-                    let mut current_indentation_amount = 0;
-
-                    // If the line does start with whitespace, let's figure out what the current
-                    // indentation type is and how much whitespace there is.
-                    if line_starts_with_whitespace {
-                        let current_indentation_type = match state.source.read(1) {
-                            [b' '] => DocStringIndentationKind::Space,
-                            [b'\t'] => DocStringIndentationKind::Tab,
-                            _ => unreachable!(),
-                        };
-
-                        // If there was indentation on a previous line, we need to check
-                        // if the current indentation type is the same or different.
-                        // If it's different, we need to produce an error.
-                        if let Some(indentation_type) = indentation_type {
-                            if indentation_type != current_indentation_type {
-                                return Err(SyntaxError::InvalidDocIndentation(
-                                    state.source.span(),
-                                ));
-                            }
-                        }
-
-                        let mut leading_whitespace_buffer = Vec::new();
-
-                        // If the type of whitespace is the same, we want to know
-                        // how much whitespace is on this line. We only care about
-                        // the smallest amount of whitespace in this case.
-                        loop {
-                            match (current_indentation_type, state.source.read(1)) {
-                                (DocStringIndentationKind::Space, [b' ']) => {
-                                    leading_whitespace_buffer.push(b' ');
-                                    current_indentation_amount += 1;
-                                    state.source.next();
-                                }
-                                (DocStringIndentationKind::Tab, [b'\t']) => {
-                                    leading_whitespace_buffer.push(b'\t');
-                                    current_indentation_amount += 1;
-                                    state.source.next();
-                                }
-                                _ => break,
-                            };
-                        }
-
-                        // If we can read the label at this point, we then need to check if the amount
-                        // of indentation is the same or less than the smallest amount encountered thus far.
-                        if state.source.at(&label, label.length)
-                            && current_indentation_amount > indentation_amount
-                        {
-                            return Err(SyntaxError::InvalidDocBodyIndentationLevel(
-                                current_indentation_amount,
-                                state.source.span(),
-                            ));
-                        }
-
-                        // If we've found less whitespace here, we should update the minimum.
-                        if current_indentation_amount < indentation_amount {
-                            indentation_amount = current_indentation_amount;
-                        }
-
-                        let mut whitespace_buffer = Vec::new();
-
-                        // We should now try to consume anymore whitespace, since the doc body
-                        // can include spaces or tabs. We should also push it to the buffer,
-                        // in case we don't encounter the label. In theory, the only whitespace
-                        // we'll encounter here is the character not found by the checks above.
-                        while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
-                            whitespace_buffer.push(*b);
-                            state.source.next();
-                        }
-
-                        // Check if we can read the label again now.
-                        if state.source.at(&label, label.length) {
-                            // If there was extra whitespace after indentation, we need
-                            // to error out about mixed indentation types.
-                            if !whitespace_buffer.is_empty() {
-                                return Err(SyntaxError::InvalidDocIndentation(
-                                    state.source.span(),
-                                ));
-                            }
-
-                            // If no extra whitespace was found, we've reached the end of the heredoc
-                            // and can consume the label, sending the indentation amount along to the parser
-                            // to normalize.
-                            state.source.skip(label.length);
-                            state.replace(StackFrame::Scripting);
-
-                            break TokenKind::EndDocString(
-                                label,
-                                indentation_type,
-                                current_indentation_amount,
-                            );
-                        } else {
-                            buffer.extend(leading_whitespace_buffer);
-                            buffer.extend(whitespace_buffer);
-                            continue;
-                        }
-                    } else {
-                        new_line = false;
-                        state.source.next();
-                        buffer.push(b);
-                    }
+                    state.source.next();
+                    buffer.push(b);
                }
                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
            }
        };

-        // Trailing line breaks in the last segment of a heredoc
-        // shouldn't end up in the final string.
+        // Any trailing line breaks should be removed from the final heredoc.
        if buffer.last() == Some(&b'\n') {
            buffer.pop();
        }
@ -1081,6 +1111,116 @@ impl Lexer {
        }

        tokens.push(Token { kind, span });
+
+        Ok(())
+    }
+
+    fn nowdoc(
+        &self,
+        state: &mut State,
+        tokens: &mut Vec<Token>,
+        label: ByteString,
+    ) -> SyntaxResult<()> {
+        let span = state.source.span();
+        let mut buffer: Vec<u8> = Vec::new();
+
+        let kind = loop {
+            match state.source.read(3) {
+                // If we find a new-line, we can start to check if we can see the EndHeredoc token.
+                [b'\n', ..] => {
+                    buffer.push(b'\n');
+                    state.source.next();
+
+                    // Check if we can see the closing label right here.
+                    if state.source.at(&label, label.len()) {
+                        state.source.skip(label.len());
+                        state.replace(StackFrame::Scripting);
+                        break TokenKind::EndDocString(label, DocStringIndentationKind::None, 0);
+                    }
+
+                    // Check if there's any whitespace first.
+                    let (whitespace_kind, whitespace_amount) = match state.source.read(1) {
+                        [b' '] => {
+                            let mut amount = 0;
+                            while state.source.read(1) == [b' '] {
+                                amount += 1;
+                                state.source.next();
+                            }
+                            (DocStringIndentationKind::Space, amount)
+                        }
+                        [b'\t'] => {
+                            let mut amount = 0;
+                            while state.source.read(1) == [b'\t'] {
+                                amount += 1;
+                                state.source.next();
+                            }
+                            (DocStringIndentationKind::Tab, amount)
+                        }
+                        _ => (DocStringIndentationKind::None, 0),
+                    };
+
+                    // We've figured out what type of whitespace was being used
+                    // at the start of the line.
+                    // We should now check for any extra whitespace, of any kind.
+                    let mut extra_whitespace_buffer = Vec::new();
+                    while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
+                        extra_whitespace_buffer.push(b);
+                        state.source.next();
+                    }
+
+                    // We've consumed all leading whitespace on this line now,
+                    // so let's try to read the label again.
+                    if state.source.at(&label, label.len()) {
+                        // We've found the label, finally! We need to do 1 last
+                        // check to make sure there wasn't a mixture of indentation types.
+                        if whitespace_kind != DocStringIndentationKind::None
+                            && !extra_whitespace_buffer.is_empty()
+                        {
+                            return Err(SyntaxError::InvalidDocIndentation(state.source.span()));
+                        }
+
+                        // If we get here, only 1 type of indentation was found. We can move
+                        // the process along by reading over the label and breaking out
+                        // with the EndHeredoc token, storing the kind and amount of whitespace.
+                        state.source.skip(label.len());
+                        state.replace(StackFrame::Scripting);
+                        break TokenKind::EndDocString(label, whitespace_kind, whitespace_amount);
+                    } else {
+                        // We didn't find the label. The buffer still needs to know about
+                        // the whitespace, so let's extend the buffer with the whitespace
+                        // and let the loop run again to handle the rest of the line.
+                        if whitespace_kind != DocStringIndentationKind::None {
+                            let whitespace_char: u8 = whitespace_kind.into();
+                            for _ in 0..whitespace_amount {
+                                buffer.push(whitespace_char);
+                            }
+                        }
+
+                        buffer.extend(extra_whitespace_buffer);
+                    }
+                }
+                &[b, ..] => {
+                    state.source.next();
+                    buffer.push(b);
+                }
+                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
+            }
+        };
+
+        // Any trailing line breaks should be removed from the final heredoc.
+        if buffer.last() == Some(&b'\n') {
+            buffer.pop();
+        }
+
+        if !buffer.is_empty() {
+            tokens.push(Token {
+                kind: TokenKind::StringPart(buffer.into()),
+                span,
+            })
+        }
+
+        tokens.push(Token { kind, span });
+
        Ok(())
    }

--- a/src/lexer/state.rs
+++ b/src/lexer/state.rs
@ -5,6 +5,9 @@ use crate::lexer::error::SyntaxError;
 use crate::lexer::error::SyntaxResult;
 use crate::lexer::source::Source;

+use super::token::DocStringIndentationAmount;
+use super::token::DocStringIndentationKind;
+
 #[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Copy)]
 pub enum DocStringKind {
    Heredoc,
@ -18,7 +21,12 @@ pub enum StackFrame {
    Halted,
    DoubleQuote,
    ShellExec,
-    DocString(DocStringKind, ByteString),
+    DocString(
+        DocStringKind,
+        ByteString,
+        DocStringIndentationKind,
+        DocStringIndentationAmount,
+    ),
    LookingForVarname,
    LookingForProperty,
    VarOffset,
--- a/src/lexer/token.rs
+++ b/src/lexer/token.rs
@ -11,10 +11,14 @@ pub enum OpenTagKind {
    Full,
 }

+pub type DocStringIndentationAmount = usize;
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 pub enum DocStringIndentationKind {
    Space,
    Tab,
+    None,
+    Both,
 }

 impl From<u8> for DocStringIndentationKind {
@ -32,6 +36,7 @@ impl From<DocStringIndentationKind> for u8 {
        match kind {
            DocStringIndentationKind::Space => b' ',
            DocStringIndentationKind::Tab => b'\t',
+            _ => unreachable!(),
        }
    }
 }
@ -43,7 +48,7 @@ pub enum TokenKind {
    Parent,
    Backtick,
    StartDocString(ByteString, DocStringKind),
-    EndDocString(ByteString, Option<DocStringIndentationKind>, usize),
+    EndDocString(ByteString, DocStringIndentationKind, usize),
    From,
    Print,
    Dollar,
--- a/src/parser/expressions.rs
+++ b/src/parser/expressions.rs
@ -1,5 +1,7 @@
 use crate::expect_token;
 use crate::expected_token_err;
+use crate::lexer::error::SyntaxError;
+use crate::lexer::token::DocStringIndentationKind;
 use crate::lexer::token::TokenKind;
 use crate::lexer::DocStringKind;
 use crate::parser::ast;
@ -971,6 +973,7 @@ fn shell_exec(state: &mut State) -> ParseResult<Expression> {

 #[inline(always)]
 fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression> {
+    let span = state.current.span;
    state.next();

    Ok(match kind {
@ -992,19 +995,65 @@ fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression>

            state.next();

-            // FIXME: Can we move this logic above into the loop, by peeking ahead in
-            //        the token stream for the EndHeredoc? Might be more performant.
-            if let Some(indentation_type) = indentation_type {
-                let search_char: u8 = indentation_type.into();
+            let mut new_line = true;
+            if indentation_type != DocStringIndentationKind::None {
+                let indentation_char: u8 = indentation_type.into();

                for part in parts.iter_mut() {
+                    // We only need to strip and validate indentation
+                    // for individual lines, so we can skip checks if
+                    // we know we're not on a new line.
+                    if !new_line {
+                        continue;
+                    }
+
                    match part {
                        StringPart::Const(bytes) => {
-                            for _ in 0..indentation_amount {
-                                if bytes.starts_with(&[search_char]) {
-                                    bytes.remove(0);
-                                }
+                            // 1. If this line doesn't start with any whitespace,
+                            //    we can return an error early because we know
+                            //    the label was indented.
+                            if !bytes.starts_with(&[b' ']) && !bytes.starts_with(&[b'\t']) {
+                                return Err(ParseError::SyntaxError(
+                                    SyntaxError::InvalidDocBodyIndentationLevel(
+                                        indentation_amount,
+                                        span,
+                                    ),
+                                ));
                            }
+
+                            // 2. If this line doesn't start with the correct
+                            //    type of whitespace, we can also return an error.
+                            if !bytes.starts_with(&[indentation_char]) {
+                                return Err(ParseError::SyntaxError(
+                                    SyntaxError::InvalidDocIndentation(span),
+                                ));
+                            }
+
+                            // 3. We now know that the whitespace at the start of
+                            //    this line is correct, so we need to check that the
+                            //    amount of whitespace is correct too. In this case,
+                            //    the amount of whitespace just needs to be at least
+                            //    the same, so we can create a vector containing the
+                            //    minimum and check using `starts_with()`.
+                            let expected_whitespace_buffer =
+                                vec![indentation_char; indentation_amount];
+                            if !bytes.starts_with(&expected_whitespace_buffer) {
+                                return Err(ParseError::SyntaxError(
+                                    SyntaxError::InvalidDocBodyIndentationLevel(
+                                        indentation_amount,
+                                        span,
+                                    ),
+                                ));
+                            }
+
+                            // 4. All of the above checks have passed, so we know
+                            //    there are no more possible errors. Let's now
+                            //    strip the leading whitespace accordingly.
+                            *bytes = bytes
+                                .strip_prefix(&expected_whitespace_buffer[..])
+                                .unwrap()
+                                .into();
+                            new_line = bytes.ends_with(&[b'\n']);
                        }
                        _ => continue,
                    }
@ -1014,31 +1063,71 @@ fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression>
            Expression::Heredoc { parts }
        }
        DocStringKind::Nowdoc => {
-            // FIXME: This feels hacky. We should probably produce different tokens from the lexer
-            //        but since I already had the logic in place for parsing heredocs, this was
-            //        the fastest way to get nowdocs working too.
-            let mut s = expect_token!([
-                    TokenKind::StringPart(s) => s
-                ], state, "constant string");
+            let mut string_part = expect_token!([
+                TokenKind::StringPart(s) => s,
+            ], state, "constant string");

-            let (indentation_type, indentation_amount) = expect_token!([
-                    TokenKind::EndDocString(_, indentation_type, indentation_amount) => (indentation_type, indentation_amount)
-                ], state, "label");
+            let (indentation_type, indentation_amount) = match state.current.kind {
+                TokenKind::EndDocString(_, indentation_type, indentation_amount) => {
+                    (indentation_type, indentation_amount)
+                }
+                _ => unreachable!(),
+            };

-            // FIXME: Hacky code, but it's late and I want to get this done.
-            if let Some(indentation_type) = indentation_type {
-                let search_char: u8 = indentation_type.into();
-                let mut lines = s
+            state.next();
+
+            if indentation_type != DocStringIndentationKind::None {
+                let indentation_char: u8 = indentation_type.into();
+
+                let mut lines = string_part
                    .split(|b| *b == b'\n')
                    .map(|s| s.to_vec())
                    .collect::<Vec<Vec<u8>>>();
+
                for line in lines.iter_mut() {
-                    for _ in 0..indentation_amount {
-                        if line.starts_with(&[search_char]) {
-                            line.remove(0);
-                        }
+                    if line.is_empty() {
+                        continue;
                    }
+
+                    // 1. If this line doesn't start with any whitespace,
+                    //    we can return an error early because we know
+                    //    the label was indented.
+                    if !line.starts_with(&[b' ']) && !line.starts_with(&[b'\t']) {
+                        return Err(ParseError::SyntaxError(
+                            SyntaxError::InvalidDocBodyIndentationLevel(indentation_amount, span),
+                        ));
+                    }
+
+                    // 2. If this line doesn't start with the correct
+                    //    type of whitespace, we can also return an error.
+                    if !line.starts_with(&[indentation_char]) {
+                        return Err(ParseError::SyntaxError(SyntaxError::InvalidDocIndentation(
+                            span,
+                        )));
+                    }
+
+                    // 3. We now know that the whitespace at the start of
+                    //    this line is correct, so we need to check that the
+                    //    amount of whitespace is correct too. In this case,
+                    //    the amount of whitespace just needs to be at least
+                    //    the same, so we can create a vector containing the
+                    //    minimum and check using `starts_with()`.
+                    let expected_whitespace_buffer = vec![indentation_char; indentation_amount];
+                    if !line.starts_with(&expected_whitespace_buffer) {
+                        return Err(ParseError::SyntaxError(
+                            SyntaxError::InvalidDocBodyIndentationLevel(indentation_amount, span),
+                        ));
+                    }
+
+                    // 4. All of the above checks have passed, so we know
+                    //    there are no more possible errors. Let's now
+                    //    strip the leading whitespace accordingly.
+                    *line = line
+                        .strip_prefix(&expected_whitespace_buffer[..])
+                        .unwrap()
+                        .into();
                }
+
                let mut bytes = Vec::new();
                for (i, line) in lines.iter().enumerate() {
                    bytes.extend(line);
@ -1046,10 +1135,10 @@ fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression>
                        bytes.push(b'\n');
                    }
                }
-                s = bytes.into();
+                string_part = bytes.into();
            }

-            Expression::Nowdoc { value: s }
+            Expression::Nowdoc { value: string_part }
        }
    })
 }
--- a/tests/fixtures/0226/lexer-error.txt
+++ b/tests/fixtures/0226/lexer-error.txt
@ -1 +0,0 @@
-InvalidDocBodyIndentationLevel(4, (5, 5)) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 5
--- a/tests/fixtures/0226/parser-error.txt
+++ b/tests/fixtures/0226/parser-error.txt
@ -0,0 +1 @@
+SyntaxError(InvalidDocBodyIndentationLevel(4, (3, 1))) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 3
--- a/tests/fixtures/0227/lexer-error.txt
+++ b/tests/fixtures/0227/lexer-error.txt
@ -1 +0,0 @@
-InvalidDocIndentation((5, 1)) -> Syntax Error: Invalid indentation - cannot use tabs and spaces on line 5
--- a/tests/fixtures/0227/parser-error.txt
+++ b/tests/fixtures/0227/parser-error.txt
@ -0,0 +1 @@
+SyntaxError(InvalidDocIndentation((3, 1))) -> Syntax Error: Invalid indentation - cannot use tabs and spaces on line 3
--- a/tests/fixtures/0233/lexer-error.txt
+++ b/tests/fixtures/0233/lexer-error.txt
@ -1 +0,0 @@
-InvalidDocBodyIndentationLevel(4, (5, 5)) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 5
--- a/tests/fixtures/0233/parser-error.txt
+++ b/tests/fixtures/0233/parser-error.txt
@ -0,0 +1 @@
+SyntaxError(InvalidDocBodyIndentationLevel(4, (3, 1))) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 3
--- a/tests/fixtures/0234/lexer-error.txt
+++ b/tests/fixtures/0234/lexer-error.txt
@ -1 +0,0 @@
-InvalidDocBodyIndentationLevel(4, (5, 5)) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 5
--- a/tests/fixtures/0234/parser-error.txt
+++ b/tests/fixtures/0234/parser-error.txt
@ -0,0 +1 @@
+SyntaxError(InvalidDocBodyIndentationLevel(4, (3, 1))) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 3
				`@ -1 +0,0 @@`
				`InvalidDocBodyIndentationLevel(4, (5, 5)) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 5`
				`@ -0,0 +1 @@`
				`SyntaxError(InvalidDocBodyIndentationLevel(4, (3, 1))) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 3`
				`@ -1 +0,0 @@`
				`InvalidDocIndentation((5, 1)) -> Syntax Error: Invalid indentation - cannot use tabs and spaces on line 5`
				`@ -0,0 +1 @@`
				`SyntaxError(InvalidDocIndentation((3, 1))) -> Syntax Error: Invalid indentation - cannot use tabs and spaces on line 3`