fix: better heredoc/nowdoc implementation (#183)

This commit is contained in:
Ryan Chandler 2022-12-09 01:44:20 +00:00 committed by GitHub
parent bd2023c3c4
commit 825577d18d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 435 additions and 193 deletions

View File

@ -68,7 +68,7 @@ impl Lexer {
// The shell exec state is entered when inside of a execution string (`).
StackFrame::ShellExec => self.shell_exec(&mut state, &mut tokens)?,
// The doc string state is entered when tokenizing heredocs and nowdocs.
StackFrame::DocString(kind, label) => {
StackFrame::DocString(kind, label, ..) => {
let kind = *kind;
let label = label.clone();
@ -471,7 +471,12 @@ impl Lexer {
}
state.source.next();
state.replace(StackFrame::DocString(doc_string_kind, label.clone()));
state.replace(StackFrame::DocString(
doc_string_kind,
label.clone(),
DocStringIndentationKind::None,
0,
));
TokenKind::StartDocString(label, doc_string_kind)
}
@ -877,52 +882,128 @@ impl Lexer {
kind: DocStringKind,
label: ByteString,
) -> SyntaxResult<()> {
let span = state.source.span();
let mut buffer = Vec::new();
let mut new_line = false;
let mut indentation_amount: usize = 0;
// 1. Check if there's any whitespace here. It can either be a space or tab character.
let indentation_type = match state.source.read(1) {
[b' '] => Some(DocStringIndentationKind::Space),
[b'\t'] => Some(DocStringIndentationKind::Tab),
_ => None,
match kind {
DocStringKind::Heredoc => self.heredoc(state, tokens, label)?,
DocStringKind::Nowdoc => self.nowdoc(state, tokens, label)?,
};
// 2. Count how much whitespace there is on this line.
if let Some(indentation_type) = indentation_type {
loop {
match (indentation_type, state.source.read(1)) {
(DocStringIndentationKind::Space, [b' ']) => {
indentation_amount += 1;
state.source.next();
buffer.push(b' ');
}
(DocStringIndentationKind::Tab, [b'\t']) => {
indentation_amount += 1;
state.source.next();
buffer.push(b'\t');
}
_ => break,
};
}
}
Ok(())
}
fn heredoc(
&self,
state: &mut State,
tokens: &mut Vec<Token>,
label: ByteString,
) -> SyntaxResult<()> {
let span = state.source.span();
let mut buffer: Vec<u8> = Vec::new();
let kind = loop {
match state.source.read(2) {
[b'$', b'{'] if kind == DocStringKind::Heredoc => {
match state.source.read(3) {
[b'$', b'{', ..] => {
state.source.skip(2);
state.enter(StackFrame::LookingForVarname);
break TokenKind::DollarLeftBrace;
}
[b'{', b'$'] if kind == DocStringKind::Heredoc => {
[b'{', b'$', ..] => {
// Intentionally only consume the left brace.
state.source.next();
state.enter(StackFrame::Scripting);
break TokenKind::LeftBrace;
}
[b'$', ident_start!()] if kind == DocStringKind::Heredoc => {
&[b'\\', b @ (b'"' | b'\\' | b'$'), ..] => {
state.source.skip(2);
buffer.push(b);
}
&[b'\\', b'n', ..] => {
state.source.skip(2);
buffer.push(b'\n');
}
&[b'\\', b'r', ..] => {
state.source.skip(2);
buffer.push(b'\r');
}
&[b'\\', b't', ..] => {
state.source.skip(2);
buffer.push(b'\t');
}
&[b'\\', b'v', ..] => {
state.source.skip(2);
buffer.push(b'\x0b');
}
&[b'\\', b'e', ..] => {
state.source.skip(2);
buffer.push(b'\x1b');
}
&[b'\\', b'f', ..] => {
state.source.skip(2);
buffer.push(b'\x0c');
}
&[b'\\', b'x', b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')] => {
state.source.skip(3);
let mut hex = String::from(b as char);
if let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
state.source.current()
{
state.source.next();
hex.push(*b as char);
}
let b = u8::from_str_radix(&hex, 16).unwrap();
buffer.push(b);
}
&[b'\\', b'u', b'{'] => {
state.source.skip(3);
let mut code_point = String::new();
while let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
state.source.current()
{
state.source.next();
code_point.push(*b as char);
}
if code_point.is_empty() || state.source.current() != Some(&b'}') {
return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
}
state.source.next();
let c = if let Ok(c) = u32::from_str_radix(&code_point, 16) {
c
} else {
return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
};
if let Some(c) = char::from_u32(c) {
let mut tmp = [0; 4];
let bytes = c.encode_utf8(&mut tmp);
buffer.extend(bytes.as_bytes());
} else {
return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
}
}
&[b'\\', b @ b'0'..=b'7', ..] => {
state.source.skip(2);
let mut octal = String::from(b as char);
if let Some(b @ b'0'..=b'7') = state.source.current() {
state.source.next();
octal.push(*b as char);
}
if let Some(b @ b'0'..=b'7') = state.source.current() {
state.source.next();
octal.push(*b as char);
}
if let Ok(b) = u8::from_str_radix(&octal, 8) {
buffer.push(b);
} else {
return Err(SyntaxError::InvalidOctalEscape(state.source.span()));
}
}
[b'$', ident_start!(), ..] => {
state.source.next();
let ident = self.consume_identifier(state);
@ -936,139 +1017,88 @@ impl Lexer {
break TokenKind::Variable(ident.into());
}
&[b'\n', ..] => {
new_line = true;
state.source.next();
// If we find a new-line, we can start to check if we can see the EndHeredoc token.
[b'\n', ..] => {
buffer.push(b'\n');
state.source.next();
// Check if we can see the closing label right here.
if state.source.at(&label, label.len()) {
state.source.skip(label.len());
state.replace(StackFrame::Scripting);
break TokenKind::EndDocString(label, DocStringIndentationKind::None, 0);
}
// Check if there's any whitespace first.
let (whitespace_kind, whitespace_amount) = match state.source.read(1) {
[b' '] => {
let mut amount = 0;
while state.source.read(1) == [b' '] {
amount += 1;
state.source.next();
}
(DocStringIndentationKind::Space, amount)
}
[b'\t'] => {
let mut amount = 0;
while state.source.read(1) == [b'\t'] {
amount += 1;
state.source.next();
}
(DocStringIndentationKind::Tab, amount)
}
_ => (DocStringIndentationKind::None, 0),
};
// We've figured out what type of whitespace was being used
// at the start of the line.
// We should now check for any extra whitespace, of any kind.
let mut extra_whitespace_buffer = Vec::new();
while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
extra_whitespace_buffer.push(b);
state.source.next();
}
// We've consumed all leading whitespace on this line now,
// so let's try to read the label again.
if state.source.at(&label, label.len()) {
// We've found the label, finally! We need to do 1 last
// check to make sure there wasn't a mixture of indentation types.
if whitespace_kind != DocStringIndentationKind::None
&& !extra_whitespace_buffer.is_empty()
{
return Err(SyntaxError::InvalidDocIndentation(state.source.span()));
}
// If we get here, only 1 type of indentation was found. We can move
// the process along by reading over the label and breaking out
// with the EndHeredoc token, storing the kind and amount of whitespace.
state.source.skip(label.len());
state.replace(StackFrame::Scripting);
break TokenKind::EndDocString(label, whitespace_kind, whitespace_amount);
} else {
// We didn't find the label. The buffer still needs to know about
// the whitespace, so let's extend the buffer with the whitespace
// and let the loop run again to handle the rest of the line.
if whitespace_kind != DocStringIndentationKind::None {
let whitespace_char: u8 = whitespace_kind.into();
for _ in 0..whitespace_amount {
buffer.push(whitespace_char);
}
}
buffer.extend(extra_whitespace_buffer);
}
}
&[b, ..] => {
// If we're not on a new line, just add to the buffer as usual.
if !new_line {
new_line = false;
state.source.next();
buffer.push(b);
continue;
}
// If we can see the label here, we can consume it and exit early.
if state.source.at(&label, label.length) {
state.source.skip(label.length);
state.replace(StackFrame::Scripting);
break TokenKind::EndDocString(label, None, 0);
}
// We know the label isn't at the start of the line, so we can
// check if the line starts with any whitespace.
let line_starts_with_whitespace =
matches!(state.source.read(1), [b' '] | [b'\t']);
let mut current_indentation_amount = 0;
// If the line does start with whitespace, let's figure out what the current
// indentation type is and how much whitespace there is.
if line_starts_with_whitespace {
let current_indentation_type = match state.source.read(1) {
[b' '] => DocStringIndentationKind::Space,
[b'\t'] => DocStringIndentationKind::Tab,
_ => unreachable!(),
};
// If there was indentation on a previous line, we need to check
// if the current indentation type is the same or different.
// If it's different, we need to produce an error.
if let Some(indentation_type) = indentation_type {
if indentation_type != current_indentation_type {
return Err(SyntaxError::InvalidDocIndentation(
state.source.span(),
));
}
}
let mut leading_whitespace_buffer = Vec::new();
// If the type of whitespace is the same, we want to know
// how much whitespace is on this line. We only care about
// the smallest amount of whitespace in this case.
loop {
match (current_indentation_type, state.source.read(1)) {
(DocStringIndentationKind::Space, [b' ']) => {
leading_whitespace_buffer.push(b' ');
current_indentation_amount += 1;
state.source.next();
}
(DocStringIndentationKind::Tab, [b'\t']) => {
leading_whitespace_buffer.push(b'\t');
current_indentation_amount += 1;
state.source.next();
}
_ => break,
};
}
// If we can read the label at this point, we then need to check if the amount
// of indentation is the same or less than the smallest amount encountered thus far.
if state.source.at(&label, label.length)
&& current_indentation_amount > indentation_amount
{
return Err(SyntaxError::InvalidDocBodyIndentationLevel(
current_indentation_amount,
state.source.span(),
));
}
// If we've found less whitespace here, we should update the minimum.
if current_indentation_amount < indentation_amount {
indentation_amount = current_indentation_amount;
}
let mut whitespace_buffer = Vec::new();
// We should now try to consume anymore whitespace, since the doc body
// can include spaces or tabs. We should also push it to the buffer,
// in case we don't encounter the label. In theory, the only whitespace
// we'll encounter here is the character not found by the checks above.
while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
whitespace_buffer.push(*b);
state.source.next();
}
// Check if we can read the label again now.
if state.source.at(&label, label.length) {
// If there was extra whitespace after indentation, we need
// to error out about mixed indentation types.
if !whitespace_buffer.is_empty() {
return Err(SyntaxError::InvalidDocIndentation(
state.source.span(),
));
}
// If no extra whitespace was found, we've reached the end of the heredoc
// and can consume the label, sending the indentation amount along to the parser
// to normalize.
state.source.skip(label.length);
state.replace(StackFrame::Scripting);
break TokenKind::EndDocString(
label,
indentation_type,
current_indentation_amount,
);
} else {
buffer.extend(leading_whitespace_buffer);
buffer.extend(whitespace_buffer);
continue;
}
} else {
new_line = false;
state.source.next();
buffer.push(b);
}
state.source.next();
buffer.push(b);
}
[] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
}
};
// Trailing line breaks in the last segment of a heredoc
// shouldn't end up in the final string.
// Any trailing line breaks should be removed from the final heredoc.
if buffer.last() == Some(&b'\n') {
buffer.pop();
}
@ -1081,6 +1111,116 @@ impl Lexer {
}
tokens.push(Token { kind, span });
Ok(())
}
fn nowdoc(
&self,
state: &mut State,
tokens: &mut Vec<Token>,
label: ByteString,
) -> SyntaxResult<()> {
let span = state.source.span();
let mut buffer: Vec<u8> = Vec::new();
let kind = loop {
match state.source.read(3) {
// If we find a new-line, we can start to check if we can see the EndHeredoc token.
[b'\n', ..] => {
buffer.push(b'\n');
state.source.next();
// Check if we can see the closing label right here.
if state.source.at(&label, label.len()) {
state.source.skip(label.len());
state.replace(StackFrame::Scripting);
break TokenKind::EndDocString(label, DocStringIndentationKind::None, 0);
}
// Check if there's any whitespace first.
let (whitespace_kind, whitespace_amount) = match state.source.read(1) {
[b' '] => {
let mut amount = 0;
while state.source.read(1) == [b' '] {
amount += 1;
state.source.next();
}
(DocStringIndentationKind::Space, amount)
}
[b'\t'] => {
let mut amount = 0;
while state.source.read(1) == [b'\t'] {
amount += 1;
state.source.next();
}
(DocStringIndentationKind::Tab, amount)
}
_ => (DocStringIndentationKind::None, 0),
};
// We've figured out what type of whitespace was being used
// at the start of the line.
// We should now check for any extra whitespace, of any kind.
let mut extra_whitespace_buffer = Vec::new();
while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
extra_whitespace_buffer.push(b);
state.source.next();
}
// We've consumed all leading whitespace on this line now,
// so let's try to read the label again.
if state.source.at(&label, label.len()) {
// We've found the label, finally! We need to do 1 last
// check to make sure there wasn't a mixture of indentation types.
if whitespace_kind != DocStringIndentationKind::None
&& !extra_whitespace_buffer.is_empty()
{
return Err(SyntaxError::InvalidDocIndentation(state.source.span()));
}
// If we get here, only 1 type of indentation was found. We can move
// the process along by reading over the label and breaking out
// with the EndHeredoc token, storing the kind and amount of whitespace.
state.source.skip(label.len());
state.replace(StackFrame::Scripting);
break TokenKind::EndDocString(label, whitespace_kind, whitespace_amount);
} else {
// We didn't find the label. The buffer still needs to know about
// the whitespace, so let's extend the buffer with the whitespace
// and let the loop run again to handle the rest of the line.
if whitespace_kind != DocStringIndentationKind::None {
let whitespace_char: u8 = whitespace_kind.into();
for _ in 0..whitespace_amount {
buffer.push(whitespace_char);
}
}
buffer.extend(extra_whitespace_buffer);
}
}
&[b, ..] => {
state.source.next();
buffer.push(b);
}
[] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
}
};
// Any trailing line breaks should be removed from the final heredoc.
if buffer.last() == Some(&b'\n') {
buffer.pop();
}
if !buffer.is_empty() {
tokens.push(Token {
kind: TokenKind::StringPart(buffer.into()),
span,
})
}
tokens.push(Token { kind, span });
Ok(())
}

View File

@ -5,6 +5,9 @@ use crate::lexer::error::SyntaxError;
use crate::lexer::error::SyntaxResult;
use crate::lexer::source::Source;
use super::token::DocStringIndentationAmount;
use super::token::DocStringIndentationKind;
#[derive(Debug, PartialEq, Eq, PartialOrd, Clone, Copy)]
pub enum DocStringKind {
Heredoc,
@ -18,7 +21,12 @@ pub enum StackFrame {
Halted,
DoubleQuote,
ShellExec,
DocString(DocStringKind, ByteString),
DocString(
DocStringKind,
ByteString,
DocStringIndentationKind,
DocStringIndentationAmount,
),
LookingForVarname,
LookingForProperty,
VarOffset,

View File

@ -11,10 +11,14 @@ pub enum OpenTagKind {
Full,
}
pub type DocStringIndentationAmount = usize;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum DocStringIndentationKind {
Space,
Tab,
None,
Both,
}
impl From<u8> for DocStringIndentationKind {
@ -32,6 +36,7 @@ impl From<DocStringIndentationKind> for u8 {
match kind {
DocStringIndentationKind::Space => b' ',
DocStringIndentationKind::Tab => b'\t',
_ => unreachable!(),
}
}
}
@ -43,7 +48,7 @@ pub enum TokenKind {
Parent,
Backtick,
StartDocString(ByteString, DocStringKind),
EndDocString(ByteString, Option<DocStringIndentationKind>, usize),
EndDocString(ByteString, DocStringIndentationKind, usize),
From,
Print,
Dollar,

View File

@ -1,5 +1,7 @@
use crate::expect_token;
use crate::expected_token_err;
use crate::lexer::error::SyntaxError;
use crate::lexer::token::DocStringIndentationKind;
use crate::lexer::token::TokenKind;
use crate::lexer::DocStringKind;
use crate::parser::ast;
@ -971,6 +973,7 @@ fn shell_exec(state: &mut State) -> ParseResult<Expression> {
#[inline(always)]
fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression> {
let span = state.current.span;
state.next();
Ok(match kind {
@ -992,19 +995,65 @@ fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression>
state.next();
// FIXME: Can we move this logic above into the loop, by peeking ahead in
// the token stream for the EndHeredoc? Might be more performant.
if let Some(indentation_type) = indentation_type {
let search_char: u8 = indentation_type.into();
let mut new_line = true;
if indentation_type != DocStringIndentationKind::None {
let indentation_char: u8 = indentation_type.into();
for part in parts.iter_mut() {
// We only need to strip and validate indentation
// for individual lines, so we can skip checks if
// we know we're not on a new line.
if !new_line {
continue;
}
match part {
StringPart::Const(bytes) => {
for _ in 0..indentation_amount {
if bytes.starts_with(&[search_char]) {
bytes.remove(0);
}
// 1. If this line doesn't start with any whitespace,
// we can return an error early because we know
// the label was indented.
if !bytes.starts_with(&[b' ']) && !bytes.starts_with(&[b'\t']) {
return Err(ParseError::SyntaxError(
SyntaxError::InvalidDocBodyIndentationLevel(
indentation_amount,
span,
),
));
}
// 2. If this line doesn't start with the correct
// type of whitespace, we can also return an error.
if !bytes.starts_with(&[indentation_char]) {
return Err(ParseError::SyntaxError(
SyntaxError::InvalidDocIndentation(span),
));
}
// 3. We now know that the whitespace at the start of
// this line is correct, so we need to check that the
// amount of whitespace is correct too. In this case,
// the amount of whitespace just needs to be at least
// the same, so we can create a vector containing the
// minimum and check using `starts_with()`.
let expected_whitespace_buffer =
vec![indentation_char; indentation_amount];
if !bytes.starts_with(&expected_whitespace_buffer) {
return Err(ParseError::SyntaxError(
SyntaxError::InvalidDocBodyIndentationLevel(
indentation_amount,
span,
),
));
}
// 4. All of the above checks have passed, so we know
// there are no more possible errors. Let's now
// strip the leading whitespace accordingly.
*bytes = bytes
.strip_prefix(&expected_whitespace_buffer[..])
.unwrap()
.into();
new_line = bytes.ends_with(&[b'\n']);
}
_ => continue,
}
@ -1014,31 +1063,71 @@ fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression>
Expression::Heredoc { parts }
}
DocStringKind::Nowdoc => {
// FIXME: This feels hacky. We should probably produce different tokens from the lexer
// but since I already had the logic in place for parsing heredocs, this was
// the fastest way to get nowdocs working too.
let mut s = expect_token!([
TokenKind::StringPart(s) => s
], state, "constant string");
let mut string_part = expect_token!([
TokenKind::StringPart(s) => s,
], state, "constant string");
let (indentation_type, indentation_amount) = expect_token!([
TokenKind::EndDocString(_, indentation_type, indentation_amount) => (indentation_type, indentation_amount)
], state, "label");
let (indentation_type, indentation_amount) = match state.current.kind {
TokenKind::EndDocString(_, indentation_type, indentation_amount) => {
(indentation_type, indentation_amount)
}
_ => unreachable!(),
};
// FIXME: Hacky code, but it's late and I want to get this done.
if let Some(indentation_type) = indentation_type {
let search_char: u8 = indentation_type.into();
let mut lines = s
state.next();
if indentation_type != DocStringIndentationKind::None {
let indentation_char: u8 = indentation_type.into();
let mut lines = string_part
.split(|b| *b == b'\n')
.map(|s| s.to_vec())
.collect::<Vec<Vec<u8>>>();
for line in lines.iter_mut() {
for _ in 0..indentation_amount {
if line.starts_with(&[search_char]) {
line.remove(0);
}
if line.is_empty() {
continue;
}
// 1. If this line doesn't start with any whitespace,
// we can return an error early because we know
// the label was indented.
if !line.starts_with(&[b' ']) && !line.starts_with(&[b'\t']) {
return Err(ParseError::SyntaxError(
SyntaxError::InvalidDocBodyIndentationLevel(indentation_amount, span),
));
}
// 2. If this line doesn't start with the correct
// type of whitespace, we can also return an error.
if !line.starts_with(&[indentation_char]) {
return Err(ParseError::SyntaxError(SyntaxError::InvalidDocIndentation(
span,
)));
}
// 3. We now know that the whitespace at the start of
// this line is correct, so we need to check that the
// amount of whitespace is correct too. In this case,
// the amount of whitespace just needs to be at least
// the same, so we can create a vector containing the
// minimum and check using `starts_with()`.
let expected_whitespace_buffer = vec![indentation_char; indentation_amount];
if !line.starts_with(&expected_whitespace_buffer) {
return Err(ParseError::SyntaxError(
SyntaxError::InvalidDocBodyIndentationLevel(indentation_amount, span),
));
}
// 4. All of the above checks have passed, so we know
// there are no more possible errors. Let's now
// strip the leading whitespace accordingly.
*line = line
.strip_prefix(&expected_whitespace_buffer[..])
.unwrap()
.into();
}
let mut bytes = Vec::new();
for (i, line) in lines.iter().enumerate() {
bytes.extend(line);
@ -1046,10 +1135,10 @@ fn doc_string(state: &mut State, kind: DocStringKind) -> ParseResult<Expression>
bytes.push(b'\n');
}
}
s = bytes.into();
string_part = bytes.into();
}
Expression::Nowdoc { value: s }
Expression::Nowdoc { value: string_part }
}
})
}

View File

@ -1 +0,0 @@
InvalidDocBodyIndentationLevel(4, (5, 5)) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 5

1
tests/fixtures/0226/parser-error.txt vendored Normal file
View File

@ -0,0 +1 @@
SyntaxError(InvalidDocBodyIndentationLevel(4, (3, 1))) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 3

View File

@ -1 +0,0 @@
InvalidDocIndentation((5, 1)) -> Syntax Error: Invalid indentation - cannot use tabs and spaces on line 5

1
tests/fixtures/0227/parser-error.txt vendored Normal file
View File

@ -0,0 +1 @@
SyntaxError(InvalidDocIndentation((3, 1))) -> Syntax Error: Invalid indentation - cannot use tabs and spaces on line 3

View File

@ -1 +0,0 @@
InvalidDocBodyIndentationLevel(4, (5, 5)) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 5

1
tests/fixtures/0233/parser-error.txt vendored Normal file
View File

@ -0,0 +1 @@
SyntaxError(InvalidDocBodyIndentationLevel(4, (3, 1))) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 3

View File

@ -1 +0,0 @@
InvalidDocBodyIndentationLevel(4, (5, 5)) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 5

1
tests/fixtures/0234/parser-error.txt vendored Normal file
View File

@ -0,0 +1 @@
SyntaxError(InvalidDocBodyIndentationLevel(4, (3, 1))) -> Syntax Error: Invalid body indentation level - expecting an indentation level of at least 4 on line 3