lexer: Rework lookahead

This commit is contained in:
Evan Shaw 2022-09-15 07:57:19 +12:00
parent fd118ea0da
commit cbed61165f

View File

@ -19,7 +19,6 @@ pub struct Lexer {
chars: Vec<u8>, chars: Vec<u8>,
cursor: usize, cursor: usize,
current: Option<u8>, current: Option<u8>,
peek: Option<u8>,
col: usize, col: usize,
line: usize, line: usize,
} }
@ -32,7 +31,6 @@ impl Lexer {
chars: Vec::new(), chars: Vec::new(),
cursor: 0, cursor: 0,
current: None, current: None,
peek: None,
line: 1, line: 1,
col: 0, col: 0,
} }
@ -45,10 +43,9 @@ impl Lexer {
let mut tokens = Vec::new(); let mut tokens = Vec::new();
self.chars = input.as_ref().to_vec(); self.chars = input.as_ref().to_vec();
self.next(); self.current = self.chars.get(0).copied();
self.next();
while self.peek.is_some() { while self.current.is_some() {
match self.state { match self.state {
// The "Initial" state is used to parse inline HTML. It is essentially a catch-all // The "Initial" state is used to parse inline HTML. It is essentially a catch-all
// state that will build up a single token buffer until it encounters an open tag // state that will build up a single token buffer until it encounters an open tag
@ -59,7 +56,7 @@ impl Lexer {
// The scripting state is entered when an open tag is encountered in the source code. // The scripting state is entered when an open tag is encountered in the source code.
// This tells the lexer to start analysing characters at PHP tokens instead of inline HTML. // This tells the lexer to start analysing characters at PHP tokens instead of inline HTML.
LexerState::Scripting => { LexerState::Scripting => {
while let Some(c) = self.peek { while let Some(c) = self.current {
if !c.is_ascii_whitespace() && ![b'\n', b'\t', b'\r'].contains(&c) { if !c.is_ascii_whitespace() && ![b'\n', b'\t', b'\r'].contains(&c) {
break; break;
} }
@ -75,7 +72,7 @@ impl Lexer {
} }
// If we have consumed whitespace and then reached the end of the file, we should break. // If we have consumed whitespace and then reached the end of the file, we should break.
if self.peek.is_none() { if self.current.is_none() {
break; break;
} }
@ -91,64 +88,31 @@ impl Lexer {
fn initial(&mut self) -> Result<Vec<Token>, LexerError> { fn initial(&mut self) -> Result<Vec<Token>, LexerError> {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
while let Some(char) = self.current { while let Some(char) = self.current {
match char { if self.try_read(b"<?php") {
b'<' => { self.skip(5);
// This is disgusting and can most definitely be tidied up with a multi-peek iterator. self.col += 4;
if let Some(b'?') = self.peek {
self.next();
if let Some(b'p') = self.peek { self.enter_state(LexerState::Scripting);
self.next();
if let Some(b'h') = self.peek { let mut tokens = vec![];
self.next();
if let Some(b'p') = self.peek { if !buffer.is_empty() {
self.next(); tokens.push(Token {
kind: TokenKind::InlineHtml(buffer.into()),
self.col += 4; span: (self.line, self.col.saturating_sub(5)),
});
self.enter_state(LexerState::Scripting);
let mut tokens = vec![];
if !buffer.is_empty() {
tokens.push(Token {
kind: TokenKind::InlineHtml(buffer.into()),
span: (self.line, self.col.saturating_sub(5)),
});
}
tokens.push(Token {
kind: TokenKind::OpenTag(OpenTagKind::Full),
span: (self.line, self.col),
});
return Ok(tokens);
}
} else {
self.col += 3;
buffer.push(b'h');
}
} else {
self.col += 2;
buffer.push(b'?');
}
} else {
self.next();
self.col += 1;
buffer.push(char);
}
}
_ => {
self.next();
buffer.push(char);
} }
tokens.push(Token {
kind: TokenKind::OpenTag(OpenTagKind::Full),
span: (self.line, self.col),
});
return Ok(tokens);
} }
self.next();
buffer.push(char);
} }
Ok(vec![Token { Ok(vec![Token {
@ -160,180 +124,166 @@ impl Lexer {
fn scripting(&mut self) -> Result<Token, LexerError> { fn scripting(&mut self) -> Result<Token, LexerError> {
// We should never reach this point since we have the empty checks surrounding // We should never reach this point since we have the empty checks surrounding
// the call to this function, but it's better to be safe than sorry. // the call to this function, but it's better to be safe than sorry.
if self.peek.is_none() { let char = match self.current {
return Err(LexerError::UnexpectedEndOfFile); Some(c) => c,
} None => return Err(LexerError::UnexpectedEndOfFile),
};
// Since we have the check above, we can safely unwrap the result of `.next()` let kind = match self.peek() {
// to help reduce the amount of indentation. [b'@', ..] => {
self.next(); self.next();
let char = self.current.unwrap();
let kind = match char {
b'@' => {
self.col += 1; self.col += 1;
TokenKind::At TokenKind::At
} }
b'!' => { [b'!', b'=', b'=', ..] => {
self.col += 1; self.skip(3);
self.col += 2;
if let Some(b'=') = self.peek { TokenKind::BangDoubleEquals
self.col += 1;
self.next();
if let Some(b'=') = self.peek {
self.col += 1;
self.next();
TokenKind::BangDoubleEquals
} else {
TokenKind::BangEquals
}
} else {
TokenKind::Bang
}
} }
b'&' => { [b'!', b'=', ..] => {
self.col += 1; self.skip(2);
self.col += 2;
if let Some(b'&') = self.peek { TokenKind::BangEquals
self.col += 1;
self.next();
TokenKind::BooleanAnd
} else {
TokenKind::Ampersand
}
} }
b'?' => { [b'!', ..] => {
self.next();
self.col += 1;
TokenKind::BangEquals
}
[b'&', b'&', ..] => {
self.skip(2);
self.col += 2;
TokenKind::BooleanAnd
}
[b'&', ..] => {
self.next();
self.col += 1;
TokenKind::Ampersand
}
[b'?', b'>', ..] => {
// This is a close tag, we can enter "Initial" mode again. // This is a close tag, we can enter "Initial" mode again.
if let Some(b'>') = self.peek { self.skip(2);
self.next(); self.col += 2;
self.next();
self.col += 2; self.enter_state(LexerState::Initial);
self.enter_state(LexerState::Initial); TokenKind::CloseTag
TokenKind::CloseTag
} else if let Some(b'?') = self.peek {
self.col += 1;
self.next();
if let Some(b'=') = self.peek {
self.col += 1;
self.next();
TokenKind::CoalesceEqual
} else {
TokenKind::Coalesce
}
} else if let Some(b':') = self.peek {
self.col += 1;
self.next();
TokenKind::QuestionColon
} else if self.try_read(b"->") {
self.col += 1;
self.skip(2);
TokenKind::NullsafeArrow
} else {
TokenKind::Question
}
} }
b'=' => { [b'?', b'?', b'=', ..] => {
if let Some(b'=') = self.peek { self.skip(3);
self.next(); self.col += 3;
TokenKind::CoalesceEqual
if let Some(b'=') = self.peek { }
self.next(); [b'?', b'?', ..] => {
self.skip(2);
self.col += 3; self.col += 2;
TokenKind::Coalesce
TokenKind::TripleEquals }
} else { [b'?', b':', ..] => {
self.col += 2; self.skip(2);
self.col += 2;
TokenKind::DoubleEquals TokenKind::QuestionColon
} }
} else if let Some(b'>') = self.peek { [b'?', b'-', b'>', ..] => {
self.next(); self.skip(3);
self.col += 1; self.col += 3;
TokenKind::DoubleArrow TokenKind::NullsafeArrow
} else { }
self.col += 1; [b'?', ..] => {
self.next();
TokenKind::Equals self.col += 1;
} TokenKind::Question
}
[b'=', b'>', ..] => {
self.skip(2);
self.col += 2;
TokenKind::DoubleArrow
}
[b'=', b'=', b'=', ..] => {
self.skip(3);
self.col += 3;
TokenKind::TripleEquals
}
[b'=', b'=', ..] => {
self.skip(2);
self.col += 2;
TokenKind::DoubleEquals
}
[b'=', ..] => {
self.next();
self.col += 1;
TokenKind::Equals
} }
// Single quoted string. // Single quoted string.
b'\'' => self.tokenize_single_quote_string(), [b'\'', ..] => {
b'"' => self.tokenize_double_quote_string(), self.next();
b'$' => self.tokenize_variable(), self.col += 1;
b'.' => { self.tokenize_single_quote_string()
}
[b'"', ..] => {
self.next();
self.col += 1;
self.tokenize_double_quote_string()
}
[b'$', ..] => {
self.next();
self.col += 1;
self.tokenize_variable()
}
[b'.', b'=', ..] => {
self.skip(2);
self.col += 2;
TokenKind::DotEquals
}
[b'.', b'0'..=b'9', ..] => {
self.next();
self.tokenize_number(String::from("0."), true)?
}
[b'.', b'.', b'.', ..] => {
self.skip(3);
self.col += 3;
TokenKind::Ellipsis
}
[b'.', ..] => {
self.next();
self.col += 1;
TokenKind::Dot
}
[b'0'..=b'9', ..] => {
self.next();
self.tokenize_number(String::from(char as char), false)?
}
&[b'\\', n, ..] if n == b'_' || n.is_ascii_alphabetic() => {
self.col += 1; self.col += 1;
if let Some(b'0'..=b'9') = self.peek { match self.scripting()? {
self.tokenize_number(String::from("0."), true)? Token {
} else if let Some(b'.') = self.peek { kind:
self.next(); TokenKind::Identifier(ByteString(mut i))
| TokenKind::QualifiedIdentifier(ByteString(mut i)),
self.col += 1; ..
} => {
if let Some(b'.') = self.peek { i.insert(0, b'\\');
self.next(); TokenKind::FullyQualifiedIdentifier(i.into())
self.col += 1;
TokenKind::Ellipsis
} else {
todo!("don't know how to handle this case yet, it should just be 2 Dot tokens...")
} }
} else if let Some(b'=') = self.peek { s => unreachable!("{:?}", s),
self.next();
self.col += 1;
TokenKind::DotEquals
} else {
TokenKind::Dot
} }
} }
b'0'..=b'9' => self.tokenize_number(String::from(char as char), false)?, [b'\\', ..] => {
b'\\' => { self.next();
self.col += 1; self.col += 1;
TokenKind::NamespaceSeparator
if self
.peek
.map_or(false, |n| n == b'_' || n.is_ascii_alphabetic())
{
match self.scripting()? {
Token {
kind:
TokenKind::Identifier(ByteString(mut i))
| TokenKind::QualifiedIdentifier(ByteString(mut i)),
..
} => {
i.insert(0, b'\\');
TokenKind::FullyQualifiedIdentifier(i.into())
}
s => unreachable!("{:?}", s),
}
} else {
TokenKind::NamespaceSeparator
}
} }
_ if char.is_ascii_alphabetic() || char == b'_' => { _ if char.is_ascii_alphabetic() || char == b'_' => {
self.next();
self.col += 1; self.col += 1;
let mut qualified = false; let mut qualified = false;
let mut last_was_slash = false; let mut last_was_slash = false;
let mut buffer = vec![char]; let mut buffer = vec![char];
while let Some(next) = self.peek { while let Some(next) = self.current {
if next.is_ascii_alphanumeric() || next == b'_' { if next.is_ascii_alphanumeric() || next == b'_' {
buffer.push(next); buffer.push(next);
self.next(); self.next();
@ -361,126 +311,119 @@ impl Lexer {
.unwrap_or_else(|| TokenKind::Identifier(buffer.into())) .unwrap_or_else(|| TokenKind::Identifier(buffer.into()))
} }
} }
b'/' | b'#' => { [b'/', b'*', ..] => {
self.next();
self.col += 1; self.col += 1;
let mut buffer = vec![char];
fn read_till_end_of_line(s: &mut Lexer) -> Vec<u8> { while self.current.is_some() {
s.col += 1; match self.peek() {
[b'*', b'/', ..] => {
let mut buffer = Vec::new(); self.col += 2;
buffer.extend_from_slice(b"*/");
while let Some(c) = s.peek { self.next();
if c == b'\n' {
break; break;
} }
[b'\n', ..] => {
self.line += 1;
self.col = 0;
buffer.push(c); buffer.push(b'\n');
s.next();
}
buffer
}
if char == b'/' && self.peek == Some(b'*') {
let mut buffer = vec![char];
while self.peek.is_some() {
self.next();
let t = self.current.unwrap();
match t {
b'*' => {
if let Some(b'/') = self.peek {
self.col += 2;
buffer.extend_from_slice(b"*/");
self.next();
break;
} else {
self.col += 1;
buffer.push(t);
}
}
b'\n' => {
self.line += 1;
self.col = 0;
buffer.push(b'\n');
}
_ => {
self.col += 1;
buffer.push(t);
}
} }
} &[t, ..] => {
self.col += 1;
if buffer.starts_with(b"/**") { buffer.push(t);
TokenKind::DocComment(buffer.into()) }
} else { [] => {}
TokenKind::Comment(buffer.into())
} }
} else if let Some(b'=') = self.peek {
self.col += 1;
self.next(); self.next();
TokenKind::SlashEquals }
} else if char == b'/' && self.peek != Some(b'/') { self.next();
TokenKind::Slash self.col += 1;
} else if char == b'#' && self.peek == Some(b'[') {
TokenKind::Attribute if buffer.starts_with(b"/**") {
TokenKind::DocComment(buffer.into())
} else { } else {
self.next();
let current = self.current.unwrap();
let mut buffer = read_till_end_of_line(self);
buffer.splice(0..0, [char, current]);
TokenKind::Comment(buffer.into()) TokenKind::Comment(buffer.into())
} }
} }
b'*' => { [b'#', b'[', ..] => {
self.col += 1; self.skip(2);
self.col += 2;
if let Some(b'*') = self.peek { TokenKind::Attribute
self.col += 1; }
&[ch @ b'/', b'/', ..] | &[ch @ b'#', ..] => {
let mut buffer = if ch == b'/' {
self.skip(2);
self.col += 2;
b"//".to_vec()
} else {
self.next(); self.next();
self.col += 1;
b"#".to_vec()
};
if let Some(b'=') = self.peek { while let Some(c) = self.current {
self.col += 1; if c == b'\n' {
self.next(); break;
TokenKind::PowEquals
} else {
TokenKind::Pow
} }
} else if let Some(b'=') = self.peek {
self.col += 1; buffer.push(c);
self.next(); self.next();
TokenKind::AsteriskEqual
} else {
TokenKind::Asterisk
} }
self.next();
TokenKind::Comment(buffer.into())
} }
b'|' => { [b'/', b'=', ..] => {
self.skip(2);
self.col += 2;
TokenKind::SlashEquals
}
[b'/', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::Slash
if let Some(b'|') = self.peek {
self.col += 1;
self.next();
TokenKind::BooleanOr
} else {
TokenKind::Pipe
}
} }
b'{' => { [b'*', b'*', ..] => {
self.skip(2);
self.col += 2;
TokenKind::Pow
}
[b'*', b'=', ..] => {
self.skip(2);
self.col += 2;
TokenKind::AsteriskEqual
}
[b'*', ..] => {
self.next();
self.col += 1;
TokenKind::Asterisk
}
[b'|', b'|', ..] => {
self.skip(2);
self.col += 2;
TokenKind::Pipe
}
[b'|', ..] => {
self.next();
self.col += 1;
TokenKind::Pipe
}
[b'{', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::LeftBrace TokenKind::LeftBrace
} }
b'}' => { [b'}', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::RightBrace TokenKind::RightBrace
} }
b'(' => { [b'(', ..] => {
self.next();
self.col += 1; self.col += 1;
if self.try_read(b"string)") { if self.try_read(b"string)") {
@ -509,116 +452,112 @@ impl Lexer {
TokenKind::LeftParen TokenKind::LeftParen
} }
} }
b')' => { [b')', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::RightParen TokenKind::RightParen
} }
b';' => { [b';', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::SemiColon TokenKind::SemiColon
} }
b'+' => { [b'+', b'+', ..] => {
self.col += 1; self.skip(2);
self.col += 2;
if let Some(b'=') = self.peek { TokenKind::Increment
self.col += 1;
self.next();
TokenKind::PlusEquals
} else if let Some(b'+') = self.peek {
self.col += 1;
self.next();
TokenKind::Increment
} else {
TokenKind::Plus
}
} }
b'-' => { [b'+', b'=', ..] => {
self.col += 1; self.skip(2);
self.col += 2;
if let Some(b'>') = self.peek { TokenKind::PlusEquals
self.col += 1;
self.next();
TokenKind::Arrow
} else if let Some(b'=') = self.peek {
self.col += 1;
self.next();
TokenKind::MinusEquals
} else {
TokenKind::Minus
}
} }
b'<' => { [b'+', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::Plus
if let Some(b'=') = self.peek {
self.next();
self.col += 1;
TokenKind::LessThanEquals
} else if let Some(b'<') = self.peek {
self.next();
if let Some(b'<') = self.peek {
// TODO: Handle both heredocs and nowdocs.
self.next();
todo!("heredocs & nowdocs");
} else {
TokenKind::LeftShift
}
} else {
TokenKind::LessThan
}
} }
b'>' => { [b'-', b'-', ..] => {
self.skip(2);
self.col += 2;
TokenKind::Decrement
}
[b'-', b'>', ..] => {
self.skip(2);
self.col += 2;
TokenKind::Arrow
}
[b'-', b'=', ..] => {
self.skip(2);
self.col += 2;
TokenKind::MinusEquals
}
[b'-', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::Minus
if let Some(b'=') = self.peek {
self.next();
self.col += 1;
TokenKind::GreaterThanEquals
} else if let Some(b'>') = self.peek {
self.next();
self.col += 1;
TokenKind::RightShift
} else {
TokenKind::GreaterThan
}
} }
b',' => { [b'<', b'<', b'<', ..] => {
// TODO: Handle both heredocs and nowdocs.
self.skip(3);
self.col += 3;
todo!("heredocs & nowdocs");
}
[b'<', b'<', ..] => {
self.skip(2);
self.col += 2;
TokenKind::LeftShift
}
[b'<', b'=', ..] => {
self.skip(2);
self.col += 2;
TokenKind::LessThanEquals
}
[b'<', ..] => {
self.next();
self.col += 1;
TokenKind::LessThan
}
[b'>', b'>', ..] => {
self.skip(2);
self.col += 2;
TokenKind::RightShift
}
[b'>', b'=', ..] => {
self.skip(2);
self.col += 2;
TokenKind::GreaterThanEquals
}
[b'>', ..] => {
self.next();
self.col += 1;
TokenKind::GreaterThan
}
[b',', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::Comma TokenKind::Comma
} }
b'[' => { [b'[', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::LeftBracket TokenKind::LeftBracket
} }
b']' => { [b']', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::RightBracket TokenKind::RightBracket
} }
b':' => { [b':', b':', ..] => {
self.skip(2);
self.col += 2;
TokenKind::DoubleColon
}
[b':', ..] => {
self.next();
self.col += 1; self.col += 1;
TokenKind::Colon
if let Some(b':') = self.peek {
self.col += 1;
self.next();
TokenKind::DoubleColon
} else {
TokenKind::Colon
}
} }
_ => unimplemented!( _ => unimplemented!(
"<scripting> char: {}, line: {}, col: {}", "<scripting> char: {}, line: {}, col: {}",
@ -640,7 +579,7 @@ impl Lexer {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
let mut escaping = false; let mut escaping = false;
while let Some(n) = self.peek { while let Some(n) = self.current {
if !escaping && n == b'\'' { if !escaping && n == b'\'' {
self.next(); self.next();
@ -682,7 +621,7 @@ impl Lexer {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
let mut escaping = false; let mut escaping = false;
while let Some(n) = self.peek { while let Some(n) = self.current {
if !escaping && n == b'"' { if !escaping && n == b'"' {
self.next(); self.next();
@ -723,7 +662,7 @@ impl Lexer {
self.col += 1; self.col += 1;
while let Some(n) = self.peek { while let Some(n) = self.current {
match n { match n {
b'0'..=b'9' if !buffer.is_empty() => { b'0'..=b'9' if !buffer.is_empty() => {
self.col += 1; self.col += 1;
@ -753,7 +692,7 @@ impl Lexer {
self.col += 1; self.col += 1;
while let Some(n) = self.peek { while let Some(n) = self.current {
match n { match n {
b'0'..=b'9' => { b'0'..=b'9' => {
underscore = false; underscore = false;
@ -797,13 +736,12 @@ impl Lexer {
self.state = state; self.state = state;
} }
fn try_read(&self, search: &'static [u8]) -> bool { fn peek(&self) -> &[u8] {
if self.current.is_none() || self.peek.is_none() { &self.chars[self.cursor..]
return false; }
}
let start = self.cursor.saturating_sub(1); fn try_read(&self, search: &'static [u8]) -> bool {
self.chars[start..].starts_with(search) self.peek().starts_with(search)
} }
fn skip(&mut self, count: usize) { fn skip(&mut self, count: usize) {
@ -813,9 +751,8 @@ impl Lexer {
} }
fn next(&mut self) { fn next(&mut self) {
self.current = self.peek;
self.peek = self.chars.get(self.cursor).cloned();
self.cursor += 1; self.cursor += 1;
self.current = self.chars.get(self.cursor).copied();
} }
} }