use std::{fmt, str::CharIndices}; use super::error::LispError; #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum Token<'a> { LeftParen, RightParen, Float(&'a str), Integer(&'a str), // Char(&'a str), String(&'a str), Name(&'a str), // Keyword(&'a str), BackQuote, Comma, CommaAt, Quote, End, } impl<'a> fmt::Display for Token<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Token::LeftParen => write!(f, "("), Token::RightParen => write!(f, ")"), Token::Float(_) => write!(f, "float"), Token::Integer(_) => write!(f, "integer"), // Token::Char(_) => write!(f, "char"), Token::String(_) => write!(f, "string"), Token::Name(_) => write!(f, "name"), // Token::Keyword(_) => write!(f, "keyword"), Token::BackQuote => write!(f, "`"), Token::Comma => write!(f, ","), Token::CommaAt => write!(f, ",@"), Token::Quote => write!(f, "'"), Token::End => write!(f, "EOF"), } } } #[derive(Debug, Copy, Clone, PartialEq)] pub struct Span { low: u32, high: u32, } impl Span { pub fn empty(pos: u32) -> Self { Self { low: pos, high: pos, } } } pub struct Lexer<'input> { input: &'input str, cur_pos: u32, offset: u32, } impl<'a> Lexer<'a> { pub fn new(input: &'a str, offset: u32) -> Self { Self { input, cur_pos: 0, offset, } } pub fn next_token(&mut self) -> Result<(Span, Token<'a>), LispError> { let mut chars = self.input.char_indices(); while let Some((ind, chr)) = chars.next() { let low = self.cur_pos; let res = match chr { '(' => Ok((1, Token::LeftParen)), ')' => Ok((1, Token::RightParen)), '\'' => Ok((1, Token::Quote)), '`' => Ok((1, Token::BackQuote)), ',' => match chars.next() { Some((_, '@')) => Ok((2, Token::CommaAt)), _ => Ok((1, Token::Comma)), }, '#' => parse_name(&self.input[ind..]), '-' | '0'..='9' => parse_number(&self.input[ind..]), '"' => parse_string(&self.input[ind..]), ';' => { self.cur_pos += consume_comment(ind, &mut chars) as u32; continue; } _ if is_ident(chr) => parse_name(&self.input[ind..]), ch if ch.is_whitespace() => { self.cur_pos += ch.len_utf8() as u32; continue; } _ => Err(LispError::ParseError), }; let (size, token) = match res { Ok(v) => v, Err(_) => return Err(LispError::ParseError), }; self.cur_pos += size as u32; self.input = &self.input[ind + size..]; let sp = Span { low, high: low + size as u32, }; return Ok((sp, token)); } self.input = &self.input[..0]; return Ok((Span::empty(self.cur_pos), Token::End)); } } fn parse_number<'a>(mut input: &'a str) -> Result<(usize, Token<'a>), LispError> { let mut dot = false; let mut minus = false; let mut size = 0; let mut chars = input.chars(); if let Some(v) = chars.next() { if v == '-' { minus = true; size += 1; input = &input[1..]; } else if v.is_digit(10) { size += 1; } } while let Some(chr) = chars.next() { if chr.is_digit(10) { size += 1; } else if chr == '.' { if !dot { dot = true; size += 1; } else { return Err(LispError::ParseError); } } else if !is_ident(chr) { break; } else { return Err(LispError::ParseError); } } let tok = if size == 1 && minus { Token::Name("-") } else if dot { Token::Float(&input[..size]) } else { Token::Integer(&input[..size]) }; return Ok((size, tok)); } fn parse_string<'a>(input: &'a str) -> Result<(usize, Token<'a>), LispError> { // count opening quote let mut size = 1; let mut chars = input.char_indices().skip(1); while let Some((ind, chr)) = chars.next() { match chr { '\\' => { let _ = chars.next(); } '"' => { size += ind; break; } _ => (), } } return Ok((size, Token::String(&input[..size]))); } fn is_ident(ch: char) -> bool { match ch { '!' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | '<' | '=' | '>' | '?' | '^' | '_' | '|' | '#' => true, _ if ch.is_alphanumeric() => true, _ => false, } } fn consume_comment(start: usize, chars: &mut CharIndices) -> usize { let mut last = start; for (ind, ch) in chars { last = ind; if ch == '\n' { break; } } last - start + 1 } fn parse_name<'a>(input: &'a str) -> Result<(usize, Token<'a>), LispError> { for (ind, chr) in input.char_indices() { if !is_ident(chr) { return Ok((ind, Token::Name(&input[..ind]))); } } return Ok((input.len(), Token::Name(input))); } #[cfg(test)] mod tests { use super::*; fn sp(low: u32, high: u32) -> Span { Span { low, high } } fn tokens(input: &str) -> Vec<(Span, Token)> { let mut lexer = Lexer::new(input, 0); let mut tokens = Vec::new(); loop { match lexer.next_token().unwrap() { (_, Token::End) => break, t => tokens.push(t), } } tokens } #[test] fn string_parsing() { let input = r#""hello there""#; let parsed = parse_string(input).unwrap(); assert_eq!(parsed.0, 13); assert_eq!(parsed.1, Token::String(r#""hello there""#)); } #[test] fn integer_parsing() { let input = "12345"; let parsed = parse_number(input).unwrap(); assert_eq!(parsed.0, 5); assert_eq!(parsed.1, Token::Integer("12345")); } #[test] fn float_parsing() { let input = "12.345"; let parsed = parse_number(input).unwrap(); assert_eq!(parsed.0, 6); assert_eq!(parsed.1, Token::Float("12.345")); } #[test] fn lexer() { assert_eq!( tokens("1 2 3"), [ (sp(0, 1), Token::Integer("1")), (sp(2, 3), Token::Integer("2")), (sp(4, 5), Token::Integer("3")) ] ); assert_eq!( tokens("1 foo 3"), [ (sp(0, 1), Token::Integer("1")), (sp(2, 5), Token::Name("foo")), (sp(6, 7), Token::Integer("3")) ] ); assert_eq!(tokens("foo"), [(sp(0, 3), Token::Name("foo")),]); assert_eq!(tokens("#t"), [(sp(0, 2), Token::Name("#t")),]); assert_eq!( tokens("1 \"foo\" 3"), [ (sp(0, 1), Token::Integer("1")), (sp(2, 7), Token::String(r#""foo""#)), (sp(8, 9), Token::Integer("3")) ] ); assert_eq!( tokens("(* 1 (+ 2 3))"), [ (sp(0, 1), Token::LeftParen), (sp(1, 2), Token::Name("*")), (sp(3, 4), Token::Integer("1")), (sp(5, 6), Token::LeftParen), (sp(6, 7), Token::Name("+")), (sp(8, 9), Token::Integer("2")), (sp(10, 11), Token::Integer("3")), (sp(11, 12), Token::RightParen), (sp(12, 13), Token::RightParen), ] ); assert_eq!(tokens("; foo"), []); assert_eq!(tokens("1; foo"), [(sp(0, 1), Token::Integer("1"))]); assert_eq!(tokens(">="), [(sp(0, 2), Token::Name(">="))]); assert_eq!(tokens("&&"), [(sp(0, 2), Token::Name("&&"))]); } }