use std::{fmt, str::CharIndices}; use crate::lisp::error::{ParseError, ParseErrorKind}; #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum Token<'a> { LeftParen, RightParen, Float(&'a str), Integer(&'a str), Char(&'a str), String(&'a str), Name(&'a str), // Keyword(&'a str), BackQuote, Comma, CommaAt, Quote, End, } impl<'a> Token<'a> { pub fn name(&self) -> &'static str { match self { Token::LeftParen => "(", Token::RightParen => ")", Token::Float(_) => "float", Token::Integer(_) => "integer", Token::Char(_) => "char", Token::String(_) => "string", Token::Name(_) => "name", // Token::Keyword(_) => "keyword", Token::BackQuote => "`", Token::Comma => ",", Token::CommaAt => ",@", Token::Quote => "'", Token::End => "EOF", } } } impl<'a> fmt::Display for Token<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Token::LeftParen => write!(f, "("), Token::RightParen => write!(f, ")"), Token::Float(_) => write!(f, "float"), Token::Integer(_) => write!(f, "integer"), Token::Char(_) => write!(f, "char"), Token::String(_) => write!(f, "string"), Token::Name(_) => write!(f, "name"), // Token::Keyword(_) => write!(f, "keyword"), Token::BackQuote => write!(f, "`"), Token::Comma => write!(f, ","), Token::CommaAt => write!(f, ",@"), Token::Quote => write!(f, "'"), Token::End => write!(f, "EOF"), } } } #[derive(Debug, Copy, Clone, PartialEq)] pub struct Span { low: u32, high: u32, } impl Span { pub fn empty(pos: u32) -> Self { Self { low: pos, high: pos, } } } #[derive(Debug, Clone)] pub struct SpanDisplay<'src, 'file> { pub file_name: &'file str, pub source: &'src str, pub line: usize, pub col: usize, } impl<'src, 'file> SpanDisplay<'src, 'file> { pub fn highlight_span(span: Span, source: &'src str, file_name: &'file str) -> Self { let line_start = match source[..span.low as usize].rfind('\n') { Some(pos) => pos + 1, None => 0, }; let line = source[..line_start].chars().filter(|&c| c == '\n').count() + 1; let col = source[line_start..span.low as usize].chars().count(); Self { file_name, source, line, col, } } } pub struct Lexer<'input> { input: &'input str, cur_pos: u32, } impl<'a> Lexer<'a> { pub fn new(input: &'a str) -> Self { Self { input, cur_pos: 0 } } pub fn next_token(&mut self) -> Result<(Span, Token<'a>), ParseError> { let mut chars = self.input.char_indices(); while let Some((ind, chr)) = chars.next() { let low = self.cur_pos; let res = match chr { '(' | '[' => Ok((1, Token::LeftParen)), ')' | ']' => Ok((1, Token::RightParen)), '\'' => Ok((1, Token::Quote)), '`' => Ok((1, Token::BackQuote)), ',' => match chars.next() { Some((_, '@')) => Ok((2, Token::CommaAt)), _ => Ok((1, Token::Comma)), }, '#' => match chars.next() { Some((_, '\\')) => parse_char(&self.input[ind..]), _ => parse_name(&self.input[ind..]), }, '-' | '0'..='9' => parse_number(&self.input[ind..]), '"' => parse_string(&self.input[ind..]), ';' => { self.cur_pos += consume_comment(ind, &mut chars) as u32; continue; } _ if is_ident(chr) => parse_name(&self.input[ind..]), ch if ch.is_whitespace() => { self.cur_pos += ch.len_utf8() as u32; continue; } ch => Err(ParseErrorKind::InvalidChar(ch)), }; let (size, token) = match res { Ok(v) => v, Err(kind) => { return Err(ParseError { span: Span { low, high: low + chr.len_utf8() as u32, }, kind, }) } }; self.cur_pos += size as u32; self.input = &self.input[ind + size..]; let sp = Span { low, high: low + size as u32, }; return Ok((sp, token)); } self.input = &self.input[..0]; Ok((Span::empty(self.cur_pos), Token::End)) } } fn parse_number(mut input: &str) -> Result<(usize, Token<'_>), ParseErrorKind> { let mut dot = false; let mut minus = false; let mut size = 0; let mut chars = input.chars(); if let Some(v) = chars.next() { if v == '-' { minus = true; size += 1; input = &input[1..]; } else if v.is_digit(10) { size += 1; } } for chr in chars { if chr.is_digit(10) { size += 1; } else if chr == '.' { if !dot { dot = true; size += 1; } else { return Err(ParseErrorKind::InvalidChar(chr)); } } else if !is_ident(chr) { break; } else { return Err(ParseErrorKind::InvalidChar(chr)); } } let tok = if size == 1 && minus { Token::Name("-") } else if dot { Token::Float(&input[..size]) } else { Token::Integer(&input[..size]) }; Ok((size, tok)) } fn parse_string(input: &str) -> Result<(usize, Token<'_>), ParseErrorKind> { // count opening quote let mut size = 1; let mut closed = false; let mut chars = input.char_indices().skip(1); while let Some((ind, chr)) = chars.next() { match chr { '\\' => { let _ = chars.next(); } '"' => { size += ind; closed = true; break; } _ => (), } } if !closed { Err(ParseErrorKind::UnterminatedString) } else { Ok((size, Token::String(&input[..size]))) } } fn is_ident(ch: char) -> bool { match ch { // "!$%&*+-./<=>?^_|#" '!' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | '<' | '=' | '>' | '?' | '^' | '_' | '|' | '#' => true, _ if ch.is_alphanumeric() => true, _ => false, } } fn consume_comment(start: usize, chars: &mut CharIndices) -> usize { let mut last = start; for (ind, ch) in chars { last = ind; if ch == '\n' { break; } } last - start + 1 } fn parse_name(input: &str) -> Result<(usize, Token<'_>), ParseErrorKind> { for (ind, chr) in input.char_indices() { if !is_ident(chr) { return Ok((ind, Token::Name(&input[..ind]))); } } return Ok((input.len(), Token::Name(input))); } fn parse_char(input: &str) -> Result<(usize, Token<'_>), ParseErrorKind> { // first two chars of input are '#' and '\' let chr = &input[..3]; return Ok((chr.len(), Token::Char(chr))); } #[cfg(test)] mod tests { use super::*; fn sp(low: u32, high: u32) -> Span { Span { low, high } } fn tokens(input: &str) -> Vec<(Span, Token)> { let mut lexer = Lexer::new(input); let mut tokens = Vec::new(); loop { match lexer.next_token().unwrap() { (_, Token::End) => break, t => tokens.push(t), } } tokens } #[test] fn string_parsing() { let input = r#""hello there""#; let parsed = parse_string(input).unwrap(); assert_eq!(parsed.0, 13); assert_eq!(parsed.1, Token::String(r#""hello there""#)); } #[test] fn char_parsing() { let input = r##"#\a"##; let parsed = parse_char(input).unwrap(); assert_eq!(parsed.0, 3); assert_eq!(parsed.1, Token::Char(r##"#\a"##)); } #[test] fn integer_parsing() { let input = "12345"; let parsed = parse_number(input).unwrap(); assert_eq!(parsed.0, 5); assert_eq!(parsed.1, Token::Integer("12345")); } #[test] fn float_parsing() { let input = "12.345"; let parsed = parse_number(input).unwrap(); assert_eq!(parsed.0, 6); assert_eq!(parsed.1, Token::Float("12.345")); } #[test] fn lexer() { assert_eq!( tokens("1 2 3"), [ (sp(0, 1), Token::Integer("1")), (sp(2, 3), Token::Integer("2")), (sp(4, 5), Token::Integer("3")) ] ); assert_eq!( tokens("1 foo 3"), [ (sp(0, 1), Token::Integer("1")), (sp(2, 5), Token::Name("foo")), (sp(6, 7), Token::Integer("3")) ] ); assert_eq!(tokens("foo"), [(sp(0, 3), Token::Name("foo")),]); assert_eq!(tokens("#t"), [(sp(0, 2), Token::Name("#t")),]); assert_eq!( tokens("1 \"foo\" 3"), [ (sp(0, 1), Token::Integer("1")), (sp(2, 7), Token::String(r#""foo""#)), (sp(8, 9), Token::Integer("3")) ] ); assert_eq!( tokens("(* 1 (+ 2 3))"), [ (sp(0, 1), Token::LeftParen), (sp(1, 2), Token::Name("*")), (sp(3, 4), Token::Integer("1")), (sp(5, 6), Token::LeftParen), (sp(6, 7), Token::Name("+")), (sp(8, 9), Token::Integer("2")), (sp(10, 11), Token::Integer("3")), (sp(11, 12), Token::RightParen), (sp(12, 13), Token::RightParen), ] ); assert_eq!(tokens("; foo"), []); assert_eq!(tokens("1; foo"), [(sp(0, 1), Token::Integer("1"))]); assert_eq!(tokens(">="), [(sp(0, 2), Token::Name(">="))]); assert_eq!(tokens("&&"), [(sp(0, 2), Token::Name("&&"))]); } }