From ae79d5a8b0166d43d3ff48aa593db6038b40410b Mon Sep 17 00:00:00 2001 From: Akshay Date: Fri, 19 Mar 2021 21:57:34 +0530 Subject: begin work on scripting lisp --- src/lisp/lex.rs | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 src/lisp/lex.rs (limited to 'src/lisp/lex.rs') diff --git a/src/lisp/lex.rs b/src/lisp/lex.rs new file mode 100644 index 0000000..a1bea5f --- /dev/null +++ b/src/lisp/lex.rs @@ -0,0 +1,303 @@ +use std::{fmt, str::CharIndices}; + +use super::error::LispError; + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum Token<'a> { + LeftParen, + RightParen, + Comment(&'a str), + Float(&'a str), + Integer(&'a str), + Char(&'a str), + String(&'a str), + Name(&'a str), + Keyword(&'a str), + BackQuote, + Comma, + CommaAt, + Quote, + End, +} + +impl<'a> fmt::Display for Token<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Token::LeftParen => write!(f, "("), + Token::RightParen => write!(f, ")"), + Token::Comment(_) => write!(f, "comment"), + Token::Float(_) => write!(f, "float"), + Token::Integer(_) => write!(f, "integer"), + Token::Char(_) => write!(f, "char"), + Token::String(_) => write!(f, "string"), + Token::Name(_) => write!(f, "name"), + Token::Keyword(_) => write!(f, "keyword"), + Token::BackQuote => write!(f, "`"), + Token::Comma => write!(f, ","), + Token::CommaAt => write!(f, ",@"), + Token::Quote => write!(f, "'"), + Token::End => write!(f, "EOF"), + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Span { + low: u32, + high: u32, +} + +impl Span { + fn empty(pos: u32) -> Self { + Self { + low: pos, + high: pos, + } + } +} + +pub struct Lexer<'input> { + input: &'input str, + cur_pos: u32, + offset: u32, +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str, offset: u32) -> Self { + Self { + input, + cur_pos: 0, + offset, + } + } + pub fn next_token(&mut self) -> Result<(Span, Token<'a>), LispError> { + let mut chars = self.input.char_indices(); + + while let Some((ind, chr)) = chars.next() { + let low = self.cur_pos; + let res = match chr { + '(' => Ok((1, Token::LeftParen)), + ')' => Ok((1, Token::RightParen)), + '\'' => Ok((1, Token::Quote)), + '`' => Ok((1, Token::BackQuote)), + ',' => match chars.next() { + Some((_, '@')) => Ok((2, Token::CommaAt)), + _ => Ok((1, Token::Comma)), + }, + '#' => parse_name(&self.input[ind..]), + '-' | '0'..='9' => parse_number(&self.input[ind..]), + '"' => parse_string(&self.input[ind..]), + ';' => { + self.cur_pos += consume_comment(ind, &mut chars) as u32; + continue; + } + _ if is_ident(chr) => parse_name(&self.input[ind..]), + ch if ch.is_whitespace() => { + self.cur_pos += ch.len_utf8() as u32; + continue; + } + ch => { + eprintln!("some unexpected character: {}", ch); + Err(LispError::ParseError) + } + }; + let (size, token) = match res { + Ok(v) => v, + Err(_) => return Err(LispError::ParseError), + }; + self.cur_pos += size as u32; + self.input = &self.input[ind + size..]; + let sp = Span { + low, + high: low + size as u32, + }; + return Ok((sp, token)); + } + self.input = &self.input[..0]; + return Ok((Span::empty(self.cur_pos), Token::End)); + } +} + +fn parse_number<'a>(mut input: &'a str) -> Result<(usize, Token<'a>), LispError> { + let mut dot = false; + let mut size = 0; + let mut chars = input.chars(); + + if let Some(v) = chars.next() { + if v == '-' { + size += 1; + input = &input[1..]; + } else if v.is_digit(10) { + size += 1; + } + } + + while let Some(chr) = chars.next() { + if chr.is_digit(10) { + size += 1; + } else if chr == '.' { + if !dot { + dot = true; + size += 1; + } else { + return Err(LispError::ParseError); + } + } else if !is_ident(chr) { + break; + } else { + return Err(LispError::ParseError); + } + } + let tok = if dot { + Token::Float(&input[..size]) + } else { + Token::Integer(&input[..size]) + }; + return Ok((size, tok)); +} + +fn parse_string<'a>(input: &'a str) -> Result<(usize, Token<'a>), LispError> { + // count opening quote + let mut size = 1; + let mut chars = input.char_indices().skip(1); + while let Some((ind, chr)) = chars.next() { + match chr { + '\\' => { + let _ = chars.next(); + } + '"' => { + size += ind; + break; + } + _ => (), + } + } + return Ok((size, Token::String(&input[..size]))); +} + +fn is_ident(ch: char) -> bool { + match ch { + '!' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | '<' | '=' | '>' | '?' | '^' | '_' + | '|' | '#' => true, + _ if ch.is_alphanumeric() => true, + _ => false, + } +} + +fn consume_comment(start: usize, chars: &mut CharIndices) -> usize { + let mut last = start; + + for (ind, ch) in chars { + last = ind; + if ch == '\n' { + break; + } + } + + last - start + 1 +} + +fn parse_name<'a>(input: &'a str) -> Result<(usize, Token<'a>), LispError> { + for (ind, chr) in input.char_indices() { + if !is_ident(chr) { + return Ok((ind, Token::Name(&input[..ind]))); + } + } + return Ok((input.len(), Token::Name(input))); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sp(low: u32, high: u32) -> Span { + Span { low, high } + } + + fn tokens(input: &str) -> Vec<(Span, Token)> { + let mut lexer = Lexer::new(input, 0); + let mut tokens = Vec::new(); + loop { + match lexer.next_token().unwrap() { + (_, Token::End) => break, + t => tokens.push(t), + } + } + tokens + } + + #[test] + fn string_parsing() { + let input = r#""hello there""#; + let parsed = parse_string(input).unwrap(); + assert_eq!(parsed.0, 13); + assert_eq!(parsed.1, Token::String(r#""hello there""#)); + } + + #[test] + fn integer_parsing() { + let input = "12345"; + let parsed = parse_number(input).unwrap(); + assert_eq!(parsed.0, 5); + assert_eq!(parsed.1, Token::Integer("12345")); + } + + #[test] + fn float_parsing() { + let input = "12.345"; + let parsed = parse_number(input).unwrap(); + assert_eq!(parsed.0, 6); + assert_eq!(parsed.1, Token::Float("12.345")); + } + + #[test] + fn lexer() { + assert_eq!( + tokens("1 2 3"), + [ + (sp(0, 1), Token::Integer("1")), + (sp(2, 3), Token::Integer("2")), + (sp(4, 5), Token::Integer("3")) + ] + ); + + assert_eq!( + tokens("1 foo 3"), + [ + (sp(0, 1), Token::Integer("1")), + (sp(2, 5), Token::Name("foo")), + (sp(6, 7), Token::Integer("3")) + ] + ); + + assert_eq!(tokens("foo"), [(sp(0, 3), Token::Name("foo")),]); + assert_eq!(tokens("#t"), [(sp(0, 2), Token::Name("#t")),]); + + assert_eq!( + tokens("1 \"foo\" 3"), + [ + (sp(0, 1), Token::Integer("1")), + (sp(2, 7), Token::String(r#""foo""#)), + (sp(8, 9), Token::Integer("3")) + ] + ); + + assert_eq!( + tokens("(* 1 (+ 2 3))"), + [ + (sp(0, 1), Token::LeftParen), + (sp(1, 2), Token::Name("*")), + (sp(3, 4), Token::Integer("1")), + (sp(5, 6), Token::LeftParen), + (sp(6, 7), Token::Name("+")), + (sp(8, 9), Token::Integer("2")), + (sp(10, 11), Token::Integer("3")), + (sp(11, 12), Token::RightParen), + (sp(12, 13), Token::RightParen), + ] + ); + + assert_eq!(tokens("; foo"), []); + assert_eq!(tokens("1; foo"), [(sp(0, 1), Token::Integer("1"))]); + } +} -- cgit v1.2.3