From ad24976da38482948c586bdbc16004273662ff7e Mon Sep 17 00:00:00 2001 From: Veetaha Date: Fri, 24 Jan 2020 03:39:23 +0200 Subject: ra_syntax: changed added diagnostics information returned from tokenize() (implemented with iterators) --- crates/ra_syntax/src/parsing/lexer.rs | 299 ++++++++++++++++++++++++++-------- 1 file changed, 229 insertions(+), 70 deletions(-) (limited to 'crates/ra_syntax/src/parsing/lexer.rs') diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs index 6d839208d..9dca7d747 100644 --- a/crates/ra_syntax/src/parsing/lexer.rs +++ b/crates/ra_syntax/src/parsing/lexer.rs @@ -1,4 +1,6 @@ -//! FIXME: write short doc here +//! Lexer analyzes raw input string and produces lexemes (tokens). + +use std::iter::{FromIterator, IntoIterator}; use crate::{ SyntaxKind::{self, *}, @@ -13,85 +15,242 @@ pub struct Token { /// The length of the token. pub len: TextUnit, } +impl Token { + pub const fn new(kind: SyntaxKind, len: TextUnit) -> Self { + Self { kind, len } + } +} -fn match_literal_kind(kind: rustc_lexer::LiteralKind) -> SyntaxKind { - match kind { - rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, - rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, - rustc_lexer::LiteralKind::Char { .. } => CHAR, - rustc_lexer::LiteralKind::Byte { .. } => BYTE, - rustc_lexer::LiteralKind::Str { .. } => STRING, - rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, - rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, - rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, +#[derive(Debug)] +/// Represents the result of parsing one token. +pub struct ParsedToken { + /// Parsed token. + pub token: Token, + /// If error is present then parsed token is malformed. + pub error: Option, +} +impl ParsedToken { + pub const fn new(token: Token, error: Option) -> Self { + Self { token, error } } } +#[derive(Debug, Default)] +/// Represents the result of parsing one token. +pub struct ParsedTokens { + /// Parsed token. + pub tokens: Vec, + /// If error is present then parsed token is malformed. + pub errors: Vec, +} + +impl FromIterator for ParsedTokens { + fn from_iter>(iter: I) -> Self { + let res = Self::default(); + for entry in iter { + res.tokens.push(entry.token); + if let Some(error) = entry.error { + res.errors.push(error); + } + } + res + } +} + +/// Returns the first encountered token from the string. +/// If the string contains zero or two or more tokens returns `None`. +pub fn single_token(text: &str) -> Option { + // TODO: test whether this condition indeed checks for a single token + first_token(text).filter(|parsed| parsed.token.len.to_usize() == text.len()) +} + +/* +/// Returns `ParsedTokens` which are basically a pair `(Vec, Vec)` +/// This is just a shorthand for `tokenize(text).collect()` +pub fn tokenize_to_vec_with_errors(text: &str) -> ParsedTokens { + tokenize(text).collect() +} + +/// The simplest version of tokenize, it just retunst a ready-made `Vec`. +/// It discards all tokenization errors while parsing. If you need that infromation +/// consider using `tokenize()` or `tokenize_to_vec_with_errors()`. +pub fn tokenize_to_vec(text: &str) -> Vec { + tokenize(text).map(|parsed_token| parsed_token.token).collect() +} +*/ + /// Break a string up into its component tokens -pub fn tokenize(text: &str) -> Vec { - if text.is_empty() { - return vec![]; +/// This is the core function, all other `tokenize*()` functions are simply +/// handy shortcuts for this one. +pub fn tokenize(text: &str) -> impl Iterator + '_ { + let shebang = rustc_lexer::strip_shebang(text).map(|shebang_len| { + text = &text[shebang_len..]; + ParsedToken::new(Token::new(SHEBANG, TextUnit::from_usize(shebang_len)), None) + }); + + // Notice that we eagerly evaluate shebang since it may change text slice + // and we cannot simplify this into a single method call chain + shebang.into_iter().chain(tokenize_without_shebang(text)) +} + +pub fn tokenize_without_shebang(text: &str) -> impl Iterator + '_ { + rustc_lexer::tokenize(text).map(|rustc_token| { + let token_text = &text[..rustc_token.len]; + text = &text[rustc_token.len..]; + rustc_token_kind_to_parsed_token(&rustc_token.kind, token_text) + }) +} + +#[derive(Debug)] +pub enum TokenizeError { + /// Base prefix was provided, but there were no digits + /// after it, e.g. `0x`. + EmptyInt, + /// Float exponent lacks digits e.g. `e+`, `E+`, `e-`, `E-`, + EmptyExponent, + + /// Block comment lacks trailing delimiter `*/` + UnterminatedBlockComment, + /// Character literal lacks trailing delimiter `'` + UnterminatedChar, + /// Characterish byte literal lacks trailing delimiter `'` + UnterminatedByte, + /// String literal lacks trailing delimiter `"` + UnterminatedString, + /// Byte string literal lacks trailing delimiter `"` + UnterminatedByteString, + /// Raw literal lacks trailing delimiter e.g. `"##` + UnterminatedRawString, + /// Raw byte string literal lacks trailing delimiter e.g. `"##` + UnterminatedRawByteString, + + /// Raw string lacks a quote after pound characters e.g. `r###` + UnstartedRawString, + /// Raw byte string lacks a quote after pound characters e.g. `br###` + UnstartedRawByteString, + + /// Lifetime starts with a number e.g. `'4ever` + LifetimeStartsWithNumber, +} + +fn rustc_token_kind_to_parsed_token( + rustc_token_kind: &rustc_lexer::TokenKind, + token_text: &str, +) -> ParsedToken { + use rustc_lexer::TokenKind as TK; + use TokenizeError as TE; + + // We drop some useful infromation here (see patterns with double dots `..`) + // Storing that info in `SyntaxKind` is not possible due to its layout requirements of + // being `u16` that come from `rowan::SyntaxKind` type and changes to `rowan::SyntaxKind` + // would mean hell of a rewrite. + + let (syntax_kind, error) = match *rustc_token_kind { + TK::LineComment => ok(COMMENT), + TK::BlockComment { terminated } => ok_if(terminated, COMMENT, TE::UnterminatedBlockComment), + TK::Whitespace => ok(WHITESPACE), + TK::Ident => ok(if token_text == "_" { + UNDERSCORE + } else { + SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) + }), + TK::RawIdent => ok(IDENT), + TK::Literal { kind, .. } => match_literal_kind(&kind), + TK::Lifetime { starts_with_number } => { + ok_if(!starts_with_number, LIFETIME, TE::LifetimeStartsWithNumber) + } + TK::Semi => ok(SEMI), + TK::Comma => ok(COMMA), + TK::Dot => ok(DOT), + TK::OpenParen => ok(L_PAREN), + TK::CloseParen => ok(R_PAREN), + TK::OpenBrace => ok(L_CURLY), + TK::CloseBrace => ok(R_CURLY), + TK::OpenBracket => ok(L_BRACK), + TK::CloseBracket => ok(R_BRACK), + TK::At => ok(AT), + TK::Pound => ok(POUND), + TK::Tilde => ok(TILDE), + TK::Question => ok(QUESTION), + TK::Colon => ok(COLON), + TK::Dollar => ok(DOLLAR), + TK::Eq => ok(EQ), + TK::Not => ok(EXCL), + TK::Lt => ok(L_ANGLE), + TK::Gt => ok(R_ANGLE), + TK::Minus => ok(MINUS), + TK::And => ok(AMP), + TK::Or => ok(PIPE), + TK::Plus => ok(PLUS), + TK::Star => ok(STAR), + TK::Slash => ok(SLASH), + TK::Caret => ok(CARET), + TK::Percent => ok(PERCENT), + TK::Unknown => ok(ERROR), + }; + + return ParsedToken::new( + Token::new(syntax_kind, TextUnit::from_usize(token_text.len())), + error, + ); + + type ParsedSyntaxKind = (SyntaxKind, Option); + + const fn ok(syntax_kind: SyntaxKind) -> ParsedSyntaxKind { + (syntax_kind, None) } - let mut text = text; - let mut acc = Vec::new(); - if let Some(len) = rustc_lexer::strip_shebang(text) { - acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) }); - text = &text[len..]; + const fn ok_if(cond: bool, syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind { + if cond { + ok(syntax_kind) + } else { + err(syntax_kind, error) + } } - while !text.is_empty() { - let rustc_token = rustc_lexer::first_token(text); - let kind = match rustc_token.kind { - rustc_lexer::TokenKind::LineComment => COMMENT, - rustc_lexer::TokenKind::BlockComment { .. } => COMMENT, - rustc_lexer::TokenKind::Whitespace => WHITESPACE, - rustc_lexer::TokenKind::Ident => { - let token_text = &text[..rustc_token.len]; - if token_text == "_" { - UNDERSCORE - } else { - SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT) - } + const fn err(syntax_kind: SyntaxKind, error: TokenizeError) -> ParsedSyntaxKind { + (syntax_kind, Some(error)) + } + + const fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> ParsedSyntaxKind { + use rustc_lexer::LiteralKind as LK; + match *kind { + LK::Int { empty_int, .. } => ok_if(!empty_int, INT_NUMBER, TE::EmptyInt), + LK::Float { empty_exponent, .. } => { + ok_if(!empty_exponent, FLOAT_NUMBER, TE::EmptyExponent) } - rustc_lexer::TokenKind::RawIdent => IDENT, - rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), - rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME, - rustc_lexer::TokenKind::Semi => SEMI, - rustc_lexer::TokenKind::Comma => COMMA, - rustc_lexer::TokenKind::Dot => DOT, - rustc_lexer::TokenKind::OpenParen => L_PAREN, - rustc_lexer::TokenKind::CloseParen => R_PAREN, - rustc_lexer::TokenKind::OpenBrace => L_CURLY, - rustc_lexer::TokenKind::CloseBrace => R_CURLY, - rustc_lexer::TokenKind::OpenBracket => L_BRACK, - rustc_lexer::TokenKind::CloseBracket => R_BRACK, - rustc_lexer::TokenKind::At => AT, - rustc_lexer::TokenKind::Pound => POUND, - rustc_lexer::TokenKind::Tilde => TILDE, - rustc_lexer::TokenKind::Question => QUESTION, - rustc_lexer::TokenKind::Colon => COLON, - rustc_lexer::TokenKind::Dollar => DOLLAR, - rustc_lexer::TokenKind::Eq => EQ, - rustc_lexer::TokenKind::Not => EXCL, - rustc_lexer::TokenKind::Lt => L_ANGLE, - rustc_lexer::TokenKind::Gt => R_ANGLE, - rustc_lexer::TokenKind::Minus => MINUS, - rustc_lexer::TokenKind::And => AMP, - rustc_lexer::TokenKind::Or => PIPE, - rustc_lexer::TokenKind::Plus => PLUS, - rustc_lexer::TokenKind::Star => STAR, - rustc_lexer::TokenKind::Slash => SLASH, - rustc_lexer::TokenKind::Caret => CARET, - rustc_lexer::TokenKind::Percent => PERCENT, - rustc_lexer::TokenKind::Unknown => ERROR, - }; - let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) }; - acc.push(token); - text = &text[rustc_token.len..]; + LK::Char { terminated } => ok_if(terminated, CHAR, TE::UnterminatedChar), + LK::Byte { terminated } => ok_if(terminated, BYTE, TE::UnterminatedByte), + LK::Str { terminated } => ok_if(terminated, STRING, TE::UnterminatedString), + LK::ByteStr { terminated } => { + ok_if(terminated, BYTE_STRING, TE::UnterminatedByteString) + } + + LK::RawStr { started: true, terminated, .. } => { + ok_if(terminated, RAW_STRING, TE::UnterminatedRawString) + } + LK::RawStr { started: false, .. } => err(RAW_STRING, TE::UnstartedRawString), + + LK::RawByteStr { started: true, terminated, .. } => { + ok_if(terminated, RAW_BYTE_STRING, TE::UnterminatedRawByteString) + } + LK::RawByteStr { started: false, .. } => { + err(RAW_BYTE_STRING, TE::UnstartedRawByteString) + } + } + } +} + +pub fn first_token(text: &str) -> Option { + // Checking for emptyness because of `rustc_lexer::first_token()` invariant (see its body) + if text.is_empty() { + None + } else { + let rustc_token = rustc_lexer::first_token(text); + Some(rustc_token_kind_to_parsed_token(&rustc_token.kind, &text[..rustc_token.len])) } - acc } -pub fn classify_literal(text: &str) -> Option { +// TODO: think what to do with this ad hoc function +pub fn classify_literal(text: &str) -> Option { let t = rustc_lexer::first_token(text); if t.len != text.len() { return None; @@ -100,5 +259,5 @@ pub fn classify_literal(text: &str) -> Option { rustc_lexer::TokenKind::Literal { kind, .. } => match_literal_kind(kind), _ => return None, }; - Some(Token { kind, len: TextUnit::from_usize(t.len) }) + Some(ParsedToken::new(Token::new(kind, TextUnit::from_usize(t.len)))) } -- cgit v1.2.3