From 75761c0e47d8c20a490a3d61ea64d2413d3c3570 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 22 Jul 2019 17:47:33 +0300 Subject: add rustc_lexer --- Cargo.lock | 10 ++ crates/ra_syntax/Cargo.toml | 1 + crates/ra_syntax/src/parsing/lexer.rs | 108 ++++++++++++++++++++- crates/ra_syntax/tests/data/lexer/0004_numbers.txt | 12 +-- .../tests/data/lexer/0014_unclosed_char.txt | 2 +- .../data/parser/err/0002_duplicate_shebang.txt | 38 +++++++- .../tests/data/parser/err/0030_string_suffixes.rs | 6 -- .../tests/data/parser/err/0030_string_suffixes.txt | 70 ------------- .../tests/data/parser/ok/0030_string_suffixes.rs | 6 ++ .../tests/data/parser/ok/0030_string_suffixes.txt | 63 ++++++++++++ 10 files changed, 225 insertions(+), 91 deletions(-) delete mode 100644 crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs delete mode 100644 crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt create mode 100644 crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs create mode 100644 crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt diff --git a/Cargo.lock b/Cargo.lock index 8feaf27ec..d5474d6e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1308,12 +1308,21 @@ dependencies = [ "serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "ra_rustc_lexer" +version = "0.1.0-pre.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ra_syntax" version = "0.1.0" dependencies = [ "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "ra_parser 0.1.0", + "ra_rustc_lexer 0.1.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)", "ra_text_edit 0.1.0", "rowan 0.6.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)", "smol_str 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)", @@ -2250,6 +2259,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum proptest 0.9.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cf147e022eacf0c8a054ab864914a7602618adba841d800a9a9868a5237a529f" "checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" "checksum quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" +"checksum ra_rustc_lexer 0.1.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e8d92772f822978a6c9c4657aa61af439e4e635180628b3354049b283b749f1e" "checksum ra_vfs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fb7cd4e302032c5ab514f1c01c89727cd96fd950dd36f9ebee9252df45d9fb1a" "checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" "checksum rand 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d47eab0e83d9693d40f825f86948aa16eff6750ead4bdffc4ab95b8b3a7f052c" diff --git a/crates/ra_syntax/Cargo.toml b/crates/ra_syntax/Cargo.toml index 97b6b047f..9ef8dee5d 100644 --- a/crates/ra_syntax/Cargo.toml +++ b/crates/ra_syntax/Cargo.toml @@ -11,6 +11,7 @@ repository = "https://github.com/rust-analyzer/rust-analyzer" unicode-xid = "0.1.0" itertools = "0.8.0" rowan = "0.6.0-pre.1" +ra_rustc_lexer = { version = "0.1.0-pre.1", features = [ "unicode-xid" ] } # ideally, `serde` should be enabled by `ra_lsp_server`, but we enable it here # to reduce number of compilations diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs index 60cf37047..1c818fdf4 100644 --- a/crates/ra_syntax/src/parsing/lexer.rs +++ b/crates/ra_syntax/src/parsing/lexer.rs @@ -30,19 +30,119 @@ pub struct Token { /// Break a string up into its component tokens pub fn tokenize(text: &str) -> Vec { + if text.is_empty() { + return vec![]; + } let mut text = text; let mut acc = Vec::new(); + if let Some(len) = ra_rustc_lexer::strip_shebang(text) { + acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) }); + text = &text[len..]; + } while !text.is_empty() { - let token = next_token(text); + let rustc_token = ra_rustc_lexer::first_token(text); + macro_rules! decompose { + ($t1:expr, $t2:expr) => {{ + acc.push(Token { kind: $t1, len: 1.into() }); + acc.push(Token { kind: $t2, len: 1.into() }); + text = &text[2..]; + continue; + }}; + ($t1:expr, $t2:expr, $t3:expr) => {{ + acc.push(Token { kind: $t1, len: 1.into() }); + acc.push(Token { kind: $t2, len: 1.into() }); + acc.push(Token { kind: $t3, len: 1.into() }); + text = &text[3..]; + continue; + }}; + } + let kind = match rustc_token.kind { + ra_rustc_lexer::TokenKind::LineComment => COMMENT, + ra_rustc_lexer::TokenKind::BlockComment { .. } => COMMENT, + ra_rustc_lexer::TokenKind::Whitespace => WHITESPACE, + ra_rustc_lexer::TokenKind::Ident => { + let token_text = &text[..rustc_token.len]; + if token_text == "_" { + UNDERSCORE + } else { + SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT) + } + } + ra_rustc_lexer::TokenKind::RawIdent => IDENT, + ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind { + ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, + ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, + ra_rustc_lexer::LiteralKind::Char { .. } => CHAR, + ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE, + ra_rustc_lexer::LiteralKind::Str { .. } => STRING, + ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, + ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, + ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, + }, + ra_rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME, + ra_rustc_lexer::TokenKind::Semi => SEMI, + ra_rustc_lexer::TokenKind::Comma => COMMA, + ra_rustc_lexer::TokenKind::DotDotDot => decompose!(DOT, DOT, DOT), + ra_rustc_lexer::TokenKind::DotDotEq => decompose!(DOT, DOT, EQ), + ra_rustc_lexer::TokenKind::DotDot => decompose!(DOT, DOT), + ra_rustc_lexer::TokenKind::Dot => DOT, + ra_rustc_lexer::TokenKind::OpenParen => L_PAREN, + ra_rustc_lexer::TokenKind::CloseParen => R_PAREN, + ra_rustc_lexer::TokenKind::OpenBrace => L_CURLY, + ra_rustc_lexer::TokenKind::CloseBrace => R_CURLY, + ra_rustc_lexer::TokenKind::OpenBracket => L_BRACK, + ra_rustc_lexer::TokenKind::CloseBracket => R_BRACK, + ra_rustc_lexer::TokenKind::At => AT, + ra_rustc_lexer::TokenKind::Pound => POUND, + ra_rustc_lexer::TokenKind::Tilde => TILDE, + ra_rustc_lexer::TokenKind::Question => QUESTION, + ra_rustc_lexer::TokenKind::ColonColon => decompose!(COLON, COLON), + ra_rustc_lexer::TokenKind::Colon => COLON, + ra_rustc_lexer::TokenKind::Dollar => DOLLAR, + ra_rustc_lexer::TokenKind::EqEq => decompose!(EQ, EQ), + ra_rustc_lexer::TokenKind::Eq => EQ, + ra_rustc_lexer::TokenKind::FatArrow => decompose!(EQ, R_ANGLE), + ra_rustc_lexer::TokenKind::Ne => decompose!(EXCL, EQ), + ra_rustc_lexer::TokenKind::Not => EXCL, + ra_rustc_lexer::TokenKind::Le => decompose!(L_ANGLE, EQ), + ra_rustc_lexer::TokenKind::LArrow => decompose!(COLON, MINUS), + ra_rustc_lexer::TokenKind::Lt => L_ANGLE, + ra_rustc_lexer::TokenKind::ShlEq => decompose!(L_ANGLE, L_ANGLE, EQ), + ra_rustc_lexer::TokenKind::Shl => decompose!(L_ANGLE, L_ANGLE), + ra_rustc_lexer::TokenKind::Ge => decompose!(R_ANGLE, EQ), + ra_rustc_lexer::TokenKind::Gt => R_ANGLE, + ra_rustc_lexer::TokenKind::ShrEq => decompose!(R_ANGLE, R_ANGLE, EQ), + ra_rustc_lexer::TokenKind::Shr => decompose!(R_ANGLE, R_ANGLE), + ra_rustc_lexer::TokenKind::RArrow => decompose!(MINUS, R_ANGLE), + ra_rustc_lexer::TokenKind::Minus => MINUS, + ra_rustc_lexer::TokenKind::MinusEq => decompose!(MINUS, EQ), + ra_rustc_lexer::TokenKind::And => AMP, + ra_rustc_lexer::TokenKind::AndAnd => decompose!(AMP, AMP), + ra_rustc_lexer::TokenKind::AndEq => decompose!(AMP, EQ), + ra_rustc_lexer::TokenKind::Or => PIPE, + ra_rustc_lexer::TokenKind::OrOr => decompose!(PIPE, PIPE), + ra_rustc_lexer::TokenKind::OrEq => decompose!(PIPE, EQ), + ra_rustc_lexer::TokenKind::PlusEq => decompose!(PLUS, EQ), + ra_rustc_lexer::TokenKind::Plus => PLUS, + ra_rustc_lexer::TokenKind::StarEq => decompose!(STAR, EQ), + ra_rustc_lexer::TokenKind::Star => STAR, + ra_rustc_lexer::TokenKind::SlashEq => decompose!(SLASH, EQ), + ra_rustc_lexer::TokenKind::Slash => SLASH, + ra_rustc_lexer::TokenKind::CaretEq => decompose!(CARET, EQ), + ra_rustc_lexer::TokenKind::Caret => CARET, + ra_rustc_lexer::TokenKind::PercentEq => decompose!(PERCENT, EQ), + ra_rustc_lexer::TokenKind::Percent => PERCENT, + ra_rustc_lexer::TokenKind::Unknown => ERROR, + }; + let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) }; acc.push(token); - let len: u32 = token.len.into(); - text = &text[len as usize..]; + text = &text[rustc_token.len..]; } acc } /// Get the next token from a string -pub fn next_token(text: &str) -> Token { +fn next_token(text: &str) -> Token { assert!(!text.is_empty()); let mut ptr = Ptr::new(text); let c = ptr.bump().unwrap(); diff --git a/crates/ra_syntax/tests/data/lexer/0004_numbers.txt b/crates/ra_syntax/tests/data/lexer/0004_numbers.txt index 39988aedc..7bb89b8ae 100644 --- a/crates/ra_syntax/tests/data/lexer/0004_numbers.txt +++ b/crates/ra_syntax/tests/data/lexer/0004_numbers.txt @@ -12,9 +12,9 @@ INT_NUMBER 2 "0_" WHITESPACE 1 " " FLOAT_NUMBER 2 "0." WHITESPACE 1 " " -INT_NUMBER 2 "0e" +FLOAT_NUMBER 2 "0e" WHITESPACE 1 " " -INT_NUMBER 2 "0E" +FLOAT_NUMBER 2 "0E" WHITESPACE 1 " " INT_NUMBER 2 "0z" WHITESPACE 1 "\n" @@ -32,9 +32,9 @@ INT_NUMBER 6 "0_1279" WHITESPACE 1 " " FLOAT_NUMBER 6 "0.1279" WHITESPACE 1 " " -INT_NUMBER 6 "0e1279" +FLOAT_NUMBER 6 "0e1279" WHITESPACE 1 " " -INT_NUMBER 6 "0E1279" +FLOAT_NUMBER 6 "0E1279" WHITESPACE 1 "\n" INT_NUMBER 1 "0" DOT 1 "." @@ -47,9 +47,7 @@ IDENT 3 "foo" L_PAREN 1 "(" R_PAREN 1 ")" WHITESPACE 1 "\n" -INT_NUMBER 2 "0e" -PLUS 1 "+" -INT_NUMBER 1 "1" +FLOAT_NUMBER 4 "0e+1" WHITESPACE 1 "\n" INT_NUMBER 1 "0" DOT 1 "." diff --git a/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt b/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt index 812dfbc18..737a300ee 100644 --- a/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt +++ b/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt @@ -1 +1 @@ -CHAR 2 "\'1" +LIFETIME 2 "\'1" diff --git a/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt b/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt index 76d186a3c..84867026f 100644 --- a/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt +++ b/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt @@ -1,7 +1,39 @@ SOURCE_FILE@[0; 42) SHEBANG@[0; 20) "#!/use/bin/env rusti" WHITESPACE@[20; 21) "\n" - ERROR@[21; 41) - SHEBANG@[21; 41) "#!/use/bin/env rusti" + ATTR@[21; 23) + POUND@[21; 22) "#" + EXCL@[22; 23) "!" + ERROR@[23; 24) + SLASH@[23; 24) "/" + USE_ITEM@[24; 28) + USE_KW@[24; 27) "use" + ERROR@[27; 28) + SLASH@[27; 28) "/" + MACRO_CALL@[28; 31) + PATH@[28; 31) + PATH_SEGMENT@[28; 31) + NAME_REF@[28; 31) + IDENT@[28; 31) "bin" + ERROR@[31; 32) + SLASH@[31; 32) "/" + MACRO_CALL@[32; 41) + PATH@[32; 35) + PATH_SEGMENT@[32; 35) + NAME_REF@[32; 35) + IDENT@[32; 35) "env" + WHITESPACE@[35; 36) " " + NAME@[36; 41) + IDENT@[36; 41) "rusti" WHITESPACE@[41; 42) "\n" -error 21: expected an item +error 23: expected `[` +error 23: expected an item +error 27: expected one of `*`, `::`, `{`, `self`, `super` or an indentifier +error 28: expected SEMI +error 31: expected EXCL +error 31: expected `{`, `[`, `(` +error 31: expected SEMI +error 31: expected an item +error 35: expected EXCL +error 41: expected `{`, `[`, `(` +error 41: expected SEMI diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs deleted file mode 100644 index 261aad1fb..000000000 --- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs +++ /dev/null @@ -1,6 +0,0 @@ -fn main() { - let _ = 'c'u32; - let _ = "string"invalid; - let _ = b'b'_suff; - let _ = b"bs"invalid; -} diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt deleted file mode 100644 index b0acfa5d2..000000000 --- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt +++ /dev/null @@ -1,70 +0,0 @@ -SOURCE_FILE@[0; 112) - FN_DEF@[0; 111) - FN_KW@[0; 2) "fn" - WHITESPACE@[2; 3) " " - NAME@[3; 7) - IDENT@[3; 7) "main" - PARAM_LIST@[7; 9) - L_PAREN@[7; 8) "(" - R_PAREN@[8; 9) ")" - WHITESPACE@[9; 10) " " - BLOCK@[10; 111) - L_CURLY@[10; 11) "{" - WHITESPACE@[11; 16) "\n " - LET_STMT@[16; 27) - LET_KW@[16; 19) "let" - WHITESPACE@[19; 20) " " - PLACEHOLDER_PAT@[20; 21) - UNDERSCORE@[20; 21) "_" - WHITESPACE@[21; 22) " " - EQ@[22; 23) "=" - WHITESPACE@[23; 24) " " - LITERAL@[24; 27) - CHAR@[24; 27) "\'c\'" - EXPR_STMT@[27; 31) - PATH_EXPR@[27; 30) - PATH@[27; 30) - PATH_SEGMENT@[27; 30) - NAME_REF@[27; 30) - IDENT@[27; 30) "u32" - SEMI@[30; 31) ";" - WHITESPACE@[31; 36) "\n " - LET_STMT@[36; 60) - LET_KW@[36; 39) "let" - WHITESPACE@[39; 40) " " - PLACEHOLDER_PAT@[40; 41) - UNDERSCORE@[40; 41) "_" - WHITESPACE@[41; 42) " " - EQ@[42; 43) "=" - WHITESPACE@[43; 44) " " - LITERAL@[44; 59) - STRING@[44; 59) "\"string\"invalid" - SEMI@[59; 60) ";" - WHITESPACE@[60; 65) "\n " - LET_STMT@[65; 83) - LET_KW@[65; 68) "let" - WHITESPACE@[68; 69) " " - PLACEHOLDER_PAT@[69; 70) - UNDERSCORE@[69; 70) "_" - WHITESPACE@[70; 71) " " - EQ@[71; 72) "=" - WHITESPACE@[72; 73) " " - LITERAL@[73; 82) - BYTE@[73; 82) "b\'b\'_suff" - SEMI@[82; 83) ";" - WHITESPACE@[83; 88) "\n " - LET_STMT@[88; 109) - LET_KW@[88; 91) "let" - WHITESPACE@[91; 92) " " - PLACEHOLDER_PAT@[92; 93) - UNDERSCORE@[92; 93) "_" - WHITESPACE@[93; 94) " " - EQ@[94; 95) "=" - WHITESPACE@[95; 96) " " - LITERAL@[96; 108) - BYTE_STRING@[96; 108) "b\"bs\"invalid" - SEMI@[108; 109) ";" - WHITESPACE@[109; 110) "\n" - R_CURLY@[110; 111) "}" - WHITESPACE@[111; 112) "\n" -error 27: expected SEMI diff --git a/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs new file mode 100644 index 000000000..261aad1fb --- /dev/null +++ b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs @@ -0,0 +1,6 @@ +fn main() { + let _ = 'c'u32; + let _ = "string"invalid; + let _ = b'b'_suff; + let _ = b"bs"invalid; +} diff --git a/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt new file mode 100644 index 000000000..4f7e809c5 --- /dev/null +++ b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt @@ -0,0 +1,63 @@ +SOURCE_FILE@[0; 112) + FN_DEF@[0; 111) + FN_KW@[0; 2) "fn" + WHITESPACE@[2; 3) " " + NAME@[3; 7) + IDENT@[3; 7) "main" + PARAM_LIST@[7; 9) + L_PAREN@[7; 8) "(" + R_PAREN@[8; 9) ")" + WHITESPACE@[9; 10) " " + BLOCK@[10; 111) + L_CURLY@[10; 11) "{" + WHITESPACE@[11; 16) "\n " + LET_STMT@[16; 31) + LET_KW@[16; 19) "let" + WHITESPACE@[19; 20) " " + PLACEHOLDER_PAT@[20; 21) + UNDERSCORE@[20; 21) "_" + WHITESPACE@[21; 22) " " + EQ@[22; 23) "=" + WHITESPACE@[23; 24) " " + LITERAL@[24; 30) + CHAR@[24; 30) "\'c\'u32" + SEMI@[30; 31) ";" + WHITESPACE@[31; 36) "\n " + LET_STMT@[36; 60) + LET_KW@[36; 39) "let" + WHITESPACE@[39; 40) " " + PLACEHOLDER_PAT@[40; 41) + UNDERSCORE@[40; 41) "_" + WHITESPACE@[41; 42) " " + EQ@[42; 43) "=" + WHITESPACE@[43; 44) " " + LITERAL@[44; 59) + STRING@[44; 59) "\"string\"invalid" + SEMI@[59; 60) ";" + WHITESPACE@[60; 65) "\n " + LET_STMT@[65; 83) + LET_KW@[65; 68) "let" + WHITESPACE@[68; 69) " " + PLACEHOLDER_PAT@[69; 70) + UNDERSCORE@[69; 70) "_" + WHITESPACE@[70; 71) " " + EQ@[71; 72) "=" + WHITESPACE@[72; 73) " " + LITERAL@[73; 82) + BYTE@[73; 82) "b\'b\'_suff" + SEMI@[82; 83) ";" + WHITESPACE@[83; 88) "\n " + LET_STMT@[88; 109) + LET_KW@[88; 91) "let" + WHITESPACE@[91; 92) " " + PLACEHOLDER_PAT@[92; 93) + UNDERSCORE@[92; 93) "_" + WHITESPACE@[93; 94) " " + EQ@[94; 95) "=" + WHITESPACE@[95; 96) " " + LITERAL@[96; 108) + BYTE_STRING@[96; 108) "b\"bs\"invalid" + SEMI@[108; 109) ";" + WHITESPACE@[109; 110) "\n" + R_CURLY@[110; 111) "}" + WHITESPACE@[111; 112) "\n" -- cgit v1.2.3 From 700669bbd0ab3ae0c5a56985ce13ca896d342a3a Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 22 Jul 2019 17:56:19 +0300 Subject: kill old lexer --- crates/ra_syntax/src/parsing/lexer.rs | 165 +++---------------------- crates/ra_syntax/src/parsing/lexer/classes.rs | 26 ---- crates/ra_syntax/src/parsing/lexer/comments.rs | 57 --------- crates/ra_syntax/src/parsing/lexer/numbers.rs | 66 ---------- crates/ra_syntax/src/parsing/lexer/ptr.rs | 162 ------------------------ crates/ra_syntax/src/parsing/lexer/strings.rs | 112 ----------------- 6 files changed, 17 insertions(+), 571 deletions(-) delete mode 100644 crates/ra_syntax/src/parsing/lexer/classes.rs delete mode 100644 crates/ra_syntax/src/parsing/lexer/comments.rs delete mode 100644 crates/ra_syntax/src/parsing/lexer/numbers.rs delete mode 100644 crates/ra_syntax/src/parsing/lexer/ptr.rs delete mode 100644 crates/ra_syntax/src/parsing/lexer/strings.rs diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs index 1c818fdf4..2a4343b0a 100644 --- a/crates/ra_syntax/src/parsing/lexer.rs +++ b/crates/ra_syntax/src/parsing/lexer.rs @@ -1,22 +1,6 @@ -mod classes; -mod comments; -mod numbers; -mod ptr; -mod strings; - use crate::{ SyntaxKind::{self, *}, - TextUnit, T, -}; - -use self::{ - classes::*, - comments::{scan_comment, scan_shebang}, - numbers::scan_number, - ptr::Ptr, - strings::{ - is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string, - }, + TextUnit, }; /// A token of Rust source. @@ -141,138 +125,23 @@ pub fn tokenize(text: &str) -> Vec { acc } -/// Get the next token from a string -fn next_token(text: &str) -> Token { - assert!(!text.is_empty()); - let mut ptr = Ptr::new(text); - let c = ptr.bump().unwrap(); - let kind = next_token_inner(c, &mut ptr); - let len = ptr.into_len(); - Token { kind, len } -} - -fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { - if is_whitespace(c) { - ptr.bump_while(is_whitespace); - return WHITESPACE; - } - - match c { - '#' => { - if scan_shebang(ptr) { - return SHEBANG; - } - } - '/' => { - if let Some(kind) = scan_comment(ptr) { - return kind; - } - } - _ => (), - } - - let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1)); - if ident_start { - return scan_ident(c, ptr); - } - - if is_dec_digit(c) { - let kind = scan_number(c, ptr); - scan_literal_suffix(ptr); - return kind; - } - - // One-byte tokens. - if let Some(kind) = SyntaxKind::from_char(c) { - return kind; - } - - match c { - // Possiblily multi-byte tokens, - // but we only produce single byte token now - // T![...], T![..], T![..=], T![.] - '.' => return T![.], - // T![::] T![:] - ':' => return T![:], - // T![==] FATARROW T![=] - '=' => return T![=], - // T![!=] T![!] - '!' => return T![!], - // T![->] T![-] - '-' => return T![-], - - // If the character is an ident start not followed by another single - // quote, then this is a lifetime name: - '\'' => { - return if ptr.at_p(is_ident_start) && !ptr.at_str("''") { - ptr.bump(); - while ptr.at_p(is_ident_continue) { - ptr.bump(); - } - // lifetimes shouldn't end with a single quote - // if we find one, then this is an invalid character literal - if ptr.at('\'') { - ptr.bump(); - return CHAR; - } - LIFETIME - } else { - scan_char(ptr); - scan_literal_suffix(ptr); - CHAR - }; - } - 'b' => { - let kind = scan_byte_char_or_string(ptr); - scan_literal_suffix(ptr); - return kind; - } - '"' => { - scan_string(ptr); - scan_literal_suffix(ptr); - return STRING; - } - 'r' => { - scan_raw_string(ptr); - scan_literal_suffix(ptr); - return RAW_STRING; - } - _ => (), - } - ERROR -} - -fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { - let is_raw = match (c, ptr.current()) { - ('r', Some('#')) => { - ptr.bump(); - true - } - ('_', None) => return T![_], - ('_', Some(c)) if !is_ident_continue(c) => return T![_], - _ => false, - }; - ptr.bump_while(is_ident_continue); - if !is_raw { - if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) { - return kind; - } - } - IDENT -} - -fn scan_literal_suffix(ptr: &mut Ptr) { - if ptr.at_p(is_ident_start) { - ptr.bump(); - } - ptr.bump_while(is_ident_continue); -} - pub fn classify_literal(text: &str) -> Option { - let tkn = next_token(text); - if !tkn.kind.is_literal() || tkn.len.to_usize() != text.len() { + let t = ra_rustc_lexer::first_token(text); + if t.len != text.len() { return None; } - - Some(tkn) + let kind = match t.kind { + ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind { + ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, + ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, + ra_rustc_lexer::LiteralKind::Char { .. } => CHAR, + ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE, + ra_rustc_lexer::LiteralKind::Str { .. } => STRING, + ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, + ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, + ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, + }, + _ => return None, + }; + Some(Token { kind, len: TextUnit::from_usize(t.len) }) } diff --git a/crates/ra_syntax/src/parsing/lexer/classes.rs b/crates/ra_syntax/src/parsing/lexer/classes.rs deleted file mode 100644 index 4235d2648..000000000 --- a/crates/ra_syntax/src/parsing/lexer/classes.rs +++ /dev/null @@ -1,26 +0,0 @@ -use unicode_xid::UnicodeXID; - -pub fn is_ident_start(c: char) -> bool { - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || c == '_' - || (c > '\x7f' && UnicodeXID::is_xid_start(c)) -} - -pub fn is_ident_continue(c: char) -> bool { - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || (c >= '0' && c <= '9') - || c == '_' - || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) -} - -pub fn is_whitespace(c: char) -> bool { - //FIXME: use is_pattern_whitespace - //https://github.com/behnam/rust-unic/issues/192 - c.is_whitespace() -} - -pub fn is_dec_digit(c: char) -> bool { - '0' <= c && c <= '9' -} diff --git a/crates/ra_syntax/src/parsing/lexer/comments.rs b/crates/ra_syntax/src/parsing/lexer/comments.rs deleted file mode 100644 index 8bbbe659b..000000000 --- a/crates/ra_syntax/src/parsing/lexer/comments.rs +++ /dev/null @@ -1,57 +0,0 @@ -use crate::parsing::lexer::ptr::Ptr; - -use crate::SyntaxKind::{self, *}; - -pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { - if ptr.at_str("!/") { - ptr.bump(); - ptr.bump(); - bump_until_eol(ptr); - true - } else { - false - } -} - -fn scan_block_comment(ptr: &mut Ptr) -> Option { - if ptr.at('*') { - ptr.bump(); - let mut depth: u32 = 1; - while depth > 0 { - if ptr.at_str("*/") { - depth -= 1; - ptr.bump(); - ptr.bump(); - } else if ptr.at_str("/*") { - depth += 1; - ptr.bump(); - ptr.bump(); - } else if ptr.bump().is_none() { - break; - } - } - Some(COMMENT) - } else { - None - } -} - -pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option { - if ptr.at('/') { - bump_until_eol(ptr); - Some(COMMENT) - } else { - scan_block_comment(ptr) - } -} - -fn bump_until_eol(ptr: &mut Ptr) { - loop { - if ptr.at('\n') || ptr.at_str("\r\n") { - return; - } - if ptr.bump().is_none() { - break; - } - } -} diff --git a/crates/ra_syntax/src/parsing/lexer/numbers.rs b/crates/ra_syntax/src/parsing/lexer/numbers.rs deleted file mode 100644 index e53ae231b..000000000 --- a/crates/ra_syntax/src/parsing/lexer/numbers.rs +++ /dev/null @@ -1,66 +0,0 @@ -use crate::parsing::lexer::{classes::*, ptr::Ptr}; - -use crate::SyntaxKind::{self, *}; - -pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { - if c == '0' { - match ptr.current().unwrap_or('\0') { - 'b' | 'o' => { - ptr.bump(); - scan_digits(ptr, false); - } - 'x' => { - ptr.bump(); - scan_digits(ptr, true); - } - '0'..='9' | '_' | '.' | 'e' | 'E' => { - scan_digits(ptr, true); - } - _ => return INT_NUMBER, - } - } else { - scan_digits(ptr, false); - } - - // might be a float, but don't be greedy if this is actually an - // integer literal followed by field/method access or a range pattern - // (`0..2` and `12.foo()`) - if ptr.at('.') && !(ptr.at_str("..") || ptr.nth_is_p(1, is_ident_start)) { - // might have stuff after the ., and if it does, it needs to start - // with a number - ptr.bump(); - scan_digits(ptr, false); - scan_float_exponent(ptr); - return FLOAT_NUMBER; - } - // it might be a float if it has an exponent - if ptr.at('e') || ptr.at('E') { - scan_float_exponent(ptr); - return FLOAT_NUMBER; - } - INT_NUMBER -} - -fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { - while let Some(c) = ptr.current() { - match c { - '_' | '0'..='9' => { - ptr.bump(); - } - 'a'..='f' | 'A'..='F' if allow_hex => { - ptr.bump(); - } - _ => return, - } - } -} - -fn scan_float_exponent(ptr: &mut Ptr) { - if ptr.at('e') || ptr.at('E') { - ptr.bump(); - if ptr.at('-') || ptr.at('+') { - ptr.bump(); - } - scan_digits(ptr, false); - } -} diff --git a/crates/ra_syntax/src/parsing/lexer/ptr.rs b/crates/ra_syntax/src/parsing/lexer/ptr.rs deleted file mode 100644 index c341c4176..000000000 --- a/crates/ra_syntax/src/parsing/lexer/ptr.rs +++ /dev/null @@ -1,162 +0,0 @@ -use crate::TextUnit; - -use std::str::Chars; - -/// A simple view into the characters of a string. -pub(crate) struct Ptr<'s> { - text: &'s str, - len: TextUnit, -} - -impl<'s> Ptr<'s> { - /// Creates a new `Ptr` from a string. - pub fn new(text: &'s str) -> Ptr<'s> { - Ptr { text, len: 0.into() } - } - - /// Gets the length of the remaining string. - pub fn into_len(self) -> TextUnit { - self.len - } - - /// Gets the current character, if one exists. - pub fn current(&self) -> Option { - self.chars().next() - } - - /// Gets the nth character from the current. - /// For example, 0 will return the current character, 1 will return the next, etc. - pub fn nth(&self, n: u32) -> Option { - self.chars().nth(n as usize) - } - - /// Checks whether the current character is `c`. - pub fn at(&self, c: char) -> bool { - self.current() == Some(c) - } - - /// Checks whether the next characters match `s`. - pub fn at_str(&self, s: &str) -> bool { - let chars = self.chars(); - chars.as_str().starts_with(s) - } - - /// Checks whether the current character satisfies the predicate `p`. - pub fn at_p bool>(&self, p: P) -> bool { - self.current().map(p) == Some(true) - } - - /// Checks whether the nth character satisfies the predicate `p`. - pub fn nth_is_p bool>(&self, n: u32, p: P) -> bool { - self.nth(n).map(p) == Some(true) - } - - /// Moves to the next character. - pub fn bump(&mut self) -> Option { - let ch = self.chars().next()?; - self.len += TextUnit::of_char(ch); - Some(ch) - } - - /// Moves to the next character as long as `pred` is satisfied. - pub fn bump_while bool>(&mut self, pred: F) { - loop { - match self.current() { - Some(c) if pred(c) => { - self.bump(); - } - _ => return, - } - } - } - - /// Returns the text up to the current point. - pub fn current_token_text(&self) -> &str { - let len: u32 = self.len.into(); - &self.text[..len as usize] - } - - /// Returns an iterator over the remaining characters. - fn chars(&self) -> Chars { - let len: u32 = self.len.into(); - self.text[len as usize..].chars() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_current() { - let ptr = Ptr::new("test"); - assert_eq!(ptr.current(), Some('t')); - } - - #[test] - fn test_nth() { - let ptr = Ptr::new("test"); - assert_eq!(ptr.nth(0), Some('t')); - assert_eq!(ptr.nth(1), Some('e')); - assert_eq!(ptr.nth(2), Some('s')); - assert_eq!(ptr.nth(3), Some('t')); - assert_eq!(ptr.nth(4), None); - } - - #[test] - fn test_at() { - let ptr = Ptr::new("test"); - assert!(ptr.at('t')); - assert!(!ptr.at('a')); - } - - #[test] - fn test_at_str() { - let ptr = Ptr::new("test"); - assert!(ptr.at_str("t")); - assert!(ptr.at_str("te")); - assert!(ptr.at_str("test")); - assert!(!ptr.at_str("tests")); - assert!(!ptr.at_str("rust")); - } - - #[test] - fn test_at_p() { - let ptr = Ptr::new("test"); - assert!(ptr.at_p(|c| c == 't')); - assert!(!ptr.at_p(|c| c == 'e')); - } - - #[test] - fn test_nth_is_p() { - let ptr = Ptr::new("test"); - assert!(ptr.nth_is_p(0, |c| c == 't')); - assert!(!ptr.nth_is_p(1, |c| c == 't')); - assert!(ptr.nth_is_p(3, |c| c == 't')); - assert!(!ptr.nth_is_p(150, |c| c == 't')); - } - - #[test] - fn test_bump() { - let mut ptr = Ptr::new("test"); - assert_eq!(ptr.current(), Some('t')); - ptr.bump(); - assert_eq!(ptr.current(), Some('e')); - ptr.bump(); - assert_eq!(ptr.current(), Some('s')); - ptr.bump(); - assert_eq!(ptr.current(), Some('t')); - ptr.bump(); - assert_eq!(ptr.current(), None); - ptr.bump(); - assert_eq!(ptr.current(), None); - } - - #[test] - fn test_bump_while() { - let mut ptr = Ptr::new("test"); - assert_eq!(ptr.current(), Some('t')); - ptr.bump_while(|c| c != 's'); - assert_eq!(ptr.current(), Some('s')); - } -} diff --git a/crates/ra_syntax/src/parsing/lexer/strings.rs b/crates/ra_syntax/src/parsing/lexer/strings.rs deleted file mode 100644 index f74acff9e..000000000 --- a/crates/ra_syntax/src/parsing/lexer/strings.rs +++ /dev/null @@ -1,112 +0,0 @@ -use crate::{ - parsing::lexer::ptr::Ptr, - SyntaxKind::{self, *}, -}; - -pub(crate) fn is_string_literal_start(c: char, c1: Option, c2: Option) -> bool { - match (c, c1, c2) { - ('r', Some('"'), _) - | ('r', Some('#'), Some('"')) - | ('r', Some('#'), Some('#')) - | ('b', Some('"'), _) - | ('b', Some('\''), _) - | ('b', Some('r'), Some('"')) - | ('b', Some('r'), Some('#')) => true, - _ => false, - } -} - -pub(crate) fn scan_char(ptr: &mut Ptr) { - while let Some(c) = ptr.current() { - match c { - '\\' => { - ptr.bump(); - if ptr.at('\\') || ptr.at('\'') { - ptr.bump(); - } - } - '\'' => { - ptr.bump(); - return; - } - '\n' => return, - _ => { - ptr.bump(); - } - } - } -} - -pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { - // unwrapping and not-exhaustive match are ok - // because of string_literal_start - let c = ptr.bump().unwrap(); - match c { - '\'' => { - scan_byte(ptr); - BYTE - } - '"' => { - scan_byte_string(ptr); - BYTE_STRING - } - 'r' => { - scan_raw_string(ptr); - RAW_BYTE_STRING - } - _ => unreachable!(), - } -} - -pub(crate) fn scan_string(ptr: &mut Ptr) { - while let Some(c) = ptr.current() { - match c { - '\\' => { - ptr.bump(); - if ptr.at('\\') || ptr.at('"') { - ptr.bump(); - } - } - '"' => { - ptr.bump(); - return; - } - _ => { - ptr.bump(); - } - } - } -} - -pub(crate) fn scan_raw_string(ptr: &mut Ptr) { - let mut hashes = 0; - while ptr.at('#') { - hashes += 1; - ptr.bump(); - } - if !ptr.at('"') { - return; - } - ptr.bump(); - - while let Some(c) = ptr.bump() { - if c == '"' { - let mut hashes_left = hashes; - while ptr.at('#') && hashes_left > 0 { - hashes_left -= 1; - ptr.bump(); - } - if hashes_left == 0 { - return; - } - } - } -} - -fn scan_byte(ptr: &mut Ptr) { - scan_char(ptr) -} - -fn scan_byte_string(ptr: &mut Ptr) { - scan_string(ptr) -} -- cgit v1.2.3