From 75761c0e47d8c20a490a3d61ea64d2413d3c3570 Mon Sep 17 00:00:00 2001
From: Aleksey Kladov <aleksey.kladov@gmail.com>
Date: Mon, 22 Jul 2019 17:47:33 +0300
Subject: add rustc_lexer

---
 Cargo.lock                                         |  10 ++
 crates/ra_syntax/Cargo.toml                        |   1 +
 crates/ra_syntax/src/parsing/lexer.rs              | 108 ++++++++++++++++++++-
 crates/ra_syntax/tests/data/lexer/0004_numbers.txt |  12 +--
 .../tests/data/lexer/0014_unclosed_char.txt        |   2 +-
 .../data/parser/err/0002_duplicate_shebang.txt     |  38 +++++++-
 .../tests/data/parser/err/0030_string_suffixes.rs  |   6 --
 .../tests/data/parser/err/0030_string_suffixes.txt |  70 -------------
 .../tests/data/parser/ok/0030_string_suffixes.rs   |   6 ++
 .../tests/data/parser/ok/0030_string_suffixes.txt  |  63 ++++++++++++
 10 files changed, 225 insertions(+), 91 deletions(-)
 delete mode 100644 crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs
 delete mode 100644 crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt
 create mode 100644 crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs
 create mode 100644 crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt

diff --git a/Cargo.lock b/Cargo.lock
index 8feaf27ec..d5474d6e2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1308,12 +1308,21 @@ dependencies = [
  "serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
+[[package]]
+name = "ra_rustc_lexer"
+version = "0.1.0-pre.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "ra_syntax"
 version = "0.1.0"
 dependencies = [
  "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "ra_parser 0.1.0",
+ "ra_rustc_lexer 0.1.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "ra_text_edit 0.1.0",
  "rowan 0.6.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "smol_str 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -2250,6 +2259,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum proptest 0.9.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cf147e022eacf0c8a054ab864914a7602618adba841d800a9a9868a5237a529f"
 "checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0"
 "checksum quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1"
+"checksum ra_rustc_lexer 0.1.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e8d92772f822978a6c9c4657aa61af439e4e635180628b3354049b283b749f1e"
 "checksum ra_vfs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fb7cd4e302032c5ab514f1c01c89727cd96fd950dd36f9ebee9252df45d9fb1a"
 "checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca"
 "checksum rand 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d47eab0e83d9693d40f825f86948aa16eff6750ead4bdffc4ab95b8b3a7f052c"
diff --git a/crates/ra_syntax/Cargo.toml b/crates/ra_syntax/Cargo.toml
index 97b6b047f..9ef8dee5d 100644
--- a/crates/ra_syntax/Cargo.toml
+++ b/crates/ra_syntax/Cargo.toml
@@ -11,6 +11,7 @@ repository = "https://github.com/rust-analyzer/rust-analyzer"
 unicode-xid = "0.1.0"
 itertools = "0.8.0"
 rowan = "0.6.0-pre.1"
+ra_rustc_lexer = { version = "0.1.0-pre.1", features = [ "unicode-xid" ] }
 
 # ideally, `serde` should be enabled by `ra_lsp_server`, but we enable it here
 # to reduce number of compilations
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index 60cf37047..1c818fdf4 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -30,19 +30,119 @@ pub struct Token {
 
 /// Break a string up into its component tokens
 pub fn tokenize(text: &str) -> Vec<Token> {
+    if text.is_empty() {
+        return vec![];
+    }
     let mut text = text;
     let mut acc = Vec::new();
+    if let Some(len) = ra_rustc_lexer::strip_shebang(text) {
+        acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) });
+        text = &text[len..];
+    }
     while !text.is_empty() {
-        let token = next_token(text);
+        let rustc_token = ra_rustc_lexer::first_token(text);
+        macro_rules! decompose {
+            ($t1:expr, $t2:expr) => {{
+                acc.push(Token { kind: $t1, len: 1.into() });
+                acc.push(Token { kind: $t2, len: 1.into() });
+                text = &text[2..];
+                continue;
+            }};
+            ($t1:expr, $t2:expr, $t3:expr) => {{
+                acc.push(Token { kind: $t1, len: 1.into() });
+                acc.push(Token { kind: $t2, len: 1.into() });
+                acc.push(Token { kind: $t3, len: 1.into() });
+                text = &text[3..];
+                continue;
+            }};
+        }
+        let kind = match rustc_token.kind {
+            ra_rustc_lexer::TokenKind::LineComment => COMMENT,
+            ra_rustc_lexer::TokenKind::BlockComment { .. } => COMMENT,
+            ra_rustc_lexer::TokenKind::Whitespace => WHITESPACE,
+            ra_rustc_lexer::TokenKind::Ident => {
+                let token_text = &text[..rustc_token.len];
+                if token_text == "_" {
+                    UNDERSCORE
+                } else {
+                    SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT)
+                }
+            }
+            ra_rustc_lexer::TokenKind::RawIdent => IDENT,
+            ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind {
+                ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER,
+                ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER,
+                ra_rustc_lexer::LiteralKind::Char { .. } => CHAR,
+                ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE,
+                ra_rustc_lexer::LiteralKind::Str { .. } => STRING,
+                ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING,
+                ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING,
+                ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING,
+            },
+            ra_rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME,
+            ra_rustc_lexer::TokenKind::Semi => SEMI,
+            ra_rustc_lexer::TokenKind::Comma => COMMA,
+            ra_rustc_lexer::TokenKind::DotDotDot => decompose!(DOT, DOT, DOT),
+            ra_rustc_lexer::TokenKind::DotDotEq => decompose!(DOT, DOT, EQ),
+            ra_rustc_lexer::TokenKind::DotDot => decompose!(DOT, DOT),
+            ra_rustc_lexer::TokenKind::Dot => DOT,
+            ra_rustc_lexer::TokenKind::OpenParen => L_PAREN,
+            ra_rustc_lexer::TokenKind::CloseParen => R_PAREN,
+            ra_rustc_lexer::TokenKind::OpenBrace => L_CURLY,
+            ra_rustc_lexer::TokenKind::CloseBrace => R_CURLY,
+            ra_rustc_lexer::TokenKind::OpenBracket => L_BRACK,
+            ra_rustc_lexer::TokenKind::CloseBracket => R_BRACK,
+            ra_rustc_lexer::TokenKind::At => AT,
+            ra_rustc_lexer::TokenKind::Pound => POUND,
+            ra_rustc_lexer::TokenKind::Tilde => TILDE,
+            ra_rustc_lexer::TokenKind::Question => QUESTION,
+            ra_rustc_lexer::TokenKind::ColonColon => decompose!(COLON, COLON),
+            ra_rustc_lexer::TokenKind::Colon => COLON,
+            ra_rustc_lexer::TokenKind::Dollar => DOLLAR,
+            ra_rustc_lexer::TokenKind::EqEq => decompose!(EQ, EQ),
+            ra_rustc_lexer::TokenKind::Eq => EQ,
+            ra_rustc_lexer::TokenKind::FatArrow => decompose!(EQ, R_ANGLE),
+            ra_rustc_lexer::TokenKind::Ne => decompose!(EXCL, EQ),
+            ra_rustc_lexer::TokenKind::Not => EXCL,
+            ra_rustc_lexer::TokenKind::Le => decompose!(L_ANGLE, EQ),
+            ra_rustc_lexer::TokenKind::LArrow => decompose!(COLON, MINUS),
+            ra_rustc_lexer::TokenKind::Lt => L_ANGLE,
+            ra_rustc_lexer::TokenKind::ShlEq => decompose!(L_ANGLE, L_ANGLE, EQ),
+            ra_rustc_lexer::TokenKind::Shl => decompose!(L_ANGLE, L_ANGLE),
+            ra_rustc_lexer::TokenKind::Ge => decompose!(R_ANGLE, EQ),
+            ra_rustc_lexer::TokenKind::Gt => R_ANGLE,
+            ra_rustc_lexer::TokenKind::ShrEq => decompose!(R_ANGLE, R_ANGLE, EQ),
+            ra_rustc_lexer::TokenKind::Shr => decompose!(R_ANGLE, R_ANGLE),
+            ra_rustc_lexer::TokenKind::RArrow => decompose!(MINUS, R_ANGLE),
+            ra_rustc_lexer::TokenKind::Minus => MINUS,
+            ra_rustc_lexer::TokenKind::MinusEq => decompose!(MINUS, EQ),
+            ra_rustc_lexer::TokenKind::And => AMP,
+            ra_rustc_lexer::TokenKind::AndAnd => decompose!(AMP, AMP),
+            ra_rustc_lexer::TokenKind::AndEq => decompose!(AMP, EQ),
+            ra_rustc_lexer::TokenKind::Or => PIPE,
+            ra_rustc_lexer::TokenKind::OrOr => decompose!(PIPE, PIPE),
+            ra_rustc_lexer::TokenKind::OrEq => decompose!(PIPE, EQ),
+            ra_rustc_lexer::TokenKind::PlusEq => decompose!(PLUS, EQ),
+            ra_rustc_lexer::TokenKind::Plus => PLUS,
+            ra_rustc_lexer::TokenKind::StarEq => decompose!(STAR, EQ),
+            ra_rustc_lexer::TokenKind::Star => STAR,
+            ra_rustc_lexer::TokenKind::SlashEq => decompose!(SLASH, EQ),
+            ra_rustc_lexer::TokenKind::Slash => SLASH,
+            ra_rustc_lexer::TokenKind::CaretEq => decompose!(CARET, EQ),
+            ra_rustc_lexer::TokenKind::Caret => CARET,
+            ra_rustc_lexer::TokenKind::PercentEq => decompose!(PERCENT, EQ),
+            ra_rustc_lexer::TokenKind::Percent => PERCENT,
+            ra_rustc_lexer::TokenKind::Unknown => ERROR,
+        };
+        let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) };
         acc.push(token);
-        let len: u32 = token.len.into();
-        text = &text[len as usize..];
+        text = &text[rustc_token.len..];
     }
     acc
 }
 
 /// Get the next token from a string
-pub fn next_token(text: &str) -> Token {
+fn next_token(text: &str) -> Token {
     assert!(!text.is_empty());
     let mut ptr = Ptr::new(text);
     let c = ptr.bump().unwrap();
diff --git a/crates/ra_syntax/tests/data/lexer/0004_numbers.txt b/crates/ra_syntax/tests/data/lexer/0004_numbers.txt
index 39988aedc..7bb89b8ae 100644
--- a/crates/ra_syntax/tests/data/lexer/0004_numbers.txt
+++ b/crates/ra_syntax/tests/data/lexer/0004_numbers.txt
@@ -12,9 +12,9 @@ INT_NUMBER 2 "0_"
 WHITESPACE 1 " "
 FLOAT_NUMBER 2 "0."
 WHITESPACE 1 " "
-INT_NUMBER 2 "0e"
+FLOAT_NUMBER 2 "0e"
 WHITESPACE 1 " "
-INT_NUMBER 2 "0E"
+FLOAT_NUMBER 2 "0E"
 WHITESPACE 1 " "
 INT_NUMBER 2 "0z"
 WHITESPACE 1 "\n"
@@ -32,9 +32,9 @@ INT_NUMBER 6 "0_1279"
 WHITESPACE 1 " "
 FLOAT_NUMBER 6 "0.1279"
 WHITESPACE 1 " "
-INT_NUMBER 6 "0e1279"
+FLOAT_NUMBER 6 "0e1279"
 WHITESPACE 1 " "
-INT_NUMBER 6 "0E1279"
+FLOAT_NUMBER 6 "0E1279"
 WHITESPACE 1 "\n"
 INT_NUMBER 1 "0"
 DOT 1 "."
@@ -47,9 +47,7 @@ IDENT 3 "foo"
 L_PAREN 1 "("
 R_PAREN 1 ")"
 WHITESPACE 1 "\n"
-INT_NUMBER 2 "0e"
-PLUS 1 "+"
-INT_NUMBER 1 "1"
+FLOAT_NUMBER 4 "0e+1"
 WHITESPACE 1 "\n"
 INT_NUMBER 1 "0"
 DOT 1 "."
diff --git a/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt b/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt
index 812dfbc18..737a300ee 100644
--- a/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt
+++ b/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt
@@ -1 +1 @@
-CHAR 2 "\'1"
+LIFETIME 2 "\'1"
diff --git a/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt b/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt
index 76d186a3c..84867026f 100644
--- a/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt
+++ b/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt
@@ -1,7 +1,39 @@
 SOURCE_FILE@[0; 42)
   SHEBANG@[0; 20) "#!/use/bin/env rusti"
   WHITESPACE@[20; 21) "\n"
-  ERROR@[21; 41)
-    SHEBANG@[21; 41) "#!/use/bin/env rusti"
+  ATTR@[21; 23)
+    POUND@[21; 22) "#"
+    EXCL@[22; 23) "!"
+  ERROR@[23; 24)
+    SLASH@[23; 24) "/"
+  USE_ITEM@[24; 28)
+    USE_KW@[24; 27) "use"
+    ERROR@[27; 28)
+      SLASH@[27; 28) "/"
+  MACRO_CALL@[28; 31)
+    PATH@[28; 31)
+      PATH_SEGMENT@[28; 31)
+        NAME_REF@[28; 31)
+          IDENT@[28; 31) "bin"
+  ERROR@[31; 32)
+    SLASH@[31; 32) "/"
+  MACRO_CALL@[32; 41)
+    PATH@[32; 35)
+      PATH_SEGMENT@[32; 35)
+        NAME_REF@[32; 35)
+          IDENT@[32; 35) "env"
+    WHITESPACE@[35; 36) " "
+    NAME@[36; 41)
+      IDENT@[36; 41) "rusti"
   WHITESPACE@[41; 42) "\n"
-error 21: expected an item
+error 23: expected `[`
+error 23: expected an item
+error 27: expected one of `*`, `::`, `{`, `self`, `super` or an indentifier
+error 28: expected SEMI
+error 31: expected EXCL
+error 31: expected `{`, `[`, `(`
+error 31: expected SEMI
+error 31: expected an item
+error 35: expected EXCL
+error 41: expected `{`, `[`, `(`
+error 41: expected SEMI
diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs
deleted file mode 100644
index 261aad1fb..000000000
--- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs
+++ /dev/null
@@ -1,6 +0,0 @@
-fn main() {
-    let _ = 'c'u32;
-    let _ = "string"invalid;
-    let _ = b'b'_suff;
-    let _ = b"bs"invalid;
-}
diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt
deleted file mode 100644
index b0acfa5d2..000000000
--- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-SOURCE_FILE@[0; 112)
-  FN_DEF@[0; 111)
-    FN_KW@[0; 2) "fn"
-    WHITESPACE@[2; 3) " "
-    NAME@[3; 7)
-      IDENT@[3; 7) "main"
-    PARAM_LIST@[7; 9)
-      L_PAREN@[7; 8) "("
-      R_PAREN@[8; 9) ")"
-    WHITESPACE@[9; 10) " "
-    BLOCK@[10; 111)
-      L_CURLY@[10; 11) "{"
-      WHITESPACE@[11; 16) "\n    "
-      LET_STMT@[16; 27)
-        LET_KW@[16; 19) "let"
-        WHITESPACE@[19; 20) " "
-        PLACEHOLDER_PAT@[20; 21)
-          UNDERSCORE@[20; 21) "_"
-        WHITESPACE@[21; 22) " "
-        EQ@[22; 23) "="
-        WHITESPACE@[23; 24) " "
-        LITERAL@[24; 27)
-          CHAR@[24; 27) "\'c\'"
-      EXPR_STMT@[27; 31)
-        PATH_EXPR@[27; 30)
-          PATH@[27; 30)
-            PATH_SEGMENT@[27; 30)
-              NAME_REF@[27; 30)
-                IDENT@[27; 30) "u32"
-        SEMI@[30; 31) ";"
-      WHITESPACE@[31; 36) "\n    "
-      LET_STMT@[36; 60)
-        LET_KW@[36; 39) "let"
-        WHITESPACE@[39; 40) " "
-        PLACEHOLDER_PAT@[40; 41)
-          UNDERSCORE@[40; 41) "_"
-        WHITESPACE@[41; 42) " "
-        EQ@[42; 43) "="
-        WHITESPACE@[43; 44) " "
-        LITERAL@[44; 59)
-          STRING@[44; 59) "\"string\"invalid"
-        SEMI@[59; 60) ";"
-      WHITESPACE@[60; 65) "\n    "
-      LET_STMT@[65; 83)
-        LET_KW@[65; 68) "let"
-        WHITESPACE@[68; 69) " "
-        PLACEHOLDER_PAT@[69; 70)
-          UNDERSCORE@[69; 70) "_"
-        WHITESPACE@[70; 71) " "
-        EQ@[71; 72) "="
-        WHITESPACE@[72; 73) " "
-        LITERAL@[73; 82)
-          BYTE@[73; 82) "b\'b\'_suff"
-        SEMI@[82; 83) ";"
-      WHITESPACE@[83; 88) "\n    "
-      LET_STMT@[88; 109)
-        LET_KW@[88; 91) "let"
-        WHITESPACE@[91; 92) " "
-        PLACEHOLDER_PAT@[92; 93)
-          UNDERSCORE@[92; 93) "_"
-        WHITESPACE@[93; 94) " "
-        EQ@[94; 95) "="
-        WHITESPACE@[95; 96) " "
-        LITERAL@[96; 108)
-          BYTE_STRING@[96; 108) "b\"bs\"invalid"
-        SEMI@[108; 109) ";"
-      WHITESPACE@[109; 110) "\n"
-      R_CURLY@[110; 111) "}"
-  WHITESPACE@[111; 112) "\n"
-error 27: expected SEMI
diff --git a/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs
new file mode 100644
index 000000000..261aad1fb
--- /dev/null
+++ b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs
@@ -0,0 +1,6 @@
+fn main() {
+    let _ = 'c'u32;
+    let _ = "string"invalid;
+    let _ = b'b'_suff;
+    let _ = b"bs"invalid;
+}
diff --git a/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt
new file mode 100644
index 000000000..4f7e809c5
--- /dev/null
+++ b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt
@@ -0,0 +1,63 @@
+SOURCE_FILE@[0; 112)
+  FN_DEF@[0; 111)
+    FN_KW@[0; 2) "fn"
+    WHITESPACE@[2; 3) " "
+    NAME@[3; 7)
+      IDENT@[3; 7) "main"
+    PARAM_LIST@[7; 9)
+      L_PAREN@[7; 8) "("
+      R_PAREN@[8; 9) ")"
+    WHITESPACE@[9; 10) " "
+    BLOCK@[10; 111)
+      L_CURLY@[10; 11) "{"
+      WHITESPACE@[11; 16) "\n    "
+      LET_STMT@[16; 31)
+        LET_KW@[16; 19) "let"
+        WHITESPACE@[19; 20) " "
+        PLACEHOLDER_PAT@[20; 21)
+          UNDERSCORE@[20; 21) "_"
+        WHITESPACE@[21; 22) " "
+        EQ@[22; 23) "="
+        WHITESPACE@[23; 24) " "
+        LITERAL@[24; 30)
+          CHAR@[24; 30) "\'c\'u32"
+        SEMI@[30; 31) ";"
+      WHITESPACE@[31; 36) "\n    "
+      LET_STMT@[36; 60)
+        LET_KW@[36; 39) "let"
+        WHITESPACE@[39; 40) " "
+        PLACEHOLDER_PAT@[40; 41)
+          UNDERSCORE@[40; 41) "_"
+        WHITESPACE@[41; 42) " "
+        EQ@[42; 43) "="
+        WHITESPACE@[43; 44) " "
+        LITERAL@[44; 59)
+          STRING@[44; 59) "\"string\"invalid"
+        SEMI@[59; 60) ";"
+      WHITESPACE@[60; 65) "\n    "
+      LET_STMT@[65; 83)
+        LET_KW@[65; 68) "let"
+        WHITESPACE@[68; 69) " "
+        PLACEHOLDER_PAT@[69; 70)
+          UNDERSCORE@[69; 70) "_"
+        WHITESPACE@[70; 71) " "
+        EQ@[71; 72) "="
+        WHITESPACE@[72; 73) " "
+        LITERAL@[73; 82)
+          BYTE@[73; 82) "b\'b\'_suff"
+        SEMI@[82; 83) ";"
+      WHITESPACE@[83; 88) "\n    "
+      LET_STMT@[88; 109)
+        LET_KW@[88; 91) "let"
+        WHITESPACE@[91; 92) " "
+        PLACEHOLDER_PAT@[92; 93)
+          UNDERSCORE@[92; 93) "_"
+        WHITESPACE@[93; 94) " "
+        EQ@[94; 95) "="
+        WHITESPACE@[95; 96) " "
+        LITERAL@[96; 108)
+          BYTE_STRING@[96; 108) "b\"bs\"invalid"
+        SEMI@[108; 109) ";"
+      WHITESPACE@[109; 110) "\n"
+      R_CURLY@[110; 111) "}"
+  WHITESPACE@[111; 112) "\n"
-- 
cgit v1.2.3


From 700669bbd0ab3ae0c5a56985ce13ca896d342a3a Mon Sep 17 00:00:00 2001
From: Aleksey Kladov <aleksey.kladov@gmail.com>
Date: Mon, 22 Jul 2019 17:56:19 +0300
Subject: kill old lexer

---
 crates/ra_syntax/src/parsing/lexer.rs          | 165 +++----------------------
 crates/ra_syntax/src/parsing/lexer/classes.rs  |  26 ----
 crates/ra_syntax/src/parsing/lexer/comments.rs |  57 ---------
 crates/ra_syntax/src/parsing/lexer/numbers.rs  |  66 ----------
 crates/ra_syntax/src/parsing/lexer/ptr.rs      | 162 ------------------------
 crates/ra_syntax/src/parsing/lexer/strings.rs  | 112 -----------------
 6 files changed, 17 insertions(+), 571 deletions(-)
 delete mode 100644 crates/ra_syntax/src/parsing/lexer/classes.rs
 delete mode 100644 crates/ra_syntax/src/parsing/lexer/comments.rs
 delete mode 100644 crates/ra_syntax/src/parsing/lexer/numbers.rs
 delete mode 100644 crates/ra_syntax/src/parsing/lexer/ptr.rs
 delete mode 100644 crates/ra_syntax/src/parsing/lexer/strings.rs

diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
index 1c818fdf4..2a4343b0a 100644
--- a/crates/ra_syntax/src/parsing/lexer.rs
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -1,22 +1,6 @@
-mod classes;
-mod comments;
-mod numbers;
-mod ptr;
-mod strings;
-
 use crate::{
     SyntaxKind::{self, *},
-    TextUnit, T,
-};
-
-use self::{
-    classes::*,
-    comments::{scan_comment, scan_shebang},
-    numbers::scan_number,
-    ptr::Ptr,
-    strings::{
-        is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string,
-    },
+    TextUnit,
 };
 
 /// A token of Rust source.
@@ -141,138 +125,23 @@ pub fn tokenize(text: &str) -> Vec<Token> {
     acc
 }
 
-/// Get the next token from a string
-fn next_token(text: &str) -> Token {
-    assert!(!text.is_empty());
-    let mut ptr = Ptr::new(text);
-    let c = ptr.bump().unwrap();
-    let kind = next_token_inner(c, &mut ptr);
-    let len = ptr.into_len();
-    Token { kind, len }
-}
-
-fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
-    if is_whitespace(c) {
-        ptr.bump_while(is_whitespace);
-        return WHITESPACE;
-    }
-
-    match c {
-        '#' => {
-            if scan_shebang(ptr) {
-                return SHEBANG;
-            }
-        }
-        '/' => {
-            if let Some(kind) = scan_comment(ptr) {
-                return kind;
-            }
-        }
-        _ => (),
-    }
-
-    let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1));
-    if ident_start {
-        return scan_ident(c, ptr);
-    }
-
-    if is_dec_digit(c) {
-        let kind = scan_number(c, ptr);
-        scan_literal_suffix(ptr);
-        return kind;
-    }
-
-    // One-byte tokens.
-    if let Some(kind) = SyntaxKind::from_char(c) {
-        return kind;
-    }
-
-    match c {
-        // Possiblily multi-byte tokens,
-        // but we only produce single byte token now
-        // T![...], T![..], T![..=], T![.]
-        '.' => return T![.],
-        // T![::] T![:]
-        ':' => return T![:],
-        // T![==] FATARROW T![=]
-        '=' => return T![=],
-        // T![!=] T![!]
-        '!' => return T![!],
-        // T![->] T![-]
-        '-' => return T![-],
-
-        // If the character is an ident start not followed by another single
-        // quote, then this is a lifetime name:
-        '\'' => {
-            return if ptr.at_p(is_ident_start) && !ptr.at_str("''") {
-                ptr.bump();
-                while ptr.at_p(is_ident_continue) {
-                    ptr.bump();
-                }
-                // lifetimes shouldn't end with a single quote
-                // if we find one, then this is an invalid character literal
-                if ptr.at('\'') {
-                    ptr.bump();
-                    return CHAR;
-                }
-                LIFETIME
-            } else {
-                scan_char(ptr);
-                scan_literal_suffix(ptr);
-                CHAR
-            };
-        }
-        'b' => {
-            let kind = scan_byte_char_or_string(ptr);
-            scan_literal_suffix(ptr);
-            return kind;
-        }
-        '"' => {
-            scan_string(ptr);
-            scan_literal_suffix(ptr);
-            return STRING;
-        }
-        'r' => {
-            scan_raw_string(ptr);
-            scan_literal_suffix(ptr);
-            return RAW_STRING;
-        }
-        _ => (),
-    }
-    ERROR
-}
-
-fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
-    let is_raw = match (c, ptr.current()) {
-        ('r', Some('#')) => {
-            ptr.bump();
-            true
-        }
-        ('_', None) => return T![_],
-        ('_', Some(c)) if !is_ident_continue(c) => return T![_],
-        _ => false,
-    };
-    ptr.bump_while(is_ident_continue);
-    if !is_raw {
-        if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) {
-            return kind;
-        }
-    }
-    IDENT
-}
-
-fn scan_literal_suffix(ptr: &mut Ptr) {
-    if ptr.at_p(is_ident_start) {
-        ptr.bump();
-    }
-    ptr.bump_while(is_ident_continue);
-}
-
 pub fn classify_literal(text: &str) -> Option<Token> {
-    let tkn = next_token(text);
-    if !tkn.kind.is_literal() || tkn.len.to_usize() != text.len() {
+    let t = ra_rustc_lexer::first_token(text);
+    if t.len != text.len() {
         return None;
     }
-
-    Some(tkn)
+    let kind = match t.kind {
+        ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind {
+            ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER,
+            ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER,
+            ra_rustc_lexer::LiteralKind::Char { .. } => CHAR,
+            ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE,
+            ra_rustc_lexer::LiteralKind::Str { .. } => STRING,
+            ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING,
+            ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING,
+            ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING,
+        },
+        _ => return None,
+    };
+    Some(Token { kind, len: TextUnit::from_usize(t.len) })
 }
diff --git a/crates/ra_syntax/src/parsing/lexer/classes.rs b/crates/ra_syntax/src/parsing/lexer/classes.rs
deleted file mode 100644
index 4235d2648..000000000
--- a/crates/ra_syntax/src/parsing/lexer/classes.rs
+++ /dev/null
@@ -1,26 +0,0 @@
-use unicode_xid::UnicodeXID;
-
-pub fn is_ident_start(c: char) -> bool {
-    (c >= 'a' && c <= 'z')
-        || (c >= 'A' && c <= 'Z')
-        || c == '_'
-        || (c > '\x7f' && UnicodeXID::is_xid_start(c))
-}
-
-pub fn is_ident_continue(c: char) -> bool {
-    (c >= 'a' && c <= 'z')
-        || (c >= 'A' && c <= 'Z')
-        || (c >= '0' && c <= '9')
-        || c == '_'
-        || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
-}
-
-pub fn is_whitespace(c: char) -> bool {
-    //FIXME: use is_pattern_whitespace
-    //https://github.com/behnam/rust-unic/issues/192
-    c.is_whitespace()
-}
-
-pub fn is_dec_digit(c: char) -> bool {
-    '0' <= c && c <= '9'
-}
diff --git a/crates/ra_syntax/src/parsing/lexer/comments.rs b/crates/ra_syntax/src/parsing/lexer/comments.rs
deleted file mode 100644
index 8bbbe659b..000000000
--- a/crates/ra_syntax/src/parsing/lexer/comments.rs
+++ /dev/null
@@ -1,57 +0,0 @@
-use crate::parsing::lexer::ptr::Ptr;
-
-use crate::SyntaxKind::{self, *};
-
-pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool {
-    if ptr.at_str("!/") {
-        ptr.bump();
-        ptr.bump();
-        bump_until_eol(ptr);
-        true
-    } else {
-        false
-    }
-}
-
-fn scan_block_comment(ptr: &mut Ptr) -> Option<SyntaxKind> {
-    if ptr.at('*') {
-        ptr.bump();
-        let mut depth: u32 = 1;
-        while depth > 0 {
-            if ptr.at_str("*/") {
-                depth -= 1;
-                ptr.bump();
-                ptr.bump();
-            } else if ptr.at_str("/*") {
-                depth += 1;
-                ptr.bump();
-                ptr.bump();
-            } else if ptr.bump().is_none() {
-                break;
-            }
-        }
-        Some(COMMENT)
-    } else {
-        None
-    }
-}
-
-pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option<SyntaxKind> {
-    if ptr.at('/') {
-        bump_until_eol(ptr);
-        Some(COMMENT)
-    } else {
-        scan_block_comment(ptr)
-    }
-}
-
-fn bump_until_eol(ptr: &mut Ptr) {
-    loop {
-        if ptr.at('\n') || ptr.at_str("\r\n") {
-            return;
-        }
-        if ptr.bump().is_none() {
-            break;
-        }
-    }
-}
diff --git a/crates/ra_syntax/src/parsing/lexer/numbers.rs b/crates/ra_syntax/src/parsing/lexer/numbers.rs
deleted file mode 100644
index e53ae231b..000000000
--- a/crates/ra_syntax/src/parsing/lexer/numbers.rs
+++ /dev/null
@@ -1,66 +0,0 @@
-use crate::parsing::lexer::{classes::*, ptr::Ptr};
-
-use crate::SyntaxKind::{self, *};
-
-pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
-    if c == '0' {
-        match ptr.current().unwrap_or('\0') {
-            'b' | 'o' => {
-                ptr.bump();
-                scan_digits(ptr, false);
-            }
-            'x' => {
-                ptr.bump();
-                scan_digits(ptr, true);
-            }
-            '0'..='9' | '_' | '.' | 'e' | 'E' => {
-                scan_digits(ptr, true);
-            }
-            _ => return INT_NUMBER,
-        }
-    } else {
-        scan_digits(ptr, false);
-    }
-
-    // might be a float, but don't be greedy if this is actually an
-    // integer literal followed by field/method access or a range pattern
-    // (`0..2` and `12.foo()`)
-    if ptr.at('.') && !(ptr.at_str("..") || ptr.nth_is_p(1, is_ident_start)) {
-        // might have stuff after the ., and if it does, it needs to start
-        // with a number
-        ptr.bump();
-        scan_digits(ptr, false);
-        scan_float_exponent(ptr);
-        return FLOAT_NUMBER;
-    }
-    // it might be a float if it has an exponent
-    if ptr.at('e') || ptr.at('E') {
-        scan_float_exponent(ptr);
-        return FLOAT_NUMBER;
-    }
-    INT_NUMBER
-}
-
-fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
-    while let Some(c) = ptr.current() {
-        match c {
-            '_' | '0'..='9' => {
-                ptr.bump();
-            }
-            'a'..='f' | 'A'..='F' if allow_hex => {
-                ptr.bump();
-            }
-            _ => return,
-        }
-    }
-}
-
-fn scan_float_exponent(ptr: &mut Ptr) {
-    if ptr.at('e') || ptr.at('E') {
-        ptr.bump();
-        if ptr.at('-') || ptr.at('+') {
-            ptr.bump();
-        }
-        scan_digits(ptr, false);
-    }
-}
diff --git a/crates/ra_syntax/src/parsing/lexer/ptr.rs b/crates/ra_syntax/src/parsing/lexer/ptr.rs
deleted file mode 100644
index c341c4176..000000000
--- a/crates/ra_syntax/src/parsing/lexer/ptr.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-use crate::TextUnit;
-
-use std::str::Chars;
-
-/// A simple view into the characters of a string.
-pub(crate) struct Ptr<'s> {
-    text: &'s str,
-    len: TextUnit,
-}
-
-impl<'s> Ptr<'s> {
-    /// Creates a new `Ptr` from a string.
-    pub fn new(text: &'s str) -> Ptr<'s> {
-        Ptr { text, len: 0.into() }
-    }
-
-    /// Gets the length of the remaining string.
-    pub fn into_len(self) -> TextUnit {
-        self.len
-    }
-
-    /// Gets the current character, if one exists.
-    pub fn current(&self) -> Option<char> {
-        self.chars().next()
-    }
-
-    /// Gets the nth character from the current.
-    /// For example, 0 will return the current character, 1 will return the next, etc.
-    pub fn nth(&self, n: u32) -> Option<char> {
-        self.chars().nth(n as usize)
-    }
-
-    /// Checks whether the current character is `c`.
-    pub fn at(&self, c: char) -> bool {
-        self.current() == Some(c)
-    }
-
-    /// Checks whether the next characters match `s`.
-    pub fn at_str(&self, s: &str) -> bool {
-        let chars = self.chars();
-        chars.as_str().starts_with(s)
-    }
-
-    /// Checks whether the current character satisfies the predicate `p`.
-    pub fn at_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
-        self.current().map(p) == Some(true)
-    }
-
-    /// Checks whether the nth character satisfies the predicate `p`.
-    pub fn nth_is_p<P: Fn(char) -> bool>(&self, n: u32, p: P) -> bool {
-        self.nth(n).map(p) == Some(true)
-    }
-
-    /// Moves to the next character.
-    pub fn bump(&mut self) -> Option<char> {
-        let ch = self.chars().next()?;
-        self.len += TextUnit::of_char(ch);
-        Some(ch)
-    }
-
-    /// Moves to the next character as long as `pred` is satisfied.
-    pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
-        loop {
-            match self.current() {
-                Some(c) if pred(c) => {
-                    self.bump();
-                }
-                _ => return,
-            }
-        }
-    }
-
-    /// Returns the text up to the current point.
-    pub fn current_token_text(&self) -> &str {
-        let len: u32 = self.len.into();
-        &self.text[..len as usize]
-    }
-
-    /// Returns an iterator over the remaining characters.
-    fn chars(&self) -> Chars {
-        let len: u32 = self.len.into();
-        self.text[len as usize..].chars()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_current() {
-        let ptr = Ptr::new("test");
-        assert_eq!(ptr.current(), Some('t'));
-    }
-
-    #[test]
-    fn test_nth() {
-        let ptr = Ptr::new("test");
-        assert_eq!(ptr.nth(0), Some('t'));
-        assert_eq!(ptr.nth(1), Some('e'));
-        assert_eq!(ptr.nth(2), Some('s'));
-        assert_eq!(ptr.nth(3), Some('t'));
-        assert_eq!(ptr.nth(4), None);
-    }
-
-    #[test]
-    fn test_at() {
-        let ptr = Ptr::new("test");
-        assert!(ptr.at('t'));
-        assert!(!ptr.at('a'));
-    }
-
-    #[test]
-    fn test_at_str() {
-        let ptr = Ptr::new("test");
-        assert!(ptr.at_str("t"));
-        assert!(ptr.at_str("te"));
-        assert!(ptr.at_str("test"));
-        assert!(!ptr.at_str("tests"));
-        assert!(!ptr.at_str("rust"));
-    }
-
-    #[test]
-    fn test_at_p() {
-        let ptr = Ptr::new("test");
-        assert!(ptr.at_p(|c| c == 't'));
-        assert!(!ptr.at_p(|c| c == 'e'));
-    }
-
-    #[test]
-    fn test_nth_is_p() {
-        let ptr = Ptr::new("test");
-        assert!(ptr.nth_is_p(0, |c| c == 't'));
-        assert!(!ptr.nth_is_p(1, |c| c == 't'));
-        assert!(ptr.nth_is_p(3, |c| c == 't'));
-        assert!(!ptr.nth_is_p(150, |c| c == 't'));
-    }
-
-    #[test]
-    fn test_bump() {
-        let mut ptr = Ptr::new("test");
-        assert_eq!(ptr.current(), Some('t'));
-        ptr.bump();
-        assert_eq!(ptr.current(), Some('e'));
-        ptr.bump();
-        assert_eq!(ptr.current(), Some('s'));
-        ptr.bump();
-        assert_eq!(ptr.current(), Some('t'));
-        ptr.bump();
-        assert_eq!(ptr.current(), None);
-        ptr.bump();
-        assert_eq!(ptr.current(), None);
-    }
-
-    #[test]
-    fn test_bump_while() {
-        let mut ptr = Ptr::new("test");
-        assert_eq!(ptr.current(), Some('t'));
-        ptr.bump_while(|c| c != 's');
-        assert_eq!(ptr.current(), Some('s'));
-    }
-}
diff --git a/crates/ra_syntax/src/parsing/lexer/strings.rs b/crates/ra_syntax/src/parsing/lexer/strings.rs
deleted file mode 100644
index f74acff9e..000000000
--- a/crates/ra_syntax/src/parsing/lexer/strings.rs
+++ /dev/null
@@ -1,112 +0,0 @@
-use crate::{
-    parsing::lexer::ptr::Ptr,
-    SyntaxKind::{self, *},
-};
-
-pub(crate) fn is_string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
-    match (c, c1, c2) {
-        ('r', Some('"'), _)
-        | ('r', Some('#'), Some('"'))
-        | ('r', Some('#'), Some('#'))
-        | ('b', Some('"'), _)
-        | ('b', Some('\''), _)
-        | ('b', Some('r'), Some('"'))
-        | ('b', Some('r'), Some('#')) => true,
-        _ => false,
-    }
-}
-
-pub(crate) fn scan_char(ptr: &mut Ptr) {
-    while let Some(c) = ptr.current() {
-        match c {
-            '\\' => {
-                ptr.bump();
-                if ptr.at('\\') || ptr.at('\'') {
-                    ptr.bump();
-                }
-            }
-            '\'' => {
-                ptr.bump();
-                return;
-            }
-            '\n' => return,
-            _ => {
-                ptr.bump();
-            }
-        }
-    }
-}
-
-pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind {
-    // unwrapping and not-exhaustive match are ok
-    // because of string_literal_start
-    let c = ptr.bump().unwrap();
-    match c {
-        '\'' => {
-            scan_byte(ptr);
-            BYTE
-        }
-        '"' => {
-            scan_byte_string(ptr);
-            BYTE_STRING
-        }
-        'r' => {
-            scan_raw_string(ptr);
-            RAW_BYTE_STRING
-        }
-        _ => unreachable!(),
-    }
-}
-
-pub(crate) fn scan_string(ptr: &mut Ptr) {
-    while let Some(c) = ptr.current() {
-        match c {
-            '\\' => {
-                ptr.bump();
-                if ptr.at('\\') || ptr.at('"') {
-                    ptr.bump();
-                }
-            }
-            '"' => {
-                ptr.bump();
-                return;
-            }
-            _ => {
-                ptr.bump();
-            }
-        }
-    }
-}
-
-pub(crate) fn scan_raw_string(ptr: &mut Ptr) {
-    let mut hashes = 0;
-    while ptr.at('#') {
-        hashes += 1;
-        ptr.bump();
-    }
-    if !ptr.at('"') {
-        return;
-    }
-    ptr.bump();
-
-    while let Some(c) = ptr.bump() {
-        if c == '"' {
-            let mut hashes_left = hashes;
-            while ptr.at('#') && hashes_left > 0 {
-                hashes_left -= 1;
-                ptr.bump();
-            }
-            if hashes_left == 0 {
-                return;
-            }
-        }
-    }
-}
-
-fn scan_byte(ptr: &mut Ptr) {
-    scan_char(ptr)
-}
-
-fn scan_byte_string(ptr: &mut Ptr) {
-    scan_string(ptr)
-}
-- 
cgit v1.2.3