1 files changed, 215 insertions, 0 deletions
diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs
new file mode 100644
index 000000000..f9362120e
--- /dev/null
+++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -0,0 +1,215 @@
+mod classes;
+mod comments;
+mod numbers;
+mod ptr;
+mod strings;
+use crate::{
+    SyntaxKind::{self, *},
+    TextUnit,
+};
+use self::{
+    classes::*,
+    comments::{scan_comment, scan_shebang},
+    numbers::scan_number,
+    ptr::Ptr,
+    strings::{
+        is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string,
+    },
+};
+/// A token of Rust source.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Token {
+    /// The kind of token.
+    pub kind: SyntaxKind,
+    /// The length of the token.
+    pub len: TextUnit,
+}
+/// Break a string up into its component tokens
+pub fn tokenize(text: &str) -> Vec<Token> {
+    let mut text = text;
+    let mut acc = Vec::new();
+    while !text.is_empty() {
+        let token = next_token(text);
+        acc.push(token);
+        let len: u32 = token.len.into();
+        text = &text[len as usize..];
+    }
+    acc
+}
+/// Get the next token from a string
+pub fn next_token(text: &str) -> Token {
+    assert!(!text.is_empty());
+    let mut ptr = Ptr::new(text);
+    let c = ptr.bump().unwrap();
+    let kind = next_token_inner(c, &mut ptr);
+    let len = ptr.into_len();
+    Token { kind, len }
+}
+fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    if is_whitespace(c) {
+        ptr.bump_while(is_whitespace);
+        return WHITESPACE;
+    }
+    match c {
+        '#' => {
+            if scan_shebang(ptr) {
+                return SHEBANG;
+            }
+        }
+        '/' => {
+            if let Some(kind) = scan_comment(ptr) {
+                return kind;
+            }
+        }
+        _ => (),
+    }
+    let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1));
+    if ident_start {
+        return scan_ident(c, ptr);
+    }
+    if is_dec_digit(c) {
+        let kind = scan_number(c, ptr);
+        scan_literal_suffix(ptr);
+        return kind;
+    }
+    // One-byte tokens.
+    if let Some(kind) = SyntaxKind::from_char(c) {
+        return kind;
+    }
+    match c {
+        // Multi-byte tokens.
+        '.' => {
+            return match (ptr.current(), ptr.nth(1)) {
+                (Some('.'), Some('.')) => {
+                    ptr.bump();
+                    ptr.bump();
+                    DOTDOTDOT
+                }
+                (Some('.'), Some('=')) => {
+                    ptr.bump();
+                    ptr.bump();
+                    DOTDOTEQ
+                }
+                (Some('.'), _) => {
+                    ptr.bump();
+                    DOTDOT
+                }
+                _ => DOT,
+            };
+        }
+        ':' => {
+            return match ptr.current() {
+                Some(':') => {
+                    ptr.bump();
+                    COLONCOLON
+                }
+                _ => COLON,
+            };
+        }
+        '=' => {
+            return match ptr.current() {
+                Some('=') => {
+                    ptr.bump();
+                    EQEQ
+                }
+                Some('>') => {
+                    ptr.bump();
+                    FAT_ARROW
+                }
+                _ => EQ,
+            };
+        }
+        '!' => {
+            return match ptr.current() {
+                Some('=') => {
+                    ptr.bump();
+                    NEQ
+                }
+                _ => EXCL,
+            };
+        }
+        '-' => {
+            return if ptr.at('>') {
+                ptr.bump();
+                THIN_ARROW
+            } else {
+                MINUS
+            };
+        }
+        // If the character is an ident start not followed by another single
+        // quote, then this is a lifetime name:
+        '\'' => {
+            return if ptr.at_p(is_ident_start) && !ptr.at_str("''") {
+                ptr.bump();
+                while ptr.at_p(is_ident_continue) {
+                    ptr.bump();
+                }
+                // lifetimes shouldn't end with a single quote
+                // if we find one, then this is an invalid character literal
+                if ptr.at('\'') {
+                    ptr.bump();
+                    return CHAR;
+                }
+                LIFETIME
+            } else {
+                scan_char(ptr);
+                scan_literal_suffix(ptr);
+                CHAR
+            };
+        }
+        'b' => {
+            let kind = scan_byte_char_or_string(ptr);
+            scan_literal_suffix(ptr);
+            return kind;
+        }
+        '"' => {
+            scan_string(ptr);
+            scan_literal_suffix(ptr);
+            return STRING;
+        }
+        'r' => {
+            scan_raw_string(ptr);
+            scan_literal_suffix(ptr);
+            return RAW_STRING;
+        }
+        _ => (),
+    }
+    ERROR
+}
+fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    let is_raw = match (c, ptr.current()) {
+        ('r', Some('#')) => {
+            ptr.bump();
+            true
+        }
+        ('_', Some(c)) if !is_ident_continue(c) => return UNDERSCORE,
+        _ => false,
+    };
+    ptr.bump_while(is_ident_continue);
+    if !is_raw {
+        if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) {
+            return kind;
+        }
+    }
+    IDENT
+}
+fn scan_literal_suffix(ptr: &mut Ptr) {
+    if ptr.at_p(is_ident_start) {
+        ptr.bump();
+    }
+    ptr.bump_while(is_ident_continue);
+}

diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs new file mode 100644 index 000000000..f9362120e --- /dev/null +++ b/crates/ra_syntax/src/parsing/lexer.rs
@@ -0,0 +1,215 @@
	1	mod classes;
	2	mod comments;
	3	mod numbers;
	4	mod ptr;
	5	mod strings;
	6
	7	use crate::{
	8	SyntaxKind::{self, *},
	9	TextUnit,
	10	};
	11
	12	use self::{
	13	classes::*,
	14	comments::{scan_comment, scan_shebang},
	15	numbers::scan_number,
	16	ptr::Ptr,
	17	strings::{
	18	is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, scan_string,
	19	},
	20	};
	21
	22	/// A token of Rust source.
	23	#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
	24	pub struct Token {
	25	/// The kind of token.
	26	pub kind: SyntaxKind,
	27	/// The length of the token.
	28	pub len: TextUnit,
	29	}
	30
	31	/// Break a string up into its component tokens
	32	pub fn tokenize(text: &str) -> Vec<Token> {
	33	let mut text = text;
	34	let mut acc = Vec::new();
	35	while !text.is_empty() {
	36	let token = next_token(text);
	37	acc.push(token);
	38	let len: u32 = token.len.into();
	39	text = &text[len as usize..];
	40	}
	41	acc
	42	}
	43
	44	/// Get the next token from a string
	45	pub fn next_token(text: &str) -> Token {
	46	assert!(!text.is_empty());
	47	let mut ptr = Ptr::new(text);
	48	let c = ptr.bump().unwrap();
	49	let kind = next_token_inner(c, &mut ptr);
	50	let len = ptr.into_len();
	51	Token { kind, len }
	52	}
	53
	54	fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
	55	if is_whitespace(c) {
	56	ptr.bump_while(is_whitespace);
	57	return WHITESPACE;
	58	}
	59
	60	match c {
	61	'#' => {
	62	if scan_shebang(ptr) {
	63	return SHEBANG;
	64	}
	65	}
	66	'/' => {
	67	if let Some(kind) = scan_comment(ptr) {
	68	return kind;
	69	}
	70	}
	71	_ => (),
	72	}
	73
	74	let ident_start = is_ident_start(c) && !is_string_literal_start(c, ptr.current(), ptr.nth(1));
	75	if ident_start {
	76	return scan_ident(c, ptr);
	77	}
	78
	79	if is_dec_digit(c) {
	80	let kind = scan_number(c, ptr);
	81	scan_literal_suffix(ptr);
	82	return kind;
	83	}
	84
	85	// One-byte tokens.
	86	if let Some(kind) = SyntaxKind::from_char(c) {
	87	return kind;
	88	}
	89
	90	match c {
	91	// Multi-byte tokens.
	92	'.' => {
	93	return match (ptr.current(), ptr.nth(1)) {
	94	(Some('.'), Some('.')) => {
	95	ptr.bump();
	96	ptr.bump();
	97	DOTDOTDOT
	98	}
	99	(Some('.'), Some('=')) => {
	100	ptr.bump();
	101	ptr.bump();
	102	DOTDOTEQ
	103	}
	104	(Some('.'), _) => {
	105	ptr.bump();
	106	DOTDOT
	107	}
	108	_ => DOT,
	109	};
	110	}
	111	':' => {
	112	return match ptr.current() {
	113	Some(':') => {
	114	ptr.bump();
	115	COLONCOLON
	116	}
	117	_ => COLON,
	118	};
	119	}
	120	'=' => {
	121	return match ptr.current() {
	122	Some('=') => {
	123	ptr.bump();
	124	EQEQ
	125	}
	126	Some('>') => {
	127	ptr.bump();
	128	FAT_ARROW
	129	}
	130	_ => EQ,
	131	};
	132	}
	133	'!' => {
	134	return match ptr.current() {
	135	Some('=') => {
	136	ptr.bump();
	137	NEQ
	138	}
	139	_ => EXCL,
	140	};
	141	}
	142	'-' => {
	143	return if ptr.at('>') {
	144	ptr.bump();
	145	THIN_ARROW
	146	} else {
	147	MINUS
	148	};
	149	}
	150
	151	// If the character is an ident start not followed by another single
	152	// quote, then this is a lifetime name:
	153	'\'' => {
	154	return if ptr.at_p(is_ident_start) && !ptr.at_str("''") {
	155	ptr.bump();
	156	while ptr.at_p(is_ident_continue) {
	157	ptr.bump();
	158	}
	159	// lifetimes shouldn't end with a single quote
	160	// if we find one, then this is an invalid character literal
	161	if ptr.at('\'') {
	162	ptr.bump();
	163	return CHAR;
	164	}
	165	LIFETIME
	166	} else {
	167	scan_char(ptr);
	168	scan_literal_suffix(ptr);
	169	CHAR
	170	};
	171	}
	172	'b' => {
	173	let kind = scan_byte_char_or_string(ptr);
	174	scan_literal_suffix(ptr);
	175	return kind;
	176	}
	177	'"' => {
	178	scan_string(ptr);
	179	scan_literal_suffix(ptr);
	180	return STRING;
	181	}
	182	'r' => {
	183	scan_raw_string(ptr);
	184	scan_literal_suffix(ptr);
	185	return RAW_STRING;
	186	}
	187	_ => (),
	188	}
	189	ERROR
	190	}
	191
	192	fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
	193	let is_raw = match (c, ptr.current()) {
	194	('r', Some('#')) => {
	195	ptr.bump();
	196	true
	197	}
	198	('_', Some(c)) if !is_ident_continue(c) => return UNDERSCORE,
	199	_ => false,
	200	};
	201	ptr.bump_while(is_ident_continue);
	202	if !is_raw {
	203	if let Some(kind) = SyntaxKind::from_keyword(ptr.current_token_text()) {
	204	return kind;
	205	}
	206	}
	207	IDENT
	208	}
	209
	210	fn scan_literal_suffix(ptr: &mut Ptr) {
	211	if ptr.at_p(is_ident_start) {
	212	ptr.bump();
	213	}
	214	ptr.bump_while(is_ident_continue);
	215	}