diff options
Diffstat (limited to 'src/lexer')
-rw-r--r-- | src/lexer/classes.rs | 9 | ||||
-rw-r--r-- | src/lexer/comments.rs | 5 | ||||
-rw-r--r-- | src/lexer/mod.rs | 145 | ||||
-rw-r--r-- | src/lexer/numbers.rs | 6 | ||||
-rw-r--r-- | src/lexer/ptr.rs | 11 | ||||
-rw-r--r-- | src/lexer/strings.rs | 33 |
6 files changed, 109 insertions, 100 deletions
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs index 4235d2648..7fed008af 100644 --- a/src/lexer/classes.rs +++ b/src/lexer/classes.rs | |||
@@ -1,17 +1,12 @@ | |||
1 | use unicode_xid::UnicodeXID; | 1 | use unicode_xid::UnicodeXID; |
2 | 2 | ||
3 | pub fn is_ident_start(c: char) -> bool { | 3 | pub fn is_ident_start(c: char) -> bool { |
4 | (c >= 'a' && c <= 'z') | 4 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' |
5 | || (c >= 'A' && c <= 'Z') | ||
6 | || c == '_' | ||
7 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) | 5 | || (c > '\x7f' && UnicodeXID::is_xid_start(c)) |
8 | } | 6 | } |
9 | 7 | ||
10 | pub fn is_ident_continue(c: char) -> bool { | 8 | pub fn is_ident_continue(c: char) -> bool { |
11 | (c >= 'a' && c <= 'z') | 9 | (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' |
12 | || (c >= 'A' && c <= 'Z') | ||
13 | || (c >= '0' && c <= '9') | ||
14 | || c == '_' | ||
15 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) | 10 | || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) |
16 | } | 11 | } |
17 | 12 | ||
diff --git a/src/lexer/comments.rs b/src/lexer/comments.rs index 79782cc5b..b70f2c6c6 100644 --- a/src/lexer/comments.rs +++ b/src/lexer/comments.rs | |||
@@ -1,6 +1,6 @@ | |||
1 | use lexer::ptr::Ptr; | 1 | use lexer::ptr::Ptr; |
2 | 2 | ||
3 | use {SyntaxKind}; | 3 | use SyntaxKind; |
4 | use syntax_kinds::*; | 4 | use syntax_kinds::*; |
5 | 5 | ||
6 | pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { | 6 | pub(crate) fn scan_shebang(ptr: &mut Ptr) -> bool { |
@@ -23,7 +23,6 @@ pub(crate) fn scan_comment(ptr: &mut Ptr) -> Option<SyntaxKind> { | |||
23 | } | 23 | } |
24 | } | 24 | } |
25 | 25 | ||
26 | |||
27 | fn bump_until_eol(ptr: &mut Ptr) { | 26 | fn bump_until_eol(ptr: &mut Ptr) { |
28 | loop { | 27 | loop { |
29 | if ptr.next_is('\n') || ptr.next_is('\r') && ptr.nnext_is('\n') { | 28 | if ptr.next_is('\n') || ptr.next_is('\r') && ptr.nnext_is('\n') { |
@@ -33,4 +32,4 @@ fn bump_until_eol(ptr: &mut Ptr) { | |||
33 | break; | 32 | break; |
34 | } | 33 | } |
35 | } | 34 | } |
36 | } \ No newline at end of file | 35 | } |
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 842059a42..2f8d3a402 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs | |||
@@ -1,4 +1,4 @@ | |||
1 | use {Token, SyntaxKind}; | 1 | use {SyntaxKind, Token}; |
2 | use syntax_kinds::*; | 2 | use syntax_kinds::*; |
3 | 3 | ||
4 | mod ptr; | 4 | mod ptr; |
@@ -11,10 +11,11 @@ mod numbers; | |||
11 | use self::numbers::scan_number; | 11 | use self::numbers::scan_number; |
12 | 12 | ||
13 | mod strings; | 13 | mod strings; |
14 | use self::strings::{is_string_literal_start, scan_char, scan_byte_char_or_string, scan_string, scan_raw_string}; | 14 | use self::strings::{is_string_literal_start, scan_byte_char_or_string, scan_char, scan_raw_string, |
15 | scan_string}; | ||
15 | 16 | ||
16 | mod comments; | 17 | mod comments; |
17 | use self::comments::{scan_shebang, scan_comment}; | 18 | use self::comments::{scan_comment, scan_shebang}; |
18 | 19 | ||
19 | pub fn tokenize(text: &str) -> Vec<Token> { | 20 | pub fn tokenize(text: &str) -> Vec<Token> { |
20 | let mut text = text; | 21 | let mut text = text; |
@@ -45,10 +46,10 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | |||
45 | match c { | 46 | match c { |
46 | '#' => if scan_shebang(ptr) { | 47 | '#' => if scan_shebang(ptr) { |
47 | return SHEBANG; | 48 | return SHEBANG; |
48 | } | 49 | }, |
49 | '/' => if let Some(kind) = scan_comment(ptr) { | 50 | '/' => if let Some(kind) = scan_comment(ptr) { |
50 | return kind; | 51 | return kind; |
51 | } | 52 | }, |
52 | _ => (), | 53 | _ => (), |
53 | } | 54 | } |
54 | 55 | ||
@@ -89,79 +90,91 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { | |||
89 | '%' => return PERCENT, | 90 | '%' => return PERCENT, |
90 | 91 | ||
91 | // Multi-byte tokens. | 92 | // Multi-byte tokens. |
92 | '.' => return match (ptr.next(), ptr.nnext()) { | 93 | '.' => { |
93 | (Some('.'), Some('.')) => { | 94 | return match (ptr.next(), ptr.nnext()) { |
94 | ptr.bump(); | 95 | (Some('.'), Some('.')) => { |
95 | ptr.bump(); | 96 | ptr.bump(); |
96 | DOTDOTDOT | 97 | ptr.bump(); |
97 | }, | 98 | DOTDOTDOT |
98 | (Some('.'), Some('=')) => { | 99 | } |
99 | ptr.bump(); | 100 | (Some('.'), Some('=')) => { |
100 | ptr.bump(); | 101 | ptr.bump(); |
101 | DOTDOTEQ | 102 | ptr.bump(); |
102 | }, | 103 | DOTDOTEQ |
103 | (Some('.'), _) => { | 104 | } |
104 | ptr.bump(); | 105 | (Some('.'), _) => { |
105 | DOTDOT | 106 | ptr.bump(); |
106 | }, | 107 | DOTDOT |
107 | _ => DOT | 108 | } |
108 | }, | 109 | _ => DOT, |
109 | ':' => return match ptr.next() { | ||
110 | Some(':') => { | ||
111 | ptr.bump(); | ||
112 | COLONCOLON | ||
113 | } | 110 | } |
114 | _ => COLON | 111 | } |
115 | }, | 112 | ':' => { |
116 | '=' => return match ptr.next() { | 113 | return match ptr.next() { |
117 | Some('=') => { | 114 | Some(':') => { |
118 | ptr.bump(); | 115 | ptr.bump(); |
119 | EQEQ | 116 | COLONCOLON |
117 | } | ||
118 | _ => COLON, | ||
120 | } | 119 | } |
121 | Some('>') => { | 120 | } |
122 | ptr.bump(); | 121 | '=' => { |
123 | FAT_ARROW | 122 | return match ptr.next() { |
123 | Some('=') => { | ||
124 | ptr.bump(); | ||
125 | EQEQ | ||
126 | } | ||
127 | Some('>') => { | ||
128 | ptr.bump(); | ||
129 | FAT_ARROW | ||
130 | } | ||
131 | _ => EQ, | ||
124 | } | 132 | } |
125 | _ => EQ, | 133 | } |
126 | }, | 134 | '!' => { |
127 | '!' => return match ptr.next() { | 135 | return match ptr.next() { |
128 | Some('=') => { | 136 | Some('=') => { |
137 | ptr.bump(); | ||
138 | NEQ | ||
139 | } | ||
140 | _ => EXCL, | ||
141 | } | ||
142 | } | ||
143 | '-' => { | ||
144 | return if ptr.next_is('>') { | ||
129 | ptr.bump(); | 145 | ptr.bump(); |
130 | NEQ | 146 | THIN_ARROW |
147 | } else { | ||
148 | MINUS | ||
131 | } | 149 | } |
132 | _ => EXCL, | 150 | } |
133 | }, | ||
134 | '-' => return if ptr.next_is('>') { | ||
135 | ptr.bump(); | ||
136 | THIN_ARROW | ||
137 | } else { | ||
138 | MINUS | ||
139 | }, | ||
140 | 151 | ||
141 | // If the character is an ident start not followed by another single | 152 | // If the character is an ident start not followed by another single |
142 | // quote, then this is a lifetime name: | 153 | // quote, then this is a lifetime name: |
143 | '\'' => return if ptr.next_is_p(is_ident_start) && !ptr.nnext_is('\'') { | 154 | '\'' => { |
144 | ptr.bump(); | 155 | return if ptr.next_is_p(is_ident_start) && !ptr.nnext_is('\'') { |
145 | while ptr.next_is_p(is_ident_continue) { | ||
146 | ptr.bump(); | ||
147 | } | ||
148 | // lifetimes shouldn't end with a single quote | ||
149 | // if we find one, then this is an invalid character literal | ||
150 | if ptr.next_is('\'') { | ||
151 | ptr.bump(); | 156 | ptr.bump(); |
152 | return CHAR; // TODO: error reporting | 157 | while ptr.next_is_p(is_ident_continue) { |
153 | } | 158 | ptr.bump(); |
154 | LIFETIME | 159 | } |
155 | } else { | 160 | // lifetimes shouldn't end with a single quote |
156 | scan_char(ptr); | 161 | // if we find one, then this is an invalid character literal |
157 | scan_literal_suffix(ptr); | 162 | if ptr.next_is('\'') { |
158 | CHAR | 163 | ptr.bump(); |
159 | }, | 164 | return CHAR; // TODO: error reporting |
165 | } | ||
166 | LIFETIME | ||
167 | } else { | ||
168 | scan_char(ptr); | ||
169 | scan_literal_suffix(ptr); | ||
170 | CHAR | ||
171 | }; | ||
172 | } | ||
160 | 'b' => { | 173 | 'b' => { |
161 | let kind = scan_byte_char_or_string(ptr); | 174 | let kind = scan_byte_char_or_string(ptr); |
162 | scan_literal_suffix(ptr); | 175 | scan_literal_suffix(ptr); |
163 | return kind | 176 | return kind; |
164 | }, | 177 | } |
165 | '"' => { | 178 | '"' => { |
166 | scan_string(ptr); | 179 | scan_string(ptr); |
167 | scan_literal_suffix(ptr); | 180 | scan_literal_suffix(ptr); |
diff --git a/src/lexer/numbers.rs b/src/lexer/numbers.rs index 4c7edfe1c..95e42246f 100644 --- a/src/lexer/numbers.rs +++ b/src/lexer/numbers.rs | |||
@@ -1,7 +1,7 @@ | |||
1 | use lexer::ptr::Ptr; | 1 | use lexer::ptr::Ptr; |
2 | use lexer::classes::*; | 2 | use lexer::classes::*; |
3 | 3 | ||
4 | use {SyntaxKind}; | 4 | use SyntaxKind; |
5 | use syntax_kinds::*; | 5 | use syntax_kinds::*; |
6 | 6 | ||
7 | pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { | 7 | pub(crate) fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { |
@@ -49,10 +49,10 @@ fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { | |||
49 | '_' | '0'...'9' => { | 49 | '_' | '0'...'9' => { |
50 | ptr.bump(); | 50 | ptr.bump(); |
51 | } | 51 | } |
52 | 'a'...'f' | 'A' ... 'F' if allow_hex => { | 52 | 'a'...'f' | 'A'...'F' if allow_hex => { |
53 | ptr.bump(); | 53 | ptr.bump(); |
54 | } | 54 | } |
55 | _ => return | 55 | _ => return, |
56 | } | 56 | } |
57 | } | 57 | } |
58 | } | 58 | } |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index ff6ef11fc..99d55b283 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs | |||
@@ -1,4 +1,4 @@ | |||
1 | use {TextUnit}; | 1 | use TextUnit; |
2 | 2 | ||
3 | use std::str::Chars; | 3 | use std::str::Chars; |
4 | 4 | ||
@@ -9,7 +9,10 @@ pub(crate) struct Ptr<'s> { | |||
9 | 9 | ||
10 | impl<'s> Ptr<'s> { | 10 | impl<'s> Ptr<'s> { |
11 | pub fn new(text: &'s str) -> Ptr<'s> { | 11 | pub fn new(text: &'s str) -> Ptr<'s> { |
12 | Ptr { text, len: TextUnit::new(0) } | 12 | Ptr { |
13 | text, | ||
14 | len: TextUnit::new(0), | ||
15 | } | ||
13 | } | 16 | } |
14 | 17 | ||
15 | pub fn into_len(self) -> TextUnit { | 18 | pub fn into_len(self) -> TextUnit { |
@@ -53,7 +56,7 @@ impl<'s> Ptr<'s> { | |||
53 | match self.next() { | 56 | match self.next() { |
54 | Some(c) if pred(c) => { | 57 | Some(c) if pred(c) => { |
55 | self.bump(); | 58 | self.bump(); |
56 | }, | 59 | } |
57 | _ => return, | 60 | _ => return, |
58 | } | 61 | } |
59 | } | 62 | } |
@@ -66,6 +69,6 @@ impl<'s> Ptr<'s> { | |||
66 | 69 | ||
67 | fn chars(&self) -> Chars { | 70 | fn chars(&self) -> Chars { |
68 | let len: u32 = self.len.into(); | 71 | let len: u32 = self.len.into(); |
69 | self.text[len as usize ..].chars() | 72 | self.text[len as usize..].chars() |
70 | } | 73 | } |
71 | } | 74 | } |
diff --git a/src/lexer/strings.rs b/src/lexer/strings.rs index 116d31760..00a84ec85 100644 --- a/src/lexer/strings.rs +++ b/src/lexer/strings.rs | |||
@@ -1,17 +1,17 @@ | |||
1 | use {SyntaxKind}; | 1 | use SyntaxKind; |
2 | use syntax_kinds::*; | 2 | use syntax_kinds::*; |
3 | 3 | ||
4 | use lexer::ptr::Ptr; | 4 | use lexer::ptr::Ptr; |
5 | 5 | ||
6 | pub(crate) fn is_string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { | 6 | pub(crate) fn is_string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { |
7 | match (c, c1, c2) { | 7 | match (c, c1, c2) { |
8 | ('r', Some('"'), _) | | 8 | ('r', Some('"'), _) |
9 | ('r', Some('#'), _) | | 9 | | ('r', Some('#'), _) |
10 | ('b', Some('"'), _) | | 10 | | ('b', Some('"'), _) |
11 | ('b', Some('\''), _) | | 11 | | ('b', Some('\''), _) |
12 | ('b', Some('r'), Some('"')) | | 12 | | ('b', Some('r'), Some('"')) |
13 | ('b', Some('r'), Some('#')) => true, | 13 | | ('b', Some('r'), Some('#')) => true, |
14 | _ => false | 14 | _ => false, |
15 | } | 15 | } |
16 | } | 16 | } |
17 | 17 | ||
@@ -50,20 +50,20 @@ pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind { | |||
50 | pub(crate) fn scan_string(ptr: &mut Ptr) { | 50 | pub(crate) fn scan_string(ptr: &mut Ptr) { |
51 | while let Some(c) = ptr.bump() { | 51 | while let Some(c) = ptr.bump() { |
52 | if c == '"' { | 52 | if c == '"' { |
53 | return | 53 | return; |
54 | } | 54 | } |
55 | } | 55 | } |
56 | } | 56 | } |
57 | 57 | ||
58 | pub(crate) fn scan_raw_string(ptr: &mut Ptr) { | 58 | pub(crate) fn scan_raw_string(ptr: &mut Ptr) { |
59 | if !ptr.next_is('"') { | 59 | if !ptr.next_is('"') { |
60 | return | 60 | return; |
61 | } | 61 | } |
62 | ptr.bump(); | 62 | ptr.bump(); |
63 | 63 | ||
64 | while let Some(c) = ptr.bump() { | 64 | while let Some(c) = ptr.bump() { |
65 | if c == '"' { | 65 | if c == '"' { |
66 | return | 66 | return; |
67 | } | 67 | } |
68 | } | 68 | } |
69 | } | 69 | } |
@@ -71,32 +71,32 @@ pub(crate) fn scan_raw_string(ptr: &mut Ptr) { | |||
71 | fn scan_byte(ptr: &mut Ptr) { | 71 | fn scan_byte(ptr: &mut Ptr) { |
72 | if ptr.next_is('\'') { | 72 | if ptr.next_is('\'') { |
73 | ptr.bump(); | 73 | ptr.bump(); |
74 | return | 74 | return; |
75 | } | 75 | } |
76 | ptr.bump(); | 76 | ptr.bump(); |
77 | if ptr.next_is('\'') { | 77 | if ptr.next_is('\'') { |
78 | ptr.bump(); | 78 | ptr.bump(); |
79 | return | 79 | return; |
80 | } | 80 | } |
81 | } | 81 | } |
82 | 82 | ||
83 | fn scan_byte_string(ptr: &mut Ptr) { | 83 | fn scan_byte_string(ptr: &mut Ptr) { |
84 | while let Some(c) = ptr.bump() { | 84 | while let Some(c) = ptr.bump() { |
85 | if c == '"' { | 85 | if c == '"' { |
86 | return | 86 | return; |
87 | } | 87 | } |
88 | } | 88 | } |
89 | } | 89 | } |
90 | 90 | ||
91 | fn scan_raw_byte_string(ptr: &mut Ptr) { | 91 | fn scan_raw_byte_string(ptr: &mut Ptr) { |
92 | if !ptr.next_is('"') { | 92 | if !ptr.next_is('"') { |
93 | return | 93 | return; |
94 | } | 94 | } |
95 | ptr.bump(); | 95 | ptr.bump(); |
96 | 96 | ||
97 | while let Some(c) = ptr.bump() { | 97 | while let Some(c) = ptr.bump() { |
98 | if c == '"' { | 98 | if c == '"' { |
99 | return | 99 | return; |
100 | } | 100 | } |
101 | } | 101 | } |
102 | } | 102 | } |
@@ -105,4 +105,3 @@ fn scan_char_or_byte(ptr: &mut Ptr) { | |||
105 | //FIXME: deal with escape sequencies | 105 | //FIXME: deal with escape sequencies |
106 | ptr.bump(); | 106 | ptr.bump(); |
107 | } | 107 | } |
108 | |||