aboutsummaryrefslogtreecommitdiff
path: root/src/lexer/mod.rs
blob: afbbee4d0d9f5c6595ea1aed30975a7743194ca1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
use {Token, SyntaxKind};
use syntax_kinds::*;

mod ptr;
use self::ptr::Ptr;

mod classes;
use self::classes::*;

pub fn next_token(text: &str) -> Token {
    assert!(!text.is_empty());
    let mut ptr = Ptr::new(text);
    let c = ptr.bump().unwrap();
    let kind = next_token_inner(c, &mut ptr);
    let len = ptr.into_len();
    Token { kind, len }
}

fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
    // Note: r as in r" or r#" is part of a raw string literal,
    // b as in b' is part of a byte literal.
    // They are not identifiers, and are handled further down.
    let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
    if ident_start {
        return scan_ident(c, ptr);
    }

    if is_whitespace(c) {
        ptr.bump_while(is_whitespace);
        return WHITESPACE;
    }

    if is_dec_digit(c) {
        return scan_number(c, ptr);
    }

    ERROR
}

fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
    let is_single_letter = match ptr.next() {
        None => true,
        Some(c) if !is_ident_continue(c) => true,
        _ => false,
    };
    if is_single_letter {
        return if c == '_' { UNDERSCORE } else { IDENT };
    }
    ptr.bump_while(is_ident_continue);
    IDENT
}

fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
    if c == '0' {
        match ptr.next().unwrap_or('\0') {
            'b' | 'o' => {
                ptr.bump();
                scan_digits(ptr, false);
            }
            'x' => {
                ptr.bump();
                scan_digits(ptr, true);
            }
            '0'...'9' | '_' | '.' | 'e' | 'E' => {
                scan_digits(ptr, true);
            }
            _ => return INT_NUMBER,
        }
    } else {
        scan_digits(ptr, false);
    }

    // might be a float, but don't be greedy if this is actually an
    // integer literal followed by field/method access or a range pattern
    // (`0..2` and `12.foo()`)
    if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) {
        // might have stuff after the ., and if it does, it needs to start
        // with a number
        ptr.bump();
        scan_digits(ptr, false);
        scan_float_exponent(ptr);
        return FLOAT_NUMBER;
    }
    // it might be a float if it has an exponent
    if ptr.next_is('e') || ptr.next_is('E') {
        scan_float_exponent(ptr);
        return FLOAT_NUMBER;
    }
    INT_NUMBER
}

fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
    while let Some(c) = ptr.next() {
        match c {
            '_' | '0'...'9' => {
                ptr.bump();
            }
            'a'...'f' | 'A' ... 'F' if allow_hex => {
                ptr.bump();
            }
            _ => return
        }
    }
}

fn scan_float_exponent(ptr: &mut Ptr) {
    if ptr.next_is('e') || ptr.next_is('E') {
        ptr.bump();
        if ptr.next_is('-') || ptr.next_is('+') {
            ptr.bump();
        }
        scan_digits(ptr, false);
    }
}

fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
    match (c, c1, c2) {
        ('r', Some('"'), _) |
        ('r', Some('#'), _) |
        ('b', Some('"'), _) |
        ('b', Some('\''), _) |
        ('b', Some('r'), Some('"')) |
        ('b', Some('r'), Some('#')) => true,
        _ => false
    }
}