diff options
-rw-r--r-- | docs/ARCHITECTURE.md | 21 | ||||
-rw-r--r-- | src/parser/event.rs | 2 | ||||
-rw-r--r-- | src/parser/grammar/mod.rs | 25 | ||||
-rw-r--r-- | src/parser/parser/imp.rs | 3 | ||||
-rw-r--r-- | src/parser/parser/mod.rs | 65 |
5 files changed, 91 insertions, 25 deletions
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index a1fa246c2..6b4434396 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md | |||
@@ -33,19 +33,22 @@ The centerpiece of this whole endeavor is the syntax tree, in the | |||
33 | 33 | ||
34 | The syntax tree is produced using a three-staged process. | 34 | The syntax tree is produced using a three-staged process. |
35 | 35 | ||
36 | First, a raw text is split into tokens with a lexer. Lexer has a | 36 | First, a raw text is split into tokens with a lexer (the `lexer` module). |
37 | peculiar signature: it is an `Fn(&str) -> Token`, where token is a | 37 | Lexer has a peculiar signature: it is an `Fn(&str) -> Token`, where token |
38 | pair of `SyntaxKind` (you should have read the `tree` module and RFC | 38 | is a pair of `SyntaxKind` (you should have read the `tree` module and RFC |
39 | by this time! :)) and a len. That is, lexer chomps only the first | 39 | by this time! :)) and a len. That is, lexer chomps only the first |
40 | token of the input. This forces the lexer to be stateless, and makes | 40 | token of the input. This forces the lexer to be stateless, and makes |
41 | it possible to implement incremental relexing easily. | 41 | it possible to implement incremental relexing easily. |
42 | 42 | ||
43 | Then, the bulk of work, the parser turns a stream of tokens into | 43 | Then, the bulk of work, the parser turns a stream of tokens into |
44 | stream of events. Not that parser **does not** construct a tree right | 44 | stream of events (the `parser` module; of particular interest are |
45 | away. This is done for several reasons: | 45 | the `parser/event` and `parser/parser` modules, which contain parsing |
46 | API, and the `parser/grammar` module, which contains actual parsing code | ||
47 | for various Rust syntactic constructs). Not that parser **does not** | ||
48 | construct a tree right away. This is done for several reasons: | ||
46 | 49 | ||
47 | * to decouple the actual tree data structure from the parser: you can | 50 | * to decouple the actual tree data structure from the parser: you can |
48 | build any datastructre you want from the stream of events | 51 | build any data structure you want from the stream of events |
49 | 52 | ||
50 | * to make parsing fast: you can produce a list of events without | 53 | * to make parsing fast: you can produce a list of events without |
51 | allocations | 54 | allocations |
@@ -77,12 +80,6 @@ And at last, the TreeBuilder converts a flat stream of events into a | |||
77 | tree structure. It also *should* be responsible for attaching comments | 80 | tree structure. It also *should* be responsible for attaching comments |
78 | and rebalancing the tree, but it does not do this yet :) | 81 | and rebalancing the tree, but it does not do this yet :) |
79 | 82 | ||
80 | |||
81 | ## Error reporing | ||
82 | |||
83 | TODO: describe how stuff like `skip_to_first` works | ||
84 | |||
85 | |||
86 | ## Validator | 83 | ## Validator |
87 | 84 | ||
88 | Parser and lexer accept a lot of *invalid* code intentionally. The | 85 | Parser and lexer accept a lot of *invalid* code intentionally. The |
diff --git a/src/parser/event.rs b/src/parser/event.rs index 30fe5c6d7..4af16d783 100644 --- a/src/parser/event.rs +++ b/src/parser/event.rs | |||
@@ -42,7 +42,7 @@ pub(crate) enum Event { | |||
42 | /// | | 42 | /// | |
43 | /// 'foo' | 43 | /// 'foo' |
44 | /// | 44 | /// |
45 | /// See also `CompleteMarker::precede`. | 45 | /// See also `CompletedMarker::precede`. |
46 | Start { | 46 | Start { |
47 | kind: SyntaxKind, | 47 | kind: SyntaxKind, |
48 | forward_parent: Option<u32>, | 48 | forward_parent: Option<u32>, |
diff --git a/src/parser/grammar/mod.rs b/src/parser/grammar/mod.rs index e29cf9b02..ee0263203 100644 --- a/src/parser/grammar/mod.rs +++ b/src/parser/grammar/mod.rs | |||
@@ -1,4 +1,27 @@ | |||
1 | use parser::parser::{Parser}; | 1 | //! This is the actual "grammar" of the Rust language. |
2 | //! | ||
3 | //! Each function in this module and its children corresponds | ||
4 | //! to a production of the format grammar. Submodules roughly | ||
5 | //! correspond to different *areas* of the grammar. By convention, | ||
6 | //! each submodule starts with `use super::*` import and exports | ||
7 | //! "public" productions via `pub(super)`. | ||
8 | //! | ||
9 | //! See docs for `Parser` to learn about API, available to the grammar, | ||
10 | //! and see docs for `Event` to learn how this actually manages to | ||
11 | //! produce parse trees. | ||
12 | //! | ||
13 | //! Code in this module also contains inline tests, which start with | ||
14 | //! `// test name-of-the-test` comment and look like this: | ||
15 | //! | ||
16 | //! ``` | ||
17 | //! // test fn_item_with_zero_parameters | ||
18 | //! // fn foo() {} | ||
19 | //! ``` | ||
20 | //! | ||
21 | //! After adding a new inline-test, run `cargo collect-tests` to extract | ||
22 | //! it as a standalone text-fixture into `tests/data/parser/inline`, and | ||
23 | //! run `cargo test` once to create the "gold" value. | ||
24 | use parser::parser::Parser; | ||
2 | use parser::token_set::TokenSet; | 25 | use parser::token_set::TokenSet; |
3 | use SyntaxKind; | 26 | use SyntaxKind; |
4 | use syntax_kinds::*; | 27 | use syntax_kinds::*; |
diff --git a/src/parser/parser/imp.rs b/src/parser/parser/imp.rs index 2b16e11b9..03c044091 100644 --- a/src/parser/parser/imp.rs +++ b/src/parser/parser/imp.rs | |||
@@ -4,6 +4,9 @@ use parser::event::Event; | |||
4 | use SyntaxKind; | 4 | use SyntaxKind; |
5 | use syntax_kinds::{TOMBSTONE, EOF}; | 5 | use syntax_kinds::{TOMBSTONE, EOF}; |
6 | 6 | ||
7 | /// Implementation details of `Parser`, extracted | ||
8 | /// to a separate struct in order not to pollute | ||
9 | /// the public API of the `Parser`. | ||
7 | pub(crate) struct ParserImpl<'t> { | 10 | pub(crate) struct ParserImpl<'t> { |
8 | inp: &'t ParserInput<'t>, | 11 | inp: &'t ParserInput<'t>, |
9 | 12 | ||
diff --git a/src/parser/parser/mod.rs b/src/parser/parser/mod.rs index c8db20918..618b439be 100644 --- a/src/parser/parser/mod.rs +++ b/src/parser/parser/mod.rs | |||
@@ -4,51 +4,72 @@ use syntax_kinds::ERROR; | |||
4 | pub(super) mod imp; | 4 | pub(super) mod imp; |
5 | use self::imp::ParserImpl; | 5 | use self::imp::ParserImpl; |
6 | 6 | ||
7 | /// `Parser` struct provides the low-level API for | ||
8 | /// navigating through the stream of tokens and | ||
9 | /// constructing the parse tree. The actual parsing | ||
10 | /// happens in the `grammar` module. | ||
11 | /// | ||
12 | /// However, the result of this `Parser` is not a real | ||
13 | /// tree, but rather a flat stream of events of the form | ||
14 | /// "start expression, consume number literal, | ||
15 | /// finish expression". See `Event` docs for more. | ||
7 | pub(crate) struct Parser<'t>(pub(super) ParserImpl<'t>); | 16 | pub(crate) struct Parser<'t>(pub(super) ParserImpl<'t>); |
8 | 17 | ||
9 | |||
10 | impl<'t> Parser<'t> { | 18 | impl<'t> Parser<'t> { |
19 | /// Returns the kind of the current token. | ||
20 | /// If parser has already reached the end of input, | ||
21 | /// the special `EOF` kind is returned. | ||
11 | pub(crate) fn current(&self) -> SyntaxKind { | 22 | pub(crate) fn current(&self) -> SyntaxKind { |
12 | self.nth(0) | 23 | self.nth(0) |
13 | } | 24 | } |
14 | 25 | ||
26 | /// Lookahead operation: returns the kind of the next nth | ||
27 | /// token. | ||
15 | pub(crate) fn nth(&self, n: u32) -> SyntaxKind { | 28 | pub(crate) fn nth(&self, n: u32) -> SyntaxKind { |
16 | self.0.nth(n) | 29 | self.0.nth(n) |
17 | } | 30 | } |
18 | 31 | ||
32 | /// Checks if the current token is `kind`. | ||
19 | pub(crate) fn at(&self, kind: SyntaxKind) -> bool { | 33 | pub(crate) fn at(&self, kind: SyntaxKind) -> bool { |
20 | self.current() == kind | 34 | self.current() == kind |
21 | } | 35 | } |
22 | 36 | ||
23 | pub(crate) fn at_kw(&self, t: &str) -> bool { | 37 | /// Checks if the current token is contextual keyword with text `t`. |
38 | pub(crate) fn at_contextual_kw(&self, t: &str) -> bool { | ||
24 | self.0.at_kw(t) | 39 | self.0.at_kw(t) |
25 | } | 40 | } |
26 | 41 | ||
42 | /// Starts a new node in the syntax tree. All nodes and tokens | ||
43 | /// consumed between the `start` and the corresponding `Marker::complete` | ||
44 | /// belong to the same node. | ||
27 | pub(crate) fn start(&mut self) -> Marker { | 45 | pub(crate) fn start(&mut self) -> Marker { |
28 | Marker(self.0.start()) | 46 | Marker(self.0.start()) |
29 | } | 47 | } |
30 | 48 | ||
49 | /// Advances the parser by one token. | ||
31 | pub(crate) fn bump(&mut self) { | 50 | pub(crate) fn bump(&mut self) { |
32 | self.0.bump(); | 51 | self.0.bump(); |
33 | } | 52 | } |
34 | 53 | ||
54 | /// Advances the parser by one token, remapping its kind. | ||
55 | /// This is useful to create contextual keywords from | ||
56 | /// identifiers. For example, the lexer creates an `union` | ||
57 | /// *identifier* token, but the parser remaps it to the | ||
58 | /// `union` keyword, and keyword is what ends up in the | ||
59 | /// final tree. | ||
35 | pub(crate) fn bump_remap(&mut self, kind: SyntaxKind) { | 60 | pub(crate) fn bump_remap(&mut self, kind: SyntaxKind) { |
36 | self.0.bump_remap(kind); | 61 | self.0.bump_remap(kind); |
37 | } | 62 | } |
38 | 63 | ||
64 | /// Emit error with the `message` | ||
65 | /// TODO: this should be much more fancy and support | ||
66 | /// structured errors with spans and notes, like rustc | ||
67 | /// does. | ||
39 | pub(crate) fn error<T: Into<String>>(&mut self, message: T) { | 68 | pub(crate) fn error<T: Into<String>>(&mut self, message: T) { |
40 | self.0.error(message.into()) | 69 | self.0.error(message.into()) |
41 | } | 70 | } |
42 | 71 | ||
43 | pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool { | 72 | /// Consume the next token if it is `kind`. |
44 | if self.at(kind) { | ||
45 | self.bump(); | ||
46 | return true; | ||
47 | } | ||
48 | self.error(format!("expected {:?}", kind)); | ||
49 | false | ||
50 | } | ||
51 | |||
52 | pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool { | 73 | pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool { |
53 | if !self.at(kind) { | 74 | if !self.at(kind) { |
54 | return false; | 75 | return false; |
@@ -57,6 +78,17 @@ impl<'t> Parser<'t> { | |||
57 | true | 78 | true |
58 | } | 79 | } |
59 | 80 | ||
81 | /// Consume the next token if it is `kind` or emit an error | ||
82 | /// otherwise. | ||
83 | pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool { | ||
84 | if self.eat(kind) { | ||
85 | return true; | ||
86 | } | ||
87 | self.error(format!("expected {:?}", kind)); | ||
88 | false | ||
89 | } | ||
90 | |||
91 | /// Create an error node and consume the next token. | ||
60 | pub(crate) fn err_and_bump(&mut self, message: &str) { | 92 | pub(crate) fn err_and_bump(&mut self, message: &str) { |
61 | let m = self.start(); | 93 | let m = self.start(); |
62 | self.error(message); | 94 | self.error(message); |
@@ -65,9 +97,11 @@ impl<'t> Parser<'t> { | |||
65 | } | 97 | } |
66 | } | 98 | } |
67 | 99 | ||
100 | /// See `Parser::start`. | ||
68 | pub(crate) struct Marker(u32); | 101 | pub(crate) struct Marker(u32); |
69 | 102 | ||
70 | impl Marker { | 103 | impl Marker { |
104 | /// Finishes the syntax tree node and assigns `kind` to it. | ||
71 | pub(crate) fn complete(self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker { | 105 | pub(crate) fn complete(self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker { |
72 | let pos = self.0; | 106 | let pos = self.0; |
73 | ::std::mem::forget(self); | 107 | ::std::mem::forget(self); |
@@ -75,6 +109,8 @@ impl Marker { | |||
75 | CompletedMarker(pos) | 109 | CompletedMarker(pos) |
76 | } | 110 | } |
77 | 111 | ||
112 | /// Abandons the syntax tree node. All its children | ||
113 | /// are attached to its parent instead. | ||
78 | pub(crate) fn abandon(self, p: &mut Parser) { | 114 | pub(crate) fn abandon(self, p: &mut Parser) { |
79 | let pos = self.0; | 115 | let pos = self.0; |
80 | ::std::mem::forget(self); | 116 | ::std::mem::forget(self); |
@@ -94,6 +130,13 @@ impl Drop for Marker { | |||
94 | pub(crate) struct CompletedMarker(u32); | 130 | pub(crate) struct CompletedMarker(u32); |
95 | 131 | ||
96 | impl CompletedMarker { | 132 | impl CompletedMarker { |
133 | /// This one is tricky :-) | ||
134 | /// This method allows to create a new node which starts | ||
135 | /// *before* the current one. That is, parser could start | ||
136 | /// node `A`, then complete it, and then after parsing the | ||
137 | /// whole `A`, decide that it should have started some node | ||
138 | /// `B` before starting `A`. `precede` allows to do exactly | ||
139 | /// that. See also docs about `forward_parent` in `Event::Start`. | ||
97 | pub(crate) fn precede(self, p: &mut Parser) -> Marker { | 140 | pub(crate) fn precede(self, p: &mut Parser) -> Marker { |
98 | Marker(p.0.precede(self.0)) | 141 | Marker(p.0.precede(self.0)) |
99 | } | 142 | } |