aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAleksey Kladov <[email protected]>2018-02-11 14:58:22 +0000
committerAleksey Kladov <[email protected]>2018-02-11 14:58:22 +0000
commit59087840f515c809498f09ec535e59054a893525 (patch)
tree282e1f9606dbeeba6c19a2dcd6ff94da420d155a
parent9e2c0564783aa91f6440e7cadcc1a4dfda785de0 (diff)
Document how the parsing works
-rw-r--r--docs/ARCHITECTURE.md21
-rw-r--r--src/parser/event.rs2
-rw-r--r--src/parser/grammar/mod.rs25
-rw-r--r--src/parser/parser/imp.rs3
-rw-r--r--src/parser/parser/mod.rs65
5 files changed, 91 insertions, 25 deletions
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index a1fa246c2..6b4434396 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -33,19 +33,22 @@ The centerpiece of this whole endeavor is the syntax tree, in the
33 33
34The syntax tree is produced using a three-staged process. 34The syntax tree is produced using a three-staged process.
35 35
36First, a raw text is split into tokens with a lexer. Lexer has a 36First, a raw text is split into tokens with a lexer (the `lexer` module).
37peculiar signature: it is an `Fn(&str) -> Token`, where token is a 37Lexer has a peculiar signature: it is an `Fn(&str) -> Token`, where token
38pair of `SyntaxKind` (you should have read the `tree` module and RFC 38is a pair of `SyntaxKind` (you should have read the `tree` module and RFC
39by this time! :)) and a len. That is, lexer chomps only the first 39by this time! :)) and a len. That is, lexer chomps only the first
40token of the input. This forces the lexer to be stateless, and makes 40token of the input. This forces the lexer to be stateless, and makes
41it possible to implement incremental relexing easily. 41it possible to implement incremental relexing easily.
42 42
43Then, the bulk of work, the parser turns a stream of tokens into 43Then, the bulk of work, the parser turns a stream of tokens into
44stream of events. Not that parser **does not** construct a tree right 44stream of events (the `parser` module; of particular interest are
45away. This is done for several reasons: 45the `parser/event` and `parser/parser` modules, which contain parsing
46API, and the `parser/grammar` module, which contains actual parsing code
47for various Rust syntactic constructs). Not that parser **does not**
48construct a tree right away. This is done for several reasons:
46 49
47* to decouple the actual tree data structure from the parser: you can 50* to decouple the actual tree data structure from the parser: you can
48 build any datastructre you want from the stream of events 51 build any data structure you want from the stream of events
49 52
50* to make parsing fast: you can produce a list of events without 53* to make parsing fast: you can produce a list of events without
51 allocations 54 allocations
@@ -77,12 +80,6 @@ And at last, the TreeBuilder converts a flat stream of events into a
77tree structure. It also *should* be responsible for attaching comments 80tree structure. It also *should* be responsible for attaching comments
78and rebalancing the tree, but it does not do this yet :) 81and rebalancing the tree, but it does not do this yet :)
79 82
80
81## Error reporing
82
83TODO: describe how stuff like `skip_to_first` works
84
85
86## Validator 83## Validator
87 84
88Parser and lexer accept a lot of *invalid* code intentionally. The 85Parser and lexer accept a lot of *invalid* code intentionally. The
diff --git a/src/parser/event.rs b/src/parser/event.rs
index 30fe5c6d7..4af16d783 100644
--- a/src/parser/event.rs
+++ b/src/parser/event.rs
@@ -42,7 +42,7 @@ pub(crate) enum Event {
42 /// | 42 /// |
43 /// 'foo' 43 /// 'foo'
44 /// 44 ///
45 /// See also `CompleteMarker::precede`. 45 /// See also `CompletedMarker::precede`.
46 Start { 46 Start {
47 kind: SyntaxKind, 47 kind: SyntaxKind,
48 forward_parent: Option<u32>, 48 forward_parent: Option<u32>,
diff --git a/src/parser/grammar/mod.rs b/src/parser/grammar/mod.rs
index e29cf9b02..ee0263203 100644
--- a/src/parser/grammar/mod.rs
+++ b/src/parser/grammar/mod.rs
@@ -1,4 +1,27 @@
1use parser::parser::{Parser}; 1//! This is the actual "grammar" of the Rust language.
2//!
3//! Each function in this module and its children corresponds
4//! to a production of the format grammar. Submodules roughly
5//! correspond to different *areas* of the grammar. By convention,
6//! each submodule starts with `use super::*` import and exports
7//! "public" productions via `pub(super)`.
8//!
9//! See docs for `Parser` to learn about API, available to the grammar,
10//! and see docs for `Event` to learn how this actually manages to
11//! produce parse trees.
12//!
13//! Code in this module also contains inline tests, which start with
14//! `// test name-of-the-test` comment and look like this:
15//!
16//! ```
17//! // test fn_item_with_zero_parameters
18//! // fn foo() {}
19//! ```
20//!
21//! After adding a new inline-test, run `cargo collect-tests` to extract
22//! it as a standalone text-fixture into `tests/data/parser/inline`, and
23//! run `cargo test` once to create the "gold" value.
24use parser::parser::Parser;
2use parser::token_set::TokenSet; 25use parser::token_set::TokenSet;
3use SyntaxKind; 26use SyntaxKind;
4use syntax_kinds::*; 27use syntax_kinds::*;
diff --git a/src/parser/parser/imp.rs b/src/parser/parser/imp.rs
index 2b16e11b9..03c044091 100644
--- a/src/parser/parser/imp.rs
+++ b/src/parser/parser/imp.rs
@@ -4,6 +4,9 @@ use parser::event::Event;
4use SyntaxKind; 4use SyntaxKind;
5use syntax_kinds::{TOMBSTONE, EOF}; 5use syntax_kinds::{TOMBSTONE, EOF};
6 6
7/// Implementation details of `Parser`, extracted
8/// to a separate struct in order not to pollute
9/// the public API of the `Parser`.
7pub(crate) struct ParserImpl<'t> { 10pub(crate) struct ParserImpl<'t> {
8 inp: &'t ParserInput<'t>, 11 inp: &'t ParserInput<'t>,
9 12
diff --git a/src/parser/parser/mod.rs b/src/parser/parser/mod.rs
index c8db20918..618b439be 100644
--- a/src/parser/parser/mod.rs
+++ b/src/parser/parser/mod.rs
@@ -4,51 +4,72 @@ use syntax_kinds::ERROR;
4pub(super) mod imp; 4pub(super) mod imp;
5use self::imp::ParserImpl; 5use self::imp::ParserImpl;
6 6
7/// `Parser` struct provides the low-level API for
8/// navigating through the stream of tokens and
9/// constructing the parse tree. The actual parsing
10/// happens in the `grammar` module.
11///
12/// However, the result of this `Parser` is not a real
13/// tree, but rather a flat stream of events of the form
14/// "start expression, consume number literal,
15/// finish expression". See `Event` docs for more.
7pub(crate) struct Parser<'t>(pub(super) ParserImpl<'t>); 16pub(crate) struct Parser<'t>(pub(super) ParserImpl<'t>);
8 17
9
10impl<'t> Parser<'t> { 18impl<'t> Parser<'t> {
19 /// Returns the kind of the current token.
20 /// If parser has already reached the end of input,
21 /// the special `EOF` kind is returned.
11 pub(crate) fn current(&self) -> SyntaxKind { 22 pub(crate) fn current(&self) -> SyntaxKind {
12 self.nth(0) 23 self.nth(0)
13 } 24 }
14 25
26 /// Lookahead operation: returns the kind of the next nth
27 /// token.
15 pub(crate) fn nth(&self, n: u32) -> SyntaxKind { 28 pub(crate) fn nth(&self, n: u32) -> SyntaxKind {
16 self.0.nth(n) 29 self.0.nth(n)
17 } 30 }
18 31
32 /// Checks if the current token is `kind`.
19 pub(crate) fn at(&self, kind: SyntaxKind) -> bool { 33 pub(crate) fn at(&self, kind: SyntaxKind) -> bool {
20 self.current() == kind 34 self.current() == kind
21 } 35 }
22 36
23 pub(crate) fn at_kw(&self, t: &str) -> bool { 37 /// Checks if the current token is contextual keyword with text `t`.
38 pub(crate) fn at_contextual_kw(&self, t: &str) -> bool {
24 self.0.at_kw(t) 39 self.0.at_kw(t)
25 } 40 }
26 41
42 /// Starts a new node in the syntax tree. All nodes and tokens
43 /// consumed between the `start` and the corresponding `Marker::complete`
44 /// belong to the same node.
27 pub(crate) fn start(&mut self) -> Marker { 45 pub(crate) fn start(&mut self) -> Marker {
28 Marker(self.0.start()) 46 Marker(self.0.start())
29 } 47 }
30 48
49 /// Advances the parser by one token.
31 pub(crate) fn bump(&mut self) { 50 pub(crate) fn bump(&mut self) {
32 self.0.bump(); 51 self.0.bump();
33 } 52 }
34 53
54 /// Advances the parser by one token, remapping its kind.
55 /// This is useful to create contextual keywords from
56 /// identifiers. For example, the lexer creates an `union`
57 /// *identifier* token, but the parser remaps it to the
58 /// `union` keyword, and keyword is what ends up in the
59 /// final tree.
35 pub(crate) fn bump_remap(&mut self, kind: SyntaxKind) { 60 pub(crate) fn bump_remap(&mut self, kind: SyntaxKind) {
36 self.0.bump_remap(kind); 61 self.0.bump_remap(kind);
37 } 62 }
38 63
64 /// Emit error with the `message`
65 /// TODO: this should be much more fancy and support
66 /// structured errors with spans and notes, like rustc
67 /// does.
39 pub(crate) fn error<T: Into<String>>(&mut self, message: T) { 68 pub(crate) fn error<T: Into<String>>(&mut self, message: T) {
40 self.0.error(message.into()) 69 self.0.error(message.into())
41 } 70 }
42 71
43 pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool { 72 /// Consume the next token if it is `kind`.
44 if self.at(kind) {
45 self.bump();
46 return true;
47 }
48 self.error(format!("expected {:?}", kind));
49 false
50 }
51
52 pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool { 73 pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool {
53 if !self.at(kind) { 74 if !self.at(kind) {
54 return false; 75 return false;
@@ -57,6 +78,17 @@ impl<'t> Parser<'t> {
57 true 78 true
58 } 79 }
59 80
81 /// Consume the next token if it is `kind` or emit an error
82 /// otherwise.
83 pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool {
84 if self.eat(kind) {
85 return true;
86 }
87 self.error(format!("expected {:?}", kind));
88 false
89 }
90
91 /// Create an error node and consume the next token.
60 pub(crate) fn err_and_bump(&mut self, message: &str) { 92 pub(crate) fn err_and_bump(&mut self, message: &str) {
61 let m = self.start(); 93 let m = self.start();
62 self.error(message); 94 self.error(message);
@@ -65,9 +97,11 @@ impl<'t> Parser<'t> {
65 } 97 }
66} 98}
67 99
100/// See `Parser::start`.
68pub(crate) struct Marker(u32); 101pub(crate) struct Marker(u32);
69 102
70impl Marker { 103impl Marker {
104 /// Finishes the syntax tree node and assigns `kind` to it.
71 pub(crate) fn complete(self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker { 105 pub(crate) fn complete(self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker {
72 let pos = self.0; 106 let pos = self.0;
73 ::std::mem::forget(self); 107 ::std::mem::forget(self);
@@ -75,6 +109,8 @@ impl Marker {
75 CompletedMarker(pos) 109 CompletedMarker(pos)
76 } 110 }
77 111
112 /// Abandons the syntax tree node. All its children
113 /// are attached to its parent instead.
78 pub(crate) fn abandon(self, p: &mut Parser) { 114 pub(crate) fn abandon(self, p: &mut Parser) {
79 let pos = self.0; 115 let pos = self.0;
80 ::std::mem::forget(self); 116 ::std::mem::forget(self);
@@ -94,6 +130,13 @@ impl Drop for Marker {
94pub(crate) struct CompletedMarker(u32); 130pub(crate) struct CompletedMarker(u32);
95 131
96impl CompletedMarker { 132impl CompletedMarker {
133 /// This one is tricky :-)
134 /// This method allows to create a new node which starts
135 /// *before* the current one. That is, parser could start
136 /// node `A`, then complete it, and then after parsing the
137 /// whole `A`, decide that it should have started some node
138 /// `B` before starting `A`. `precede` allows to do exactly
139 /// that. See also docs about `forward_parent` in `Event::Start`.
97 pub(crate) fn precede(self, p: &mut Parser) -> Marker { 140 pub(crate) fn precede(self, p: &mut Parser) -> Marker {
98 Marker(p.0.precede(self.0)) 141 Marker(p.0.precede(self.0))
99 } 142 }