diff options
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | Cargo.toml | 6 | ||||
-rw-r--r-- | minirust.rs | 152 | ||||
-rw-r--r-- | rfc.md | 133 | ||||
-rw-r--r-- | src/lib.rs | 7 |
5 files changed, 112 insertions, 189 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..6aa106405 --- /dev/null +++ b/.gitignore | |||
@@ -0,0 +1,3 @@ | |||
1 | /target/ | ||
2 | **/*.rs.bk | ||
3 | Cargo.lock | ||
diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..c94b99fad --- /dev/null +++ b/Cargo.toml | |||
@@ -0,0 +1,6 @@ | |||
1 | [package] | ||
2 | name = "libsyntax2" | ||
3 | version = "0.1.0" | ||
4 | authors = ["Aleksey Kladov <[email protected]>"] | ||
5 | |||
6 | [dependencies] | ||
diff --git a/minirust.rs b/minirust.rs deleted file mode 100644 index 9baa96039..000000000 --- a/minirust.rs +++ /dev/null | |||
@@ -1,152 +0,0 @@ | |||
1 | #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] | ||
2 | pub struct NodeKind(u16); | ||
3 | |||
4 | pub struct File { | ||
5 | text: String, | ||
6 | nodes: Vec<NodeData>, | ||
7 | } | ||
8 | |||
9 | struct NodeData { | ||
10 | kind: NodeKind, | ||
11 | range: (u32, u32), | ||
12 | parent: Option<u32>, | ||
13 | first_child: Option<u32>, | ||
14 | next_sibling: Option<u32>, | ||
15 | } | ||
16 | |||
17 | #[derive(Clone, Copy)] | ||
18 | pub struct Node<'f> { | ||
19 | file: &'f File, | ||
20 | idx: u32, | ||
21 | } | ||
22 | |||
23 | pub struct Children<'f> { | ||
24 | next: Option<Node<'f>>, | ||
25 | } | ||
26 | |||
27 | impl File { | ||
28 | pub fn root<'f>(&'f self) -> Node<'f> { | ||
29 | assert!(!self.nodes.is_empty()); | ||
30 | Node { file: self, idx: 0 } | ||
31 | } | ||
32 | } | ||
33 | |||
34 | impl<'f> Node<'f> { | ||
35 | pub fn kind(&self) -> NodeKind { | ||
36 | self.data().kind | ||
37 | } | ||
38 | |||
39 | pub fn text(&self) -> &'f str { | ||
40 | let (start, end) = self.data().range; | ||
41 | &self.file.text[start as usize..end as usize] | ||
42 | } | ||
43 | |||
44 | pub fn parent(&self) -> Option<Node<'f>> { | ||
45 | self.as_node(self.data().parent) | ||
46 | } | ||
47 | |||
48 | pub fn children(&self) -> Children<'f> { | ||
49 | Children { next: self.as_node(self.data().first_child) } | ||
50 | } | ||
51 | |||
52 | fn data(&self) -> &'f NodeData { | ||
53 | &self.file.nodes[self.idx as usize] | ||
54 | } | ||
55 | |||
56 | fn as_node(&self, idx: Option<u32>) -> Option<Node<'f>> { | ||
57 | idx.map(|idx| Node { file: self.file, idx }) | ||
58 | } | ||
59 | } | ||
60 | |||
61 | impl<'f> Iterator for Children<'f> { | ||
62 | type Item = Node<'f>; | ||
63 | |||
64 | fn next(&mut self) -> Option<Node<'f>> { | ||
65 | let next = self.next; | ||
66 | self.next = next.and_then(|node| node.as_node(node.data().next_sibling)); | ||
67 | next | ||
68 | } | ||
69 | } | ||
70 | |||
71 | pub const ERROR: NodeKind = NodeKind(0); | ||
72 | pub const WHITESPACE: NodeKind = NodeKind(1); | ||
73 | pub const STRUCT_KW: NodeKind = NodeKind(2); | ||
74 | pub const IDENT: NodeKind = NodeKind(3); | ||
75 | pub const L_CURLY: NodeKind = NodeKind(4); | ||
76 | pub const R_CURLY: NodeKind = NodeKind(5); | ||
77 | pub const COLON: NodeKind = NodeKind(6); | ||
78 | pub const COMMA: NodeKind = NodeKind(7); | ||
79 | pub const AMP: NodeKind = NodeKind(8); | ||
80 | pub const LINE_COMMENT: NodeKind = NodeKind(9); | ||
81 | pub const FILE: NodeKind = NodeKind(10); | ||
82 | pub const STRUCT_DEF: NodeKind = NodeKind(11); | ||
83 | pub const FIELD_DEF: NodeKind = NodeKind(12); | ||
84 | pub const TYPE_REF: NodeKind = NodeKind(13); | ||
85 | |||
86 | |||
87 | pub trait AstNode<'f>: Copy + 'f { | ||
88 | fn new(node: Node<'f>) -> Option<Self>; | ||
89 | fn node(&self) -> Node<'f>; | ||
90 | } | ||
91 | |||
92 | pub fn child_of_kind<'f>(node: Node<'f>, kind: NodeKind) -> Option<Node<'f>> { | ||
93 | node.children().find(|child| child.kind() == kind) | ||
94 | } | ||
95 | |||
96 | pub fn ast_children<'f, A: AstNode<'f>>(node: Node<'f>) -> Box<Iterator<Item=A> + 'f> { | ||
97 | Box::new(node.children().filter_map(A::new)) | ||
98 | } | ||
99 | |||
100 | #[derive(Clone, Copy)] | ||
101 | pub struct StructDef<'f>(Node<'f>); | ||
102 | |||
103 | #[derive(Clone, Copy)] | ||
104 | pub struct FieldDef<'f>(Node<'f>); | ||
105 | |||
106 | #[derive(Clone, Copy)] | ||
107 | pub struct TypeRef<'f>(Node<'f>); | ||
108 | |||
109 | pub trait NameOwner<'f>: AstNode<'f> { | ||
110 | fn name_ident(&self) -> Node<'f> { | ||
111 | child_of_kind(self.node(), IDENT).unwrap() | ||
112 | } | ||
113 | |||
114 | fn name(&self) -> &'f str { self.name_ident().text() } | ||
115 | } | ||
116 | |||
117 | |||
118 | impl<'f> AstNode<'f> for StructDef<'f> { | ||
119 | fn new(node: Node<'f>) -> Option<Self> { | ||
120 | if node.kind() == STRUCT_DEF { Some(StructDef(node)) } else { None } | ||
121 | } | ||
122 | fn node(&self) -> Node<'f> { self.0 } | ||
123 | } | ||
124 | |||
125 | impl<'f> AstNode<'f> for FieldDef<'f> { | ||
126 | fn new(node: Node<'f>) -> Option<Self> { | ||
127 | if node.kind() == FIELD_DEF { Some(FieldDef(node)) } else { None } | ||
128 | } | ||
129 | fn node(&self) -> Node<'f> { self.0 } | ||
130 | } | ||
131 | |||
132 | impl<'f> AstNode<'f> for TypeRef<'f> { | ||
133 | fn new(node: Node<'f>) -> Option<Self> { | ||
134 | if node.kind() == TYPE_REF { Some(TypeRef(node)) } else { None } | ||
135 | } | ||
136 | fn node(&self) -> Node<'f> { self.0 } | ||
137 | } | ||
138 | |||
139 | impl<'f> NameOwner<'f> for StructDef<'f> {} | ||
140 | impl<'f> NameOwner<'f> for FieldDef<'f> {} | ||
141 | |||
142 | impl<'f> StructDef<'f> { | ||
143 | pub fn fields(&self) -> Box<Iterator<Item=FieldDef<'f>> + 'f> { | ||
144 | ast_children(self.node()) | ||
145 | } | ||
146 | } | ||
147 | |||
148 | impl<'f> FieldDef<'f> { | ||
149 | pub fn type_ref(&self) -> Option<TypeRef<'f>> { | ||
150 | ast_children(self.node()).next() | ||
151 | } | ||
152 | } \ No newline at end of file | ||
@@ -30,12 +30,66 @@ other tools, and eventual libsyntax removal. | |||
30 | 30 | ||
31 | Note that this RFC does not propose to stabilize any API for working | 31 | Note that this RFC does not propose to stabilize any API for working |
32 | with rust syntax: the semver version of the hypothetical library would | 32 | with rust syntax: the semver version of the hypothetical library would |
33 | be `0.1.0`. | 33 | be `0.1.0`. It is intended to be used by tools, which are currently |
34 | closely related to the compiler: `rustc`, `rustfmt`, `clippy`, `rls` | ||
35 | and hypothetical `rustfix`. While it would be possible to create | ||
36 | third-party tools on top of the new libsyntax, the burden of adopting | ||
37 | to breaking changes would be on authors of such tools. | ||
34 | 38 | ||
35 | 39 | ||
36 | # Motivation | 40 | # Motivation |
37 | [motivation]: #motivation | 41 | [motivation]: #motivation |
38 | 42 | ||
43 | There are two main drawbacks with the current version of libsyntax: | ||
44 | |||
45 | * It is tightly integrated with the compiler and hard to use | ||
46 | independently | ||
47 | |||
48 | * The AST representation is not well-suited for use inside IDEs | ||
49 | |||
50 | |||
51 | ## IDE support | ||
52 | |||
53 | There are several differences in how IDEs and compilers typically | ||
54 | treat source code. | ||
55 | |||
56 | In the compiler, it is convenient to transform the source | ||
57 | code into Abstract Syntax Tree form, which is independent of the | ||
58 | surface syntax. For example, it's convenient to discard comments, | ||
59 | whitespaces and desugar some syntactic constructs in terms of the | ||
60 | simpler ones. | ||
61 | |||
62 | In contrast, IDEs work much closer to the source code, so it is | ||
63 | crucial to preserve full information about the original text. For | ||
64 | example, IDE may adjust indentation after typing a `}` which closes a | ||
65 | block, and to do this correctly, IDE must be aware of syntax (that is, | ||
66 | that `}` indeed closes some block, and is not a syntax error) and of | ||
67 | all whitespaces and comments. So, IDE suitable AST should explicitly | ||
68 | account for syntactic elements, not considered important by the | ||
69 | compiler. | ||
70 | |||
71 | Another difference is that IDEs typically work with incomplete and | ||
72 | syntactically invalid code. This boils down to two parser properties. | ||
73 | First, the parser must produce syntax tree even if some required input | ||
74 | is missing. For example, for input `fn foo` the function node should | ||
75 | be present in the parse, despite the fact that there is no parameters | ||
76 | or body. Second, the parser must be able to skip over parts of input | ||
77 | it can't recognize and aggressively recover from errors. That is, the | ||
78 | syntax tree data structure should be able to handle both missing and | ||
79 | extra nodes. | ||
80 | |||
81 | IDEs also need the ability to incrementally reparse and relex source | ||
82 | code after the user types. A smart IDE would use syntax tree structure | ||
83 | to handle editing commands (for example, to add/remove trailing commas | ||
84 | after join/split lines actions), so parsing time can be very | ||
85 | noticeable. | ||
86 | |||
87 | |||
88 | Currently rustc uses the classical AST approach, and preserves some of | ||
89 | the source code information in the form of spans in the AST. It is not | ||
90 | clear if this structure can full fill all IDE requirements. | ||
91 | |||
92 | |||
39 | ## Reusability | 93 | ## Reusability |
40 | 94 | ||
41 | In theory, the parser can be a pure function, which takes a `&str` as | 95 | In theory, the parser can be a pure function, which takes a `&str` as |
@@ -67,43 +121,42 @@ files. As a data point, it turned out to be easier to move `rustfmt` | |||
67 | into the main `rustc` repository than to move libsyntax outside! | 121 | into the main `rustc` repository than to move libsyntax outside! |
68 | 122 | ||
69 | 123 | ||
70 | ## IDE support | 124 | # Guide-level explanation |
125 | [guide-level-explanation]: #guide-level-explanation | ||
71 | 126 | ||
72 | There is one big difference in how IDEs and compilers typically treat | 127 | Not applicable. |
73 | source code. | ||
74 | 128 | ||
75 | In the compiler, it is convenient to transform the source | ||
76 | code into Abstract Syntax Tree form, which is independent of the | ||
77 | surface syntax. For example, it's convenient to discard comments, | ||
78 | whitespaces and desugar some syntactic constructs in terms of the | ||
79 | simpler ones. | ||
80 | 129 | ||
81 | In contrast, for IDEs it is crucial to have a lossless view of the | 130 | # Reference-level explanation |
82 | source code because, for example, it's important to preserve comments | 131 | [reference-level-explanation]: #reference-level-explanation |
83 | during refactorings. Ideally, IDEs should be able to incrementally | ||
84 | relex and reparse the file as the user types, because syntax tree is | ||
85 | necessary to correctly handle certain code-editing actions like | ||
86 | autoindentation or joining lines. IDE also must be able to produce | ||
87 | partial parse trees when some input is missing or invalid. | ||
88 | 132 | ||
89 | Currently rustc uses the AST approach, and preserves some of the | 133 | It is not clear if a single parser can accommodate the needs of the |
90 | source code information in the form of spans in the AST. | 134 | compiler and the IDE, but there is hope that it is possible. The RFC |
135 | proposes to develop libsynax2.0 as an experimental crates.io crate. If | ||
136 | the experiment turns out to be a success, the second RFC will propose | ||
137 | to integrate it with all existing tools and `rustc`. | ||
91 | 138 | ||
139 | Next, a syntax tree data structure is proposed for libsyntax2.0. It | ||
140 | seems to have the following important properties: | ||
92 | 141 | ||
93 | # Guide-level explanation | 142 | * It is lossless and faithfully represents the original source code, |
94 | [guide-level-explanation]: #guide-level-explanation | 143 | including explicit nodes for comments and whitespace. |
95 | 144 | ||
96 | Not applicable. | 145 | * It is flexible and allows to encode arbitrary node structure, |
146 | even for invalid syntax. | ||
97 | 147 | ||
148 | * It is minimal: it stores small amount of data and has no | ||
149 | dependencies. For instance, it does not need compiler's string | ||
150 | interner or literal data representation. | ||
98 | 151 | ||
99 | # Reference-level explanation | 152 | * While the tree itself is minimal, it is extensible in a sense that |
100 | [reference-level-explanation]: #reference-level-explanation | 153 | it possible to associate arbitrary data with certain nodes in a |
154 | type-safe way. | ||
101 | 155 | ||
102 | This section proposes a new syntax tree data structure, which should | ||
103 | be suitable for both compiler and IDE. It is heavily inspired by [PSI] | ||
104 | data structure which used in [IntelliJ] based IDEs and in the [Kotlin] | ||
105 | compiler. | ||
106 | 156 | ||
157 | It is not clear if this representation is the best one. It is heavily | ||
158 | inspired by [PSI] data structure which used in [IntelliJ] based IDEs | ||
159 | and in the [Kotlin] compiler. | ||
107 | 160 | ||
108 | [PSI]: http://www.jetbrains.org/intellij/sdk/docs/reference_guide/custom_language_support/implementing_parser_and_psi.html | 161 | [PSI]: http://www.jetbrains.org/intellij/sdk/docs/reference_guide/custom_language_support/implementing_parser_and_psi.html |
109 | [IntelliJ]: https://github.com/JetBrains/intellij-community/ | 162 | [IntelliJ]: https://github.com/JetBrains/intellij-community/ |
@@ -351,6 +404,11 @@ impl<'f> AstNode<'f> for TypeRef<'f> { | |||
351 | } | 404 | } |
352 | ``` | 405 | ``` |
353 | 406 | ||
407 | Note that although AST wrappers provide a type-safe access to the | ||
408 | tree, they are still represented as indexes, so clients of the syntax | ||
409 | tree can easily associated additional data with AST nodes by storing | ||
410 | it in a side-table. | ||
411 | |||
354 | 412 | ||
355 | ## Missing Source Code | 413 | ## Missing Source Code |
356 | 414 | ||
@@ -374,7 +432,8 @@ This RFC proposes huge changes to the internals of the compiler, so | |||
374 | it's important to proceed carefully and incrementally. The following | 432 | it's important to proceed carefully and incrementally. The following |
375 | plan is suggested: | 433 | plan is suggested: |
376 | 434 | ||
377 | * RFC discussion about the theoretical feasibility of the proposal. | 435 | * RFC discussion about the theoretical feasibility of the proposal, |
436 | and the best representation representation for the syntax tree. | ||
378 | 437 | ||
379 | * Implementation of the proposal as a completely separate crates.io | 438 | * Implementation of the proposal as a completely separate crates.io |
380 | crate, by refactoring existing libsyntax source code to produce a | 439 | crate, by refactoring existing libsyntax source code to produce a |
@@ -393,11 +452,11 @@ plan is suggested: | |||
393 | - No harm will be done as long as the new libsyntax exists as an | 452 | - No harm will be done as long as the new libsyntax exists as an |
394 | experiemt on crates.io. However, actually using it in the compiler | 453 | experiemt on crates.io. However, actually using it in the compiler |
395 | and other tools would require massive refactorings. | 454 | and other tools would require massive refactorings. |
396 | 455 | ||
397 | - Proposed syntax tree requires to keep the original source code | 456 | - It's difficult to know upfront if the proposed syntax tree would |
398 | available, which might increase memory usage of the | 457 | actually work well in both the compiler and IDE. It may be possible |
399 | compiler. However, it should be possible to throw the original tree | 458 | that some drawbacks will be discovered during implementation. |
400 | and source code away after conversion to HIR. | 459 | |
401 | 460 | ||
402 | # Rationale and alternatives | 461 | # Rationale and alternatives |
403 | [alternatives]: #alternatives | 462 | [alternatives]: #alternatives |
@@ -422,14 +481,14 @@ plan is suggested: | |||
422 | the source code? It seems like the answer is yes, because the | 481 | the source code? It seems like the answer is yes, because the |
423 | language and especially macros were cleverly designed with this | 482 | language and especially macros were cleverly designed with this |
424 | use-case in mind. | 483 | use-case in mind. |
425 | 484 | ||
426 | 485 | ||
427 | - Is it possible to implement macro expansion using the proposed | 486 | - Is it possible to implement macro expansion using the proposed |
428 | framework? This is the main question of this RFC. The proposed | 487 | framework? This is the main question of this RFC. The proposed |
429 | solution of synthesizing source code on the fly seems workable: it's | 488 | solution of synthesizing source code on the fly seems workable: it's |
430 | not that different from the current implementation, which | 489 | not that different from the current implementation, which |
431 | synthesizes token trees. | 490 | synthesizes token trees. |
432 | 491 | ||
433 | 492 | ||
434 | - How to actually phase out current libsyntax, if libsyntax2.0 turns | 493 | - How to actually phase out current libsyntax, if libsyntax2.0 turns |
435 | out to be a success? | 494 | out to be a success? |
diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 000000000..31e1bb209 --- /dev/null +++ b/src/lib.rs | |||
@@ -0,0 +1,7 @@ | |||
1 | #[cfg(test)] | ||
2 | mod tests { | ||
3 | #[test] | ||
4 | fn it_works() { | ||
5 | assert_eq!(2 + 2, 4); | ||
6 | } | ||
7 | } | ||