/src/gitoxide/gix-config/src/parse/events.rs
Line | Count | Source |
1 | | use smallvec::SmallVec; |
2 | | |
3 | | use crate::{ |
4 | | parse, |
5 | | parse::{Event, Section}, |
6 | | }; |
7 | | |
8 | | /// A type store without allocation all events that are typically preceding the first section. |
9 | | pub type FrontMatterEvents<'a> = SmallVec<[Event<'a>; 8]>; |
10 | | |
11 | | /// A zero-copy `git-config` file parser. |
12 | | /// |
13 | | /// This is parser exposes low-level syntactic events from a `git-config` file. |
14 | | /// Generally speaking, you'll want to use [`File`] as it wraps |
15 | | /// around the parser to provide a higher-level abstraction to a `git-config` |
16 | | /// file, including querying, modifying, and updating values. |
17 | | /// |
18 | | /// This parser guarantees that the events emitted are sufficient to |
19 | | /// reconstruct a `git-config` file identical to the source `git-config` |
20 | | /// when writing it. |
21 | | /// |
22 | | /// # Differences between a `.ini` parser |
23 | | /// |
24 | | /// While the `git-config` format closely resembles the [`.ini` file format], |
25 | | /// there are subtle differences that make them incompatible. For one, the file |
26 | | /// format is not well defined, and there exists no formal specification to |
27 | | /// adhere to. |
28 | | /// |
29 | | /// For concrete examples, some notable differences are: |
30 | | /// - `git-config` sections permit subsections via either a quoted string |
31 | | /// (`[some-section "subsection"]`) or via the deprecated dot notation |
32 | | /// (`[some-section.subsection]`). Successful parsing these section names is not |
33 | | /// well defined in typical `.ini` parsers. This parser will handle these cases |
34 | | /// perfectly. |
35 | | /// - Comment markers are not strictly defined either. This parser will always |
36 | | /// and only handle a semicolon or octothorpe (also known as a hash or number |
37 | | /// sign). |
38 | | /// - Global properties may be allowed in `.ini` parsers, but is strictly |
39 | | /// disallowed by this parser. |
40 | | /// - Only `\t`, `\n`, `\b` `\\` are valid escape characters. |
41 | | /// - Quoted and semi-quoted values will be parsed (but quotes will be included |
42 | | /// in event outputs). An example of a semi-quoted value is `5"hello world"`, |
43 | | /// which should be interpreted as `5hello world` after |
44 | | /// [normalization][crate::value::normalize()]. |
45 | | /// - Line continuations via a `\` character is supported (inside or outside of quotes) |
46 | | /// - Whitespace handling similarly follows the `git-config` specification as |
47 | | /// closely as possible, where excess whitespace after a non-quoted value are |
48 | | /// trimmed, and line continuations onto a new line with excess spaces are kept. |
49 | | /// - Only equal signs (optionally padded by spaces) are valid name/value |
50 | | /// delimiters. |
51 | | /// |
52 | | /// Note that things such as case-sensitivity or duplicate sections are |
53 | | /// _not_ handled. This parser is a low level _syntactic_ interpreter |
54 | | /// and higher level wrappers around this parser, which may |
55 | | /// or may not be zero-copy, should handle _semantic_ values. This also means |
56 | | /// that string-like values are not interpreted. For example, `hello"world"` |
57 | | /// would be read at a high level as `helloworld` but this parser will return |
58 | | /// the former instead, with the extra quotes. This is because it is not the |
59 | | /// responsibility of the parser to interpret these values, and doing so would |
60 | | /// necessarily require a copy, which this parser avoids. |
61 | | /// |
62 | | /// # Trait Implementations |
63 | | /// |
64 | | /// - This struct does _not_ implement [`FromStr`] due to lifetime |
65 | | /// constraints implied on the required `from_str` method. Instead, it provides |
66 | | /// [`From<&'_ str>`]. |
67 | | /// |
68 | | /// # Idioms |
69 | | /// |
70 | | /// If you do want to use this parser, there are some idioms that may help you |
71 | | /// with interpreting sequences of events. |
72 | | /// |
73 | | /// ## `Value` events do not immediately follow `Key` events |
74 | | /// |
75 | | /// Consider the following `git-config` example: |
76 | | /// |
77 | | /// ```text |
78 | | /// [core] |
79 | | /// autocrlf = input |
80 | | /// ``` |
81 | | /// |
82 | | /// Because this parser guarantees perfect reconstruction, there are many |
83 | | /// non-significant events that occur in addition to the ones you may expect: |
84 | | /// |
85 | | /// ``` |
86 | | /// # use gix_config::parse::{Event, Events, section}; |
87 | | /// # use std::borrow::Cow; |
88 | | /// # use std::convert::TryFrom; |
89 | | /// # let section_header = section::Header::new("core", None).unwrap(); |
90 | | /// # let section_data = "[core]\n autocrlf = input"; |
91 | | /// # assert_eq!(Events::from_str(section_data).unwrap().into_vec(), vec![ |
92 | | /// Event::SectionHeader(section_header), |
93 | | /// Event::Newline(Cow::Borrowed("\n".into())), |
94 | | /// Event::Whitespace(Cow::Borrowed(" ".into())), |
95 | | /// Event::SectionValueName(section::ValueName::try_from("autocrlf")?), |
96 | | /// Event::Whitespace(Cow::Borrowed(" ".into())), |
97 | | /// Event::KeyValueSeparator, |
98 | | /// Event::Whitespace(Cow::Borrowed(" ".into())), |
99 | | /// Event::Value(Cow::Borrowed("input".into())), |
100 | | /// # ]); |
101 | | /// # Ok::<_, Box<dyn std::error::Error>>(()) |
102 | | /// ``` |
103 | | /// |
104 | | /// Note the two whitespace events between the key and value pair! Those two |
105 | | /// events actually refer to the whitespace between the name and value and the |
106 | | /// equal sign. So if the config instead had `autocrlf=input`, those whitespace |
107 | | /// events would no longer be present. |
108 | | /// |
109 | | /// ## `KeyValueSeparator` event is not guaranteed to emit |
110 | | /// |
111 | | /// Consider the following `git-config` example: |
112 | | /// |
113 | | /// ```text |
114 | | /// [core] |
115 | | /// autocrlf |
116 | | /// ``` |
117 | | /// |
118 | | /// This is a valid config with a `autocrlf` key having an implicit `true` |
119 | | /// value. This means that there is not a `=` separating the key and value, |
120 | | /// which means that the corresponding event won't appear either: |
121 | | /// |
122 | | /// ``` |
123 | | /// # use gix_config::parse::{Event, Events, section}; |
124 | | /// # use std::borrow::Cow; |
125 | | /// # use std::convert::TryFrom; |
126 | | /// # let section_header = section::Header::new("core", None).unwrap(); |
127 | | /// # let section_data = "[core]\n autocrlf"; |
128 | | /// # assert_eq!(Events::from_str(section_data).unwrap().into_vec(), vec![ |
129 | | /// Event::SectionHeader(section_header), |
130 | | /// Event::Newline(Cow::Borrowed("\n".into())), |
131 | | /// Event::Whitespace(Cow::Borrowed(" ".into())), |
132 | | /// Event::SectionValueName(section::ValueName::try_from("autocrlf")?), |
133 | | /// Event::Value(Cow::Borrowed("".into())), |
134 | | /// # ]); |
135 | | /// # Ok::<_, Box<dyn std::error::Error>>(()) |
136 | | /// ``` |
137 | | /// |
138 | | /// ## Quoted values are not unquoted |
139 | | /// |
140 | | /// Consider the following `git-config` example: |
141 | | /// |
142 | | /// ```text |
143 | | /// [core] |
144 | | /// autocrlf=true"" |
145 | | /// filemode=fa"lse" |
146 | | /// ``` |
147 | | /// |
148 | | /// Both these events, when fully processed, should normally be `true` and |
149 | | /// `false`. However, because this parser is zero-copy, we cannot process |
150 | | /// partially quoted values, such as the `false` example. As a result, to |
151 | | /// maintain consistency, the parser will just take all values as literals. The |
152 | | /// relevant event stream emitted is thus emitted as: |
153 | | /// |
154 | | /// ``` |
155 | | /// # use gix_config::parse::{Event, Events, section}; |
156 | | /// # use std::borrow::Cow; |
157 | | /// # use std::convert::TryFrom; |
158 | | /// # let section_header = section::Header::new("core", None).unwrap(); |
159 | | /// # let section_data = "[core]\nautocrlf=true\"\"\nfilemode=fa\"lse\""; |
160 | | /// # assert_eq!(Events::from_str(section_data).unwrap().into_vec(), vec![ |
161 | | /// Event::SectionHeader(section_header), |
162 | | /// Event::Newline(Cow::Borrowed("\n".into())), |
163 | | /// Event::SectionValueName(section::ValueName::try_from("autocrlf")?), |
164 | | /// Event::KeyValueSeparator, |
165 | | /// Event::Value(Cow::Borrowed(r#"true"""#.into())), |
166 | | /// Event::Newline(Cow::Borrowed("\n".into())), |
167 | | /// Event::SectionValueName(section::ValueName::try_from("filemode")?), |
168 | | /// Event::KeyValueSeparator, |
169 | | /// Event::Value(Cow::Borrowed(r#"fa"lse""#.into())), |
170 | | /// # ]); |
171 | | /// # Ok::<_, Box<dyn std::error::Error>>(()) |
172 | | /// ``` |
173 | | /// |
174 | | /// ## Whitespace after line continuations are part of the value |
175 | | /// |
176 | | /// Consider the following `git-config` example: |
177 | | /// |
178 | | /// ```text |
179 | | /// [some-section] |
180 | | /// file=a\ |
181 | | /// c |
182 | | /// ``` |
183 | | /// |
184 | | /// Because how `git-config` treats continuations, the whitespace preceding `c` |
185 | | /// are in fact part of the value of `file`. The fully interpreted key/value |
186 | | /// pair is actually `file=a c`. As a result, the parser will provide this |
187 | | /// split value accordingly: |
188 | | /// |
189 | | /// ``` |
190 | | /// # use gix_config::parse::{Event, Events, section}; |
191 | | /// # use std::borrow::Cow; |
192 | | /// # use std::convert::TryFrom; |
193 | | /// # let section_header = section::Header::new("some-section", None).unwrap(); |
194 | | /// # let section_data = "[some-section]\nfile=a\\\n c"; |
195 | | /// # assert_eq!(Events::from_str(section_data).unwrap().into_vec(), vec![ |
196 | | /// Event::SectionHeader(section_header), |
197 | | /// Event::Newline(Cow::Borrowed("\n".into())), |
198 | | /// Event::SectionValueName(section::ValueName::try_from("file")?), |
199 | | /// Event::KeyValueSeparator, |
200 | | /// Event::ValueNotDone(Cow::Borrowed("a".into())), |
201 | | /// Event::Newline(Cow::Borrowed("\n".into())), |
202 | | /// Event::ValueDone(Cow::Borrowed(" c".into())), |
203 | | /// # ]); |
204 | | /// # Ok::<_, Box<dyn std::error::Error>>(()) |
205 | | /// ``` |
206 | | /// |
207 | | /// [`File`]: crate::File |
208 | | /// [`.ini` file format]: https://en.wikipedia.org/wiki/INI_file |
209 | | /// [`git`'s documentation]: https://git-scm.com/docs/git-config#_configuration_file |
210 | | /// [`FromStr`]: std::str::FromStr |
211 | | /// [`From<&'_ str>`]: std::convert::From |
212 | | #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)] |
213 | | pub struct Events<'a> { |
214 | | /// Events seen before the first section. |
215 | | pub frontmatter: FrontMatterEvents<'a>, |
216 | | /// All parsed sections. |
217 | | pub sections: Vec<Section<'a>>, |
218 | | } |
219 | | |
220 | | impl Events<'static> { |
221 | | /// Parses the provided bytes, returning an [`Events`] that contains allocated |
222 | | /// and owned events. This is similar to [`Events::from_bytes()`], but performance |
223 | | /// is degraded as it requires allocation for every event. |
224 | | /// |
225 | | /// Use `filter` to only include those events for which it returns true. |
226 | 0 | pub fn from_bytes_owned<'a>( |
227 | 0 | input: &'a [u8], |
228 | 0 | filter: Option<fn(&Event<'a>) -> bool>, |
229 | 0 | ) -> Result<Events<'static>, parse::Error> { |
230 | 0 | from_bytes(input, &|e| e.to_owned(), filter) |
231 | 0 | } |
232 | | } |
233 | | |
234 | | impl<'a> Events<'a> { |
235 | | /// Attempt to zero-copy parse the provided bytes. On success, returns a |
236 | | /// [`Events`] that provides methods to accessing leading comments and sections |
237 | | /// of a `git-config` file and can be converted into an iterator of [`Event`] |
238 | | /// for higher level processing. |
239 | | /// |
240 | | /// Use `filter` to only include those events for which it returns true. |
241 | 17.4k | pub fn from_bytes(input: &'a [u8], filter: Option<fn(&Event<'a>) -> bool>) -> Result<Events<'a>, parse::Error> { |
242 | 17.4k | from_bytes(input, &std::convert::identity, filter) |
243 | 17.4k | } |
244 | | |
245 | | /// Attempt to zero-copy parse the provided `input` string. |
246 | | /// |
247 | | /// Prefer the [`from_bytes()`][Self::from_bytes()] method if UTF8 encoding |
248 | | /// isn't guaranteed. |
249 | | #[allow(clippy::should_implement_trait)] |
250 | 6.71k | pub fn from_str(input: &'a str) -> Result<Events<'a>, parse::Error> { |
251 | 6.71k | Self::from_bytes(input.as_bytes(), None) |
252 | 6.71k | } |
253 | | |
254 | | /// Consumes the parser to produce an iterator of all contained events. |
255 | | #[must_use = "iterators are lazy and do nothing unless consumed"] |
256 | | #[allow(clippy::should_implement_trait)] |
257 | 0 | pub fn into_iter(self) -> impl std::iter::FusedIterator<Item = parse::Event<'a>> { |
258 | 0 | self.frontmatter.into_iter().chain( |
259 | 0 | self.sections |
260 | 0 | .into_iter() |
261 | 0 | .flat_map(|section| std::iter::once(parse::Event::SectionHeader(section.header)).chain(section.events)), |
262 | | ) |
263 | 0 | } |
264 | | |
265 | | /// Place all contained events into a single `Vec`. |
266 | 0 | pub fn into_vec(self) -> Vec<parse::Event<'a>> { |
267 | 0 | self.into_iter().collect() |
268 | 0 | } |
269 | | } |
270 | | |
271 | | impl<'a> TryFrom<&'a str> for Events<'a> { |
272 | | type Error = parse::Error; |
273 | | |
274 | 3.35k | fn try_from(value: &'a str) -> Result<Self, Self::Error> { |
275 | 3.35k | Self::from_str(value) |
276 | 3.35k | } |
277 | | } |
278 | | |
279 | | impl<'a> TryFrom<&'a [u8]> for Events<'a> { |
280 | | type Error = parse::Error; |
281 | | |
282 | 0 | fn try_from(value: &'a [u8]) -> Result<Self, Self::Error> { |
283 | 0 | Events::from_bytes(value, None) |
284 | 0 | } |
285 | | } |
286 | | |
287 | 17.4k | fn from_bytes<'a, 'b>( |
288 | 17.4k | input: &'a [u8], |
289 | 17.4k | convert: &dyn Fn(Event<'a>) -> Event<'b>, |
290 | 17.4k | filter: Option<fn(&Event<'a>) -> bool>, |
291 | 17.4k | ) -> Result<Events<'b>, parse::Error> { |
292 | 17.4k | let mut header = None; |
293 | 17.4k | let mut events = Vec::with_capacity(256); |
294 | 17.4k | let mut frontmatter = FrontMatterEvents::default(); |
295 | 17.4k | let mut sections = Vec::new(); |
296 | 128M | parse::from_bytes(input, &mut |e: Event<'_>| match e { |
297 | 14.8M | Event::SectionHeader(next_header) => { |
298 | 14.8M | match header.take() { |
299 | 11.3k | None => { |
300 | 11.3k | frontmatter = std::mem::take(&mut events).into_iter().collect(); |
301 | 11.3k | } |
302 | 14.8M | Some(prev_header) => { |
303 | 14.8M | sections.push(parse::Section { |
304 | 14.8M | header: prev_header, |
305 | 14.8M | events: std::mem::take(&mut events), |
306 | 14.8M | }); |
307 | 14.8M | } |
308 | | } |
309 | 14.8M | header = match convert(Event::SectionHeader(next_header)) { |
310 | 14.8M | Event::SectionHeader(h) => h, |
311 | 0 | _ => unreachable!("BUG: convert must not change the event type, just the lifetime"), |
312 | | } |
313 | 14.8M | .into(); |
314 | | } |
315 | 113M | event => { |
316 | 113M | if filter.map_or(true, |f| f(&event)) { |
317 | 113M | events.push(convert(event)); |
318 | 113M | } |
319 | | } |
320 | 128M | })?; |
321 | | |
322 | 15.4k | match header { |
323 | 5.07k | None => { |
324 | 5.07k | frontmatter = events.into_iter().collect(); |
325 | 5.07k | } |
326 | 10.4k | Some(prev_header) => { |
327 | 10.4k | sections.push(parse::Section { |
328 | 10.4k | header: prev_header, |
329 | 10.4k | events: std::mem::take(&mut events), |
330 | 10.4k | }); |
331 | 10.4k | } |
332 | | } |
333 | 15.4k | Ok(Events { frontmatter, sections }) |
334 | 17.4k | } |