From c244975a0ac7059c5dd6237c23b7a8d2a208f362 Mon Sep 17 00:00:00 2001
From: Edward Shen <code@eddie.sh>
Date: Fri, 19 Feb 2021 19:18:02 -0500
Subject: [PATCH] parser is now perfect

---
 src/config.rs                     |   2 +
 src/parser.rs                     | 267 ++++++++++++++++++++----------
 tests/parser_integration_tests.rs | 120 ++++++++++++--
 3 files changed, 289 insertions(+), 100 deletions(-)

diff --git a/src/config.rs b/src/config.rs
index 7ff8ca8..f16268f 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -42,6 +42,7 @@ impl<'a> GitConfig<'a> {
                 Event::Comment(_) => (),
                 Event::SectionHeader(ParsedSectionHeader {
                     name,
+                    separator: _,
                     subsection_name,
                 }) => {
                     current_section_name = name;
@@ -112,6 +113,7 @@ impl<'a> GitConfig<'a> {
                         options.on_duplicate_name,
                     )?;
                 }
+                Event::Whitespace(_) => (),
             }
         }
 
diff --git a/src/parser.rs b/src/parser.rs
index 76408f7..07cfc55 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -8,7 +8,7 @@
 //! additional methods for accessing leading comments or events by section.
 
 use crate::values::{Boolean, TrueVariant, Value};
-use nom::bytes::complete::{escaped, tag, take_till, take_while};
+use nom::bytes::complete::{escaped, tag, take_till, take_until, take_while};
 use nom::character::complete::{char, none_of, one_of};
 use nom::character::{is_newline, is_space};
 use nom::combinator::{map, opt};
@@ -37,6 +37,7 @@ pub enum Event<'a> {
     ValueNotDone(&'a str),
     /// The last line of a value which was continued onto another line.
     ValueDone(&'a str),
+    Whitespace(&'a str),
 }
 
 #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)]
@@ -49,6 +50,13 @@ pub struct ParsedSection<'a> {
 #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Default)]
 pub struct ParsedSectionHeader<'a> {
     pub name: &'a str,
+    /// The separator used to determine if the section contains a subsection.
+    /// This is either a period `.` or a string of whitespace. Note that
+    /// reconstruction of subsection format is dependent on this value. If this
+    /// is all whitespace, then the subsection name needs to be surrounded by
+    /// quotes to have perfect reconstruction.
+    pub separator: Option<&'a str>,
+    /// The subsection name without quotes if any exist.
     pub subsection_name: Option<&'a str>,
 }
 
@@ -74,11 +82,10 @@ impl<'a> From<nom::Err<NomError<&'a str>>> for ParserError<'a> {
 
 /// A zero-copy `git-config` file parser.
 ///
-/// # Non-perfect parser
-///
-/// This parser should successfully parse all sections and comments. However,
-/// It will not parse whitespace. This attempts to closely follow the
-/// non-normative specification found in [`git`'s documentation].
+/// This is parser is considered a perfect parser, where a `git-config` file
+/// can be identically reconstructed from the events emitted from this parser.
+/// Events emitted from this parser are bound to the lifetime of the provided
+/// `str` as this parser performs no copies from the input.
 ///
 /// # Differences between a `.ini` parser
 ///
@@ -222,7 +229,6 @@ pub fn parse_from_str(input: &str) -> Result<Parser<'_>, ParserError> {
 }
 
 fn comment<'a>(i: &'a str) -> IResult<&'a str, ParsedComment<'a>> {
-    let i = i.trim_start();
     let (i, comment_tag) = one_of(";#")(i)?;
     let (i, comment) = take_till(is_char_newline)(i)?;
     Ok((
@@ -235,9 +241,10 @@ fn comment<'a>(i: &'a str) -> IResult<&'a str, ParsedComment<'a>> {
 }
 
 fn section<'a>(i: &'a str) -> IResult<&'a str, ParsedSection<'a>> {
-    let i = i.trim_start();
     let (i, section_header) = section_header(i)?;
-    let (i, items) = many1(alt((
+    let (i, items) = many0(alt((
+        map(take_spaces, |space| vec![Event::Whitespace(space)]),
+        map(take_newline, |newline| vec![Event::Newline(newline)]),
         map(section_body, |(key, values)| {
             let mut vec = vec![Event::Key(key)];
             vec.extend(values);
@@ -265,10 +272,12 @@ fn section_header<'a>(i: &'a str) -> IResult<&'a str, ParsedSectionHeader<'a>> {
         let header = match name.rfind('.') {
             Some(index) => ParsedSectionHeader {
                 name: &name[..index],
-                subsection_name: Some(&name[index + 1..]),
+                separator: name.get(index..index + 1),
+                subsection_name: name.get(index + 1..),
             },
             None => ParsedSectionHeader {
                 name: name,
+                separator: None,
                 subsection_name: None,
             },
         };
@@ -278,7 +287,7 @@ fn section_header<'a>(i: &'a str) -> IResult<&'a str, ParsedSectionHeader<'a>> {
 
     // Section header must be using modern subsection syntax at this point.
 
-    let (i, _) = take_spaces(i)?;
+    let (i, whitespace) = take_spaces(i)?;
 
     let (i, subsection_name) = delimited(
         char('"'),
@@ -290,6 +299,7 @@ fn section_header<'a>(i: &'a str) -> IResult<&'a str, ParsedSectionHeader<'a>> {
         i,
         ParsedSectionHeader {
             name: name,
+            separator: Some(whitespace),
             // We know that there's some section name here, so if we get an
             // empty vec here then we actually parsed an empty section name.
             subsection_name: subsection_name.or(Some("")),
@@ -297,18 +307,18 @@ fn section_header<'a>(i: &'a str) -> IResult<&'a str, ParsedSectionHeader<'a>> {
     ))
 }
 
-fn take_spaces<'a>(i: &'a str) -> IResult<&'a str, &'a str> {
-    take_while(|c: char| c.is_ascii() && is_space(c as u8))(i)
-}
-
 fn section_body<'a>(i: &'a str) -> IResult<&'a str, (&'a str, Vec<Event<'a>>)> {
-    let i = i.trim_start();
     // maybe need to check for [ here
     let (i, name) = config_name(i)?;
-    let (i, _) = take_spaces(i)?;
+    let (i, whitespace) = opt(take_spaces)(i)?;
     let (i, value) = config_value(i)?;
-
-    Ok((i, (name, value)))
+    if let Some(whitespace) = whitespace {
+        let mut events = vec![Event::Whitespace(whitespace)];
+        events.extend(value);
+        Ok((i, (name, events)))
+    } else {
+        Ok((i, (name, value)))
+    }
 }
 
 /// Parses the config name of a config pair. Assumes the input has already been
@@ -333,8 +343,15 @@ fn config_name<'a>(i: &'a str) -> IResult<&'a str, &'a str> {
 
 fn config_value<'a>(i: &'a str) -> IResult<&'a str, Vec<Event<'a>>> {
     if let (i, Some(_)) = opt(char('='))(i)? {
-        let (i, _) = take_spaces(i)?;
-        value_impl(i)
+        let (i, whitespace) = opt(take_spaces)(i)?;
+        let (i, values) = value_impl(i)?;
+        if let Some(whitespace) = whitespace {
+            let mut events = vec![Event::Whitespace(whitespace)];
+            events.extend(values);
+            Ok((i, events))
+        } else {
+            Ok((i, values))
+        }
     } else {
         Ok((
             i,
@@ -418,20 +435,49 @@ fn value_impl<'a>(i: &'a str) -> IResult<&'a str, Vec<Event<'a>>> {
         }));
     }
 
-    let remainder_value = &i[offset..parsed_index].trim_end();
+    let (i, remainder_value) = {
+        let mut new_index = parsed_index;
+        for index in (offset..parsed_index).rev() {
+            if !(i.as_bytes()[index] as char).is_whitespace() {
+                new_index = index + 1;
+                break;
+            }
+        }
+        (&i[new_index..], &i[offset..new_index])
+    };
+
     if partial_value_found {
         events.push(Event::ValueDone(remainder_value));
     } else {
         events.push(Event::Value(Value::from_str(remainder_value)));
     }
 
-    Ok((&i[parsed_index..], events))
+    Ok((i, events))
 }
 
 fn is_char_newline(c: char) -> bool {
     c.is_ascii() && is_newline(c as u8)
 }
 
+fn take_spaces<'a>(i: &'a str) -> IResult<&'a str, &'a str> {
+    take_common(i, |c: char| c.is_ascii() && is_space(c as u8))
+}
+
+fn take_newline<'a>(i: &'a str) -> IResult<&'a str, &'a str> {
+    take_common(i, is_char_newline)
+}
+
+fn take_common<'a, F: Fn(char) -> bool>(i: &'a str, f: F) -> IResult<&'a str, &'a str> {
+    let (i, v) = take_while(f)(i)?;
+    if v.is_empty() {
+        Err(nom::Err::Error(NomError {
+            input: i,
+            code: ErrorKind::Eof,
+        }))
+    } else {
+        Ok((i, v))
+    }
+}
 #[cfg(test)]
 mod parse {
     use super::*;
@@ -440,6 +486,25 @@ mod parse {
         ("", t)
     }
 
+    fn gen_section_header(
+        name: &str,
+        subsection: impl Into<Option<(&'static str, &'static str)>>,
+    ) -> ParsedSectionHeader<'_> {
+        if let Some((separator, subsection_name)) = subsection.into() {
+            ParsedSectionHeader {
+                name,
+                separator: Some(separator),
+                subsection_name: Some(subsection_name),
+            }
+        } else {
+            ParsedSectionHeader {
+                name,
+                separator: None,
+                subsection_name: None,
+            }
+        }
+    }
+
     mod comments {
         use super::super::*;
         use super::*;
@@ -486,10 +551,7 @@ mod parse {
         fn no_subsection() {
             assert_eq!(
                 section_header("[hello]").unwrap(),
-                fully_consumed(ParsedSectionHeader {
-                    name: "hello",
-                    subsection_name: None
-                })
+                fully_consumed(gen_section_header("hello", None)),
             );
         }
 
@@ -497,10 +559,7 @@ mod parse {
         fn modern_subsection() {
             assert_eq!(
                 section_header(r#"[hello "world"]"#).unwrap(),
-                fully_consumed(ParsedSectionHeader {
-                    name: "hello",
-                    subsection_name: Some("world")
-                })
+                fully_consumed(gen_section_header("hello", (" ", "world"))),
             );
         }
 
@@ -508,10 +567,7 @@ mod parse {
         fn escaped_subsection() {
             assert_eq!(
                 section_header(r#"[hello "foo\\bar\""]"#).unwrap(),
-                fully_consumed(ParsedSectionHeader {
-                    name: "hello",
-                    subsection_name: Some(r#"foo\\bar\""#)
-                })
+                fully_consumed(gen_section_header("hello", (" ", r#"foo\\bar\""#))),
             );
         }
 
@@ -519,10 +575,7 @@ mod parse {
         fn deprecated_subsection() {
             assert_eq!(
                 section_header(r#"[hello.world]"#).unwrap(),
-                fully_consumed(ParsedSectionHeader {
-                    name: "hello",
-                    subsection_name: Some("world")
-                })
+                fully_consumed(gen_section_header("hello", (".", "world")))
             );
         }
 
@@ -530,10 +583,7 @@ mod parse {
         fn empty_legacy_subsection_name() {
             assert_eq!(
                 section_header(r#"[hello.]"#).unwrap(),
-                fully_consumed(ParsedSectionHeader {
-                    name: "hello",
-                    subsection_name: Some("")
-                })
+                fully_consumed(gen_section_header("hello", (".", "")))
             );
         }
 
@@ -541,10 +591,7 @@ mod parse {
         fn empty_modern_subsection_name() {
             assert_eq!(
                 section_header(r#"[hello ""]"#).unwrap(),
-                fully_consumed(ParsedSectionHeader {
-                    name: "hello",
-                    subsection_name: Some("")
-                })
+                fully_consumed(gen_section_header("hello", (" ", "")))
             );
         }
 
@@ -562,10 +609,7 @@ mod parse {
         fn right_brace_in_subsection_name() {
             assert_eq!(
                 section_header(r#"[hello "]"]"#).unwrap(),
-                fully_consumed(ParsedSectionHeader {
-                    name: "hello",
-                    subsection_name: Some("]")
-                })
+                fully_consumed(gen_section_header("hello", (" ", "]")))
             );
         }
     }
@@ -611,14 +655,6 @@ mod parse {
             )
         }
 
-        #[test]
-        fn no_comment_is_trimmed() {
-            assert_eq!(
-                value_impl("hello").unwrap(),
-                value_impl("hello               ").unwrap()
-            );
-        }
-
         #[test]
         fn semicolon_comment_not_consumed() {
             assert_eq!(
@@ -636,14 +672,31 @@ mod parse {
         }
 
         #[test]
-        fn values_with_comments_are_trimmed() {
+        fn values_with_extraneous_whitespace_without_comment() {
+            assert_eq!(
+                value_impl("hello               ").unwrap(),
+                (
+                    "               ",
+                    vec![Event::Value(Value::from_str("hello"))]
+                )
+            );
+        }
+
+        #[test]
+        fn values_with_extraneous_whitespace_before_comment() {
             assert_eq!(
-                value_impl("hello#world").unwrap(),
                 value_impl("hello             #world").unwrap(),
+                (
+                    "             #world",
+                    vec![Event::Value(Value::from_str("hello")),]
+                )
             );
             assert_eq!(
-                value_impl("hello;world").unwrap(),
                 value_impl("hello             ;world").unwrap(),
+                (
+                    "             ;world",
+                    vec![Event::Value(Value::from_str("hello")),]
+                )
             );
         }
 
@@ -695,7 +748,7 @@ mod parse {
             assert_eq!(
                 value_impl("1    \"\\\"\\\na ; e \"\\\"\\\nd # \"b\t ; c").unwrap(),
                 (
-                    "# \"b\t ; c",
+                    " # \"b\t ; c",
                     vec![
                         Event::ValueNotDone(r#"1    "\""#),
                         Event::Newline("\n"),
@@ -727,6 +780,17 @@ mod parse {
         use super::super::*;
         use super::*;
 
+        #[test]
+        fn empty_section() {
+            assert_eq!(
+                section("[test]").unwrap(),
+                fully_consumed(ParsedSection {
+                    section_header: gen_section_header("test", None),
+                    events: vec![]
+                })
+            );
+        }
+
         #[test]
         fn simple_section() {
             let section_data = r#"[hello]
@@ -736,16 +800,23 @@ mod parse {
             assert_eq!(
                 section(section_data).unwrap(),
                 fully_consumed(ParsedSection {
-                    section_header: ParsedSectionHeader {
-                        name: "hello",
-                        subsection_name: None,
-                    },
+                    section_header: gen_section_header("hello", None),
                     events: vec![
+                        Event::Newline("\n"),
+                        Event::Whitespace("            "),
                         Event::Key("a"),
+                        Event::Whitespace(" "),
+                        Event::Whitespace(" "),
                         Event::Value(Value::from_str("b")),
+                        Event::Newline("\n"),
+                        Event::Whitespace("            "),
                         Event::Key("c"),
                         Event::Value(Value::Boolean(Boolean::True(TrueVariant::Implicit))),
+                        Event::Newline("\n"),
+                        Event::Whitespace("            "),
                         Event::Key("d"),
+                        Event::Whitespace(" "),
+                        Event::Whitespace(" "),
                         Event::Value(Value::from_str("\"lol\""))
                     ]
                 })
@@ -757,11 +828,9 @@ mod parse {
             assert_eq!(
                 section("[hello] c").unwrap(),
                 fully_consumed(ParsedSection {
-                    section_header: ParsedSectionHeader {
-                        name: "hello",
-                        subsection_name: None,
-                    },
+                    section_header: gen_section_header("hello", None),
                     events: vec![
+                        Event::Whitespace(" "),
                         Event::Key("c"),
                         Event::Value(Value::Boolean(Boolean::True(TrueVariant::Implicit)))
                     ]
@@ -779,30 +848,41 @@ mod parse {
             assert_eq!(
                 section(section_data).unwrap(),
                 fully_consumed(ParsedSection {
-                    section_header: ParsedSectionHeader {
-                        name: "hello",
-                        subsection_name: None,
-                    },
+                    section_header: gen_section_header("hello", None),
                     events: vec![
+                        Event::Whitespace(" "),
                         Event::Comment(ParsedComment {
                             comment_tag: ';',
                             comment: " commentA",
                         }),
+                        Event::Newline("\n"),
+                        Event::Whitespace("            "),
                         Event::Key("a"),
+                        Event::Whitespace(" "),
+                        Event::Whitespace(" "),
                         Event::Value(Value::from_str("b")),
+                        Event::Whitespace(" "),
                         Event::Comment(ParsedComment {
                             comment_tag: '#',
                             comment: " commentB",
                         }),
+                        Event::Newline("\n"),
+                        Event::Whitespace("            "),
                         Event::Comment(ParsedComment {
                             comment_tag: ';',
                             comment: " commentC",
                         }),
+                        Event::Newline("\n"),
+                        Event::Whitespace("            "),
                         Event::Comment(ParsedComment {
                             comment_tag: ';',
                             comment: " commentD",
                         }),
+                        Event::Newline("\n"),
+                        Event::Whitespace("            "),
                         Event::Key("c"),
+                        Event::Whitespace(" "),
+                        Event::Whitespace(" "),
                         Event::Value(Value::from_str("d")),
                     ]
                 })
@@ -815,17 +895,18 @@ mod parse {
             assert_eq!(
                 section("[section] a = 1    \"\\\"\\\na ; e \"\\\"\\\nd # \"b\t ; c").unwrap(),
                 fully_consumed(ParsedSection {
-                    section_header: ParsedSectionHeader {
-                        name: "section",
-                        subsection_name: None,
-                    },
+                    section_header: gen_section_header("section", None),
                     events: vec![
+                        Event::Whitespace(" "),
                         Event::Key("a"),
+                        Event::Whitespace(" "),
+                        Event::Whitespace(" "),
                         Event::ValueNotDone(r#"1    "\""#),
                         Event::Newline("\n"),
                         Event::ValueNotDone(r#"a ; e "\""#),
                         Event::Newline("\n"),
                         Event::ValueDone("d"),
+                        Event::Whitespace(" "),
                         Event::Comment(ParsedComment {
                             comment_tag: '#',
                             comment: " \"b\t ; c"
@@ -840,22 +921,40 @@ mod parse {
             assert_eq!(
                 section("[section \"a\"] b =\"\\\n;\";a").unwrap(),
                 fully_consumed(ParsedSection {
-                    section_header: ParsedSectionHeader {
-                        name: "section",
-                        subsection_name: Some("a")
-                    },
+                    section_header: gen_section_header("section", (" ", "a")),
                     events: vec![
+                        Event::Whitespace(" "),
                         Event::Key("b"),
+                        Event::Whitespace(" "),
                         Event::ValueNotDone("\""),
                         Event::Newline("\n"),
                         Event::ValueDone(";\""),
                         Event::Comment(ParsedComment {
+                            comment_tag: ';',
                             comment: "a",
-                            comment_tag: ';'
                         })
                     ]
                 })
             )
         }
+
+        #[test]
+        fn section_handles_extranous_whitespace_before_comment() {
+            assert_eq!(
+                section("[s]hello             #world").unwrap(),
+                fully_consumed(ParsedSection {
+                    section_header: gen_section_header("s", None),
+                    events: vec![
+                        Event::Key("hello"),
+                        Event::Whitespace("             "),
+                        Event::Value(Value::Boolean(Boolean::True(TrueVariant::Implicit))),
+                        Event::Comment(ParsedComment {
+                            comment_tag: '#',
+                            comment: "world",
+                        }),
+                    ]
+                })
+            );
+        }
     }
 }
diff --git a/tests/parser_integration_tests.rs b/tests/parser_integration_tests.rs
index 937ea13..c672c86 100644
--- a/tests/parser_integration_tests.rs
+++ b/tests/parser_integration_tests.rs
@@ -1,17 +1,30 @@
-use serde_git_config::parser::{parse_from_str, Event, ParsedSectionHeader, Parser};
+use serde_git_config::parser::{parse_from_str, Event, ParsedSectionHeader};
 use serde_git_config::values::Value;
 
 fn fully_consumed<T>(t: T) -> (&'static str, T) {
     ("", t)
 }
 
-fn section_header(name: &'static str, subname: impl Into<Option<&'static str>>) -> Event<'static> {
-    Event::SectionHeader(ParsedSectionHeader {
-        name,
-        subsection_name: subname.into(),
-    })
+fn gen_section_header(
+    name: &str,
+    subsection: impl Into<Option<(&'static str, &'static str)>>,
+) -> Event<'_> {
+    Event::SectionHeader(
+        if let Some((separator, subsection_name)) = subsection.into() {
+            ParsedSectionHeader {
+                name,
+                separator: Some(separator),
+                subsection_name: Some(subsection_name),
+            }
+        } else {
+            ParsedSectionHeader {
+                name,
+                separator: None,
+                subsection_name: None,
+            }
+        },
+    )
 }
-
 fn name(name: &'static str) -> Event<'static> {
     Event::Key(name)
 }
@@ -20,7 +33,16 @@ fn value(value: &'static str) -> Event<'static> {
     Event::Value(Value::from_str(value))
 }
 
+fn newline() -> Event<'static> {
+    Event::Newline("\n")
+}
+
+fn whitespace(value: &'static str) -> Event<'static> {
+    Event::Whitespace(value)
+}
+
 #[test]
+#[rustfmt::skip]
 fn personal_config() {
     let config = r#"[user]
         email = code@eddie.sh
@@ -48,34 +70,100 @@ fn personal_config() {
             .into_iter()
             .collect::<Vec<_>>(),
         vec![
-            section_header("user", None),
+            gen_section_header("user", None),
+            newline(),
+
+            whitespace("        "),
             name("email"),
+            whitespace(" "),
+            whitespace(" "),
             value("code@eddie.sh"),
+            newline(),
+
+            whitespace("        "),
             name("name"),
+            whitespace(" "),
+            whitespace(" "),
             value("Edward Shen"),
-            section_header("core", None),
+            newline(),
+
+            gen_section_header("core", None),
+            newline(),
+
+            whitespace("        "),
             name("autocrlf"),
+            whitespace(" "),
+            whitespace(" "),
             value("input"),
-            section_header("push", None),
+            newline(),
+
+            gen_section_header("push", None),
+            newline(),
+
+            whitespace("        "),
             name("default"),
+            whitespace(" "),
+            whitespace(" "),
             value("simple"),
-            section_header("commit", None),
+            newline(),
+
+            gen_section_header("commit", None),
+            newline(),
+
+            whitespace("        "),
             name("gpgsign"),
+            whitespace(" "),
+            whitespace(" "),
             value("true"),
-            section_header("gpg", None),
+            newline(),
+
+            gen_section_header("gpg", None),
+            newline(),
+
+            whitespace("        "),
             name("program"),
+            whitespace(" "),
+            whitespace(" "),
             value("gpg"),
-            section_header("url", "ssh://git@github.com/"),
+            newline(),
+
+            gen_section_header("url", (" ", "ssh://git@github.com/")),
+            newline(),
+
+            whitespace("        "),
             name("insteadOf"),
+            whitespace(" "),
+            whitespace(" "),
             value("\"github://\""),
-            section_header("url", "ssh://git@git.eddie.sh/edward/"),
+            newline(),
+
+            gen_section_header("url", (" ", "ssh://git@git.eddie.sh/edward/")),
+            newline(),
+
+            whitespace("        "),
             name("insteadOf"),
+            whitespace(" "),
+            whitespace(" "),
             value("\"gitea://\""),
-            section_header("pull", None),
+            newline(),
+
+            gen_section_header("pull", None),
+            newline(),
+
+            whitespace("        "),
             name("ff"),
+            whitespace(" "),
+            whitespace(" "),
             value("only"),
-            section_header("init", None),
+            newline(),
+
+            gen_section_header("init", None),
+            newline(),
+
+            whitespace("        "),
             name("defaultBranch"),
+            whitespace(" "),
+            whitespace(" "),
             value("master"),
         ]
     );