fix: single \r will be consider as blank line

2023-11-14 11:46:26 +08:00 · 2023-11-14 11:46:26 +08:00 · a269f2f258
commit a269f2f258
parent b6e86a128a
23 changed files with 101 additions and 305 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -13,9 +13,6 @@ exclude = ["/wasm", "/.github"]
 [package.metadata.docs.rs]
 all-features = true

-[badges]
-travis-ci = { repository = "PoiScript/orgize" }
-
 [features]
 default = []
 indexmap = ["dep:indexmap"]
@ -36,7 +33,6 @@ tracing = "0.1"

 [dev-dependencies]
 criterion = "0.4"
-pretty_assertions = "1.3"
 insta = "1.29"
 serde_json = "1.0"
 slugify = "0.1"
--- a/benches/parse.rs
+++ b/benches/parse.rs
@ -3,9 +3,9 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
 use orgize::Org;

 const INPUT: &[(&str, &str)] = &[
-    // ("org-syntax.org", include_str!("./org-syntax.org")),
    ("doc.org", include_str!("./doc.org")),
    ("org-faq.org", include_str!("./org-faq.org")),
+    ("org-syntax.org", include_str!("./org-syntax.org")),
 ];

 pub fn bench_parse(c: &mut Criterion) {
--- a/fuzz/fuzz_targets/fuzz_target_1.rs
+++ b/fuzz/fuzz_targets/fuzz_target_1.rs
@ -1,11 +1,7 @@
 #![no_main]

-use libfuzzer_sys::fuzz_target;
-use orgize::syntax::{HtmlHandler, Org};
-use std::str;
-
-fuzz_target!(|data: &[u8]| {
-    if let Ok(utf8) = str::from_utf8(data) {
-        let _ = Org::parse(utf8);
+libfuzzer_sys::fuzz_target!(|data: &[u8]| {
+    if let Ok(utf8) = std::str::from_utf8(data) {
+        let _ = orgize::Org::parse(utf8);
    }
 });
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -19,7 +19,7 @@ use rowan::{ast::AstNode, Language, NodeOrToken};

 pub fn blank_lines(parent: &SyntaxNode) -> usize {
    parent
-        .children()
+        .children_with_tokens()
        .filter(|n| n.kind() == SyntaxKind::BLANK_LINE)
        .count()
 }
--- a/src/syntax/block.rs
+++ b/src/syntax/block.rs
@ -168,10 +168,8 @@ r#"#+BEGIN_SRC
        TEXT@8..11 "SRC"
        TEXT@11..11 ""
        NEW_LINE@11..12 "\n"
-      BLANK_LINE@12..13
-        NEW_LINE@12..13 "\n"
-      BLANK_LINE@13..14
-        NEW_LINE@13..14 "\n"
+      BLANK_LINE@12..13 "\n"
+      BLANK_LINE@13..14 "\n"
      BLOCK_CONTENT@14..14
      BLOCK_END@14..27
        WHITESPACE@14..18 "    "
@ -222,10 +220,8 @@ alert('Hello World!');
        TEXT@54..60 "#+END_"
        TEXT@60..63 "SRC"
        NEW_LINE@63..64 "\n"
-      BLANK_LINE@64..65
-        NEW_LINE@64..65 "\n"
-      BLANK_LINE@65..69
-        WHITESPACE@65..69 "    "
+      BLANK_LINE@64..65 "\n"
+      BLANK_LINE@65..69 "    "
    "###
    );

--- a/src/syntax/clock.rs
+++ b/src/syntax/clock.rs
@ -130,8 +130,7 @@ fn parse() {
      COLON@59..60 ":"
      TEXT@60..62 "00"
      NEW_LINE@62..63 "\n"
-      BLANK_LINE@63..64
-        NEW_LINE@63..64 "\n"
+      BLANK_LINE@63..64 "\n"
    "###
    );
 }
--- a/src/syntax/combinator.rs
+++ b/src/syntax/combinator.rs
@ -1,13 +1,9 @@
 use std::iter::once;

-use memchr::{memchr, memchr_iter};
+use memchr::{memchr, memchr2_iter, memchr_iter};
 use nom::{
-    branch::alt,
-    bytes::complete::tag,
-    character::complete::{line_ending, space0},
-    combinator::eof,
-    sequence::tuple,
-    AsBytes, IResult, InputLength, InputTake, Parser,
+    bytes::complete::tag, character::complete::space0, AsBytes, IResult, InputLength, InputTake,
+    Parser,
 };
 use rowan::{GreenNode, GreenToken, Language, NodeOrToken};

@ -97,23 +93,30 @@ where

 /// Takes all blank lines
 pub fn blank_lines(input: Input) -> IResult<Input, Vec<GreenElement>, ()> {
-    let mut lines = vec![];
-    let mut i = input;
+    if input.is_empty() {
+        return Ok((input, vec![]));
+    }

-    while !i.is_empty() {
-        match tuple::<_, _, (), _>((space0, alt((line_ending, eof))))(i) {
-            Ok((input, (ws, nl))) => {
-                let mut b = NodeBuilder::new();
-                b.ws(ws);
-                b.nl(nl);
-                lines.push(b.finish(BLANK_LINE));
-                i = input;
-            }
-            _ => break,
+    let mut lines = vec![];
+    let mut start = 0;
+    let bytes = input.as_bytes();
+
+    for index in memchr2_iter(b'\r', b'\n', bytes)
+        .map(|i| i + 1)
+        .chain(once(bytes.len()))
+    {
+        if bytes.get(index - 1) == Some(&b'\r') && bytes.get(index) == Some(&b'\n') {
+            continue;
+        }
+        if start != index && bytes[start..index].iter().all(|b| b.is_ascii_whitespace()) {
+            lines.push(token(BLANK_LINE, &input.as_str()[start..index]));
+            start = index;
+        } else {
+            break;
        }
    }

-    Ok((i, lines))
+    Ok((input.take_split(start).0, lines))
 }

 #[test]
@ -124,6 +127,11 @@ fn test_blank_lines() {
    assert_eq!(input.as_str(), "");
    assert_eq!(output, vec![]);

+    let (input, output) = blank_lines(("\n", config).into()).unwrap();
+    assert_eq!(input.as_str(), "");
+    assert_eq!(output.len(), 1);
+    assert_eq!(output[0].to_string(), "\n");
+
    let (input, output) = blank_lines(("    t", config).into()).unwrap();
    assert_eq!(input.as_str(), "    t");
    assert_eq!(output, vec![]);
@ -138,13 +146,15 @@ fn test_blank_lines() {
    assert_eq!(output[4].to_string(), "  ");

    let (input, output) =
-        blank_lines(("  \r\n\n\t\t\r\n  \n   t\n  \r\n\n\t\t\r\n  \n", config).into()).unwrap();
-    assert_eq!(input.as_str(), "   t\n  \r\n\n\t\t\r\n  \n");
-    assert_eq!(output.len(), 4);
-    assert_eq!(output[0].to_string(), "  \r\n");
+        blank_lines(("\r\n\n\t\t\r\n  \n\r   \r   t\n  ", config).into()).unwrap();
+    assert_eq!(input.as_str(), "   t\n  ");
+    assert_eq!(output.len(), 6);
+    assert_eq!(output[0].to_string(), "\r\n");
    assert_eq!(output[1].to_string(), "\n");
    assert_eq!(output[2].to_string(), "\t\t\r\n");
    assert_eq!(output[3].to_string(), "  \n");
+    assert_eq!(output[4].to_string(), "\r");
+    assert_eq!(output[5].to_string(), "   \r");
 }

 /// Returns 1. anything before trailing whitespace, 2. whitespace itself, 3. line feeding
--- a/src/syntax/comment.rs
+++ b/src/syntax/comment.rs
@ -70,8 +70,7 @@ fn parse() {
        @r###"
    COMMENT@0..12
      TEXT@0..11 "#\n  # a\n #\n"
-      BLANK_LINE@11..12
-        NEW_LINE@11..12 "\n"
+      BLANK_LINE@11..12 "\n"
    "###
    );

--- a/src/syntax/document.rs
+++ b/src/syntax/document.rs
@ -53,13 +53,9 @@ fn parse() {
        to_document("\n  \n\n").syntax,
        @r###"
    DOCUMENT@0..5
-      BLANK_LINE@0..1
-        NEW_LINE@0..1 "\n"
-      BLANK_LINE@1..4
-        WHITESPACE@1..3 "  "
-        NEW_LINE@3..4 "\n"
-      BLANK_LINE@4..5
-        NEW_LINE@4..5 "\n"
+      BLANK_LINE@0..1 "\n"
+      BLANK_LINE@1..4 "  \n"
+      BLANK_LINE@4..5 "\n"
    "###
    );

@ -77,8 +73,7 @@ fn parse() {
        to_document("\n* section").syntax,
        @r###"
    DOCUMENT@0..10
-      BLANK_LINE@0..1
-        NEW_LINE@0..1 "\n"
+      BLANK_LINE@0..1 "\n"
      HEADLINE@1..10
        HEADLINE_STARS@1..2 "*"
        WHITESPACE@2..3 " "
@ -91,8 +86,7 @@ fn parse() {
        to_document("\n** heading 2\n* heading 1").syntax,
        @r###"
    DOCUMENT@0..25
-      BLANK_LINE@0..1
-        NEW_LINE@0..1 "\n"
+      BLANK_LINE@0..1 "\n"
      HEADLINE@1..14
        HEADLINE_STARS@1..3 "**"
        WHITESPACE@3..4 " "
--- a/src/syntax/drawer.rs
+++ b/src/syntax/drawer.rs
@ -179,8 +179,7 @@ fn parse() {
        TEXT@1..7 "DRAWER"
        COLON@7..8 ":"
        NEW_LINE@8..9 "\n"
-      BLANK_LINE@9..10
-        NEW_LINE@9..10 "\n"
+      BLANK_LINE@9..10 "\n"
      TEXT@10..10 ""
      DRAWER_END@10..18
        WHITESPACE@10..12 "  "
@ -188,8 +187,7 @@ fn parse() {
        TEXT@13..16 "END"
        COLON@16..17 ":"
        NEW_LINE@17..18 "\n"
-      BLANK_LINE@18..19
-        NEW_LINE@18..19 "\n"
+      BLANK_LINE@18..19 "\n"
    "###
    );

--- a/src/syntax/dyn_block.rs
+++ b/src/syntax/dyn_block.rs
@ -99,14 +99,12 @@ CONTENTS
        TEXT@9..19 "clocktable"
        TEXT@19..31 " :scope file"
        NEW_LINE@31..32 "\n"
-      BLANK_LINE@32..33
-        NEW_LINE@32..33 "\n"
+      BLANK_LINE@32..33 "\n"
      TEXT@33..42 "CONTENTS\n"
      DYN_BLOCK_END@42..49
        TEXT@42..48 "#+END:"
        NEW_LINE@48..49 "\n"
-      BLANK_LINE@49..53
-        WHITESPACE@49..53 "    "
+      BLANK_LINE@49..53 "    "
    "###
    );
 }
--- a/src/syntax/element.rs
+++ b/src/syntax/element.rs
@ -208,8 +208,7 @@ b"#),
    SECTION@0..4
      PARAGRAPH@0..3
        TEXT@0..2 "a\n"
-        BLANK_LINE@2..3
-          NEW_LINE@2..3 "\n"
+        BLANK_LINE@2..3 "\n"
      PARAGRAPH@3..4
        TEXT@3..4 "b"
    "###
--- a/src/syntax/fixed_width.rs
+++ b/src/syntax/fixed_width.rs
@ -55,10 +55,8 @@ fn parse() {
        @r###"
    FIXED_WIDTH@0..19
      TEXT@0..14 ": A\n:\n: B\n: C\n"
-      BLANK_LINE@14..15
-        NEW_LINE@14..15 "\n"
-      BLANK_LINE@15..19
-        WHITESPACE@15..19 "    "
+      BLANK_LINE@14..15 "\n"
+      BLANK_LINE@15..19 "    "
    "###
    );
 }
--- a/src/syntax/keyword.rs
+++ b/src/syntax/keyword.rs
@ -84,7 +84,6 @@ pub fn affiliated_keyword_nodes(input: Input) -> IResult<Input, Vec<GreenElement
        // find the first text token in children
        let Some(NodeOrToken::Token(token)) = node
            .children()
-            .into_iter()
            .find(|t| t.kind() == SyntaxKind::TEXT.into())
        else {
            return Err(nom::Err::Error(()));
--- a/src/syntax/list.rs
+++ b/src/syntax/list.rs
@ -328,8 +328,7 @@ fn parse() {
        LIST_ITEM_CONTENT@7..11
          PARAGRAPH@7..10
            TEXT@7..10 "1\n\n"
-          BLANK_LINE@10..11
-            NEW_LINE@10..11 "\n"
+          BLANK_LINE@10..11 "\n"
      LIST_ITEM@11..14
        LIST_ITEM_INDENT@11..11 ""
        LIST_ITEM_BULLET@11..13 "+ "
--- a/src/syntax/mod.rs
+++ b/src/syntax/mod.rs
@ -99,10 +99,10 @@ pub enum SyntaxKind {
    DOUBLE_ARROW, // '=>'
    PIPE,         // '|'
    COMMA,        // ','
-    TEXT,
+    NEW_LINE,     // '\n' or '\r\n' or '\r'
+    WHITESPACE,   // ' ' or '\t'
    BLANK_LINE,
-    WHITESPACE,
-    NEW_LINE,
+    TEXT,

    DOCUMENT,
    SECTION,
--- a/src/syntax/paragraph.rs
+++ b/src/syntax/paragraph.rs
@ -65,8 +65,7 @@ fn parse() {
        @r###"
    PARAGRAPH@0..6
      TEXT@0..2 "a\n"
-      BLANK_LINE@2..6
-        WHITESPACE@2..6 "    "
+      BLANK_LINE@2..6 "    "
    "###
    );

@ -89,8 +88,7 @@ c
        @r###"
    PARAGRAPH@0..3
      TEXT@0..2 "a\n"
-      BLANK_LINE@2..3
-        NEW_LINE@2..3 "\n"
+      BLANK_LINE@2..3 "\n"
    "###
    );
 }
--- a/src/syntax/rule.rs
+++ b/src/syntax/rule.rs
@ -62,10 +62,8 @@ fn parse() {
    RULE@0..8
      TEXT@0..5 "-----"
      NEW_LINE@5..6 "\n"
-      BLANK_LINE@6..7
-        NEW_LINE@6..7 "\n"
-      BLANK_LINE@7..8
-        NEW_LINE@7..8 "\n"
+      BLANK_LINE@6..7 "\n"
+      BLANK_LINE@7..8 "\n"
    "###
    );

--- a/src/syntax/table.rs
+++ b/src/syntax/table.rs
@ -195,10 +195,8 @@ fn parse_table_el() {
        @r###"
    TABLE_EL@0..37
      TEXT@0..32 "  +---+\n      |   |\n  ..."
-      BLANK_LINE@32..33
-        NEW_LINE@32..33 "\n"
-      BLANK_LINE@33..37
-        WHITESPACE@33..37 "    "
+      BLANK_LINE@32..33 "\n"
+      BLANK_LINE@33..37 "    "
    "###
    );

--- a/tests/issue_10.rs
+++ b/tests/issue_10.rs
@ -1,18 +0,0 @@
-use orgize::Org;
-
-#[test]
-fn can_handle_empty_emphasis() {
-    let cases = &[
-        "* / // a",
-        "\"* / // a\"",
-        "* * ** a",
-        "* 2020\n** December\n*** Experiment\nType A is marked with * and type B is marked with **.\n",
-        "* 2020\n:DRAWER:\n* ** a\n:END:",
-        "* * ** :a:",
-        "* * ** "
-    ];
-
-    for case in cases {
-        let _ = Org::parse(case);
-    }
-}
--- a/tests/issue_11.rs
+++ b/tests/issue_11.rs
@ -1,21 +0,0 @@
-use orgize::Org;
-
-#[test]
-fn can_handle_empty_list_item() {
-    let cases = &[
-        "0. ",
-        "* \n0. ",
-        " * ",
-        " 0. ",
-        "\t* ",
-        "- ",
-        "- hello\n- ",
-        "- \n- hello",
-        "- hello\n- \n- world",
-        "* world\n- ",
-    ];
-
-    for case in cases {
-        let _ = Org::parse(case);
-    }
-}
--- a/tests/issue_22.rs
+++ b/tests/issue_22.rs
@ -1,8 +0,0 @@
-use orgize::{ast::Paragraph, rowan::ast::AstNode, Org};
-
-#[test]
-fn whitespaces() {
-    let case = "\u{000b}\u{0085}\u{00a0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200a}\u{2028}\u{2029}\u{202f}\u{205f}\u{3000}";
-    let paragraph = Org::parse(case).first_node::<Paragraph>().unwrap();
-    assert_eq!(&paragraph.syntax().to_string(), case);
-}
--- a/tests/parse.rs
+++ b/tests/parse.rs
@ -1,168 +1,36 @@
-use orgize::Org;
-use pretty_assertions::assert_eq;
+const INPUT: &[&str] = &[
+    // issue 10
+    "* / // a",
+    "\"* / // a\"",
+    "* * ** a",
+    "* 2020\n** December\n*** Experiment\nType A is marked with * and type B is marked with **.\n",
+    "* 2020\n:DRAWER:\n* ** a\n:END:",
+    "* * ** :a:",
+    "* * ** ",
+    // issue 11
+    "0. ",
+    "* \n0. ",
+    " * ",
+    " 0. ",
+    "\t* ",
+    "- ",
+    "- hello\n- ",
+    "- \n- hello",
+    "- hello\n- \n- world",
+    "* world\n- ",
+    // issue 22
+    "\u{000b}\u{0085}\u{00a0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200a}\u{2028}\u{2029}\u{202f}\u{205f}\u{3000}",
+    // fuzz test
+    "___\n",
+    "\n\n\n",
+    "\n\n\n",
+    "\n*",
+    "\r-"
+];

-macro_rules! test_suite {
-    ($name:ident, $content:expr, $expected:expr) => {
-        #[test]
-        fn $name() {
-            assert_eq!(Org::parse($content).to_html(), $expected);
-        }
-    };
+#[test]
+fn parse() {
+    for input in INPUT {
+        let _ = orgize::Org::parse(input);
+    }
 }
-
-test_suite!(
-    emphasis,
-    "*bold*, /italic/,\n_underlined_, =verbatim= and ~code~",
-    "<main><section><p><b>bold</b>, <i>italic</i>,\n<u>underlined</u>, \
-     <code>verbatim</code> and <code>code</code></p></section></main>"
-);
-
-test_suite!(
-    link,
-    "Visit[[http://example.com][link1]]or[[http://example.com][link1]].",
-    r#"<main><section><p>Visit<a href="http://example.com">link1</a>or<a href="http://example.com">link1</a>.</p></section></main>"#
-);
-
-test_suite!(
-    section_and_headline,
-    r#"
-* title 1
-section 1
-** title 2
-section 2
-* title 3
-section 3
-* title 4
-section 4
-"#,
-    "<main><h1>title 1</h1><section><p>section 1</p></section>\
-     <h2>title 2</h2><section><p>section 2</p></section>\
-     <h1>title 3</h1><section><p>section 3</p></section>\
-     <h1>title 4</h1><section><p>section 4</p></section></main>"
-);
-
-test_suite!(
-    list,
-    r#"
-+ 1
-
-+ 2
-
-  - 3
-
-  - 4
-
-+ 5
-"#,
-    "<main><section><ul>\
-     <li><p>1</p></li>\
-     <li><p>2</p><ul><li><p>3</p></li><li><p>4</p></li></ul></li>\
-     <li><p>5</p></li>\
-     </ul></section></main>"
-);
-
-test_suite!(
-    snippet,
-    "@@html:<del>@@delete this@@html:</del>@@",
-    "<main><section><p><del>delete this</del></p></section></main>"
-);
-
-test_suite!(
-    paragraphs,
-    r#"
-* title
-
-paragraph 1
-
-paragraph 2
-
-paragraph 3
-
-paragraph 4
-"#,
-    "<main><h1>title</h1><section>\
-     <p>paragraph 1</p><p>paragraph 2</p>\
-     <p>paragraph 3</p><p>paragraph 4</p>\
-     </section></main>"
-);
-
-test_suite!(
-    table1,
-    r#"
-|-----+-----+-----|
-|   0 |   1 |   2 |
-|-----+-----+-----|
-|   4 |   5 |   6 |
-"#,
-    "<main><section><table>\
-     <thead><tr><th>0</th><th>1</th><th>2</th></tr></thead>\
-     <tbody><tr><td>4</td><td>5</td><td>6</td></tr></tbody>\
-     </table></section></main>"
-);
-
-test_suite!(
-    table2,
-    r#"
-|-----+-----+-----|
-|   0 |   1 |   2 |
-|   4 |   5 |   6 |
-|-----+-----+-----|
-"#,
-    "<main><section><table>\
-     <tbody><tr><td>0</td><td>1</td><td>2</td></tr>\
-     <tr><td>4</td><td>5</td><td>6</td></tr></tbody>\
-     </table></section></main>"
-);
-
-test_suite!(
-    table3,
-    r#"
-|-----+-----+-----|
-|-----+-----+-----|
-|   0 |   1 |   2 |
-|   4 |   5 |   6 |
-"#,
-    "<main><section><table><thead></thead>\
-     <tbody><tr><td>0</td><td>1</td><td>2</td></tr>\
-     <tr><td>4</td><td>5</td><td>6</td></tr></tbody>\
-     </table></section></main>"
-);
-
-test_suite!(
-    table4,
-    r#"
-|   0 |   1 |   2 |
-|   4 |   5 |   6 |
-|-----+-----+-----|
-|-----+-----+-----|
-"#,
-    "<main><section><table>\
-     <thead><tr><th>0</th><th>1</th><th>2</th></tr>\
-     <tr><th>4</th><th>5</th><th>6</th></tr></thead>\
-     <tbody></tbody></table></section></main>"
-);
-
-test_suite!(
-    table5,
-    r#"
-|-----+-----+-----|
-|-----+-----+-----|
-"#,
-    "<main><section><table><tbody></tbody></table></section></main>"
-);
-
-test_suite!(
-    table6,
-    r#"
-|
-|-
-|
-|-
-|
-"#,
-    "<main><section><table>\
-     <thead><tr></tr></thead>\
-     <tbody><tr></tr></tbody>\
-     <tbody><tr></tr></tbody>\
-     </table></section></main>"
-);