fix: single \r will be consider as blank line

2023-11-14 11:46:26 +08:00 · 2023-11-14 11:46:26 +08:00 · a269f2f258
commit a269f2f258
parent b6e86a128a
23 changed files with 101 additions and 305 deletions
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -19,7 +19,7 @@ use rowan::{ast::AstNode, Language, NodeOrToken};

 pub fn blank_lines(parent: &SyntaxNode) -> usize {
    parent
-        .children()
+        .children_with_tokens()
        .filter(|n| n.kind() == SyntaxKind::BLANK_LINE)
        .count()
 }
--- a/src/syntax/block.rs
+++ b/src/syntax/block.rs
@ -168,10 +168,8 @@ r#"#+BEGIN_SRC
        TEXT@8..11 "SRC"
        TEXT@11..11 ""
        NEW_LINE@11..12 "\n"
-      BLANK_LINE@12..13
-        NEW_LINE@12..13 "\n"
-      BLANK_LINE@13..14
-        NEW_LINE@13..14 "\n"
+      BLANK_LINE@12..13 "\n"
+      BLANK_LINE@13..14 "\n"
      BLOCK_CONTENT@14..14
      BLOCK_END@14..27
        WHITESPACE@14..18 "    "
@ -222,10 +220,8 @@ alert('Hello World!');
        TEXT@54..60 "#+END_"
        TEXT@60..63 "SRC"
        NEW_LINE@63..64 "\n"
-      BLANK_LINE@64..65
-        NEW_LINE@64..65 "\n"
-      BLANK_LINE@65..69
-        WHITESPACE@65..69 "    "
+      BLANK_LINE@64..65 "\n"
+      BLANK_LINE@65..69 "    "
    "###
    );

--- a/src/syntax/clock.rs
+++ b/src/syntax/clock.rs
@ -130,8 +130,7 @@ fn parse() {
      COLON@59..60 ":"
      TEXT@60..62 "00"
      NEW_LINE@62..63 "\n"
-      BLANK_LINE@63..64
-        NEW_LINE@63..64 "\n"
+      BLANK_LINE@63..64 "\n"
    "###
    );
 }
--- a/src/syntax/combinator.rs
+++ b/src/syntax/combinator.rs
@ -1,13 +1,9 @@
 use std::iter::once;

-use memchr::{memchr, memchr_iter};
+use memchr::{memchr, memchr2_iter, memchr_iter};
 use nom::{
-    branch::alt,
-    bytes::complete::tag,
-    character::complete::{line_ending, space0},
-    combinator::eof,
-    sequence::tuple,
-    AsBytes, IResult, InputLength, InputTake, Parser,
+    bytes::complete::tag, character::complete::space0, AsBytes, IResult, InputLength, InputTake,
+    Parser,
 };
 use rowan::{GreenNode, GreenToken, Language, NodeOrToken};

@ -97,23 +93,30 @@ where

 /// Takes all blank lines
 pub fn blank_lines(input: Input) -> IResult<Input, Vec<GreenElement>, ()> {
-    let mut lines = vec![];
-    let mut i = input;
+    if input.is_empty() {
+        return Ok((input, vec![]));
+    }

-    while !i.is_empty() {
-        match tuple::<_, _, (), _>((space0, alt((line_ending, eof))))(i) {
-            Ok((input, (ws, nl))) => {
-                let mut b = NodeBuilder::new();
-                b.ws(ws);
-                b.nl(nl);
-                lines.push(b.finish(BLANK_LINE));
-                i = input;
-            }
-            _ => break,
+    let mut lines = vec![];
+    let mut start = 0;
+    let bytes = input.as_bytes();
+
+    for index in memchr2_iter(b'\r', b'\n', bytes)
+        .map(|i| i + 1)
+        .chain(once(bytes.len()))
+    {
+        if bytes.get(index - 1) == Some(&b'\r') && bytes.get(index) == Some(&b'\n') {
+            continue;
+        }
+        if start != index && bytes[start..index].iter().all(|b| b.is_ascii_whitespace()) {
+            lines.push(token(BLANK_LINE, &input.as_str()[start..index]));
+            start = index;
+        } else {
+            break;
        }
    }

-    Ok((i, lines))
+    Ok((input.take_split(start).0, lines))
 }

 #[test]
@ -124,6 +127,11 @@ fn test_blank_lines() {
    assert_eq!(input.as_str(), "");
    assert_eq!(output, vec![]);

+    let (input, output) = blank_lines(("\n", config).into()).unwrap();
+    assert_eq!(input.as_str(), "");
+    assert_eq!(output.len(), 1);
+    assert_eq!(output[0].to_string(), "\n");
+
    let (input, output) = blank_lines(("    t", config).into()).unwrap();
    assert_eq!(input.as_str(), "    t");
    assert_eq!(output, vec![]);
@ -138,13 +146,15 @@ fn test_blank_lines() {
    assert_eq!(output[4].to_string(), "  ");

    let (input, output) =
-        blank_lines(("  \r\n\n\t\t\r\n  \n   t\n  \r\n\n\t\t\r\n  \n", config).into()).unwrap();
-    assert_eq!(input.as_str(), "   t\n  \r\n\n\t\t\r\n  \n");
-    assert_eq!(output.len(), 4);
-    assert_eq!(output[0].to_string(), "  \r\n");
+        blank_lines(("\r\n\n\t\t\r\n  \n\r   \r   t\n  ", config).into()).unwrap();
+    assert_eq!(input.as_str(), "   t\n  ");
+    assert_eq!(output.len(), 6);
+    assert_eq!(output[0].to_string(), "\r\n");
    assert_eq!(output[1].to_string(), "\n");
    assert_eq!(output[2].to_string(), "\t\t\r\n");
    assert_eq!(output[3].to_string(), "  \n");
+    assert_eq!(output[4].to_string(), "\r");
+    assert_eq!(output[5].to_string(), "   \r");
 }

 /// Returns 1. anything before trailing whitespace, 2. whitespace itself, 3. line feeding
--- a/src/syntax/comment.rs
+++ b/src/syntax/comment.rs
@ -70,8 +70,7 @@ fn parse() {
        @r###"
    COMMENT@0..12
      TEXT@0..11 "#\n  # a\n #\n"
-      BLANK_LINE@11..12
-        NEW_LINE@11..12 "\n"
+      BLANK_LINE@11..12 "\n"
    "###
    );

--- a/src/syntax/document.rs
+++ b/src/syntax/document.rs
@ -53,13 +53,9 @@ fn parse() {
        to_document("\n  \n\n").syntax,
        @r###"
    DOCUMENT@0..5
-      BLANK_LINE@0..1
-        NEW_LINE@0..1 "\n"
-      BLANK_LINE@1..4
-        WHITESPACE@1..3 "  "
-        NEW_LINE@3..4 "\n"
-      BLANK_LINE@4..5
-        NEW_LINE@4..5 "\n"
+      BLANK_LINE@0..1 "\n"
+      BLANK_LINE@1..4 "  \n"
+      BLANK_LINE@4..5 "\n"
    "###
    );

@ -77,8 +73,7 @@ fn parse() {
        to_document("\n* section").syntax,
        @r###"
    DOCUMENT@0..10
-      BLANK_LINE@0..1
-        NEW_LINE@0..1 "\n"
+      BLANK_LINE@0..1 "\n"
      HEADLINE@1..10
        HEADLINE_STARS@1..2 "*"
        WHITESPACE@2..3 " "
@ -91,8 +86,7 @@ fn parse() {
        to_document("\n** heading 2\n* heading 1").syntax,
        @r###"
    DOCUMENT@0..25
-      BLANK_LINE@0..1
-        NEW_LINE@0..1 "\n"
+      BLANK_LINE@0..1 "\n"
      HEADLINE@1..14
        HEADLINE_STARS@1..3 "**"
        WHITESPACE@3..4 " "
--- a/src/syntax/drawer.rs
+++ b/src/syntax/drawer.rs
@ -179,8 +179,7 @@ fn parse() {
        TEXT@1..7 "DRAWER"
        COLON@7..8 ":"
        NEW_LINE@8..9 "\n"
-      BLANK_LINE@9..10
-        NEW_LINE@9..10 "\n"
+      BLANK_LINE@9..10 "\n"
      TEXT@10..10 ""
      DRAWER_END@10..18
        WHITESPACE@10..12 "  "
@ -188,8 +187,7 @@ fn parse() {
        TEXT@13..16 "END"
        COLON@16..17 ":"
        NEW_LINE@17..18 "\n"
-      BLANK_LINE@18..19
-        NEW_LINE@18..19 "\n"
+      BLANK_LINE@18..19 "\n"
    "###
    );

--- a/src/syntax/dyn_block.rs
+++ b/src/syntax/dyn_block.rs
@ -99,14 +99,12 @@ CONTENTS
        TEXT@9..19 "clocktable"
        TEXT@19..31 " :scope file"
        NEW_LINE@31..32 "\n"
-      BLANK_LINE@32..33
-        NEW_LINE@32..33 "\n"
+      BLANK_LINE@32..33 "\n"
      TEXT@33..42 "CONTENTS\n"
      DYN_BLOCK_END@42..49
        TEXT@42..48 "#+END:"
        NEW_LINE@48..49 "\n"
-      BLANK_LINE@49..53
-        WHITESPACE@49..53 "    "
+      BLANK_LINE@49..53 "    "
    "###
    );
 }
--- a/src/syntax/element.rs
+++ b/src/syntax/element.rs
@ -208,8 +208,7 @@ b"#),
    SECTION@0..4
      PARAGRAPH@0..3
        TEXT@0..2 "a\n"
-        BLANK_LINE@2..3
-          NEW_LINE@2..3 "\n"
+        BLANK_LINE@2..3 "\n"
      PARAGRAPH@3..4
        TEXT@3..4 "b"
    "###
--- a/src/syntax/fixed_width.rs
+++ b/src/syntax/fixed_width.rs
@ -55,10 +55,8 @@ fn parse() {
        @r###"
    FIXED_WIDTH@0..19
      TEXT@0..14 ": A\n:\n: B\n: C\n"
-      BLANK_LINE@14..15
-        NEW_LINE@14..15 "\n"
-      BLANK_LINE@15..19
-        WHITESPACE@15..19 "    "
+      BLANK_LINE@14..15 "\n"
+      BLANK_LINE@15..19 "    "
    "###
    );
 }
--- a/src/syntax/keyword.rs
+++ b/src/syntax/keyword.rs
@ -84,7 +84,6 @@ pub fn affiliated_keyword_nodes(input: Input) -> IResult<Input, Vec<GreenElement
        // find the first text token in children
        let Some(NodeOrToken::Token(token)) = node
            .children()
-            .into_iter()
            .find(|t| t.kind() == SyntaxKind::TEXT.into())
        else {
            return Err(nom::Err::Error(()));
--- a/src/syntax/list.rs
+++ b/src/syntax/list.rs
@ -328,8 +328,7 @@ fn parse() {
        LIST_ITEM_CONTENT@7..11
          PARAGRAPH@7..10
            TEXT@7..10 "1\n\n"
-          BLANK_LINE@10..11
-            NEW_LINE@10..11 "\n"
+          BLANK_LINE@10..11 "\n"
      LIST_ITEM@11..14
        LIST_ITEM_INDENT@11..11 ""
        LIST_ITEM_BULLET@11..13 "+ "
--- a/src/syntax/mod.rs
+++ b/src/syntax/mod.rs
@ -99,10 +99,10 @@ pub enum SyntaxKind {
    DOUBLE_ARROW, // '=>'
    PIPE,         // '|'
    COMMA,        // ','
-    TEXT,
+    NEW_LINE,     // '\n' or '\r\n' or '\r'
+    WHITESPACE,   // ' ' or '\t'
    BLANK_LINE,
-    WHITESPACE,
-    NEW_LINE,
+    TEXT,

    DOCUMENT,
    SECTION,
--- a/src/syntax/paragraph.rs
+++ b/src/syntax/paragraph.rs
@ -65,8 +65,7 @@ fn parse() {
        @r###"
    PARAGRAPH@0..6
      TEXT@0..2 "a\n"
-      BLANK_LINE@2..6
-        WHITESPACE@2..6 "    "
+      BLANK_LINE@2..6 "    "
    "###
    );

@ -89,8 +88,7 @@ c
        @r###"
    PARAGRAPH@0..3
      TEXT@0..2 "a\n"
-      BLANK_LINE@2..3
-        NEW_LINE@2..3 "\n"
+      BLANK_LINE@2..3 "\n"
    "###
    );
 }
--- a/src/syntax/rule.rs
+++ b/src/syntax/rule.rs
@ -62,10 +62,8 @@ fn parse() {
    RULE@0..8
      TEXT@0..5 "-----"
      NEW_LINE@5..6 "\n"
-      BLANK_LINE@6..7
-        NEW_LINE@6..7 "\n"
-      BLANK_LINE@7..8
-        NEW_LINE@7..8 "\n"
+      BLANK_LINE@6..7 "\n"
+      BLANK_LINE@7..8 "\n"
    "###
    );

--- a/src/syntax/table.rs
+++ b/src/syntax/table.rs
@ -195,10 +195,8 @@ fn parse_table_el() {
        @r###"
    TABLE_EL@0..37
      TEXT@0..32 "  +---+\n      |   |\n  ..."
-      BLANK_LINE@32..33
-        NEW_LINE@32..33 "\n"
-      BLANK_LINE@33..37
-        WHITESPACE@33..37 "    "
+      BLANK_LINE@32..33 "\n"
+      BLANK_LINE@33..37 "    "
    "###
    );