From a269f2f258da5e38c671167310b91cb1f3efc7ae Mon Sep 17 00:00:00 2001 From: PoiScript Date: Tue, 14 Nov 2023 11:46:26 +0800 Subject: [PATCH] fix: single \r will be consider as blank line --- Cargo.toml | 4 - benches/parse.rs | 2 +- fuzz/fuzz_targets/fuzz_target_1.rs | 10 +- src/ast/mod.rs | 2 +- src/syntax/block.rs | 12 +- src/syntax/clock.rs | 3 +- src/syntax/combinator.rs | 58 +++++---- src/syntax/comment.rs | 3 +- src/syntax/document.rs | 16 +-- src/syntax/drawer.rs | 6 +- src/syntax/dyn_block.rs | 6 +- src/syntax/element.rs | 3 +- src/syntax/fixed_width.rs | 6 +- src/syntax/keyword.rs | 1 - src/syntax/list.rs | 3 +- src/syntax/mod.rs | 6 +- src/syntax/paragraph.rs | 6 +- src/syntax/rule.rs | 6 +- src/syntax/table.rs | 6 +- tests/issue_10.rs | 18 --- tests/issue_11.rs | 21 --- tests/issue_22.rs | 8 -- tests/parse.rs | 200 +++++------------------------ 23 files changed, 101 insertions(+), 305 deletions(-) delete mode 100644 tests/issue_10.rs delete mode 100644 tests/issue_11.rs delete mode 100644 tests/issue_22.rs diff --git a/Cargo.toml b/Cargo.toml index e41374c..595db37 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,9 +13,6 @@ exclude = ["/wasm", "/.github"] [package.metadata.docs.rs] all-features = true -[badges] -travis-ci = { repository = "PoiScript/orgize" } - [features] default = [] indexmap = ["dep:indexmap"] @@ -36,7 +33,6 @@ tracing = "0.1" [dev-dependencies] criterion = "0.4" -pretty_assertions = "1.3" insta = "1.29" serde_json = "1.0" slugify = "0.1" diff --git a/benches/parse.rs b/benches/parse.rs index ff9f089..e6f0a55 100644 --- a/benches/parse.rs +++ b/benches/parse.rs @@ -3,9 +3,9 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use orgize::Org; const INPUT: &[(&str, &str)] = &[ - // ("org-syntax.org", include_str!("./org-syntax.org")), ("doc.org", include_str!("./doc.org")), ("org-faq.org", include_str!("./org-faq.org")), + ("org-syntax.org", include_str!("./org-syntax.org")), ]; pub fn bench_parse(c: &mut Criterion) { diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index 5e13431..4511ba0 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -1,11 +1,7 @@ #![no_main] -use libfuzzer_sys::fuzz_target; -use orgize::syntax::{HtmlHandler, Org}; -use std::str; - -fuzz_target!(|data: &[u8]| { - if let Ok(utf8) = str::from_utf8(data) { - let _ = Org::parse(utf8); +libfuzzer_sys::fuzz_target!(|data: &[u8]| { + if let Ok(utf8) = std::str::from_utf8(data) { + let _ = orgize::Org::parse(utf8); } }); diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 4587a66..9f1f9a0 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -19,7 +19,7 @@ use rowan::{ast::AstNode, Language, NodeOrToken}; pub fn blank_lines(parent: &SyntaxNode) -> usize { parent - .children() + .children_with_tokens() .filter(|n| n.kind() == SyntaxKind::BLANK_LINE) .count() } diff --git a/src/syntax/block.rs b/src/syntax/block.rs index b6d764f..b47a98b 100644 --- a/src/syntax/block.rs +++ b/src/syntax/block.rs @@ -168,10 +168,8 @@ r#"#+BEGIN_SRC TEXT@8..11 "SRC" TEXT@11..11 "" NEW_LINE@11..12 "\n" - BLANK_LINE@12..13 - NEW_LINE@12..13 "\n" - BLANK_LINE@13..14 - NEW_LINE@13..14 "\n" + BLANK_LINE@12..13 "\n" + BLANK_LINE@13..14 "\n" BLOCK_CONTENT@14..14 BLOCK_END@14..27 WHITESPACE@14..18 " " @@ -222,10 +220,8 @@ alert('Hello World!'); TEXT@54..60 "#+END_" TEXT@60..63 "SRC" NEW_LINE@63..64 "\n" - BLANK_LINE@64..65 - NEW_LINE@64..65 "\n" - BLANK_LINE@65..69 - WHITESPACE@65..69 " " + BLANK_LINE@64..65 "\n" + BLANK_LINE@65..69 " " "### ); diff --git a/src/syntax/clock.rs b/src/syntax/clock.rs index 7505a35..cf3f01b 100644 --- a/src/syntax/clock.rs +++ b/src/syntax/clock.rs @@ -130,8 +130,7 @@ fn parse() { COLON@59..60 ":" TEXT@60..62 "00" NEW_LINE@62..63 "\n" - BLANK_LINE@63..64 - NEW_LINE@63..64 "\n" + BLANK_LINE@63..64 "\n" "### ); } diff --git a/src/syntax/combinator.rs b/src/syntax/combinator.rs index 282052c..a86c33f 100644 --- a/src/syntax/combinator.rs +++ b/src/syntax/combinator.rs @@ -1,13 +1,9 @@ use std::iter::once; -use memchr::{memchr, memchr_iter}; +use memchr::{memchr, memchr2_iter, memchr_iter}; use nom::{ - branch::alt, - bytes::complete::tag, - character::complete::{line_ending, space0}, - combinator::eof, - sequence::tuple, - AsBytes, IResult, InputLength, InputTake, Parser, + bytes::complete::tag, character::complete::space0, AsBytes, IResult, InputLength, InputTake, + Parser, }; use rowan::{GreenNode, GreenToken, Language, NodeOrToken}; @@ -97,23 +93,30 @@ where /// Takes all blank lines pub fn blank_lines(input: Input) -> IResult, ()> { - let mut lines = vec![]; - let mut i = input; + if input.is_empty() { + return Ok((input, vec![])); + } - while !i.is_empty() { - match tuple::<_, _, (), _>((space0, alt((line_ending, eof))))(i) { - Ok((input, (ws, nl))) => { - let mut b = NodeBuilder::new(); - b.ws(ws); - b.nl(nl); - lines.push(b.finish(BLANK_LINE)); - i = input; - } - _ => break, + let mut lines = vec![]; + let mut start = 0; + let bytes = input.as_bytes(); + + for index in memchr2_iter(b'\r', b'\n', bytes) + .map(|i| i + 1) + .chain(once(bytes.len())) + { + if bytes.get(index - 1) == Some(&b'\r') && bytes.get(index) == Some(&b'\n') { + continue; + } + if start != index && bytes[start..index].iter().all(|b| b.is_ascii_whitespace()) { + lines.push(token(BLANK_LINE, &input.as_str()[start..index])); + start = index; + } else { + break; } } - Ok((i, lines)) + Ok((input.take_split(start).0, lines)) } #[test] @@ -124,6 +127,11 @@ fn test_blank_lines() { assert_eq!(input.as_str(), ""); assert_eq!(output, vec![]); + let (input, output) = blank_lines(("\n", config).into()).unwrap(); + assert_eq!(input.as_str(), ""); + assert_eq!(output.len(), 1); + assert_eq!(output[0].to_string(), "\n"); + let (input, output) = blank_lines((" t", config).into()).unwrap(); assert_eq!(input.as_str(), " t"); assert_eq!(output, vec![]); @@ -138,13 +146,15 @@ fn test_blank_lines() { assert_eq!(output[4].to_string(), " "); let (input, output) = - blank_lines((" \r\n\n\t\t\r\n \n t\n \r\n\n\t\t\r\n \n", config).into()).unwrap(); - assert_eq!(input.as_str(), " t\n \r\n\n\t\t\r\n \n"); - assert_eq!(output.len(), 4); - assert_eq!(output[0].to_string(), " \r\n"); + blank_lines(("\r\n\n\t\t\r\n \n\r \r t\n ", config).into()).unwrap(); + assert_eq!(input.as_str(), " t\n "); + assert_eq!(output.len(), 6); + assert_eq!(output[0].to_string(), "\r\n"); assert_eq!(output[1].to_string(), "\n"); assert_eq!(output[2].to_string(), "\t\t\r\n"); assert_eq!(output[3].to_string(), " \n"); + assert_eq!(output[4].to_string(), "\r"); + assert_eq!(output[5].to_string(), " \r"); } /// Returns 1. anything before trailing whitespace, 2. whitespace itself, 3. line feeding diff --git a/src/syntax/comment.rs b/src/syntax/comment.rs index 45d5303..bf67a10 100644 --- a/src/syntax/comment.rs +++ b/src/syntax/comment.rs @@ -70,8 +70,7 @@ fn parse() { @r###" COMMENT@0..12 TEXT@0..11 "#\n # a\n #\n" - BLANK_LINE@11..12 - NEW_LINE@11..12 "\n" + BLANK_LINE@11..12 "\n" "### ); diff --git a/src/syntax/document.rs b/src/syntax/document.rs index d3e8ff3..a317592 100644 --- a/src/syntax/document.rs +++ b/src/syntax/document.rs @@ -53,13 +53,9 @@ fn parse() { to_document("\n \n\n").syntax, @r###" DOCUMENT@0..5 - BLANK_LINE@0..1 - NEW_LINE@0..1 "\n" - BLANK_LINE@1..4 - WHITESPACE@1..3 " " - NEW_LINE@3..4 "\n" - BLANK_LINE@4..5 - NEW_LINE@4..5 "\n" + BLANK_LINE@0..1 "\n" + BLANK_LINE@1..4 " \n" + BLANK_LINE@4..5 "\n" "### ); @@ -77,8 +73,7 @@ fn parse() { to_document("\n* section").syntax, @r###" DOCUMENT@0..10 - BLANK_LINE@0..1 - NEW_LINE@0..1 "\n" + BLANK_LINE@0..1 "\n" HEADLINE@1..10 HEADLINE_STARS@1..2 "*" WHITESPACE@2..3 " " @@ -91,8 +86,7 @@ fn parse() { to_document("\n** heading 2\n* heading 1").syntax, @r###" DOCUMENT@0..25 - BLANK_LINE@0..1 - NEW_LINE@0..1 "\n" + BLANK_LINE@0..1 "\n" HEADLINE@1..14 HEADLINE_STARS@1..3 "**" WHITESPACE@3..4 " " diff --git a/src/syntax/drawer.rs b/src/syntax/drawer.rs index 1c91d64..92833c1 100644 --- a/src/syntax/drawer.rs +++ b/src/syntax/drawer.rs @@ -179,8 +179,7 @@ fn parse() { TEXT@1..7 "DRAWER" COLON@7..8 ":" NEW_LINE@8..9 "\n" - BLANK_LINE@9..10 - NEW_LINE@9..10 "\n" + BLANK_LINE@9..10 "\n" TEXT@10..10 "" DRAWER_END@10..18 WHITESPACE@10..12 " " @@ -188,8 +187,7 @@ fn parse() { TEXT@13..16 "END" COLON@16..17 ":" NEW_LINE@17..18 "\n" - BLANK_LINE@18..19 - NEW_LINE@18..19 "\n" + BLANK_LINE@18..19 "\n" "### ); diff --git a/src/syntax/dyn_block.rs b/src/syntax/dyn_block.rs index 81ed103..ab18df1 100644 --- a/src/syntax/dyn_block.rs +++ b/src/syntax/dyn_block.rs @@ -99,14 +99,12 @@ CONTENTS TEXT@9..19 "clocktable" TEXT@19..31 " :scope file" NEW_LINE@31..32 "\n" - BLANK_LINE@32..33 - NEW_LINE@32..33 "\n" + BLANK_LINE@32..33 "\n" TEXT@33..42 "CONTENTS\n" DYN_BLOCK_END@42..49 TEXT@42..48 "#+END:" NEW_LINE@48..49 "\n" - BLANK_LINE@49..53 - WHITESPACE@49..53 " " + BLANK_LINE@49..53 " " "### ); } diff --git a/src/syntax/element.rs b/src/syntax/element.rs index 244c5e4..95791ae 100644 --- a/src/syntax/element.rs +++ b/src/syntax/element.rs @@ -208,8 +208,7 @@ b"#), SECTION@0..4 PARAGRAPH@0..3 TEXT@0..2 "a\n" - BLANK_LINE@2..3 - NEW_LINE@2..3 "\n" + BLANK_LINE@2..3 "\n" PARAGRAPH@3..4 TEXT@3..4 "b" "### diff --git a/src/syntax/fixed_width.rs b/src/syntax/fixed_width.rs index 5fb8690..44f26d5 100644 --- a/src/syntax/fixed_width.rs +++ b/src/syntax/fixed_width.rs @@ -55,10 +55,8 @@ fn parse() { @r###" FIXED_WIDTH@0..19 TEXT@0..14 ": A\n:\n: B\n: C\n" - BLANK_LINE@14..15 - NEW_LINE@14..15 "\n" - BLANK_LINE@15..19 - WHITESPACE@15..19 " " + BLANK_LINE@14..15 "\n" + BLANK_LINE@15..19 " " "### ); } diff --git a/src/syntax/keyword.rs b/src/syntax/keyword.rs index 9da9aed..9757c6c 100644 --- a/src/syntax/keyword.rs +++ b/src/syntax/keyword.rs @@ -84,7 +84,6 @@ pub fn affiliated_keyword_nodes(input: Input) -> IResult' PIPE, // '|' COMMA, // ',' - TEXT, + NEW_LINE, // '\n' or '\r\n' or '\r' + WHITESPACE, // ' ' or '\t' BLANK_LINE, - WHITESPACE, - NEW_LINE, + TEXT, DOCUMENT, SECTION, diff --git a/src/syntax/paragraph.rs b/src/syntax/paragraph.rs index 1faf528..d723ae3 100644 --- a/src/syntax/paragraph.rs +++ b/src/syntax/paragraph.rs @@ -65,8 +65,7 @@ fn parse() { @r###" PARAGRAPH@0..6 TEXT@0..2 "a\n" - BLANK_LINE@2..6 - WHITESPACE@2..6 " " + BLANK_LINE@2..6 " " "### ); @@ -89,8 +88,7 @@ c @r###" PARAGRAPH@0..3 TEXT@0..2 "a\n" - BLANK_LINE@2..3 - NEW_LINE@2..3 "\n" + BLANK_LINE@2..3 "\n" "### ); } diff --git a/src/syntax/rule.rs b/src/syntax/rule.rs index df28294..2a54ad1 100644 --- a/src/syntax/rule.rs +++ b/src/syntax/rule.rs @@ -62,10 +62,8 @@ fn parse() { RULE@0..8 TEXT@0..5 "-----" NEW_LINE@5..6 "\n" - BLANK_LINE@6..7 - NEW_LINE@6..7 "\n" - BLANK_LINE@7..8 - NEW_LINE@7..8 "\n" + BLANK_LINE@6..7 "\n" + BLANK_LINE@7..8 "\n" "### ); diff --git a/src/syntax/table.rs b/src/syntax/table.rs index 4f53bfe..9097349 100644 --- a/src/syntax/table.rs +++ b/src/syntax/table.rs @@ -195,10 +195,8 @@ fn parse_table_el() { @r###" TABLE_EL@0..37 TEXT@0..32 " +---+\n | |\n ..." - BLANK_LINE@32..33 - NEW_LINE@32..33 "\n" - BLANK_LINE@33..37 - WHITESPACE@33..37 " " + BLANK_LINE@32..33 "\n" + BLANK_LINE@33..37 " " "### ); diff --git a/tests/issue_10.rs b/tests/issue_10.rs deleted file mode 100644 index f0570f8..0000000 --- a/tests/issue_10.rs +++ /dev/null @@ -1,18 +0,0 @@ -use orgize::Org; - -#[test] -fn can_handle_empty_emphasis() { - let cases = &[ - "* / // a", - "\"* / // a\"", - "* * ** a", - "* 2020\n** December\n*** Experiment\nType A is marked with * and type B is marked with **.\n", - "* 2020\n:DRAWER:\n* ** a\n:END:", - "* * ** :a:", - "* * ** " - ]; - - for case in cases { - let _ = Org::parse(case); - } -} diff --git a/tests/issue_11.rs b/tests/issue_11.rs deleted file mode 100644 index d22e72b..0000000 --- a/tests/issue_11.rs +++ /dev/null @@ -1,21 +0,0 @@ -use orgize::Org; - -#[test] -fn can_handle_empty_list_item() { - let cases = &[ - "0. ", - "* \n0. ", - " * ", - " 0. ", - "\t* ", - "- ", - "- hello\n- ", - "- \n- hello", - "- hello\n- \n- world", - "* world\n- ", - ]; - - for case in cases { - let _ = Org::parse(case); - } -} diff --git a/tests/issue_22.rs b/tests/issue_22.rs deleted file mode 100644 index d0fff8b..0000000 --- a/tests/issue_22.rs +++ /dev/null @@ -1,8 +0,0 @@ -use orgize::{ast::Paragraph, rowan::ast::AstNode, Org}; - -#[test] -fn whitespaces() { - let case = "\u{000b}\u{0085}\u{00a0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200a}\u{2028}\u{2029}\u{202f}\u{205f}\u{3000}"; - let paragraph = Org::parse(case).first_node::().unwrap(); - assert_eq!(¶graph.syntax().to_string(), case); -} diff --git a/tests/parse.rs b/tests/parse.rs index e7eda2a..6f78ae8 100644 --- a/tests/parse.rs +++ b/tests/parse.rs @@ -1,168 +1,36 @@ -use orgize::Org; -use pretty_assertions::assert_eq; +const INPUT: &[&str] = &[ + // issue 10 + "* / // a", + "\"* / // a\"", + "* * ** a", + "* 2020\n** December\n*** Experiment\nType A is marked with * and type B is marked with **.\n", + "* 2020\n:DRAWER:\n* ** a\n:END:", + "* * ** :a:", + "* * ** ", + // issue 11 + "0. ", + "* \n0. ", + " * ", + " 0. ", + "\t* ", + "- ", + "- hello\n- ", + "- \n- hello", + "- hello\n- \n- world", + "* world\n- ", + // issue 22 + "\u{000b}\u{0085}\u{00a0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200a}\u{2028}\u{2029}\u{202f}\u{205f}\u{3000}", + // fuzz test + "___\n", + "\n\n\n", + "\n\n\n", + "\n*", + "\r-" +]; -macro_rules! test_suite { - ($name:ident, $content:expr, $expected:expr) => { - #[test] - fn $name() { - assert_eq!(Org::parse($content).to_html(), $expected); - } - }; +#[test] +fn parse() { + for input in INPUT { + let _ = orgize::Org::parse(input); + } } - -test_suite!( - emphasis, - "*bold*, /italic/,\n_underlined_, =verbatim= and ~code~", - "

bold, italic,\nunderlined, \ - verbatim and code

" -); - -test_suite!( - link, - "Visit[[http://example.com][link1]]or[[http://example.com][link1]].", - r#"

Visitlink1orlink1.

"# -); - -test_suite!( - section_and_headline, - r#" -* title 1 -section 1 -** title 2 -section 2 -* title 3 -section 3 -* title 4 -section 4 -"#, - "

title 1

section 1

\ -

title 2

section 2

\ -

title 3

section 3

\ -

title 4

section 4

" -); - -test_suite!( - list, - r#" -+ 1 - -+ 2 - - - 3 - - - 4 - -+ 5 -"#, - "
    \ -
  • 1

  • \ -
  • 2

    • 3

    • 4

  • \ -
  • 5

  • \ -
" -); - -test_suite!( - snippet, - "@@html:@@delete this@@html:@@", - "

delete this

" -); - -test_suite!( - paragraphs, - r#" -* title - -paragraph 1 - -paragraph 2 - -paragraph 3 - -paragraph 4 -"#, - "

title

\ -

paragraph 1

paragraph 2

\ -

paragraph 3

paragraph 4

\ -
" -); - -test_suite!( - table1, - r#" -|-----+-----+-----| -| 0 | 1 | 2 | -|-----+-----+-----| -| 4 | 5 | 6 | -"#, - "
\ - \ - \ -
012
456
" -); - -test_suite!( - table2, - r#" -|-----+-----+-----| -| 0 | 1 | 2 | -| 4 | 5 | 6 | -|-----+-----+-----| -"#, - "
\ - \ - \ -
012
456
" -); - -test_suite!( - table3, - r#" -|-----+-----+-----| -|-----+-----+-----| -| 0 | 1 | 2 | -| 4 | 5 | 6 | -"#, - "
\ - \ - \ -
012
456
" -); - -test_suite!( - table4, - r#" -| 0 | 1 | 2 | -| 4 | 5 | 6 | -|-----+-----+-----| -|-----+-----+-----| -"#, - "
\ - \ - \ -
012
456
" -); - -test_suite!( - table5, - r#" -|-----+-----+-----| -|-----+-----+-----| -"#, - "
" -); - -test_suite!( - table6, - r#" -| -|- -| -|- -| -"#, - "
\ - \ - \ - \ -
" -);