From b7ddc0f0763f83c7d996ea69922be190ffe2fe87 Mon Sep 17 00:00:00 2001 From: PoiScript Date: Thu, 16 Nov 2023 18:50:33 +0800 Subject: [PATCH] feat: update list node parsing --- src/syntax/document.rs | 28 +++-- src/syntax/drawer.rs | 1 + src/syntax/element.rs | 162 +++++++++++++++++++-------- src/syntax/headline.rs | 40 +++++-- src/syntax/keyword.rs | 6 + src/syntax/list.rs | 238 +++++++++++++++++++++------------------- src/syntax/object.rs | 152 ++++++++++++++++--------- src/syntax/paragraph.rs | 21 +++- src/syntax/planning.rs | 1 + tests/html.rs | 4 - 10 files changed, 411 insertions(+), 242 deletions(-) diff --git a/src/syntax/document.rs b/src/syntax/document.rs index fb687cf..4c07d57 100644 --- a/src/syntax/document.rs +++ b/src/syntax/document.rs @@ -1,7 +1,4 @@ -use nom::{ - combinator::{iterator, opt}, - IResult, -}; +use nom::{combinator::opt, IResult, InputLength}; use super::{ combinator::{blank_lines, node, GreenElement}, @@ -22,18 +19,29 @@ fn document_node_base(input: Input) -> IResult { children.extend(pre_blank); + if input.is_empty() { + return Ok((input, node(DOCUMENT, children))); + } + let (input, section) = opt(section_node)(input)?; if let Some(section) = section { children.push(section); } - let mut it = iterator(input, headline_node); - children.extend(&mut it); - let (input, _) = it.finish()?; + let mut i = input; + while !i.is_empty() { + let (input, headline) = headline_node(i)?; + debug_assert!( + i.input_len() > input.input_len(), + "{} > {}", + i.input_len(), + input.input_len(), + ); + i = input; + children.push(headline); + } - debug_assert!(input.is_empty()); - - Ok((input, node(DOCUMENT, children))) + Ok((i, node(DOCUMENT, children))) } #[test] diff --git a/src/syntax/drawer.rs b/src/syntax/drawer.rs index 77967da..55216c9 100644 --- a/src/syntax/drawer.rs +++ b/src/syntax/drawer.rs @@ -128,6 +128,7 @@ fn node_property_node(input: Input) -> IResult { #[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] pub fn property_drawer_node(input: Input) -> IResult { + debug_assert!(!input.is_empty()); crate::lossless_parser!(property_drawer_node_base, input) } diff --git a/src/syntax/element.rs b/src/syntax/element.rs index 736b51e..85d352f 100644 --- a/src/syntax/element.rs +++ b/src/syntax/element.rs @@ -1,4 +1,7 @@ -use nom::IResult; +use std::iter::once; + +use memchr::memchr2_iter; +use nom::{AsBytes, IResult, InputLength, InputTake}; use super::{ block::block_node, @@ -12,27 +15,46 @@ use super::{ input::Input, keyword::{affiliated_keyword_nodes, keyword_node}, list::list_node, - paragraph::paragraph_node, + paragraph::{paragraph_node, paragraph_nodes}, rule::rule_node, table::{org_table_node, table_el_node}, }; -/// Parses input into multiple element +/// Recognizes multiple org-mode elements /// /// input must not contains blank line in the beginning #[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] pub fn element_nodes(input: Input) -> Result, nom::Err<()>> { debug_assert!(!input.is_empty()); + // TODO: + // debug_assert!( + // blank_lines(input).unwrap().1.is_empty(), + // "input must not starts with blank lines: {:?}", + // input.s + // ); let mut i = input; let mut nodes = vec![]; - while !i.is_empty() { - let result = element_node(i); - debug_assert!(result.is_ok(), "element_node() always returns Ok()"); - let (input, node) = result?; - i = input; - nodes.push(node); + 'l: while !i.is_empty() { + for (input, head) in ElementPositions::new(i) { + if let Ok((input, element)) = element_node(input) { + if !head.is_empty() { + nodes.extend(paragraph_nodes(head)?); + } + nodes.push(element); + debug_assert!( + input.input_len() < i.input_len(), + "{} < {}", + input.input_len(), + i.input_len() + ); + i = input; + continue 'l; + } + } + nodes.extend(paragraph_nodes(i)?); + break; } debug_assert_eq!( @@ -44,6 +66,7 @@ pub fn element_nodes(input: Input) -> Result, nom::Err<()>> { Ok(nodes) } +/// Recognizes an org-mode element expect paragraph #[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] pub fn element_node(input: Input) -> IResult { // skip affiliated keyword first @@ -52,11 +75,7 @@ pub fn element_node(input: Input) -> IResult { let has_affiliated_keyword = !nodes.is_empty(); // find first non-whitespace character - let byte = i - .as_str() - .trim_start_matches(|c| c == ' ' || c == '\t') - .bytes() - .next(); + let byte = i.bytes().find(|&b| b != b' ' && b != b'\t'); debug_assert!( !(has_affiliated_keyword && matches!(byte, None | Some(b'\n') | Some(b'\r'))), @@ -80,7 +99,78 @@ pub fn element_node(input: Input) -> IResult { _ => Err(nom::Err::Error(())), }; - result.or_else(|_| paragraph_node(input)) + if has_affiliated_keyword { + result.or_else(|_| paragraph_node(input)) + } else { + result + } +} + +struct ElementPositions<'a> { + input: Input<'a>, + pos: usize, +} + +impl<'a> ElementPositions<'a> { + fn new(input: Input<'a>) -> Self { + ElementPositions { input, pos: 0 } + } +} + +impl<'a> Iterator for ElementPositions<'a> { + type Item = (Input<'a>, Input<'a>); + + fn next(&mut self) -> Option { + if self.pos >= self.input.s.len() { + return None; + } + + let bytes = &self.input.as_bytes()[self.pos..]; + + let mut iter = once(0).chain(memchr2_iter(b'\r', b'\n', bytes).map(|i| i + 1)); + + while let Some(i) = iter.next() { + let b = *bytes[i..].iter().find(|&&b| b != b' ' && b != b'\t')?; + + if matches!( + b, + b'[' | b'0'..=b'9' | b'*' | b'C' | b'-' | b':' | b'|' | b'+' | b'#' + ) { + let previous = self.pos; + self.pos = iter + .next() + .map(|i| i + self.pos) + .unwrap_or_else(|| self.input.s.len()); + + debug_assert!( + previous < self.pos && self.pos <= self.input.s.len(), + "{} < {} < {}", + previous, + self.pos, + self.input.s.len() + ); + + let (input, head) = self.input.take_split(i + previous); + + return Some((input, head)); + } + } + + None + } +} + +#[test] +fn positions() { + let config = crate::ParseConfig::default(); + let s = "+\n\n C\n \r\n-\n\t\t[\n: \r\n"; + let vec = ElementPositions::new((s, &config).into()).collect::>(); + assert_eq!(vec.len(), 5); + assert_eq!(vec[0].0.s, "+\n\n C\n \r\n-\n\t\t[\n: \r\n"); + assert_eq!(vec[1].0.s, " C\n \r\n-\n\t\t[\n: \r\n"); + assert_eq!(vec[2].0.s, "-\n\t\t[\n: \r\n"); + assert_eq!(vec[3].0.s, "\t\t[\n: \r\n"); + assert_eq!(vec[4].0.s, ": \r\n"); } #[test] @@ -94,6 +184,7 @@ fn parse() { SyntaxNode::new_root(node(SyntaxKind::SECTION, children).into_node().unwrap()) }; + // paragraph stops at blank lines insta::assert_debug_snapshot!( t(r#"a @@ -108,39 +199,18 @@ b"#), "### ); + // paragraph followed by special element insta::assert_debug_snapshot!( - t("#+ATTR_HTML: :width 300px\n[[./img/a.jpg]]"), + t("Table:\n|cell"), @r###" - SECTION@0..41 - PARAGRAPH@0..41 - AFFILIATED_KEYWORD@0..26 - HASH_PLUS@0..2 "#+" - TEXT@2..11 "ATTR_HTML" - COLON@11..12 ":" - TEXT@12..25 " :width 300px" - NEW_LINE@25..26 "\n" - LINK@26..41 - L_BRACKET2@26..28 "[[" - LINK_PATH@28..39 "./img/a.jpg" - R_BRACKET2@39..41 "]]" - "### - ); - - insta::assert_debug_snapshot!( - t("#+ATTR_HTML: :width 300px\n[[./img/a.jpg]]"), - @r###" - SECTION@0..41 - PARAGRAPH@0..41 - AFFILIATED_KEYWORD@0..26 - HASH_PLUS@0..2 "#+" - TEXT@2..11 "ATTR_HTML" - COLON@11..12 ":" - TEXT@12..25 " :width 300px" - NEW_LINE@25..26 "\n" - LINK@26..41 - L_BRACKET2@26..28 "[[" - LINK_PATH@28..39 "./img/a.jpg" - R_BRACKET2@39..41 "]]" + SECTION@0..12 + PARAGRAPH@0..7 + TEXT@0..7 "Table:\n" + ORG_TABLE@7..12 + ORG_TABLE_STANDARD_ROW@7..12 + PIPE@7..8 "|" + ORG_TABLE_CELL@8..12 + TEXT@8..12 "cell" "### ); } diff --git a/src/syntax/headline.rs b/src/syntax/headline.rs index 7011b17..15b6424 100644 --- a/src/syntax/headline.rs +++ b/src/syntax/headline.rs @@ -22,6 +22,7 @@ use super::{ #[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] pub fn headline_node(input: Input) -> IResult { + debug_assert!(!input.is_empty()); crate::lossless_parser!(headline_node_base, input) } @@ -59,16 +60,24 @@ fn headline_node_base(input: Input) -> IResult { b.ws(ws_); b.nl(nl); - if nl.is_empty() { + if input.is_empty() { return Ok((input, b.finish(HEADLINE))); } let (input, planning) = opt(planning_node)(input)?; b.push_opt(planning); + if input.is_empty() { + return Ok((input, b.finish(HEADLINE))); + } + let (input, property_drawer) = opt(property_drawer_node)(input)?; b.push_opt(property_drawer); + if input.is_empty() { + return Ok((input, b.finish(HEADLINE))); + } + let (input, section) = opt(section_node)(input)?; b.push_opt(section); @@ -83,6 +92,12 @@ fn headline_node_base(input: Input) -> IResult { let (input, headline) = headline_node(i)?; b.push(headline); + debug_assert!( + i.input_len() > input.input_len(), + "{} > {}", + i.input_len(), + input.input_len() + ); i = input; } @@ -91,15 +106,12 @@ fn headline_node_base(input: Input) -> IResult { #[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] pub fn section_node(input: Input) -> IResult { + debug_assert!(!input.is_empty()); let (input, section) = section_text(input)?; Ok((input, node(SECTION, element_nodes(section)?))) } -pub fn section_text(input: Input) -> IResult { - if input.is_empty() { - return Err(nom::Err::Error(())); - } - +fn section_text(input: Input) -> IResult { for (input, section) in line_starts_iter(input.as_str()).map(|i| input.take_split(i)) { if headline_stars(input).is_ok() { if section.is_empty() { @@ -119,12 +131,13 @@ fn headline_stars(input: Input) -> IResult { let level = bytes.iter().take_while(|&&c| c == b'*').count(); if level == 0 { - Err(nom::Err::Error(())) - } else if input.input_len() == level - || bytes[level] == b'\n' - || bytes[level] == b'\r' - || bytes[level] == b' ' - { + return Err(nom::Err::Error(())); + } + // followed by eof, new line, or whitespace + else if matches!( + bytes.get(level), + None | Some(b'\n') | Some(b'\r') | Some(b' ') + ) { Ok(input.take_split(level)) } else { Err(nom::Err::Error(())) @@ -151,6 +164,7 @@ fn headline_tags_node(input: Input) -> IResult { if item.is_empty() { children.push(token(COLON, ":")); can_not_be_ws = false; + debug_assert!(i > ii, "{} > {}", i, ii); i = ii; } else if item .iter() @@ -159,11 +173,13 @@ fn headline_tags_node(input: Input) -> IResult { children.push(input.slice(ii + 1..i).text_token()); children.push(token(COLON, ":")); can_not_be_ws = false; + debug_assert!(i > ii, "{} > {}", i, ii); i = ii; } else if item.iter().all(|&c| c == b' ' || c == b'\t') && !can_not_be_ws { children.push(input.slice(ii + 1..i).ws_token()); children.push(token(COLON, ":")); can_not_be_ws = true; + debug_assert!(i > ii, "{} > {}", i, ii); i = ii; } else { break; diff --git a/src/syntax/keyword.rs b/src/syntax/keyword.rs index 4a6725d..be4d519 100644 --- a/src/syntax/keyword.rs +++ b/src/syntax/keyword.rs @@ -57,6 +57,12 @@ pub fn affiliated_keyword_nodes(input: Input) -> IResult input_.input_len(), + "{} > {}", + i.input_len(), + input_.input_len() + ); i = input_; children.push(GreenElement::Node(GreenNode::new( SyntaxKind::AFFILIATED_KEYWORD.into(), diff --git a/src/syntax/list.rs b/src/syntax/list.rs index 098d2f0..d4cbf5f 100644 --- a/src/syntax/list.rs +++ b/src/syntax/list.rs @@ -17,6 +17,7 @@ use super::{ input::Input, keyword::affiliated_keyword_nodes, object::object_nodes, + paragraph::paragraph_nodes, SyntaxKind::*, }; @@ -28,24 +29,35 @@ pub fn list_node(input: Input) -> IResult { fn list_node_base(input: Input) -> IResult { let (input, affiliated_keywords) = affiliated_keyword_nodes(input)?; let (input, first_indent) = space0(input)?; - let (input, first_item) = list_item_node(first_indent, input)?; + let (input, (ends_with_empty_blank_lines, first_item)) = list_item_node(first_indent, input)?; let mut children = vec![]; children.extend(affiliated_keywords); children.push(first_item); let mut input = input; - while !input.is_empty() { + while !ends_with_empty_blank_lines && !input.is_empty() { let (input_, indent) = space0(input)?; if indent.input_len() != first_indent.input_len() { break; } - if let Ok((input_, list_item)) = list_item_node(indent, input_) { - children.push(list_item); - input = input_; - } else { + let Ok((input_, (ends_with_empty_blank_lines, list_item))) = list_item_node(indent, input_) + else { + break; + }; + + children.push(list_item); + debug_assert!( + input.input_len() > input_.input_len(), + "{} > {}", + input.input_len(), + input_.input_len(), + ); + input = input_; + + if ends_with_empty_blank_lines { break; } } @@ -57,7 +69,11 @@ fn list_node_base(input: Input) -> IResult { Ok((input, node(LIST, children))) } -fn list_item_node<'a>(indent: Input<'a>, input: Input<'a>) -> IResult, GreenElement, ()> { +#[tracing::instrument(level = "debug", skip(input, indent), fields(input = input.s))] +fn list_item_node<'a>( + indent: Input<'a>, + input: Input<'a>, +) -> IResult, (bool, GreenElement), ()> { let (input, bullet) = recognize(tuple(( alt(( tag("+"), @@ -91,7 +107,9 @@ fn list_item_node<'a>(indent: Input<'a>, input: Input<'a>) -> IResult, let (input, counter) = opt(list_item_counter)(input)?; let (input, checkbox) = opt(list_item_checkbox)(input)?; let (input, tag) = cond(!is_ordered, opt(list_item_tag))(input)?; - let (input, content) = list_item_content_node(input, indent.input_len())?; + let (input, (ends_with_empty_blank_lines, content)) = + list_item_content_node(input, indent.input_len())?; + let (input, post_blank) = cond(!ends_with_empty_blank_lines, blank_lines)(input)?; let mut children = vec![ indent.token(LIST_ITEM_INDENT), @@ -109,10 +127,17 @@ fn list_item_node<'a>(indent: Input<'a>, input: Input<'a>) -> IResult, } children.push(content); + if let Some(post_blank) = post_blank { + children.extend(post_blank); + } - Ok((input, node(LIST_ITEM, children))) + Ok(( + input, + (ends_with_empty_blank_lines, node(LIST_ITEM, children)), + )) } +#[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] fn list_item_counter(input: Input) -> IResult { let (input, node) = map( tuple((l_bracket_token, at_token, alphanumeric1, r_bracket_token)), @@ -129,6 +154,7 @@ fn list_item_counter(input: Input) -> IResult Ok((input, (node, ws))) } +#[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] fn list_item_checkbox(input: Input) -> IResult { let (input, node) = map( tuple(( @@ -151,6 +177,7 @@ fn list_item_checkbox(input: Input) -> IResult Ok((input, (node, ws))) } +#[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] fn list_item_tag(input: Input) -> IResult { let bytes = input.as_bytes(); @@ -167,19 +194,23 @@ fn list_item_tag(input: Input) -> IResult { Ok((input, (node(LIST_ITEM_TAG, children), ws))) } -fn list_item_content_node(input: Input, indent: usize) -> IResult { +#[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] +fn list_item_content_node(input: Input, indent: usize) -> IResult { if memchr(b'\n', input.as_bytes()).is_none() { return Ok(( input.of(""), - node(LIST_ITEM_CONTENT, [node(PARAGRAPH, object_nodes(input))]), + ( + false, + node(LIST_ITEM_CONTENT, [node(PARAGRAPH, object_nodes(input))]), + ), )); }; let mut skip_one = true; let mut i = input; let mut children = vec![]; - let mut previous_line_is_blank = false; - 'l: loop { + let mut previous_blank_line: Option<(Input, Input)> = None; + 'l: while !i.is_empty() { for (input, head) in line_starts_iter(i.as_str()) // the first line in list item content will always be a paragraph // so we need to skip it in the first iteration @@ -188,50 +219,51 @@ fn list_item_content_node(input: Input, indent: usize) -> IResult { - previous_line_is_blank = false; - if next_indent <= indent { + let (input, head) = previous_blank_line.unwrap_or_else(|| (input, head)); if !head.is_empty() { - children.push(node(PARAGRAPH, object_nodes(head))); + children.extend(paragraph_nodes(head)?); } - return Ok((input, node(LIST_ITEM_CONTENT, children))); + return Ok((input, (false, node(LIST_ITEM_CONTENT, children)))); } + previous_blank_line = None; + if let Ok((input, element)) = element_node(input) { if !head.is_empty() { - children.push(node(PARAGRAPH, object_nodes(head))); + children.extend(paragraph_nodes(head)?); } children.push(element); + debug_assert!( + input.input_len() < i.input_len(), + "{} < {}", + input.input_len(), + i.input_len() + ); i = input; skip_one = false; continue 'l; } } - _ if previous_line_is_blank => { - // list item ends at two consecutive empty lines - if !head.is_empty() { - children.push(node(PARAGRAPH, object_nodes(head))); - } - let (input, post_blank) = blank_lines(input)?; - - children.extend(post_blank); - - return Ok((input, node(LIST_ITEM_CONTENT, children))); - } _ => { - previous_line_is_blank = true; + // list item ends at two consecutive empty lines + if let Some((input, head)) = previous_blank_line { + if !head.is_empty() { + children.extend(paragraph_nodes(head)?); + } + + return Ok((input, (true, node(LIST_ITEM_CONTENT, children)))); + } else { + previous_blank_line = Some((input, head)) + } } } } - + children.extend(paragraph_nodes(i)?); break; } - if !i.is_empty() { - children.push(node(PARAGRAPH, object_nodes(i))); - } - - Ok((input.of(""), node(LIST_ITEM_CONTENT, children))) + Ok((input.of(""), (false, node(LIST_ITEM_CONTENT, children)))) } fn get_line_indent(input: &str) -> Option { @@ -247,9 +279,8 @@ fn parse() { let to_list = to_ast::(list_node); - let list = to_list("1)"); insta::assert_debug_snapshot!( - list.syntax, + to_list("1)").syntax, @r###" LIST@0..2 LIST_ITEM@0..2 @@ -260,9 +291,8 @@ fn parse() { "### ); - let list = to_list("+ "); insta::assert_debug_snapshot!( - list.syntax, + to_list("+ ").syntax, @r###" LIST@0..2 LIST_ITEM@0..2 @@ -273,9 +303,8 @@ fn parse() { "### ); - let list = to_list("-\n"); insta::assert_debug_snapshot!( - list.syntax, + to_list("-\n").syntax, @r###" LIST@0..2 LIST_ITEM@0..2 @@ -283,14 +312,12 @@ fn parse() { LIST_ITEM_BULLET@0..1 "-" LIST_ITEM_CONTENT@1..2 PARAGRAPH@1..2 - TEXT@1..2 "\n" + BLANK_LINE@1..2 "\n" "### ); - let list = to_list("+ 1"); - assert!(!list.is_ordered()); insta::assert_debug_snapshot!( - list.syntax, + to_list("+ 1").syntax, @r###" LIST@0..3 LIST_ITEM@0..3 @@ -302,9 +329,8 @@ fn parse() { "### ); - let list = to_list("+ 1\n"); insta::assert_debug_snapshot!( - list.syntax, + to_list("+ 1\n").syntax, @r###" LIST@0..4 LIST_ITEM@0..4 @@ -316,12 +342,13 @@ fn parse() { "### ); - let list = to_list("+ [@A] 1\n\n\n+ 2"); + // list ends with two consecutive blank lines, and these blank lines + // will be the post_blank of list node insta::assert_debug_snapshot!( - list.syntax, + to_list("+ [@A] 1\n\n\n+ 2").syntax, @r###" - LIST@0..14 - LIST_ITEM@0..11 + LIST@0..11 + LIST_ITEM@0..9 LIST_ITEM_INDENT@0..0 "" LIST_ITEM_BULLET@0..2 "+ " LIST_ITEM_COUNTER@2..6 @@ -330,25 +357,21 @@ fn parse() { TEXT@4..5 "A" R_BRACKET@5..6 "]" WHITESPACE@6..7 " " - LIST_ITEM_CONTENT@7..11 - PARAGRAPH@7..10 - TEXT@7..10 "1\n\n" - BLANK_LINE@10..11 "\n" - LIST_ITEM@11..14 - LIST_ITEM_INDENT@11..11 "" - LIST_ITEM_BULLET@11..13 "+ " - LIST_ITEM_CONTENT@13..14 - PARAGRAPH@13..14 - TEXT@13..14 "2" + LIST_ITEM_CONTENT@7..9 + PARAGRAPH@7..9 + TEXT@7..9 "1\n" + BLANK_LINE@9..10 "\n" + BLANK_LINE@10..11 "\n" "### ); - let list = to_list("+ *TAG* :: item1\n+ [X] item2"); + // empty line between list item, the empty line will be + // the post_blank of first item insta::assert_debug_snapshot!( - list.syntax, + to_list("+ *TAG* :: item1\n\n+ [X] item2").syntax, @r###" - LIST@0..28 - LIST_ITEM@0..17 + LIST@0..29 + LIST_ITEM@0..18 LIST_ITEM_INDENT@0..0 "" LIST_ITEM_BULLET@0..2 "+ " LIST_ITEM_TAG@2..10 @@ -362,20 +385,22 @@ fn parse() { LIST_ITEM_CONTENT@10..17 PARAGRAPH@10..17 TEXT@10..17 " item1\n" - LIST_ITEM@17..28 - LIST_ITEM_INDENT@17..17 "" - LIST_ITEM_BULLET@17..19 "+ " - LIST_ITEM_CHECK_BOX@19..22 - L_BRACKET@19..20 "[" - TEXT@20..21 "X" - R_BRACKET@21..22 "]" - WHITESPACE@22..23 " " - LIST_ITEM_CONTENT@23..28 - PARAGRAPH@23..28 - TEXT@23..28 "item2" + BLANK_LINE@17..18 "\n" + LIST_ITEM@18..29 + LIST_ITEM_INDENT@18..18 "" + LIST_ITEM_BULLET@18..20 "+ " + LIST_ITEM_CHECK_BOX@20..23 + L_BRACKET@20..21 "[" + TEXT@21..22 "X" + R_BRACKET@22..23 "]" + WHITESPACE@23..24 " " + LIST_ITEM_CONTENT@24..29 + PARAGRAPH@24..29 + TEXT@24..29 "item2" "### ); + // nested list let list = to_list( r#"+ item1 + item2"#, @@ -400,9 +425,8 @@ fn parse() { "### ); - let list = to_list("* item1\nitem2"); insta::assert_debug_snapshot!( - list.syntax, + to_list("* item1\nitem2").syntax, @r###" LIST@0..8 LIST_ITEM@0..8 @@ -414,13 +438,8 @@ fn parse() { "### ); - let list = to_list( - r#"* item1 - - still item 1"#, - ); insta::assert_debug_snapshot!( - list.syntax, + to_list("* item1\n\n still item 1").syntax, @r###" LIST@0..23 LIST_ITEM@0..23 @@ -428,7 +447,8 @@ fn parse() { LIST_ITEM_BULLET@0..2 "* " LIST_ITEM_CONTENT@2..23 PARAGRAPH@2..9 - TEXT@2..9 "item1\n\n" + TEXT@2..8 "item1\n" + BLANK_LINE@8..9 "\n" PARAGRAPH@9..23 TEXT@9..23 " still item 1" "### @@ -455,7 +475,8 @@ fn parse() { LIST_ITEM_BULLET@14..16 "+ " LIST_ITEM_CONTENT@16..26 PARAGRAPH@16..26 - TEXT@16..26 "item2\n " + TEXT@16..22 "item2\n" + BLANK_LINE@22..26 " " "### ); @@ -476,14 +497,16 @@ fn parse() { LIST_ITEM_BULLET@0..3 "1. " LIST_ITEM_CONTENT@3..23 PARAGRAPH@3..10 - TEXT@3..10 "item1\n\n" + TEXT@3..9 "item1\n" + BLANK_LINE@9..10 "\n" LIST@10..23 LIST_ITEM@10..23 LIST_ITEM_INDENT@10..14 " " LIST_ITEM_BULLET@14..16 "- " - LIST_ITEM_CONTENT@16..23 - PARAGRAPH@16..23 - TEXT@16..23 "item2\n\n" + LIST_ITEM_CONTENT@16..22 + PARAGRAPH@16..22 + TEXT@16..22 "item2\n" + BLANK_LINE@22..23 "\n" LIST_ITEM@23..32 LIST_ITEM_INDENT@23..23 "" LIST_ITEM_BULLET@23..26 "3. " @@ -493,21 +516,18 @@ fn parse() { "### ); - let list = to_list( - r#" + item1 - - + item2"#, - ); + // nested list insta::assert_debug_snapshot!( - list.syntax, + to_list(" + item1\n\n + item2").syntax, @r###" LIST@0..20 LIST_ITEM@0..11 LIST_ITEM_INDENT@0..2 " " LIST_ITEM_BULLET@2..4 "+ " - LIST_ITEM_CONTENT@4..11 - PARAGRAPH@4..11 - TEXT@4..11 "item1\n\n" + LIST_ITEM_CONTENT@4..10 + PARAGRAPH@4..10 + TEXT@4..10 "item1\n" + BLANK_LINE@10..11 "\n" LIST_ITEM@11..20 LIST_ITEM_INDENT@11..13 " " LIST_ITEM_BULLET@13..15 "+ " @@ -517,14 +537,8 @@ fn parse() { "### ); - let list = to_list( - r#" 1. item1 - 2. item2 - 3. item3"#, - ); - assert!(list.is_ordered()); insta::assert_debug_snapshot!( - list.syntax, + to_list(" 1. item1\n 2. item2\n 3. item3").syntax, @r###" LIST@0..42 LIST_ITEM@0..42 @@ -550,15 +564,9 @@ fn parse() { "### ); - let list = to_list( - r#" 1. item1 - #+begin_example -hello -#+end_example -"#, - ); + // Indentation of lines within other greater elements do not count insta::assert_debug_snapshot!( - list.syntax, + to_list(" 1. item1\n #+begin_example\nhello\n#+end_example\n").syntax, @r###" LIST@0..51 LIST_ITEM@0..51 diff --git a/src/syntax/object.rs b/src/syntax/object.rs index c1b0888..6b294ad 100644 --- a/src/syntax/object.rs +++ b/src/syntax/object.rs @@ -16,87 +16,115 @@ use super::{ timestamp::{timestamp_active_node, timestamp_diary_node, timestamp_inactive_node}, }; -pub struct InlinePositions<'a> { - bytes: &'a [u8], +struct ObjectPositions<'a> { + input: Input<'a>, pos: usize, next: Option, + finder: jetscii::BytesConst, } -impl InlinePositions<'_> { - pub fn new(bytes: &[u8]) -> InlinePositions { - InlinePositions { - bytes, +impl ObjectPositions<'_> { + fn new(input: Input) -> ObjectPositions { + ObjectPositions { + input, pos: 0, next: Some(0), + finder: jetscii::bytes!(b'@', b'<', b'[', b' ', b'(', b'{', b'\'', b'"', b'\n'), } } } -impl Iterator for InlinePositions<'_> { - type Item = usize; +impl<'a> Iterator for ObjectPositions<'a> { + type Item = (Input<'a>, Input<'a>); fn next(&mut self) -> Option { - self.next.take().or_else(|| { - jetscii::bytes!(b'@', b'<', b'[', b' ', b'(', b'{', b'\'', b'"', b'\n') - .find(&self.bytes[self.pos..]) - .map(|i| { - self.pos += i + 1; + if self.input.input_len() < 3 { + return None; + } - match self.bytes[self.pos - 1] { - b'{' => { - self.next = Some(self.pos); - self.pos - 1 - } - b' ' | b'(' | b'\'' | b'"' | b'\n' => self.pos, - _ => self.pos - 1, - } - }) - }) + if let Some(p) = self.next.take() { + return Some(self.input.take_split(p)); + } + + if self.pos >= self.input.input_len() { + return None; + } + + let bytes = &self.input.as_bytes()[self.pos..]; + let previous = self.pos; + let i = self.finder.find(bytes)?; + self.pos += i + 1; + + let p = match bytes[i] { + b'{' => { + self.next = Some(self.pos); + self.pos - 1 + } + b' ' | b'(' | b'\'' | b'"' | b'\n' => self.pos, + _ => self.pos - 1, + }; + + debug_assert!( + previous < self.pos && self.pos <= self.input.s.len(), + "{} < {} < {}", + previous, + self.pos, + self.input.s.len() + ); + + // a valid object requires at least three characters + if self.input.s.len() - p < 3 { + return None; + } + + Some(self.input.take_split(p)) } } pub fn object_nodes(input: Input) -> Vec { + // TODO: // debug_assert!(!input.is_empty()); - let nodes = object_nodes_base(input); + + let mut i = input; + let mut nodes = vec![]; + + 'l: while !i.is_empty() { + for (input, head) in ObjectPositions::new(i) { + debug_assert!( + input.s.len() >= 3, + "object must have at least three characters: {:?}", + input.s + ); + if let Ok((input, node)) = object_node(input) { + if !head.is_empty() { + nodes.push(head.text_token()) + } + nodes.push(node); + debug_assert!( + input.input_len() < i.input_len(), + "{} < {}", + input.input_len(), + i.input_len() + ); + i = input; + continue 'l; + } + } + nodes.push(i.text_token()); + break; + } + debug_assert_eq!( input.as_str(), nodes.iter().fold(String::new(), |s, i| s + &i.to_string()), "parser must be lossless" ); + nodes } -fn object_nodes_base(input: Input) -> Vec { - let mut children = vec![]; - - let mut i = input; - 'l: loop { - for (input, head) in InlinePositions::new(i.as_bytes()).map(|idx| i.take_split(idx)) { - if let Ok((input, node)) = object_node(input) { - if !head.is_empty() { - children.push(head.text_token()) - } - children.push(node); - i = input; - continue 'l; - } - } - - break; - } - - if !i.is_empty() { - children.push(i.text_token()); - } - - children -} - +/// Recognizes an org-mode element expect text fn object_node(i: Input) -> IResult { - if i.input_len() < 3 { - return Err(nom::Err::Error(())); - } - match &i.as_bytes()[0] { b'*' => bold_node(i), b'+' => strike_node(i), @@ -120,6 +148,22 @@ fn object_node(i: Input) -> IResult { } } +#[test] +fn positions() { + let config = crate::ParseConfig::default(); + + let vec = ObjectPositions::new(("*{", &config).into()).collect::>(); + assert!(vec.is_empty()); + + let vec = ObjectPositions::new(("*{()}//s\nc<<", &config).into()).collect::>(); + assert_eq!(vec.len(), 5); + assert_eq!(vec[0].0.s, "*{()}//s\nc<<"); + assert_eq!(vec[1].0.s, "{()}//s\nc<<"); + assert_eq!(vec[2].0.s, "()}//s\nc<<"); + assert_eq!(vec[3].0.s, ")}//s\nc<<"); + assert_eq!(vec[4].0.s, "c<<"); +} + #[test] fn parse() { use crate::{ diff --git a/src/syntax/paragraph.rs b/src/syntax/paragraph.rs index 27a59ca..caa8c65 100644 --- a/src/syntax/paragraph.rs +++ b/src/syntax/paragraph.rs @@ -1,4 +1,4 @@ -use nom::{IResult, InputTake}; +use nom::{IResult, InputLength, InputTake}; use super::{ combinator::{blank_lines, line_ends_iter, node, GreenElement}, @@ -8,10 +8,29 @@ use super::{ SyntaxKind, }; +/// Recognizes one paragraph pub fn paragraph_node(input: Input) -> IResult { crate::lossless_parser!(paragraph_node_base, input) } +/// Recognizes multiple paragraphs +pub fn paragraph_nodes(input: Input) -> Result, nom::Err<()>> { + let mut i = input; + let mut children = vec![]; + while !i.is_empty() { + let (input, node) = paragraph_node(i)?; + children.push(node); + debug_assert!( + i.input_len() > input.input_len(), + "{} > {}", + i.input_len(), + input.input_len() + ); + i = input; + } + Ok(children) +} + fn paragraph_node_base(input: Input) -> IResult { debug_assert!(!input.is_empty()); diff --git a/src/syntax/planning.rs b/src/syntax/planning.rs index 284d9e3..8560d3d 100644 --- a/src/syntax/planning.rs +++ b/src/syntax/planning.rs @@ -15,6 +15,7 @@ use super::{ }; pub fn planning_node(input: Input) -> IResult { + debug_assert!(!input.is_empty()); crate::lossless_parser!(planning_node_base, input) } diff --git a/tests/html.rs b/tests/html.rs index f54808d..82181b7 100644 --- a/tests/html.rs +++ b/tests/html.rs @@ -58,13 +58,9 @@ fn list() { "#).to_html(), @r###"
  • 1 -

  • 2 -

    • 3 -

    • 4 -

  • 5

"###