From 471a23c95873a56ab924c1089cecbd348bacf208 Mon Sep 17 00:00:00 2001 From: PoiScript Date: Thu, 23 Nov 2023 17:56:17 +0800 Subject: [PATCH] fix: consider single '\r' as newline --- src/syntax/block.rs | 16 ++--- src/syntax/clock.rs | 10 +-- src/syntax/combinator.rs | 114 +++++++++++++++++++++++--------- src/syntax/comment.rs | 11 +-- src/syntax/drawer.rs | 13 ++-- src/syntax/dyn_block.rs | 16 ++--- src/syntax/fixed_width.rs | 11 +-- src/syntax/latex_environment.rs | 8 +-- src/syntax/line_break.rs | 17 +---- src/syntax/planning.rs | 12 ++-- src/syntax/rule.rs | 10 +-- src/syntax/table.rs | 5 +- tests/parse.rs | 5 +- 13 files changed, 138 insertions(+), 110 deletions(-) diff --git a/src/syntax/block.rs b/src/syntax/block.rs index 6e142ec..deea665 100644 --- a/src/syntax/block.rs +++ b/src/syntax/block.rs @@ -1,15 +1,14 @@ use nom::{ - branch::alt, bytes::complete::{tag, tag_no_case}, - character::complete::{alpha1, line_ending, space0}, - combinator::eof, + character::complete::{alpha1, space0}, sequence::tuple, IResult, InputTake, }; use super::{ combinator::{ - blank_lines, line_starts_iter, node, token, trim_line_end, GreenElement, NodeBuilder, + blank_lines, eol_or_eof, line_starts_iter, node, token, trim_line_end, GreenElement, + NodeBuilder, }, element::element_nodes, input::Input, @@ -67,13 +66,8 @@ fn block_begin_node(input: Input) -> IResult { } fn block_end_node<'a>(input: Input<'a>, name: &str) -> IResult, GreenElement, ()> { - let (input, (ws, end, name, ws_, nl)) = tuple(( - space0, - tag_no_case("#+END_"), - tag(name), - space0, - alt((line_ending, eof)), - ))(input)?; + let (input, (ws, end, name, ws_, nl)) = + tuple((space0, tag_no_case("#+END_"), tag(name), space0, eol_or_eof))(input)?; let mut b = NodeBuilder::new(); b.ws(ws); diff --git a/src/syntax/clock.rs b/src/syntax/clock.rs index 0e98a31..99c179c 100644 --- a/src/syntax/clock.rs +++ b/src/syntax/clock.rs @@ -1,14 +1,16 @@ use nom::{ branch::alt, bytes::complete::tag, - character::complete::{digit1, line_ending, space0}, - combinator::{eof, map, opt, recognize}, + character::complete::{digit1, space0}, + combinator::{map, opt, recognize}, sequence::tuple, IResult, }; use super::{ - combinator::{blank_lines, colon_token, double_arrow_token, GreenElement, NodeBuilder}, + combinator::{ + blank_lines, colon_token, double_arrow_token, eol_or_eof, GreenElement, NodeBuilder, + }, input::Input, timestamp::{timestamp_active_node, timestamp_inactive_node}, SyntaxKind, @@ -29,7 +31,7 @@ pub fn clock_node(input: Input) -> IResult { recognize(tuple((digit1, colon_token, digit1))), ))), space0, - alt((line_ending, eof)), + eol_or_eof, blank_lines, )), |(ws, clock, ws_, timestamp, duration, ws__, nl, post_blank)| { diff --git a/src/syntax/combinator.rs b/src/syntax/combinator.rs index 89552e7..2047e96 100644 --- a/src/syntax/combinator.rs +++ b/src/syntax/combinator.rs @@ -1,10 +1,7 @@ -use std::iter::once; - -use memchr::{memchr, memchr2_iter, memchr_iter}; -use nom::{ - bytes::complete::tag, character::complete::space0, AsBytes, IResult, InputLength, InputTake, -}; +use memchr::{memchr2, memchr2_iter, Memchr2}; +use nom::{bytes::complete::tag, AsBytes, IResult, InputTake, Slice}; use rowan::{GreenNode, GreenToken, Language, NodeOrToken}; +use std::iter::once; use super::{input::Input, OrgLanguage, SyntaxKind, SyntaxKind::*}; @@ -101,13 +98,7 @@ pub fn blank_lines(input: Input) -> IResult, ()> { let mut start = 0; let bytes = input.as_bytes(); - for index in memchr2_iter(b'\r', b'\n', bytes) - .map(|i| i + 1) - .chain(once(bytes.len())) - { - if bytes.get(index - 1) == Some(&b'\r') && bytes.get(index) == Some(&b'\n') { - continue; - } + for index in line_ends_iter(input.as_str()) { if start != index && bytes[start..index].iter().all(|b| b.is_ascii_whitespace()) { lines.push(token(BLANK_LINE, &input.as_str()[start..index])); start = index; @@ -116,7 +107,7 @@ pub fn blank_lines(input: Input) -> IResult, ()> { } } - Ok((input.take_split(start).0, lines)) + Ok((input.slice(start..), lines)) } #[test] @@ -159,21 +150,26 @@ fn test_blank_lines() { /// Returns 1. anything before trailing whitespace, 2. whitespace itself, 3. line feeding pub fn trim_line_end(input: Input) -> IResult { - let (input, line) = input.take_split( - memchr(b'\n', input.as_bytes()) - .map(|i| i + 1) - .unwrap_or(input.input_len()), - ); + let bytes = input.as_bytes(); - let (ws_and_nl, contents) = line.take_split( - line.as_bytes() - .iter() - .rposition(|u| !u.is_ascii_whitespace()) - .map(|i| i + 1) - .unwrap_or(0), - ); + let (input, contents, nl) = match memchr2(b'\r', b'\n', bytes) { + Some(i) if bytes[i] == b'\r' && matches!(bytes.get(i + 1), Some(b'\n')) => ( + input.slice(i + 2..), + input.slice(0..i), + input.slice(i..i + 2), + ), + Some(i) => ( + input.slice(i + 1..), + input.slice(0..i), + input.slice(i..i + 1), + ), + _ => (input.of(""), input, input.of("")), + }; - let (nl, ws) = space0(ws_and_nl)?; + let (contents, ws) = match contents.bytes().rposition(|u| !u.is_ascii_whitespace()) { + Some(i) => (contents.slice(0..i + 1), contents.slice(i + 1..)), + None => (contents.of(""), contents), + }; Ok((input, (contents, ws, nl))) } @@ -200,18 +196,72 @@ fn test_trim_line_end() { assert_eq!(output.0.as_str(), "* hello, world :abc:"); assert_eq!(output.1.as_str(), " "); assert_eq!(output.2.as_str(), "\r\n"); + + let (input, output) = trim_line_end((" \rr", config).into()).unwrap(); + assert_eq!(input.as_str(), "r"); + assert_eq!(output.0.as_str(), ""); + assert_eq!(output.1.as_str(), " "); + assert_eq!(output.2.as_str(), "\r"); +} + +/// Recognizes a line ending \r, \n, \r\n or end of file +pub fn eol_or_eof(input: Input) -> IResult { + let mut bytes = input.bytes(); + + let count = match bytes.next() { + Some(b'\n') => 1, + Some(b'\r') => { + if matches!(bytes.next(), Some(b'\n')) { + 2 + } else { + 1 + } + } + None => 0, + _ => return Err(nom::Err::Error(())), + }; + + Ok(input.take_split(count)) +} + +struct LineStart<'a> { + bytes: &'a [u8], + iter: Memchr2<'a>, +} + +impl<'a> LineStart<'a> { + fn new(input: &'a str) -> Self { + let bytes = input.as_bytes(); + LineStart { + bytes, + iter: memchr2_iter(b'\r', b'\n', bytes), + } + } +} + +impl<'a> Iterator for LineStart<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + let i = self.iter.next()?; + if self.bytes[i] == b'\r' && self.bytes.get(i + 1) == Some(&b'\n') { + let ii = self.iter.next(); + debug_assert_eq!(i + 1, ii.unwrap()); + Some(i + 2) + } else { + Some(i + 1) + } + } } /// Returns an iterator of positions of line start, including zero pub fn line_starts_iter(s: &str) -> impl Iterator + '_ { - once(0).chain(memchr_iter(b'\n', s.as_bytes()).map(|i| i + 1)) + once(0).chain(LineStart::new(s)) } /// Returns an iterator of positions of line end, including eof pub fn line_ends_iter(s: &str) -> impl Iterator + '_ { - memchr_iter(b'\n', s.as_bytes()) - .map(|i| i + 1) - .chain(once(s.len())) + LineStart::new(s).chain(once(s.len())) } pub struct NodeBuilder { @@ -233,7 +283,7 @@ impl NodeBuilder { pub fn nl(&mut self, i: Input) { if !i.is_empty() { debug_assert!( - i.s == "\n" || i.s == "\r\n", + i.s == "\n" || i.s == "\r\n" || i.s == "\r", "{:?} should be a new line", i.s ); diff --git a/src/syntax/comment.rs b/src/syntax/comment.rs index 434a2ea..6ccb55e 100644 --- a/src/syntax/comment.rs +++ b/src/syntax/comment.rs @@ -1,4 +1,4 @@ -use nom::{IResult, InputTake}; +use nom::{AsBytes, IResult, InputTake}; use super::{ combinator::{blank_lines, line_ends_iter, node, GreenElement}, @@ -9,10 +9,13 @@ use super::{ fn comment_node_base(input: Input) -> IResult { let mut start = 0; for i in line_ends_iter(input.as_str()) { - let line = &input.as_str()[start..i]; - let trimmed = line.trim_start(); + let mut iter = input.as_bytes()[start..] + .iter() + .skip_while(|&&b| b == b' ' || b == b'\t'); - if trimmed == "#" || trimmed == "#\n" || trimmed == "#\r\n" || trimmed.starts_with("# ") { + if matches!(iter.next(), Some(b'#')) + && matches!(iter.next(), None | Some(b'\n') | Some(b'\r') | Some(b' ')) + { start = i; } else { break; diff --git a/src/syntax/drawer.rs b/src/syntax/drawer.rs index 55216c9..a0bd5c7 100644 --- a/src/syntax/drawer.rs +++ b/src/syntax/drawer.rs @@ -1,16 +1,15 @@ use nom::{ - branch::alt, bytes::complete::{tag_no_case, take_while1}, - character::complete::{line_ending, space0, space1}, - combinator::{eof, iterator, map, opt}, + character::complete::{space0, space1}, + combinator::{iterator, map, opt}, sequence::tuple, IResult, InputTake, }; use super::{ combinator::{ - blank_lines, colon_token, line_starts_iter, node, plus_token, trim_line_end, GreenElement, - NodeBuilder, + blank_lines, colon_token, eol_or_eof, line_starts_iter, node, plus_token, trim_line_end, + GreenElement, NodeBuilder, }, input::Input, SyntaxKind::*, @@ -25,7 +24,7 @@ fn drawer_begin_node(input: Input) -> IResult { take_while1(|c: char| c.is_ascii_alphabetic() || c == '-' || c == '_'), colon_token, space0, - alt((line_ending, eof)), + eol_or_eof, ))(input)?; b.ws(ws); @@ -45,7 +44,7 @@ fn drawer_end_node(input: Input) -> IResult { tag_no_case("END"), colon_token, space0, - alt((line_ending, eof)), + eol_or_eof, ))(input)?; let mut b = NodeBuilder::new(); diff --git a/src/syntax/dyn_block.rs b/src/syntax/dyn_block.rs index 5e01397..eb9ad12 100644 --- a/src/syntax/dyn_block.rs +++ b/src/syntax/dyn_block.rs @@ -1,14 +1,14 @@ use nom::{ - branch::alt, bytes::complete::tag_no_case, - character::complete::{alpha1, line_ending, space0, space1}, - combinator::eof, + character::complete::{alpha1, space0, space1}, sequence::tuple, IResult, InputTake, }; use super::{ - combinator::{blank_lines, line_starts_iter, node, trim_line_end, GreenElement, NodeBuilder}, + combinator::{ + blank_lines, eol_or_eof, line_starts_iter, node, trim_line_end, GreenElement, NodeBuilder, + }, input::Input, SyntaxKind::*, }; @@ -55,12 +55,8 @@ fn dyn_block_begin_node(input: Input) -> IResult { } fn dyn_block_end_node(input: Input) -> IResult { - let (input, (ws, end, ws_, nl)) = tuple(( - space0, - tag_no_case("#+END:"), - space0, - alt((line_ending, eof)), - ))(input)?; + let (input, (ws, end, ws_, nl)) = + tuple((space0, tag_no_case("#+END:"), space0, eol_or_eof))(input)?; let mut b = NodeBuilder::new(); b.ws(ws); diff --git a/src/syntax/fixed_width.rs b/src/syntax/fixed_width.rs index c16bc4b..32875f9 100644 --- a/src/syntax/fixed_width.rs +++ b/src/syntax/fixed_width.rs @@ -1,4 +1,4 @@ -use nom::{IResult, InputTake}; +use nom::{AsBytes, IResult, InputTake}; use super::{ combinator::{blank_lines, line_ends_iter, node, GreenElement}, @@ -9,10 +9,13 @@ use super::{ fn fixed_width_node_base(input: Input) -> IResult { let mut start = 0; for i in line_ends_iter(input.as_str()) { - let line = &input.s[start..i]; - let trimmed = line.trim_start(); + let mut iter = input.as_bytes()[start..] + .iter() + .skip_while(|&&b| b == b' ' || b == b'\t'); - if trimmed == ":" || trimmed == ":\n" || trimmed == ":\r\n" || trimmed.starts_with(": ") { + if matches!(iter.next(), Some(b':')) + && matches!(iter.next(), None | Some(b'\n') | Some(b'\r') | Some(b' ')) + { start = i; } else { break; diff --git a/src/syntax/latex_environment.rs b/src/syntax/latex_environment.rs index b731e1b..cbcdabf 100644 --- a/src/syntax/latex_environment.rs +++ b/src/syntax/latex_environment.rs @@ -1,8 +1,6 @@ use nom::{ - branch::alt, bytes::complete::{tag, take_while1}, - character::complete::{line_ending, space0}, - combinator::eof, + character::complete::space0, sequence::tuple, IResult, InputTake, }; @@ -10,7 +8,7 @@ use nom::{ use crate::SyntaxKind; use super::{ - combinator::{l_curly_token, line_starts_iter, node, r_curly_token, GreenElement}, + combinator::{eol_or_eof, l_curly_token, line_starts_iter, node, r_curly_token, GreenElement}, input::Input, }; @@ -36,7 +34,7 @@ fn latex_environment_node_base(input: Input) -> IResult tag(name1.s), r_curly_token, space0, - alt((line_ending, eof)), + eol_or_eof, ))(input) { return Ok(( diff --git a/src/syntax/line_break.rs b/src/syntax/line_break.rs index 019fab5..95b5789 100644 --- a/src/syntax/line_break.rs +++ b/src/syntax/line_break.rs @@ -1,13 +1,7 @@ -use nom::{ - branch::alt, - character::complete::{line_ending, space0}, - combinator::{eof, map}, - sequence::tuple, - IResult, -}; +use nom::{character::complete::space0, combinator::map, sequence::tuple, IResult}; use crate::{ - syntax::combinator::{backslash_token, node}, + syntax::combinator::{backslash_token, eol_or_eof, node}, SyntaxKind, }; @@ -16,12 +10,7 @@ use super::{combinator::GreenElement, input::Input}; pub fn line_break_node(input: Input) -> IResult { debug_assert!(input.s.starts_with('\\')); let mut parser = map( - tuple(( - backslash_token, - backslash_token, - space0, - alt((line_ending, eof)), - )), + tuple((backslash_token, backslash_token, space0, eol_or_eof)), |(b1, b2, ws, nl)| { node( SyntaxKind::LINE_BREAK, diff --git a/src/syntax/planning.rs b/src/syntax/planning.rs index 8560d3d..d35c270 100644 --- a/src/syntax/planning.rs +++ b/src/syntax/planning.rs @@ -1,14 +1,10 @@ use nom::{ - branch::alt, - bytes::complete::tag, - character::complete::{line_ending, space0}, - combinator::{eof, iterator}, - sequence::tuple, - IResult, + branch::alt, bytes::complete::tag, character::complete::space0, combinator::iterator, + sequence::tuple, IResult, }; use super::{ - combinator::{GreenElement, NodeBuilder}, + combinator::{eol_or_eof, GreenElement, NodeBuilder}, input::Input, timestamp::{timestamp_active_node, timestamp_inactive_node}, SyntaxKind::*, @@ -54,7 +50,7 @@ fn planning_node_base(input: Input) -> IResult { let (input, _) = it.finish()?; let (input, ws) = space0(input)?; - let (input, nl) = alt((line_ending, eof))(input)?; + let (input, nl) = eol_or_eof(input)?; b.ws(ws); b.nl(nl); diff --git a/src/syntax/rule.rs b/src/syntax/rule.rs index 9dacd19..d498ea1 100644 --- a/src/syntax/rule.rs +++ b/src/syntax/rule.rs @@ -1,14 +1,10 @@ use nom::{ - branch::alt, - bytes::complete::take_while_m_n, - character::complete::{line_ending, space0}, - combinator::{eof, map}, - sequence::tuple, + bytes::complete::take_while_m_n, character::complete::space0, combinator::map, sequence::tuple, IResult, }; use super::{ - combinator::{blank_lines, GreenElement, NodeBuilder}, + combinator::{blank_lines, eol_or_eof, GreenElement, NodeBuilder}, input::Input, SyntaxKind::*, }; @@ -19,7 +15,7 @@ pub fn rule_node(input: Input) -> IResult { space0, take_while_m_n(5, usize::max_value(), |c| c == '-'), space0, - alt((line_ending, eof)), + eol_or_eof, blank_lines, )), |(ws, dashes, ws_, nl, post_blank)| { diff --git a/src/syntax/table.rs b/src/syntax/table.rs index 803782d..93e937d 100644 --- a/src/syntax/table.rs +++ b/src/syntax/table.rs @@ -19,7 +19,7 @@ fn org_table_node_base(input: Input) -> IResult { let mut start = 0; for i in line_ends_iter(input.as_str()) { let line = input.slice(start..i); - let trimmed = line.as_str().trim_start(); + let trimmed = line.as_str().trim_start_matches([' ', '\t']); // Org tables end at the first line not starting with a vertical bar. if !trimmed.starts_with('|') { @@ -81,7 +81,8 @@ fn table_standard_row_node(input: Input) -> Result> { } } }); - it.finish()?; + let (input, _) = it.finish()?; + debug_assert!(input.is_empty()); Ok(b.finish(ORG_TABLE_STANDARD_ROW)) } diff --git a/tests/parse.rs b/tests/parse.rs index 6f78ae8..813316b 100644 --- a/tests/parse.rs +++ b/tests/parse.rs @@ -23,9 +23,10 @@ const INPUT: &[&str] = &[ // fuzz test "___\n", "\n\n\n", - "\n\n\n", "\n*", - "\r-" + "\r-", + "6\r\n", + "|\n\u{b}|" ]; #[test]