diff --git a/src/syntax/block.rs b/src/syntax/block.rs
index 6e142ec..deea665 100644
--- a/src/syntax/block.rs
+++ b/src/syntax/block.rs
@@ -1,15 +1,14 @@
use nom::{
- branch::alt,
bytes::complete::{tag, tag_no_case},
- character::complete::{alpha1, line_ending, space0},
- combinator::eof,
+ character::complete::{alpha1, space0},
sequence::tuple,
IResult, InputTake,
};
use super::{
combinator::{
- blank_lines, line_starts_iter, node, token, trim_line_end, GreenElement, NodeBuilder,
+ blank_lines, eol_or_eof, line_starts_iter, node, token, trim_line_end, GreenElement,
+ NodeBuilder,
},
element::element_nodes,
input::Input,
@@ -67,13 +66,8 @@ fn block_begin_node(input: Input) -> IResult {
}
fn block_end_node<'a>(input: Input<'a>, name: &str) -> IResult, GreenElement, ()> {
- let (input, (ws, end, name, ws_, nl)) = tuple((
- space0,
- tag_no_case("#+END_"),
- tag(name),
- space0,
- alt((line_ending, eof)),
- ))(input)?;
+ let (input, (ws, end, name, ws_, nl)) =
+ tuple((space0, tag_no_case("#+END_"), tag(name), space0, eol_or_eof))(input)?;
let mut b = NodeBuilder::new();
b.ws(ws);
diff --git a/src/syntax/clock.rs b/src/syntax/clock.rs
index 0e98a31..99c179c 100644
--- a/src/syntax/clock.rs
+++ b/src/syntax/clock.rs
@@ -1,14 +1,16 @@
use nom::{
branch::alt,
bytes::complete::tag,
- character::complete::{digit1, line_ending, space0},
- combinator::{eof, map, opt, recognize},
+ character::complete::{digit1, space0},
+ combinator::{map, opt, recognize},
sequence::tuple,
IResult,
};
use super::{
- combinator::{blank_lines, colon_token, double_arrow_token, GreenElement, NodeBuilder},
+ combinator::{
+ blank_lines, colon_token, double_arrow_token, eol_or_eof, GreenElement, NodeBuilder,
+ },
input::Input,
timestamp::{timestamp_active_node, timestamp_inactive_node},
SyntaxKind,
@@ -29,7 +31,7 @@ pub fn clock_node(input: Input) -> IResult {
recognize(tuple((digit1, colon_token, digit1))),
))),
space0,
- alt((line_ending, eof)),
+ eol_or_eof,
blank_lines,
)),
|(ws, clock, ws_, timestamp, duration, ws__, nl, post_blank)| {
diff --git a/src/syntax/combinator.rs b/src/syntax/combinator.rs
index 89552e7..2047e96 100644
--- a/src/syntax/combinator.rs
+++ b/src/syntax/combinator.rs
@@ -1,10 +1,7 @@
-use std::iter::once;
-
-use memchr::{memchr, memchr2_iter, memchr_iter};
-use nom::{
- bytes::complete::tag, character::complete::space0, AsBytes, IResult, InputLength, InputTake,
-};
+use memchr::{memchr2, memchr2_iter, Memchr2};
+use nom::{bytes::complete::tag, AsBytes, IResult, InputTake, Slice};
use rowan::{GreenNode, GreenToken, Language, NodeOrToken};
+use std::iter::once;
use super::{input::Input, OrgLanguage, SyntaxKind, SyntaxKind::*};
@@ -101,13 +98,7 @@ pub fn blank_lines(input: Input) -> IResult, ()> {
let mut start = 0;
let bytes = input.as_bytes();
- for index in memchr2_iter(b'\r', b'\n', bytes)
- .map(|i| i + 1)
- .chain(once(bytes.len()))
- {
- if bytes.get(index - 1) == Some(&b'\r') && bytes.get(index) == Some(&b'\n') {
- continue;
- }
+ for index in line_ends_iter(input.as_str()) {
if start != index && bytes[start..index].iter().all(|b| b.is_ascii_whitespace()) {
lines.push(token(BLANK_LINE, &input.as_str()[start..index]));
start = index;
@@ -116,7 +107,7 @@ pub fn blank_lines(input: Input) -> IResult, ()> {
}
}
- Ok((input.take_split(start).0, lines))
+ Ok((input.slice(start..), lines))
}
#[test]
@@ -159,21 +150,26 @@ fn test_blank_lines() {
/// Returns 1. anything before trailing whitespace, 2. whitespace itself, 3. line feeding
pub fn trim_line_end(input: Input) -> IResult {
- let (input, line) = input.take_split(
- memchr(b'\n', input.as_bytes())
- .map(|i| i + 1)
- .unwrap_or(input.input_len()),
- );
+ let bytes = input.as_bytes();
- let (ws_and_nl, contents) = line.take_split(
- line.as_bytes()
- .iter()
- .rposition(|u| !u.is_ascii_whitespace())
- .map(|i| i + 1)
- .unwrap_or(0),
- );
+ let (input, contents, nl) = match memchr2(b'\r', b'\n', bytes) {
+ Some(i) if bytes[i] == b'\r' && matches!(bytes.get(i + 1), Some(b'\n')) => (
+ input.slice(i + 2..),
+ input.slice(0..i),
+ input.slice(i..i + 2),
+ ),
+ Some(i) => (
+ input.slice(i + 1..),
+ input.slice(0..i),
+ input.slice(i..i + 1),
+ ),
+ _ => (input.of(""), input, input.of("")),
+ };
- let (nl, ws) = space0(ws_and_nl)?;
+ let (contents, ws) = match contents.bytes().rposition(|u| !u.is_ascii_whitespace()) {
+ Some(i) => (contents.slice(0..i + 1), contents.slice(i + 1..)),
+ None => (contents.of(""), contents),
+ };
Ok((input, (contents, ws, nl)))
}
@@ -200,18 +196,72 @@ fn test_trim_line_end() {
assert_eq!(output.0.as_str(), "* hello, world :abc:");
assert_eq!(output.1.as_str(), " ");
assert_eq!(output.2.as_str(), "\r\n");
+
+ let (input, output) = trim_line_end((" \rr", config).into()).unwrap();
+ assert_eq!(input.as_str(), "r");
+ assert_eq!(output.0.as_str(), "");
+ assert_eq!(output.1.as_str(), " ");
+ assert_eq!(output.2.as_str(), "\r");
+}
+
+/// Recognizes a line ending \r, \n, \r\n or end of file
+pub fn eol_or_eof(input: Input) -> IResult {
+ let mut bytes = input.bytes();
+
+ let count = match bytes.next() {
+ Some(b'\n') => 1,
+ Some(b'\r') => {
+ if matches!(bytes.next(), Some(b'\n')) {
+ 2
+ } else {
+ 1
+ }
+ }
+ None => 0,
+ _ => return Err(nom::Err::Error(())),
+ };
+
+ Ok(input.take_split(count))
+}
+
+struct LineStart<'a> {
+ bytes: &'a [u8],
+ iter: Memchr2<'a>,
+}
+
+impl<'a> LineStart<'a> {
+ fn new(input: &'a str) -> Self {
+ let bytes = input.as_bytes();
+ LineStart {
+ bytes,
+ iter: memchr2_iter(b'\r', b'\n', bytes),
+ }
+ }
+}
+
+impl<'a> Iterator for LineStart<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option {
+ let i = self.iter.next()?;
+ if self.bytes[i] == b'\r' && self.bytes.get(i + 1) == Some(&b'\n') {
+ let ii = self.iter.next();
+ debug_assert_eq!(i + 1, ii.unwrap());
+ Some(i + 2)
+ } else {
+ Some(i + 1)
+ }
+ }
}
/// Returns an iterator of positions of line start, including zero
pub fn line_starts_iter(s: &str) -> impl Iterator- + '_ {
- once(0).chain(memchr_iter(b'\n', s.as_bytes()).map(|i| i + 1))
+ once(0).chain(LineStart::new(s))
}
/// Returns an iterator of positions of line end, including eof
pub fn line_ends_iter(s: &str) -> impl Iterator
- + '_ {
- memchr_iter(b'\n', s.as_bytes())
- .map(|i| i + 1)
- .chain(once(s.len()))
+ LineStart::new(s).chain(once(s.len()))
}
pub struct NodeBuilder {
@@ -233,7 +283,7 @@ impl NodeBuilder {
pub fn nl(&mut self, i: Input) {
if !i.is_empty() {
debug_assert!(
- i.s == "\n" || i.s == "\r\n",
+ i.s == "\n" || i.s == "\r\n" || i.s == "\r",
"{:?} should be a new line",
i.s
);
diff --git a/src/syntax/comment.rs b/src/syntax/comment.rs
index 434a2ea..6ccb55e 100644
--- a/src/syntax/comment.rs
+++ b/src/syntax/comment.rs
@@ -1,4 +1,4 @@
-use nom::{IResult, InputTake};
+use nom::{AsBytes, IResult, InputTake};
use super::{
combinator::{blank_lines, line_ends_iter, node, GreenElement},
@@ -9,10 +9,13 @@ use super::{
fn comment_node_base(input: Input) -> IResult {
let mut start = 0;
for i in line_ends_iter(input.as_str()) {
- let line = &input.as_str()[start..i];
- let trimmed = line.trim_start();
+ let mut iter = input.as_bytes()[start..]
+ .iter()
+ .skip_while(|&&b| b == b' ' || b == b'\t');
- if trimmed == "#" || trimmed == "#\n" || trimmed == "#\r\n" || trimmed.starts_with("# ") {
+ if matches!(iter.next(), Some(b'#'))
+ && matches!(iter.next(), None | Some(b'\n') | Some(b'\r') | Some(b' '))
+ {
start = i;
} else {
break;
diff --git a/src/syntax/drawer.rs b/src/syntax/drawer.rs
index 55216c9..a0bd5c7 100644
--- a/src/syntax/drawer.rs
+++ b/src/syntax/drawer.rs
@@ -1,16 +1,15 @@
use nom::{
- branch::alt,
bytes::complete::{tag_no_case, take_while1},
- character::complete::{line_ending, space0, space1},
- combinator::{eof, iterator, map, opt},
+ character::complete::{space0, space1},
+ combinator::{iterator, map, opt},
sequence::tuple,
IResult, InputTake,
};
use super::{
combinator::{
- blank_lines, colon_token, line_starts_iter, node, plus_token, trim_line_end, GreenElement,
- NodeBuilder,
+ blank_lines, colon_token, eol_or_eof, line_starts_iter, node, plus_token, trim_line_end,
+ GreenElement, NodeBuilder,
},
input::Input,
SyntaxKind::*,
@@ -25,7 +24,7 @@ fn drawer_begin_node(input: Input) -> IResult {
take_while1(|c: char| c.is_ascii_alphabetic() || c == '-' || c == '_'),
colon_token,
space0,
- alt((line_ending, eof)),
+ eol_or_eof,
))(input)?;
b.ws(ws);
@@ -45,7 +44,7 @@ fn drawer_end_node(input: Input) -> IResult {
tag_no_case("END"),
colon_token,
space0,
- alt((line_ending, eof)),
+ eol_or_eof,
))(input)?;
let mut b = NodeBuilder::new();
diff --git a/src/syntax/dyn_block.rs b/src/syntax/dyn_block.rs
index 5e01397..eb9ad12 100644
--- a/src/syntax/dyn_block.rs
+++ b/src/syntax/dyn_block.rs
@@ -1,14 +1,14 @@
use nom::{
- branch::alt,
bytes::complete::tag_no_case,
- character::complete::{alpha1, line_ending, space0, space1},
- combinator::eof,
+ character::complete::{alpha1, space0, space1},
sequence::tuple,
IResult, InputTake,
};
use super::{
- combinator::{blank_lines, line_starts_iter, node, trim_line_end, GreenElement, NodeBuilder},
+ combinator::{
+ blank_lines, eol_or_eof, line_starts_iter, node, trim_line_end, GreenElement, NodeBuilder,
+ },
input::Input,
SyntaxKind::*,
};
@@ -55,12 +55,8 @@ fn dyn_block_begin_node(input: Input) -> IResult {
}
fn dyn_block_end_node(input: Input) -> IResult {
- let (input, (ws, end, ws_, nl)) = tuple((
- space0,
- tag_no_case("#+END:"),
- space0,
- alt((line_ending, eof)),
- ))(input)?;
+ let (input, (ws, end, ws_, nl)) =
+ tuple((space0, tag_no_case("#+END:"), space0, eol_or_eof))(input)?;
let mut b = NodeBuilder::new();
b.ws(ws);
diff --git a/src/syntax/fixed_width.rs b/src/syntax/fixed_width.rs
index c16bc4b..32875f9 100644
--- a/src/syntax/fixed_width.rs
+++ b/src/syntax/fixed_width.rs
@@ -1,4 +1,4 @@
-use nom::{IResult, InputTake};
+use nom::{AsBytes, IResult, InputTake};
use super::{
combinator::{blank_lines, line_ends_iter, node, GreenElement},
@@ -9,10 +9,13 @@ use super::{
fn fixed_width_node_base(input: Input) -> IResult {
let mut start = 0;
for i in line_ends_iter(input.as_str()) {
- let line = &input.s[start..i];
- let trimmed = line.trim_start();
+ let mut iter = input.as_bytes()[start..]
+ .iter()
+ .skip_while(|&&b| b == b' ' || b == b'\t');
- if trimmed == ":" || trimmed == ":\n" || trimmed == ":\r\n" || trimmed.starts_with(": ") {
+ if matches!(iter.next(), Some(b':'))
+ && matches!(iter.next(), None | Some(b'\n') | Some(b'\r') | Some(b' '))
+ {
start = i;
} else {
break;
diff --git a/src/syntax/latex_environment.rs b/src/syntax/latex_environment.rs
index b731e1b..cbcdabf 100644
--- a/src/syntax/latex_environment.rs
+++ b/src/syntax/latex_environment.rs
@@ -1,8 +1,6 @@
use nom::{
- branch::alt,
bytes::complete::{tag, take_while1},
- character::complete::{line_ending, space0},
- combinator::eof,
+ character::complete::space0,
sequence::tuple,
IResult, InputTake,
};
@@ -10,7 +8,7 @@ use nom::{
use crate::SyntaxKind;
use super::{
- combinator::{l_curly_token, line_starts_iter, node, r_curly_token, GreenElement},
+ combinator::{eol_or_eof, l_curly_token, line_starts_iter, node, r_curly_token, GreenElement},
input::Input,
};
@@ -36,7 +34,7 @@ fn latex_environment_node_base(input: Input) -> IResult
tag(name1.s),
r_curly_token,
space0,
- alt((line_ending, eof)),
+ eol_or_eof,
))(input)
{
return Ok((
diff --git a/src/syntax/line_break.rs b/src/syntax/line_break.rs
index 019fab5..95b5789 100644
--- a/src/syntax/line_break.rs
+++ b/src/syntax/line_break.rs
@@ -1,13 +1,7 @@
-use nom::{
- branch::alt,
- character::complete::{line_ending, space0},
- combinator::{eof, map},
- sequence::tuple,
- IResult,
-};
+use nom::{character::complete::space0, combinator::map, sequence::tuple, IResult};
use crate::{
- syntax::combinator::{backslash_token, node},
+ syntax::combinator::{backslash_token, eol_or_eof, node},
SyntaxKind,
};
@@ -16,12 +10,7 @@ use super::{combinator::GreenElement, input::Input};
pub fn line_break_node(input: Input) -> IResult {
debug_assert!(input.s.starts_with('\\'));
let mut parser = map(
- tuple((
- backslash_token,
- backslash_token,
- space0,
- alt((line_ending, eof)),
- )),
+ tuple((backslash_token, backslash_token, space0, eol_or_eof)),
|(b1, b2, ws, nl)| {
node(
SyntaxKind::LINE_BREAK,
diff --git a/src/syntax/planning.rs b/src/syntax/planning.rs
index 8560d3d..d35c270 100644
--- a/src/syntax/planning.rs
+++ b/src/syntax/planning.rs
@@ -1,14 +1,10 @@
use nom::{
- branch::alt,
- bytes::complete::tag,
- character::complete::{line_ending, space0},
- combinator::{eof, iterator},
- sequence::tuple,
- IResult,
+ branch::alt, bytes::complete::tag, character::complete::space0, combinator::iterator,
+ sequence::tuple, IResult,
};
use super::{
- combinator::{GreenElement, NodeBuilder},
+ combinator::{eol_or_eof, GreenElement, NodeBuilder},
input::Input,
timestamp::{timestamp_active_node, timestamp_inactive_node},
SyntaxKind::*,
@@ -54,7 +50,7 @@ fn planning_node_base(input: Input) -> IResult {
let (input, _) = it.finish()?;
let (input, ws) = space0(input)?;
- let (input, nl) = alt((line_ending, eof))(input)?;
+ let (input, nl) = eol_or_eof(input)?;
b.ws(ws);
b.nl(nl);
diff --git a/src/syntax/rule.rs b/src/syntax/rule.rs
index 9dacd19..d498ea1 100644
--- a/src/syntax/rule.rs
+++ b/src/syntax/rule.rs
@@ -1,14 +1,10 @@
use nom::{
- branch::alt,
- bytes::complete::take_while_m_n,
- character::complete::{line_ending, space0},
- combinator::{eof, map},
- sequence::tuple,
+ bytes::complete::take_while_m_n, character::complete::space0, combinator::map, sequence::tuple,
IResult,
};
use super::{
- combinator::{blank_lines, GreenElement, NodeBuilder},
+ combinator::{blank_lines, eol_or_eof, GreenElement, NodeBuilder},
input::Input,
SyntaxKind::*,
};
@@ -19,7 +15,7 @@ pub fn rule_node(input: Input) -> IResult {
space0,
take_while_m_n(5, usize::max_value(), |c| c == '-'),
space0,
- alt((line_ending, eof)),
+ eol_or_eof,
blank_lines,
)),
|(ws, dashes, ws_, nl, post_blank)| {
diff --git a/src/syntax/table.rs b/src/syntax/table.rs
index 803782d..93e937d 100644
--- a/src/syntax/table.rs
+++ b/src/syntax/table.rs
@@ -19,7 +19,7 @@ fn org_table_node_base(input: Input) -> IResult {
let mut start = 0;
for i in line_ends_iter(input.as_str()) {
let line = input.slice(start..i);
- let trimmed = line.as_str().trim_start();
+ let trimmed = line.as_str().trim_start_matches([' ', '\t']);
// Org tables end at the first line not starting with a vertical bar.
if !trimmed.starts_with('|') {
@@ -81,7 +81,8 @@ fn table_standard_row_node(input: Input) -> Result> {
}
}
});
- it.finish()?;
+ let (input, _) = it.finish()?;
+ debug_assert!(input.is_empty());
Ok(b.finish(ORG_TABLE_STANDARD_ROW))
}
diff --git a/tests/parse.rs b/tests/parse.rs
index 6f78ae8..813316b 100644
--- a/tests/parse.rs
+++ b/tests/parse.rs
@@ -23,9 +23,10 @@ const INPUT: &[&str] = &[
// fuzz test
"___\n",
"\n\n\n",
- "\n\n\n",
"\n*",
- "\r-"
+ "\r-",
+ "6\r\n",
+ "|\n\u{b}|"
];
#[test]