fix: consider single '\r' as newline

This commit is contained in:
PoiScript 2023-11-23 17:56:17 +08:00
parent 8fcfd60712
commit 471a23c958
No known key found for this signature in database
GPG key ID: 22C2B1249D99985E
13 changed files with 138 additions and 110 deletions

View file

@ -1,15 +1,14 @@
use nom::{
branch::alt,
bytes::complete::{tag, tag_no_case},
character::complete::{alpha1, line_ending, space0},
combinator::eof,
character::complete::{alpha1, space0},
sequence::tuple,
IResult, InputTake,
};
use super::{
combinator::{
blank_lines, line_starts_iter, node, token, trim_line_end, GreenElement, NodeBuilder,
blank_lines, eol_or_eof, line_starts_iter, node, token, trim_line_end, GreenElement,
NodeBuilder,
},
element::element_nodes,
input::Input,
@ -67,13 +66,8 @@ fn block_begin_node(input: Input) -> IResult<Input, (GreenElement, &str), ()> {
}
fn block_end_node<'a>(input: Input<'a>, name: &str) -> IResult<Input<'a>, GreenElement, ()> {
let (input, (ws, end, name, ws_, nl)) = tuple((
space0,
tag_no_case("#+END_"),
tag(name),
space0,
alt((line_ending, eof)),
))(input)?;
let (input, (ws, end, name, ws_, nl)) =
tuple((space0, tag_no_case("#+END_"), tag(name), space0, eol_or_eof))(input)?;
let mut b = NodeBuilder::new();
b.ws(ws);

View file

@ -1,14 +1,16 @@
use nom::{
branch::alt,
bytes::complete::tag,
character::complete::{digit1, line_ending, space0},
combinator::{eof, map, opt, recognize},
character::complete::{digit1, space0},
combinator::{map, opt, recognize},
sequence::tuple,
IResult,
};
use super::{
combinator::{blank_lines, colon_token, double_arrow_token, GreenElement, NodeBuilder},
combinator::{
blank_lines, colon_token, double_arrow_token, eol_or_eof, GreenElement, NodeBuilder,
},
input::Input,
timestamp::{timestamp_active_node, timestamp_inactive_node},
SyntaxKind,
@ -29,7 +31,7 @@ pub fn clock_node(input: Input) -> IResult<Input, GreenElement, ()> {
recognize(tuple((digit1, colon_token, digit1))),
))),
space0,
alt((line_ending, eof)),
eol_or_eof,
blank_lines,
)),
|(ws, clock, ws_, timestamp, duration, ws__, nl, post_blank)| {

View file

@ -1,10 +1,7 @@
use std::iter::once;
use memchr::{memchr, memchr2_iter, memchr_iter};
use nom::{
bytes::complete::tag, character::complete::space0, AsBytes, IResult, InputLength, InputTake,
};
use memchr::{memchr2, memchr2_iter, Memchr2};
use nom::{bytes::complete::tag, AsBytes, IResult, InputTake, Slice};
use rowan::{GreenNode, GreenToken, Language, NodeOrToken};
use std::iter::once;
use super::{input::Input, OrgLanguage, SyntaxKind, SyntaxKind::*};
@ -101,13 +98,7 @@ pub fn blank_lines(input: Input) -> IResult<Input, Vec<GreenElement>, ()> {
let mut start = 0;
let bytes = input.as_bytes();
for index in memchr2_iter(b'\r', b'\n', bytes)
.map(|i| i + 1)
.chain(once(bytes.len()))
{
if bytes.get(index - 1) == Some(&b'\r') && bytes.get(index) == Some(&b'\n') {
continue;
}
for index in line_ends_iter(input.as_str()) {
if start != index && bytes[start..index].iter().all(|b| b.is_ascii_whitespace()) {
lines.push(token(BLANK_LINE, &input.as_str()[start..index]));
start = index;
@ -116,7 +107,7 @@ pub fn blank_lines(input: Input) -> IResult<Input, Vec<GreenElement>, ()> {
}
}
Ok((input.take_split(start).0, lines))
Ok((input.slice(start..), lines))
}
#[test]
@ -159,21 +150,26 @@ fn test_blank_lines() {
/// Returns 1. anything before trailing whitespace, 2. whitespace itself, 3. line feeding
pub fn trim_line_end(input: Input) -> IResult<Input, (Input, Input, Input), ()> {
let (input, line) = input.take_split(
memchr(b'\n', input.as_bytes())
.map(|i| i + 1)
.unwrap_or(input.input_len()),
);
let bytes = input.as_bytes();
let (ws_and_nl, contents) = line.take_split(
line.as_bytes()
.iter()
.rposition(|u| !u.is_ascii_whitespace())
.map(|i| i + 1)
.unwrap_or(0),
);
let (input, contents, nl) = match memchr2(b'\r', b'\n', bytes) {
Some(i) if bytes[i] == b'\r' && matches!(bytes.get(i + 1), Some(b'\n')) => (
input.slice(i + 2..),
input.slice(0..i),
input.slice(i..i + 2),
),
Some(i) => (
input.slice(i + 1..),
input.slice(0..i),
input.slice(i..i + 1),
),
_ => (input.of(""), input, input.of("")),
};
let (nl, ws) = space0(ws_and_nl)?;
let (contents, ws) = match contents.bytes().rposition(|u| !u.is_ascii_whitespace()) {
Some(i) => (contents.slice(0..i + 1), contents.slice(i + 1..)),
None => (contents.of(""), contents),
};
Ok((input, (contents, ws, nl)))
}
@ -200,18 +196,72 @@ fn test_trim_line_end() {
assert_eq!(output.0.as_str(), "* hello, world :abc:");
assert_eq!(output.1.as_str(), " ");
assert_eq!(output.2.as_str(), "\r\n");
let (input, output) = trim_line_end((" \rr", config).into()).unwrap();
assert_eq!(input.as_str(), "r");
assert_eq!(output.0.as_str(), "");
assert_eq!(output.1.as_str(), " ");
assert_eq!(output.2.as_str(), "\r");
}
/// Recognizes a line ending \r, \n, \r\n or end of file
pub fn eol_or_eof(input: Input) -> IResult<Input, Input, ()> {
let mut bytes = input.bytes();
let count = match bytes.next() {
Some(b'\n') => 1,
Some(b'\r') => {
if matches!(bytes.next(), Some(b'\n')) {
2
} else {
1
}
}
None => 0,
_ => return Err(nom::Err::Error(())),
};
Ok(input.take_split(count))
}
struct LineStart<'a> {
bytes: &'a [u8],
iter: Memchr2<'a>,
}
impl<'a> LineStart<'a> {
fn new(input: &'a str) -> Self {
let bytes = input.as_bytes();
LineStart {
bytes,
iter: memchr2_iter(b'\r', b'\n', bytes),
}
}
}
impl<'a> Iterator for LineStart<'a> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
let i = self.iter.next()?;
if self.bytes[i] == b'\r' && self.bytes.get(i + 1) == Some(&b'\n') {
let ii = self.iter.next();
debug_assert_eq!(i + 1, ii.unwrap());
Some(i + 2)
} else {
Some(i + 1)
}
}
}
/// Returns an iterator of positions of line start, including zero
pub fn line_starts_iter(s: &str) -> impl Iterator<Item = usize> + '_ {
once(0).chain(memchr_iter(b'\n', s.as_bytes()).map(|i| i + 1))
once(0).chain(LineStart::new(s))
}
/// Returns an iterator of positions of line end, including eof
pub fn line_ends_iter(s: &str) -> impl Iterator<Item = usize> + '_ {
memchr_iter(b'\n', s.as_bytes())
.map(|i| i + 1)
.chain(once(s.len()))
LineStart::new(s).chain(once(s.len()))
}
pub struct NodeBuilder {
@ -233,7 +283,7 @@ impl NodeBuilder {
pub fn nl(&mut self, i: Input) {
if !i.is_empty() {
debug_assert!(
i.s == "\n" || i.s == "\r\n",
i.s == "\n" || i.s == "\r\n" || i.s == "\r",
"{:?} should be a new line",
i.s
);

View file

@ -1,4 +1,4 @@
use nom::{IResult, InputTake};
use nom::{AsBytes, IResult, InputTake};
use super::{
combinator::{blank_lines, line_ends_iter, node, GreenElement},
@ -9,10 +9,13 @@ use super::{
fn comment_node_base(input: Input) -> IResult<Input, GreenElement, ()> {
let mut start = 0;
for i in line_ends_iter(input.as_str()) {
let line = &input.as_str()[start..i];
let trimmed = line.trim_start();
let mut iter = input.as_bytes()[start..]
.iter()
.skip_while(|&&b| b == b' ' || b == b'\t');
if trimmed == "#" || trimmed == "#\n" || trimmed == "#\r\n" || trimmed.starts_with("# ") {
if matches!(iter.next(), Some(b'#'))
&& matches!(iter.next(), None | Some(b'\n') | Some(b'\r') | Some(b' '))
{
start = i;
} else {
break;

View file

@ -1,16 +1,15 @@
use nom::{
branch::alt,
bytes::complete::{tag_no_case, take_while1},
character::complete::{line_ending, space0, space1},
combinator::{eof, iterator, map, opt},
character::complete::{space0, space1},
combinator::{iterator, map, opt},
sequence::tuple,
IResult, InputTake,
};
use super::{
combinator::{
blank_lines, colon_token, line_starts_iter, node, plus_token, trim_line_end, GreenElement,
NodeBuilder,
blank_lines, colon_token, eol_or_eof, line_starts_iter, node, plus_token, trim_line_end,
GreenElement, NodeBuilder,
},
input::Input,
SyntaxKind::*,
@ -25,7 +24,7 @@ fn drawer_begin_node(input: Input) -> IResult<Input, (GreenElement, &str), ()> {
take_while1(|c: char| c.is_ascii_alphabetic() || c == '-' || c == '_'),
colon_token,
space0,
alt((line_ending, eof)),
eol_or_eof,
))(input)?;
b.ws(ws);
@ -45,7 +44,7 @@ fn drawer_end_node(input: Input) -> IResult<Input, GreenElement, ()> {
tag_no_case("END"),
colon_token,
space0,
alt((line_ending, eof)),
eol_or_eof,
))(input)?;
let mut b = NodeBuilder::new();

View file

@ -1,14 +1,14 @@
use nom::{
branch::alt,
bytes::complete::tag_no_case,
character::complete::{alpha1, line_ending, space0, space1},
combinator::eof,
character::complete::{alpha1, space0, space1},
sequence::tuple,
IResult, InputTake,
};
use super::{
combinator::{blank_lines, line_starts_iter, node, trim_line_end, GreenElement, NodeBuilder},
combinator::{
blank_lines, eol_or_eof, line_starts_iter, node, trim_line_end, GreenElement, NodeBuilder,
},
input::Input,
SyntaxKind::*,
};
@ -55,12 +55,8 @@ fn dyn_block_begin_node(input: Input) -> IResult<Input, GreenElement, ()> {
}
fn dyn_block_end_node(input: Input) -> IResult<Input, GreenElement, ()> {
let (input, (ws, end, ws_, nl)) = tuple((
space0,
tag_no_case("#+END:"),
space0,
alt((line_ending, eof)),
))(input)?;
let (input, (ws, end, ws_, nl)) =
tuple((space0, tag_no_case("#+END:"), space0, eol_or_eof))(input)?;
let mut b = NodeBuilder::new();
b.ws(ws);

View file

@ -1,4 +1,4 @@
use nom::{IResult, InputTake};
use nom::{AsBytes, IResult, InputTake};
use super::{
combinator::{blank_lines, line_ends_iter, node, GreenElement},
@ -9,10 +9,13 @@ use super::{
fn fixed_width_node_base(input: Input) -> IResult<Input, GreenElement, ()> {
let mut start = 0;
for i in line_ends_iter(input.as_str()) {
let line = &input.s[start..i];
let trimmed = line.trim_start();
let mut iter = input.as_bytes()[start..]
.iter()
.skip_while(|&&b| b == b' ' || b == b'\t');
if trimmed == ":" || trimmed == ":\n" || trimmed == ":\r\n" || trimmed.starts_with(": ") {
if matches!(iter.next(), Some(b':'))
&& matches!(iter.next(), None | Some(b'\n') | Some(b'\r') | Some(b' '))
{
start = i;
} else {
break;

View file

@ -1,8 +1,6 @@
use nom::{
branch::alt,
bytes::complete::{tag, take_while1},
character::complete::{line_ending, space0},
combinator::eof,
character::complete::space0,
sequence::tuple,
IResult, InputTake,
};
@ -10,7 +8,7 @@ use nom::{
use crate::SyntaxKind;
use super::{
combinator::{l_curly_token, line_starts_iter, node, r_curly_token, GreenElement},
combinator::{eol_or_eof, l_curly_token, line_starts_iter, node, r_curly_token, GreenElement},
input::Input,
};
@ -36,7 +34,7 @@ fn latex_environment_node_base(input: Input) -> IResult<Input, GreenElement, ()>
tag(name1.s),
r_curly_token,
space0,
alt((line_ending, eof)),
eol_or_eof,
))(input)
{
return Ok((

View file

@ -1,13 +1,7 @@
use nom::{
branch::alt,
character::complete::{line_ending, space0},
combinator::{eof, map},
sequence::tuple,
IResult,
};
use nom::{character::complete::space0, combinator::map, sequence::tuple, IResult};
use crate::{
syntax::combinator::{backslash_token, node},
syntax::combinator::{backslash_token, eol_or_eof, node},
SyntaxKind,
};
@ -16,12 +10,7 @@ use super::{combinator::GreenElement, input::Input};
pub fn line_break_node(input: Input) -> IResult<Input, GreenElement, ()> {
debug_assert!(input.s.starts_with('\\'));
let mut parser = map(
tuple((
backslash_token,
backslash_token,
space0,
alt((line_ending, eof)),
)),
tuple((backslash_token, backslash_token, space0, eol_or_eof)),
|(b1, b2, ws, nl)| {
node(
SyntaxKind::LINE_BREAK,

View file

@ -1,14 +1,10 @@
use nom::{
branch::alt,
bytes::complete::tag,
character::complete::{line_ending, space0},
combinator::{eof, iterator},
sequence::tuple,
IResult,
branch::alt, bytes::complete::tag, character::complete::space0, combinator::iterator,
sequence::tuple, IResult,
};
use super::{
combinator::{GreenElement, NodeBuilder},
combinator::{eol_or_eof, GreenElement, NodeBuilder},
input::Input,
timestamp::{timestamp_active_node, timestamp_inactive_node},
SyntaxKind::*,
@ -54,7 +50,7 @@ fn planning_node_base(input: Input) -> IResult<Input, GreenElement, ()> {
let (input, _) = it.finish()?;
let (input, ws) = space0(input)?;
let (input, nl) = alt((line_ending, eof))(input)?;
let (input, nl) = eol_or_eof(input)?;
b.ws(ws);
b.nl(nl);

View file

@ -1,14 +1,10 @@
use nom::{
branch::alt,
bytes::complete::take_while_m_n,
character::complete::{line_ending, space0},
combinator::{eof, map},
sequence::tuple,
bytes::complete::take_while_m_n, character::complete::space0, combinator::map, sequence::tuple,
IResult,
};
use super::{
combinator::{blank_lines, GreenElement, NodeBuilder},
combinator::{blank_lines, eol_or_eof, GreenElement, NodeBuilder},
input::Input,
SyntaxKind::*,
};
@ -19,7 +15,7 @@ pub fn rule_node(input: Input) -> IResult<Input, GreenElement, ()> {
space0,
take_while_m_n(5, usize::max_value(), |c| c == '-'),
space0,
alt((line_ending, eof)),
eol_or_eof,
blank_lines,
)),
|(ws, dashes, ws_, nl, post_blank)| {

View file

@ -19,7 +19,7 @@ fn org_table_node_base(input: Input) -> IResult<Input, GreenElement, ()> {
let mut start = 0;
for i in line_ends_iter(input.as_str()) {
let line = input.slice(start..i);
let trimmed = line.as_str().trim_start();
let trimmed = line.as_str().trim_start_matches([' ', '\t']);
// Org tables end at the first line not starting with a vertical bar.
if !trimmed.starts_with('|') {
@ -81,7 +81,8 @@ fn table_standard_row_node(input: Input) -> Result<GreenElement, nom::Err<()>> {
}
}
});
it.finish()?;
let (input, _) = it.finish()?;
debug_assert!(input.is_empty());
Ok(b.finish(ORG_TABLE_STANDARD_ROW))
}