From 4a3dd6aacb7b67f3d54d1fc2b64336d87e4938f2 Mon Sep 17 00:00:00 2001 From: PoiScript Date: Sat, 18 Nov 2023 23:57:45 +0800 Subject: [PATCH] feat: support latex fragment parsing --- src/ast/generate.js | 12 ++- src/ast/generated.rs | 50 +++++++++ src/syntax/combinator.rs | 3 + src/syntax/latex_fragment.rs | 196 +++++++++++++++++++++++++++++++++++ src/syntax/mod.rs | 7 +- src/syntax/object.rs | 7 +- 6 files changed, 269 insertions(+), 6 deletions(-) create mode 100644 src/syntax/latex_fragment.rs diff --git a/src/ast/generate.js b/src/ast/generate.js index bd9ff54..5a3c96e 100644 --- a/src/ast/generate.js +++ b/src/ast/generate.js @@ -195,10 +195,6 @@ const nodes = [ struct: "FnRef", kind: ["FN_REF"], }, - { - struct: "LatexEnvironment", - kind: ["LATEX_ENVIRONMENT"], - }, { struct: "Macros", kind: ["MACROS"], @@ -259,6 +255,14 @@ const nodes = [ ["minute_end", "TIMESTAMP_MINUTE"], ], }, + { + struct: "LatexEnvironment", + kind: ["LATEX_ENVIRONMENT"], + }, + { + struct: "LatexFragment", + kind: ["LATEX_FRAGMENT"], + }, ]; let content = `//! generated file, do not modify it directly diff --git a/src/ast/generated.rs b/src/ast/generated.rs index 68c796d..b435c3e 100644 --- a/src/ast/generated.rs +++ b/src/ast/generated.rs @@ -1650,3 +1650,53 @@ impl Timestamp { super::last_token(&self.syntax, TIMESTAMP_MINUTE) } } + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct LatexEnvironment { + pub(crate) syntax: SyntaxNode, +} +impl AstNode for LatexEnvironment { + type Language = OrgLanguage; + fn can_cast(kind: SyntaxKind) -> bool { + kind == LATEX_ENVIRONMENT + } + fn cast(node: SyntaxNode) -> Option { + Self::can_cast(node.kind()).then(|| LatexEnvironment { syntax: node }) + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl LatexEnvironment { + pub fn begin(&self) -> u32 { + self.syntax.text_range().start().into() + } + pub fn end(&self) -> u32 { + self.syntax.text_range().end().into() + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct LatexFragment { + pub(crate) syntax: SyntaxNode, +} +impl AstNode for LatexFragment { + type Language = OrgLanguage; + fn can_cast(kind: SyntaxKind) -> bool { + kind == LATEX_FRAGMENT + } + fn cast(node: SyntaxNode) -> Option { + Self::can_cast(node.kind()).then(|| LatexFragment { syntax: node }) + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl LatexFragment { + pub fn begin(&self) -> u32 { + self.syntax.text_range().start().into() + } + pub fn end(&self) -> u32 { + self.syntax.text_range().end().into() + } +} diff --git a/src/syntax/combinator.rs b/src/syntax/combinator.rs index a3a9b38..5e834ad 100644 --- a/src/syntax/combinator.rs +++ b/src/syntax/combinator.rs @@ -58,6 +58,7 @@ token_parser!(minus2_token, "--", MINUS2); // token_parser!(percent_token, "%", PERCENT); token_parser!(percent2_token, "%%", PERCENT2); // token_parser!(slash_token, "/", SLASH); +token_parser!(backslash_token, "\\", BACKSLASH); // token_parser!(underscore_token, "_", UNDERSCORE); // token_parser!(star_token, "*", STAR); token_parser!(plus_token, "+", PLUS); @@ -65,6 +66,8 @@ token_parser!(minus_token, "-", MINUS); token_parser!(colon_token, ":", COLON); token_parser!(colon2_token, "::", COLON2); token_parser!(pipe_token, "|", PIPE); +token_parser!(dollar_token, "$", DOLLAR); +token_parser!(dollar2_token, "$$", DOLLAR2); // token_parser!(equal_token, "=", EQUAL); // token_parser!(tilde_token, "~", TILDE); token_parser!(hash_plus_token, "#+", HASH_PLUS); diff --git a/src/syntax/latex_fragment.rs b/src/syntax/latex_fragment.rs new file mode 100644 index 0000000..1c32732 --- /dev/null +++ b/src/syntax/latex_fragment.rs @@ -0,0 +1,196 @@ +use nom::{ + branch::alt, + bytes::complete::{take_until1, take_while1}, + character::complete::alpha1, + sequence::tuple, + AsBytes, IResult, InputTake, +}; + +use crate::SyntaxKind; + +use super::{ + combinator::{ + backslash_token, dollar2_token, dollar_token, l_bracket_token, l_curly_token, + l_parens_token, node, r_bracket_token, r_curly_token, r_parens_token, GreenElement, + }, + input::Input, +}; + +#[tracing::instrument(level = "debug", skip(input), fields(input = input.s))] +pub fn latex_fragment_node(input: Input) -> IResult { + debug_assert!(input.s.starts_with(['\\', '$'])); + let mut parser = alt((template1, template2, template3, template4, template5)); + crate::lossless_parser!(parser, input) +} + +// \NAME[CONTENTS1] \NAME{CONTENTS1} +fn template1(input: Input) -> IResult { + let (input, (backslash, name)) = tuple((backslash_token, alpha1))(input)?; + let (input, (l, content, r)) = alt(( + tuple(( + l_bracket_token, + take_while1(|c| c != '{' && c != '}' && c != '[' && c != ']' && c != '\r' && c != '\n'), + r_bracket_token, + )), + tuple(( + l_curly_token, + take_while1(|c| c != '{' && c != '}' && c != '\r' && c != '\n'), + r_curly_token, + )), + ))(input)?; + Ok(( + input, + node( + SyntaxKind::LATEX_FRAGMENT, + [backslash, name.text_token(), l, content.text_token(), r], + ), + )) +} + +// \(CONTENTS\) +fn template2(input: Input) -> IResult { + let (input, (backslash1, l)) = tuple((backslash_token, l_parens_token))(input)?; + if let Some(i) = jetscii::Substring::new("\\)").find(input.s) { + let (input, content) = input.take_split(i); + let (input, (backslash2, r)) = tuple((backslash_token, r_parens_token))(input)?; + Ok(( + input, + node( + SyntaxKind::LATEX_FRAGMENT, + [backslash1, l, content.text_token(), backslash2, r], + ), + )) + } else { + Err(nom::Err::Error(())) + } +} + +// \[CONTENTS\] +fn template3(input: Input) -> IResult { + let (input, (backslash1, l)) = tuple((backslash_token, l_bracket_token))(input)?; + if let Some(i) = jetscii::Substring::new("\\]").find(input.s) { + let (input, content) = input.take_split(i); + let (input, (backslash2, r)) = tuple((backslash_token, r_bracket_token))(input)?; + Ok(( + input, + node( + SyntaxKind::LATEX_FRAGMENT, + [backslash1, l, content.text_token(), backslash2, r], + ), + )) + } else { + Err(nom::Err::Error(())) + } +} + +// $$CONTENTS$$ +fn template4(input: Input) -> IResult { + let (input, l) = dollar2_token(input)?; + let (input, content) = take_until1("$$")(input)?; + let (input, r) = dollar2_token(input)?; + Ok(( + input, + node(SyntaxKind::LATEX_FRAGMENT, [l, content.text_token(), r]), + )) +} + +// $CONTENTS$ +fn template5(input: Input) -> IResult { + let (input, l) = dollar_token(input)?; + let (input, content) = take_until1("$")(input)?; + let (input, r) = dollar_token(input)?; + + let b = content.as_bytes()[0]; + if matches!(b, b'\r' | b'\n' | b' ' | b'\t' | b'.' | b',' | b';' | b'$') { + return Err(nom::Err::Error(())); + } + + let b = content.as_bytes()[content.s.len() - 1]; + if matches!(b, b'\r' | b'\n' | b' ' | b'\t' | b'.' | b',' | b'$') { + return Err(nom::Err::Error(())); + } + + let p = input.bytes().next(); + if let Some(p) = p { + if !matches!(p, b')' | b'}' | b']' | b'\'' | b'"' | b' ' | b'\r' | b'\n') { + return Err(nom::Err::Error(())); + } + } + + Ok(( + input, + node(SyntaxKind::LATEX_FRAGMENT, [l, content.text_token(), r]), + )) +} + +#[test] +fn parse() { + use crate::{ast::LatexFragment, tests::to_ast, ParseConfig}; + + let to_fragment = to_ast::(latex_fragment_node); + + insta::assert_debug_snapshot!( + to_fragment("\\enlargethispage{2\\baselineskip}").syntax, + @r###" + LATEX_FRAGMENT@0..32 + BACKSLASH@0..1 "\\" + TEXT@1..16 "enlargethispage" + L_CURLY@16..17 "{" + TEXT@17..31 "2\\baselineskip" + R_CURLY@31..32 "}" + "### + ); + + insta::assert_debug_snapshot!( + to_fragment("\\[a\\]").syntax, + @r###" + LATEX_FRAGMENT@0..5 + BACKSLASH@0..1 "\\" + L_BRACKET@1..2 "[" + TEXT@2..3 "a" + BACKSLASH@3..4 "\\" + R_BRACKET@4..5 "]" + "### + ); + + insta::assert_debug_snapshot!( + to_fragment("\\(e^{i \\pi}\\)").syntax, + @r###" + LATEX_FRAGMENT@0..13 + BACKSLASH@0..1 "\\" + L_PARENS@1..2 "(" + TEXT@2..11 "e^{i \\pi}" + BACKSLASH@11..12 "\\" + R_PARENS@12..13 ")" + "### + ); + + insta::assert_debug_snapshot!( + to_fragment("$\\frac{1}{3}$").syntax, + @r###" + LATEX_FRAGMENT@0..13 + DOLLAR@0..1 "$" + TEXT@1..12 "\\frac{1}{3}" + DOLLAR@12..13 "$" + "### + ); + + insta::assert_debug_snapshot!( + to_fragment("$a\nb$").syntax, + @r###" + LATEX_FRAGMENT@0..5 + DOLLAR@0..1 "$" + TEXT@1..4 "a\nb" + DOLLAR@4..5 "$" + "### + ); + + let c = ParseConfig::default(); + + assert!(latex_fragment_node(("$ LaTeXxxx$", &c).into()).is_err()); + assert!(latex_fragment_node(("$LaTeXxxx $", &c).into()).is_err()); + assert!(latex_fragment_node(("$a.$", &c).into()).is_err()); + assert!(latex_fragment_node(("$a$a", &c).into()).is_err()); + assert!(latex_fragment_node(("$$b\nol\nd*", &c).into()).is_err()); + assert!(latex_fragment_node(("$b\nol\nd*", &c).into()).is_err()); +} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index e6625c4..7df9a45 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -18,6 +18,7 @@ pub mod inline_call; pub mod inline_src; pub mod input; pub mod keyword; +pub mod latex_fragment; pub mod link; pub mod list; pub mod macros; @@ -85,6 +86,9 @@ pub enum SyntaxKind { PERCENT, // '%' PERCENT2, // '%%' SLASH, // '/' + BACKSLASH, // '\' + DOLLAR, // '$' + DOLLAR2, // '$$' UNDERSCORE, // '_' STAR, // '*' PLUS, // '+' @@ -168,6 +172,7 @@ pub enum SyntaxKind { BLOCK_BEGIN, BLOCK_END, BLOCK_CONTENT, + LATEX_ENVIRONMENT, // // objects @@ -179,7 +184,7 @@ pub enum SyntaxKind { COOKIE, RADIO_TARGET, FN_REF, - LATEX_ENVIRONMENT, + LATEX_FRAGMENT, MACROS, MACROS_ARGUMENT, SNIPPET, diff --git a/src/syntax/object.rs b/src/syntax/object.rs index 50074e1..c21d1b3 100644 --- a/src/syntax/object.rs +++ b/src/syntax/object.rs @@ -8,6 +8,7 @@ use super::{ inline_call::inline_call_node, inline_src::inline_src_node, input::Input, + latex_fragment::latex_fragment_node, link::link_node, macros::macros_node, radio_target::radio_target_node, @@ -29,7 +30,9 @@ impl ObjectPositions<'_> { input, pos: 0, next: Some(0), - finder: jetscii::bytes!(b'@', b'<', b'[', b' ', b'(', b'{', b'\'', b'"', b'\n'), + finder: jetscii::bytes!( + b'@', b'<', b'[', b' ', b'(', b'{', b'\'', b'"', b'\n', b'\\', b'$' + ), } } } @@ -142,6 +145,8 @@ fn object_node(i: Input) -> IResult { .or_else(|_| timestamp_inactive_node(i)), b'c' => inline_call_node(i), b's' => inline_src_node(i), + b'$' => latex_fragment_node(i), + b'\\' => latex_fragment_node(i), _ => Err(nom::Err::Error(())), } }