diff --git a/examples/html-slugify.rs b/examples/html-slugify.rs index ecf8b5c..f930830 100644 --- a/examples/html-slugify.rs +++ b/examples/html-slugify.rs @@ -50,7 +50,7 @@ impl Traverser for MyHtmlHandler { special_block quote_block center_block verse_block comment_block example_block export_block source_block babel_call clock cookie radio_target drawer dyn_block fn_def fn_ref macros snippet timestamp target fixed_width org_table org_table_row org_table_cell latex_fragment - latex_environment entity line_break + latex_environment entity line_break superscript subscript } } diff --git a/src/ast/generate.js b/src/ast/generate.js index b81e51c..d083f8f 100644 --- a/src/ast/generate.js +++ b/src/ast/generate.js @@ -264,6 +264,14 @@ const nodes = [ struct: "LineBreak", kind: ["LINE_BREAK"], }, + { + struct: "Superscript", + kind: ["SUPERSCRIPT"], + }, + { + struct: "Subscript", + kind: ["SUBSCRIPT"], + }, ]; let content = `//! generated file, do not modify it directly diff --git a/src/ast/generated.rs b/src/ast/generated.rs index 344903e..ccdfa77 100644 --- a/src/ast/generated.rs +++ b/src/ast/generated.rs @@ -1716,3 +1716,53 @@ impl LineBreak { self.syntax.text_range().end().into() } } + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Superscript { + pub(crate) syntax: SyntaxNode, +} +impl AstNode for Superscript { + type Language = OrgLanguage; + fn can_cast(kind: SyntaxKind) -> bool { + kind == SUPERSCRIPT + } + fn cast(node: SyntaxNode) -> Option { + Self::can_cast(node.kind()).then(|| Superscript { syntax: node }) + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl Superscript { + pub fn begin(&self) -> u32 { + self.syntax.text_range().start().into() + } + pub fn end(&self) -> u32 { + self.syntax.text_range().end().into() + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Subscript { + pub(crate) syntax: SyntaxNode, +} +impl AstNode for Subscript { + type Language = OrgLanguage; + fn can_cast(kind: SyntaxKind) -> bool { + kind == SUBSCRIPT + } + fn cast(node: SyntaxNode) -> Option { + Self::can_cast(node.kind()).then(|| Subscript { syntax: node }) + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl Subscript { + pub fn begin(&self) -> u32 { + self.syntax.text_range().start().into() + } + pub fn end(&self) -> u32 { + self.syntax.text_range().end().into() + } +} diff --git a/src/export/forward.rs b/src/export/forward.rs index 268f962..dd3f7ab 100644 --- a/src/export/forward.rs +++ b/src/export/forward.rs @@ -49,7 +49,7 @@ /// special_block quote_block center_block verse_block comment_block example_block export_block /// source_block babel_call clock cookie radio_target drawer dyn_block fn_def fn_ref macros /// snippet timestamp target fixed_width org_table org_table_row org_table_cell latex_fragment -/// latex_environment entity line_break +/// latex_environment entity line_break superscript subscript /// } /// } /// @@ -203,6 +203,12 @@ macro_rules! forward_handler { (@method $handler:ty, line_break) => { forward_handler!(@method $handler, line_break, WalkEvent<&$crate::ast::LineBreak>); }; + (@method $handler:ty, superscript) => { + forward_handler!(@method $handler, superscript, WalkEvent<&$crate::ast::Superscript>); + }; + (@method $handler:ty, subscript) => { + forward_handler!(@method $handler, subscript, WalkEvent<&$crate::ast::Subscript>); + }; (@method $handler:ty, $x:ident) => { std::compile_error!(std::concat!(std::stringify!($x), " is not a method")); }; diff --git a/src/export/html.rs b/src/export/html.rs index b422b02..3970819 100644 --- a/src/export/html.rs +++ b/src/export/html.rs @@ -518,4 +518,20 @@ impl Traverser for HtmlExport { ctx.skip(); } } + + #[tracing::instrument(skip(self, _ctx))] + fn subscript(&mut self, event: WalkEvent<&Subscript>, _ctx: &mut TraversalContext) { + match event { + WalkEvent::Enter(_) => self.output += "", + WalkEvent::Leave(_) => self.output += "", + } + } + + #[tracing::instrument(skip(self, _ctx))] + fn superscript(&mut self, event: WalkEvent<&Superscript>, _ctx: &mut TraversalContext) { + match event { + WalkEvent::Enter(_) => self.output += "", + WalkEvent::Leave(_) => self.output += "", + } + } } diff --git a/src/export/traverse.rs b/src/export/traverse.rs index be2efd0..2413751 100644 --- a/src/export/traverse.rs +++ b/src/export/traverse.rs @@ -141,6 +141,8 @@ pub trait Traverser { LATEX_ENVIRONMENT => traverse!(LatexEnvironment, latex_environment), ENTITY => traverse!(Entity, entity), LINE_BREAK => traverse!(LineBreak, line_break), + SUPERSCRIPT => traverse!(Superscript, superscript), + SUBSCRIPT => traverse!(Subscript, subscript), BLOCK_CONTENT | LIST_ITEM_CONTENT => traverse_children!(node), @@ -252,4 +254,8 @@ pub trait Traverser { fn entity(&mut self, event: WalkEvent<&Entity>, ctx: &mut TraversalContext); /// Called when entering or leaving `LineBreak` node fn line_break(&mut self, event: WalkEvent<&LineBreak>, ctx: &mut TraversalContext); + /// Called when entering or leaving `Superscript` node + fn superscript(&mut self, event: WalkEvent<&Superscript>, ctx: &mut TraversalContext); + /// Called when entering or leaving `Subscript` node + fn subscript(&mut self, event: WalkEvent<&Subscript>, ctx: &mut TraversalContext); } diff --git a/src/syntax/combinator.rs b/src/syntax/combinator.rs index 5e834ad..89552e7 100644 --- a/src/syntax/combinator.rs +++ b/src/syntax/combinator.rs @@ -59,7 +59,7 @@ token_parser!(minus2_token, "--", MINUS2); token_parser!(percent2_token, "%%", PERCENT2); // token_parser!(slash_token, "/", SLASH); token_parser!(backslash_token, "\\", BACKSLASH); -// token_parser!(underscore_token, "_", UNDERSCORE); +token_parser!(underscore_token, "_", UNDERSCORE); // token_parser!(star_token, "*", STAR); token_parser!(plus_token, "+", PLUS); token_parser!(minus_token, "-", MINUS); @@ -71,6 +71,7 @@ token_parser!(dollar2_token, "$$", DOLLAR2); // token_parser!(equal_token, "=", EQUAL); // token_parser!(tilde_token, "~", TILDE); token_parser!(hash_plus_token, "#+", HASH_PLUS); +token_parser!(caret_token, "^", CARET); token_parser!(hash_token, "#", HASH); token_parser!(double_arrow_token, "=>", DOUBLE_ARROW); diff --git a/src/syntax/emphasis.rs b/src/syntax/emphasis.rs index 3c680e3..24c2c92 100644 --- a/src/syntax/emphasis.rs +++ b/src/syntax/emphasis.rs @@ -112,6 +112,16 @@ fn validate_marker(pos: usize, text: Input) -> bool { } } +pub fn verify_pre(input: &str) -> bool { + if input.is_empty() { + return true; + } + matches!( + input.as_bytes()[input.len() - 1], + b'\t' | b' ' | b'-' | b'(' | b'{' | b'\\' | b'"' | b'\r' | b'\n' + ) +} + #[test] fn parse() { use crate::{ast::Bold, tests::to_ast, ParseConfig}; diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 5a60467..24dcf84 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -31,6 +31,7 @@ pub mod planning; pub mod radio_target; pub mod rule; pub mod snippet; +pub mod subscript_superscript; pub mod table; pub mod target; pub mod timestamp; @@ -106,6 +107,7 @@ pub enum SyntaxKind { DOUBLE_ARROW, // '=>' PIPE, // '|' COMMA, // ',' + CARET, // '^' NEW_LINE, // '\n' or '\r\n' or '\r' WHITESPACE, // ' ' or '\t' BLANK_LINE, @@ -200,6 +202,8 @@ pub enum SyntaxKind { VERBATIM, CODE, ENTITY, + SUPERSCRIPT, + SUBSCRIPT, /* timestamp */ TIMESTAMP_ACTIVE, @@ -241,6 +245,8 @@ impl SyntaxKind { | SyntaxKind::MACROS | SyntaxKind::RADIO_TARGET | SyntaxKind::COOKIE + | SyntaxKind::SUPERSCRIPT + | SyntaxKind::SUBSCRIPT | SyntaxKind::ORG_TABLE_CELL | SyntaxKind::TIMESTAMP_ACTIVE | SyntaxKind::TIMESTAMP_INACTIVE diff --git a/src/syntax/object.rs b/src/syntax/object.rs index 071b42f..219ffbf 100644 --- a/src/syntax/object.rs +++ b/src/syntax/object.rs @@ -3,7 +3,9 @@ use nom::{AsBytes, IResult, InputLength, InputTake}; use super::{ combinator::GreenElement, cookie::cookie_node, - emphasis::{bold_node, code_node, italic_node, strike_node, underline_node, verbatim_node}, + emphasis::{ + self, bold_node, code_node, italic_node, strike_node, underline_node, verbatim_node, + }, entity::entity_node, fn_ref::fn_ref_node, inline_call::inline_call_node, @@ -15,6 +17,7 @@ use super::{ macros::macros_node, radio_target::radio_target_node, snippet::snippet_node, + subscript_superscript::{self, subscript_node, superscript_node}, target::target_node, timestamp::{timestamp_active_node, timestamp_diary_node, timestamp_inactive_node}, }; @@ -22,7 +25,6 @@ use super::{ struct ObjectPositions<'a> { input: Input<'a>, pos: usize, - next: Option, finder: jetscii::BytesConst, } @@ -31,10 +33,17 @@ impl ObjectPositions<'_> { ObjectPositions { input, pos: 0, - next: Some(0), finder: jetscii::bytes!( - b' ', b'(', b'{', b'\'', b'"', b'\n', /* */ - b'\\', b'$', b'@', b'<', b'[' + b'*', b'+', b'/', b'_', b'=', b'~', /* text markup */ + b'@', /* snippet */ + b'<', /* timestamp, target, radio target */ + b'[', /* link, cookie, fn_ref, timestamp */ + b'c', /* inline call */ + b's', /* inline source */ + b'\\', b'$', /* latex & entity */ + b'{', /* macros */ + b'^', /* superscript */ + b'_' /* subscript */ ), } } @@ -43,10 +52,11 @@ impl ObjectPositions<'_> { ObjectPositions { input, pos: 0, - next: Some(0), finder: jetscii::bytes!( - b' ', b'(', b'{', b'\'', b'"', b'\n', /* */ - b'\\', b'$' + b'*', b'+', b'/', b'_', b'=', b'~', /* text markup */ + b'\\', b'$', /* latex & entity */ + b'^', /* superscript */ + b'_' /* subscript */ ), } } @@ -60,25 +70,12 @@ impl<'a> Iterator for ObjectPositions<'a> { return None; } - if let Some(p) = self.next.take() { - return Some(self.input.take_split(p)); - } - let bytes = &self.input.as_bytes()[self.pos..]; let previous = self.pos; let i = self.finder.find(bytes)?; self.pos += i + 1; - let p = match bytes[i] { - b'{' => { - if self.input.s.len() - self.pos > 2 { - self.next = Some(self.pos); - } - self.pos - 1 - } - b' ' | b'(' | b'\'' | b'"' | b'\n' => self.pos, - _ => self.pos - 1, - }; + let p = self.pos - 1; debug_assert!( previous < self.pos && self.pos <= self.input.s.len(), @@ -112,10 +109,10 @@ impl<'a> Iterator for ObjectPositions<'a> { /// - Timestamps /// - Text Markup (bold code strike verbatim underline italic) /// - Line Breaks +/// - Subscript and Superscript /// /// // todo: /// - Citations -/// - Subscript and Superscript pub fn object_nodes(input: Input) -> Vec { // TODO: // debug_assert!(!input.is_empty()); @@ -125,11 +122,11 @@ pub fn object_nodes(input: Input) -> Vec { 'l: while !i.is_empty() { for (input, head) in ObjectPositions::standard(i) { - if let Ok((input, node)) = standard_object_node(input) { + if let Ok((input, pre)) = standard_object_node(input, head) { if !head.is_empty() { nodes.push(head.text_token()) } - nodes.push(node); + nodes.push(pre); debug_assert!( input.input_len() < i.input_len(), "{} < {}", @@ -157,8 +154,6 @@ pub fn object_nodes(input: Input) -> Vec { /// - LaTeX fragments ('\\') /// - Text markup (bold code strike verbatim underline italic) ('*', '~', '+', '=', '_', '/') /// - Entities ('\\') -/// -/// // todo: /// - Superscripts and Subscripts pub fn minimal_object_nodes(input: Input) -> Vec { let mut i = input; @@ -166,11 +161,11 @@ pub fn minimal_object_nodes(input: Input) -> Vec { 'l: while !i.is_empty() { for (input, head) in ObjectPositions::minimal(i) { - if let Ok((input, node)) = minimal_object_node(input) { + if let Ok((input, pre)) = minimal_object_node(input, head) { if !head.is_empty() { nodes.push(head.text_token()) } - nodes.push(node); + nodes.push(pre); debug_assert!( input.input_len() < i.input_len(), "{} < {}", @@ -195,7 +190,7 @@ pub fn minimal_object_nodes(input: Input) -> Vec { } /// parse an object from standard sets -fn standard_object_node(i: Input) -> IResult { +fn standard_object_node<'a>(i: Input<'a>, pre: Input<'a>) -> IResult, GreenElement, ()> { debug_assert!( i.s.len() >= 2, "object must have at least two characters: {:?}", @@ -203,12 +198,12 @@ fn standard_object_node(i: Input) -> IResult { ); match &i.as_bytes()[0] { - b'*' => bold_node(i), - b'+' => strike_node(i), - b'/' => italic_node(i), - b'_' => underline_node(i), - b'=' => verbatim_node(i), - b'~' => code_node(i), + b'*' if emphasis::verify_pre(pre.s) => bold_node(i), + b'+' if emphasis::verify_pre(pre.s) => strike_node(i), + b'/' if emphasis::verify_pre(pre.s) => italic_node(i), + b'_' if emphasis::verify_pre(pre.s) => underline_node(i), + b'=' if emphasis::verify_pre(pre.s) => verbatim_node(i), + b'~' if emphasis::verify_pre(pre.s) => code_node(i), b'@' => snippet_node(i), b'{' => macros_node(i), b'<' => radio_target_node(i) @@ -219,31 +214,38 @@ fn standard_object_node(i: Input) -> IResult { .or_else(|_| link_node(i)) .or_else(|_| fn_ref_node(i)) .or_else(|_| timestamp_inactive_node(i)), - b'c' => inline_call_node(i), - b's' => inline_src_node(i), + // NOTE: although not specified in document, inline call and inline src follows the + // same pre tokens rule as text markup + b'c' if emphasis::verify_pre(pre.s) => inline_call_node(i), + b's' if emphasis::verify_pre(pre.s) => inline_src_node(i), b'$' => latex_fragment_node(i), - b'\\' => { - if i.as_bytes()[1] == b'\\' { - line_break_node(i) - } else { - entity_node(i).or_else(|_| latex_fragment_node(i)) - } - } + b'\\' if !pre.s.ends_with('\\') && i.as_bytes()[1] == b'\\' => line_break_node(i), + b'\\' => entity_node(i).or_else(|_| latex_fragment_node(i)), + b'^' if subscript_superscript::verify_pre(pre.s) => superscript_node(i), + b'_' if subscript_superscript::verify_pre(pre.s) => subscript_node(i), _ => Err(nom::Err::Error(())), } } /// parse an object from minimal sets -fn minimal_object_node(i: Input) -> IResult { +fn minimal_object_node<'a>(i: Input<'a>, pre: Input<'a>) -> IResult, GreenElement, ()> { + debug_assert!( + i.s.len() >= 2, + "object must have at least two characters: {:?}", + i.s + ); + match &i.as_bytes()[0] { - b'*' => bold_node(i), - b'+' => strike_node(i), - b'/' => italic_node(i), - b'_' => underline_node(i), - b'=' => verbatim_node(i), - b'~' => code_node(i), + b'*' if emphasis::verify_pre(pre.s) => bold_node(i), + b'+' if emphasis::verify_pre(pre.s) => strike_node(i), + b'/' if emphasis::verify_pre(pre.s) => italic_node(i), + b'_' if emphasis::verify_pre(pre.s) => underline_node(i), + b'=' if emphasis::verify_pre(pre.s) => verbatim_node(i), + b'~' if emphasis::verify_pre(pre.s) => code_node(i), b'$' => latex_fragment_node(i), b'\\' => entity_node(i).or_else(|_| latex_fragment_node(i)), + b'^' if subscript_superscript::verify_pre(pre.s) => superscript_node(i), + b'_' if subscript_superscript::verify_pre(pre.s) => subscript_node(i), _ => Err(nom::Err::Error(())), } } @@ -261,19 +263,18 @@ fn positions() { // https://github.com/PoiScript/orgize/issues/69 let vec = ObjectPositions::standard(("{3}", &config).into()).collect::>(); - assert_eq!(vec.len(), 2); + assert_eq!(vec.len(), 1); assert_eq!(vec[0].0.s, "{3}"); - // FIXME: - assert_eq!(vec[1].0.s, "{3}"); let vec = ObjectPositions::standard(("*{()}//s\nc<<", &config).into()).collect::>(); - assert_eq!(vec.len(), 6); + assert_eq!(vec.len(), 7); assert_eq!(vec[0].0.s, "*{()}//s\nc<<"); assert_eq!(vec[1].0.s, "{()}//s\nc<<"); - assert_eq!(vec[2].0.s, "()}//s\nc<<"); - assert_eq!(vec[3].0.s, ")}//s\nc<<"); - assert_eq!(vec[4].0.s, "c<<"); - assert_eq!(vec[5].0.s, "<<"); + assert_eq!(vec[2].0.s, "//s\nc<<"); + assert_eq!(vec[3].0.s, "/s\nc<<"); + assert_eq!(vec[4].0.s, "s\nc<<"); + assert_eq!(vec[5].0.s, "c<<"); + assert_eq!(vec[6].0.s, "<<"); } #[test] @@ -347,4 +348,15 @@ functions starting with ~org-element-~."#), TEXT@174..175 "." "### ); + + insta::assert_debug_snapshot!( + t("a^abc"), + @r###" + PARAGRAPH@0..5 + TEXT@0..1 "a" + SUPERSCRIPT@1..5 + CARET@1..2 "^" + TEXT@2..5 "abc" + "### + ); } diff --git a/src/syntax/subscript_superscript.rs b/src/syntax/subscript_superscript.rs new file mode 100644 index 0000000..051303a --- /dev/null +++ b/src/syntax/subscript_superscript.rs @@ -0,0 +1,162 @@ +use memchr::memchr2_iter; +use nom::{ + branch::alt, + bytes::complete::{tag, take_while1}, + combinator::opt, + AsBytes, IResult, InputTake, +}; + +use crate::{ + syntax::{ + combinator::{caret_token, underscore_token}, + object::object_nodes, + }, + SyntaxKind, +}; + +use super::{ + combinator::{l_curly_token, node, r_curly_token, GreenElement}, + input::Input, +}; + +pub fn superscript_node(input: Input) -> IResult { + let (input, caret) = caret_token(input)?; + + let mut children = vec![caret]; + + if let Ok((input, star)) = tag::<&str, Input, ()>("*")(input) { + children.push(star.text_token()); + Ok((input, node(SyntaxKind::SUPERSCRIPT, children))) + } else if let Ok((input, (l, contents, r))) = template1(input) { + children.push(l); + children.extend(object_nodes(contents)); + children.push(r); + Ok((input, node(SyntaxKind::SUPERSCRIPT, children))) + } else if let Ok((input, (sign, contents))) = template2(input) { + if let Some(s) = sign { + children.push(s) + } + children.push(contents); + Ok((input, node(SyntaxKind::SUPERSCRIPT, children))) + } else { + Err(nom::Err::Error(())) + } +} + +pub fn subscript_node(input: Input) -> IResult { + let (input, underscore) = underscore_token(input)?; + + let mut children = vec![underscore]; + + if let Ok((input, star)) = tag::<&str, Input, ()>("*")(input) { + children.push(star.text_token()); + Ok((input, node(SyntaxKind::SUBSCRIPT, children))) + } else if let Ok((input, (l, contents, r))) = template1(input) { + children.push(l); + children.extend(object_nodes(contents)); + children.push(r); + Ok((input, node(SyntaxKind::SUBSCRIPT, children))) + } else if let Ok((input, (sign, contents))) = template2(input) { + if let Some(s) = sign { + children.push(s) + } + children.push(contents); + Ok((input, node(SyntaxKind::SUBSCRIPT, children))) + } else { + Err(nom::Err::Error(())) + } +} + +fn template1(input: Input) -> IResult { + let (input, l) = l_curly_token(input)?; + let (input, contents) = balanced_brackets(input)?; + let (input, r) = r_curly_token(input)?; + Ok((input, (l, contents, r))) +} + +fn template2(input: Input) -> IResult, GreenElement), ()> { + let (input, sign) = opt(alt((tag("+"), tag("-"))))(input)?; + + let (input, contents) = + take_while1(|c: char| c.is_alphanumeric() || c == ',' || c == '\\' || c == '.')(input)?; + + if contents.s.ends_with(|c: char| !c.is_alphanumeric()) { + return Err(nom::Err::Error(())); + } + + Ok((input, (sign.map(|x| x.text_token()), contents.text_token()))) +} + +fn balanced_brackets(input: Input) -> IResult { + let mut pairs = 1; + let bytes = input.as_bytes(); + for i in memchr2_iter(b'{', b'}', bytes) { + if bytes[i] == b'{' { + pairs += 1; + } else if pairs != 1 { + pairs -= 1; + } else { + return Ok(input.take_split(i)); + } + } + Err(nom::Err::Error(())) +} + +pub fn verify_pre(s: &str) -> bool { + dbg!(&s); + if s.is_empty() { + return false; + } + let last = s.as_bytes()[s.len() - 1]; + last != b' ' && last != b'\t' +} + +#[test] +fn parse() { + use crate::ast::Subscript; + use crate::tests::to_ast; + + let to_subscript = to_ast::(subscript_node); + + insta::assert_debug_snapshot!( + to_subscript("_*").syntax, + @r###" + SUBSCRIPT@0..2 + UNDERSCORE@0..1 "_" + TEXT@1..2 "*" + "### + ); + + insta::assert_debug_snapshot!( + to_subscript("_{*bo\nld*}").syntax, + @r###" + SUBSCRIPT@0..10 + UNDERSCORE@0..1 "_" + L_CURLY@1..2 "{" + BOLD@2..9 + STAR@2..3 "*" + TEXT@3..8 "bo\nld" + STAR@8..9 "*" + R_CURLY@9..10 "}" + "### + ); + + insta::assert_debug_snapshot!( + to_subscript("_+123").syntax, + @r###" + SUBSCRIPT@0..5 + UNDERSCORE@0..1 "_" + TEXT@1..2 "+" + TEXT@2..5 "123" + "### + ); + + insta::assert_debug_snapshot!( + to_subscript("_abc").syntax, + @r###" + SUBSCRIPT@0..4 + UNDERSCORE@0..1 "_" + TEXT@1..4 "abc" + "### + ); +} diff --git a/wasm/index.html b/wasm/index.html index 02a515c..8a7d11d 100644 --- a/wasm/index.html +++ b/wasm/index.html @@ -313,6 +313,12 @@ $$ Entity \\alpha\\_ \\rightarrow{}\\_ \\beta + +----- +Subscript & superscript & line break + +E= mc^2 \\\\ +Fe_{_3_}O_4 `); editor.session.on("change", () => render());