feat: support subscript and superscript

2023-11-21 18:41:17 +08:00 · 2023-11-21 18:41:17 +08:00 · 8b5c545d4b
commit 8b5c545d4b
parent 58dfb022c2
12 changed files with 346 additions and 63 deletions
--- a/src/syntax/combinator.rs
+++ b/src/syntax/combinator.rs
@ -59,7 +59,7 @@ token_parser!(minus2_token, "--", MINUS2);
 token_parser!(percent2_token, "%%", PERCENT2);
 // token_parser!(slash_token, "/", SLASH);
 token_parser!(backslash_token, "\\", BACKSLASH);
-// token_parser!(underscore_token, "_", UNDERSCORE);
+token_parser!(underscore_token, "_", UNDERSCORE);
 // token_parser!(star_token, "*", STAR);
 token_parser!(plus_token, "+", PLUS);
 token_parser!(minus_token, "-", MINUS);
@ -71,6 +71,7 @@ token_parser!(dollar2_token, "$$", DOLLAR2);
 // token_parser!(equal_token, "=", EQUAL);
 // token_parser!(tilde_token, "~", TILDE);
 token_parser!(hash_plus_token, "#+", HASH_PLUS);
+token_parser!(caret_token, "^", CARET);
 token_parser!(hash_token, "#", HASH);
 token_parser!(double_arrow_token, "=>", DOUBLE_ARROW);

--- a/src/syntax/emphasis.rs
+++ b/src/syntax/emphasis.rs
@ -112,6 +112,16 @@ fn validate_marker(pos: usize, text: Input) -> bool {
    }
 }

+pub fn verify_pre(input: &str) -> bool {
+    if input.is_empty() {
+        return true;
+    }
+    matches!(
+        input.as_bytes()[input.len() - 1],
+        b'\t' | b' ' | b'-' | b'(' | b'{' | b'\\' | b'"' | b'\r' | b'\n'
+    )
+}
+
 #[test]
 fn parse() {
    use crate::{ast::Bold, tests::to_ast, ParseConfig};
--- a/src/syntax/mod.rs
+++ b/src/syntax/mod.rs
@ -31,6 +31,7 @@ pub mod planning;
 pub mod radio_target;
 pub mod rule;
 pub mod snippet;
+pub mod subscript_superscript;
 pub mod table;
 pub mod target;
 pub mod timestamp;
@ -106,6 +107,7 @@ pub enum SyntaxKind {
    DOUBLE_ARROW, // '=>'
    PIPE,         // '|'
    COMMA,        // ','
+    CARET,        // '^'
    NEW_LINE,     // '\n' or '\r\n' or '\r'
    WHITESPACE,   // ' ' or '\t'
    BLANK_LINE,
@ -200,6 +202,8 @@ pub enum SyntaxKind {
    VERBATIM,
    CODE,
    ENTITY,
+    SUPERSCRIPT,
+    SUBSCRIPT,

    /* timestamp */
    TIMESTAMP_ACTIVE,
@ -241,6 +245,8 @@ impl SyntaxKind {
                | SyntaxKind::MACROS
                | SyntaxKind::RADIO_TARGET
                | SyntaxKind::COOKIE
+                | SyntaxKind::SUPERSCRIPT
+                | SyntaxKind::SUBSCRIPT
                | SyntaxKind::ORG_TABLE_CELL
                | SyntaxKind::TIMESTAMP_ACTIVE
                | SyntaxKind::TIMESTAMP_INACTIVE
--- a/src/syntax/object.rs
+++ b/src/syntax/object.rs
@ -3,7 +3,9 @@ use nom::{AsBytes, IResult, InputLength, InputTake};
 use super::{
    combinator::GreenElement,
    cookie::cookie_node,
-    emphasis::{bold_node, code_node, italic_node, strike_node, underline_node, verbatim_node},
+    emphasis::{
+        self, bold_node, code_node, italic_node, strike_node, underline_node, verbatim_node,
+    },
    entity::entity_node,
    fn_ref::fn_ref_node,
    inline_call::inline_call_node,
@ -15,6 +17,7 @@ use super::{
    macros::macros_node,
    radio_target::radio_target_node,
    snippet::snippet_node,
+    subscript_superscript::{self, subscript_node, superscript_node},
    target::target_node,
    timestamp::{timestamp_active_node, timestamp_diary_node, timestamp_inactive_node},
 };
@ -22,7 +25,6 @@ use super::{
 struct ObjectPositions<'a> {
    input: Input<'a>,
    pos: usize,
-    next: Option<usize>,
    finder: jetscii::BytesConst,
 }

@ -31,10 +33,17 @@ impl ObjectPositions<'_> {
        ObjectPositions {
            input,
            pos: 0,
-            next: Some(0),
            finder: jetscii::bytes!(
-                b' ', b'(', b'{', b'\'', b'"', b'\n', /*  */
-                b'\\', b'$', b'@', b'<', b'['
+                b'*', b'+', b'/', b'_', b'=', b'~', /* text markup */
+                b'@', /* snippet */
+                b'<', /* timestamp, target, radio target */
+                b'[', /* link, cookie, fn_ref, timestamp */
+                b'c', /* inline call */
+                b's', /* inline source */
+                b'\\', b'$', /* latex & entity */
+                b'{', /* macros */
+                b'^', /* superscript */
+                b'_'  /* subscript */
            ),
        }
    }
@ -43,10 +52,11 @@ impl ObjectPositions<'_> {
        ObjectPositions {
            input,
            pos: 0,
-            next: Some(0),
            finder: jetscii::bytes!(
-                b' ', b'(', b'{', b'\'', b'"', b'\n', /*  */
-                b'\\', b'$'
+                b'*', b'+', b'/', b'_', b'=', b'~', /* text markup */
+                b'\\', b'$', /* latex & entity */
+                b'^', /* superscript */
+                b'_'  /* subscript */
            ),
        }
    }
@ -60,25 +70,12 @@ impl<'a> Iterator for ObjectPositions<'a> {
            return None;
        }

-        if let Some(p) = self.next.take() {
-            return Some(self.input.take_split(p));
-        }
-
        let bytes = &self.input.as_bytes()[self.pos..];
        let previous = self.pos;
        let i = self.finder.find(bytes)?;
        self.pos += i + 1;

-        let p = match bytes[i] {
-            b'{' => {
-                if self.input.s.len() - self.pos > 2 {
-                    self.next = Some(self.pos);
-                }
-                self.pos - 1
-            }
-            b' ' | b'(' | b'\'' | b'"' | b'\n' => self.pos,
-            _ => self.pos - 1,
-        };
+        let p = self.pos - 1;

        debug_assert!(
            previous < self.pos && self.pos <= self.input.s.len(),
@ -112,10 +109,10 @@ impl<'a> Iterator for ObjectPositions<'a> {
 /// - Timestamps
 /// - Text Markup (bold code strike verbatim underline italic)
 /// - Line Breaks
+/// - Subscript and Superscript
 ///
 /// // todo:
 /// - Citations
-/// - Subscript and Superscript
 pub fn object_nodes(input: Input) -> Vec<GreenElement> {
    // TODO:
    // debug_assert!(!input.is_empty());
@ -125,11 +122,11 @@ pub fn object_nodes(input: Input) -> Vec<GreenElement> {

    'l: while !i.is_empty() {
        for (input, head) in ObjectPositions::standard(i) {
-            if let Ok((input, node)) = standard_object_node(input) {
+            if let Ok((input, pre)) = standard_object_node(input, head) {
                if !head.is_empty() {
                    nodes.push(head.text_token())
                }
-                nodes.push(node);
+                nodes.push(pre);
                debug_assert!(
                    input.input_len() < i.input_len(),
                    "{} < {}",
@ -157,8 +154,6 @@ pub fn object_nodes(input: Input) -> Vec<GreenElement> {
 /// - LaTeX fragments ('\\')
 /// - Text markup (bold code strike verbatim underline italic) ('*', '~', '+', '=', '_', '/')
 /// - Entities ('\\')
-///
-/// // todo:
 /// - Superscripts and Subscripts
 pub fn minimal_object_nodes(input: Input) -> Vec<GreenElement> {
    let mut i = input;
@ -166,11 +161,11 @@ pub fn minimal_object_nodes(input: Input) -> Vec<GreenElement> {

    'l: while !i.is_empty() {
        for (input, head) in ObjectPositions::minimal(i) {
-            if let Ok((input, node)) = minimal_object_node(input) {
+            if let Ok((input, pre)) = minimal_object_node(input, head) {
                if !head.is_empty() {
                    nodes.push(head.text_token())
                }
-                nodes.push(node);
+                nodes.push(pre);
                debug_assert!(
                    input.input_len() < i.input_len(),
                    "{} < {}",
@ -195,7 +190,7 @@ pub fn minimal_object_nodes(input: Input) -> Vec<GreenElement> {
 }

 /// parse an object from standard sets
-fn standard_object_node(i: Input) -> IResult<Input, GreenElement, ()> {
+fn standard_object_node<'a>(i: Input<'a>, pre: Input<'a>) -> IResult<Input<'a>, GreenElement, ()> {
    debug_assert!(
        i.s.len() >= 2,
        "object must have at least two characters: {:?}",
@ -203,12 +198,12 @@ fn standard_object_node(i: Input) -> IResult<Input, GreenElement, ()> {
    );

    match &i.as_bytes()[0] {
-        b'*' => bold_node(i),
-        b'+' => strike_node(i),
-        b'/' => italic_node(i),
-        b'_' => underline_node(i),
-        b'=' => verbatim_node(i),
-        b'~' => code_node(i),
+        b'*' if emphasis::verify_pre(pre.s) => bold_node(i),
+        b'+' if emphasis::verify_pre(pre.s) => strike_node(i),
+        b'/' if emphasis::verify_pre(pre.s) => italic_node(i),
+        b'_' if emphasis::verify_pre(pre.s) => underline_node(i),
+        b'=' if emphasis::verify_pre(pre.s) => verbatim_node(i),
+        b'~' if emphasis::verify_pre(pre.s) => code_node(i),
        b'@' => snippet_node(i),
        b'{' => macros_node(i),
        b'<' => radio_target_node(i)
@ -219,31 +214,38 @@ fn standard_object_node(i: Input) -> IResult<Input, GreenElement, ()> {
            .or_else(|_| link_node(i))
            .or_else(|_| fn_ref_node(i))
            .or_else(|_| timestamp_inactive_node(i)),
-        b'c' => inline_call_node(i),
-        b's' => inline_src_node(i),
+        // NOTE: although not specified in document, inline call and inline src follows the
+        // same pre tokens rule as text markup
+        b'c' if emphasis::verify_pre(pre.s) => inline_call_node(i),
+        b's' if emphasis::verify_pre(pre.s) => inline_src_node(i),
        b'$' => latex_fragment_node(i),
-        b'\\' => {
-            if i.as_bytes()[1] == b'\\' {
-                line_break_node(i)
-            } else {
-                entity_node(i).or_else(|_| latex_fragment_node(i))
-            }
-        }
+        b'\\' if !pre.s.ends_with('\\') && i.as_bytes()[1] == b'\\' => line_break_node(i),
+        b'\\' => entity_node(i).or_else(|_| latex_fragment_node(i)),
+        b'^' if subscript_superscript::verify_pre(pre.s) => superscript_node(i),
+        b'_' if subscript_superscript::verify_pre(pre.s) => subscript_node(i),
        _ => Err(nom::Err::Error(())),
    }
 }

 /// parse an object from minimal sets
-fn minimal_object_node(i: Input) -> IResult<Input, GreenElement, ()> {
+fn minimal_object_node<'a>(i: Input<'a>, pre: Input<'a>) -> IResult<Input<'a>, GreenElement, ()> {
+    debug_assert!(
+        i.s.len() >= 2,
+        "object must have at least two characters: {:?}",
+        i.s
+    );
+
    match &i.as_bytes()[0] {
-        b'*' => bold_node(i),
-        b'+' => strike_node(i),
-        b'/' => italic_node(i),
-        b'_' => underline_node(i),
-        b'=' => verbatim_node(i),
-        b'~' => code_node(i),
+        b'*' if emphasis::verify_pre(pre.s) => bold_node(i),
+        b'+' if emphasis::verify_pre(pre.s) => strike_node(i),
+        b'/' if emphasis::verify_pre(pre.s) => italic_node(i),
+        b'_' if emphasis::verify_pre(pre.s) => underline_node(i),
+        b'=' if emphasis::verify_pre(pre.s) => verbatim_node(i),
+        b'~' if emphasis::verify_pre(pre.s) => code_node(i),
        b'$' => latex_fragment_node(i),
        b'\\' => entity_node(i).or_else(|_| latex_fragment_node(i)),
+        b'^' if subscript_superscript::verify_pre(pre.s) => superscript_node(i),
+        b'_' if subscript_superscript::verify_pre(pre.s) => subscript_node(i),
        _ => Err(nom::Err::Error(())),
    }
 }
@ -261,19 +263,18 @@ fn positions() {

    // https://github.com/PoiScript/orgize/issues/69
    let vec = ObjectPositions::standard(("{3}", &config).into()).collect::<Vec<_>>();
-    assert_eq!(vec.len(), 2);
+    assert_eq!(vec.len(), 1);
    assert_eq!(vec[0].0.s, "{3}");
-    // FIXME:
-    assert_eq!(vec[1].0.s, "{3}");

    let vec = ObjectPositions::standard(("*{()}//s\nc<<", &config).into()).collect::<Vec<_>>();
-    assert_eq!(vec.len(), 6);
+    assert_eq!(vec.len(), 7);
    assert_eq!(vec[0].0.s, "*{()}//s\nc<<");
    assert_eq!(vec[1].0.s, "{()}//s\nc<<");
-    assert_eq!(vec[2].0.s, "()}//s\nc<<");
-    assert_eq!(vec[3].0.s, ")}//s\nc<<");
-    assert_eq!(vec[4].0.s, "c<<");
-    assert_eq!(vec[5].0.s, "<<");
+    assert_eq!(vec[2].0.s, "//s\nc<<");
+    assert_eq!(vec[3].0.s, "/s\nc<<");
+    assert_eq!(vec[4].0.s, "s\nc<<");
+    assert_eq!(vec[5].0.s, "c<<");
+    assert_eq!(vec[6].0.s, "<<");
 }

 #[test]
@ -347,4 +348,15 @@ functions starting with ~org-element-~."#),
      TEXT@174..175 "."
    "###
    );
+
+    insta::assert_debug_snapshot!(
+        t("a^abc"),
+        @r###"
+    PARAGRAPH@0..5
+      TEXT@0..1 "a"
+      SUPERSCRIPT@1..5
+        CARET@1..2 "^"
+        TEXT@2..5 "abc"
+    "###
+    );
 }
--- a/src/syntax/subscript_superscript.rs
+++ b/src/syntax/subscript_superscript.rs
@ -0,0 +1,162 @@
+use memchr::memchr2_iter;
+use nom::{
+    branch::alt,
+    bytes::complete::{tag, take_while1},
+    combinator::opt,
+    AsBytes, IResult, InputTake,
+};
+
+use crate::{
+    syntax::{
+        combinator::{caret_token, underscore_token},
+        object::object_nodes,
+    },
+    SyntaxKind,
+};
+
+use super::{
+    combinator::{l_curly_token, node, r_curly_token, GreenElement},
+    input::Input,
+};
+
+pub fn superscript_node(input: Input) -> IResult<Input, GreenElement, ()> {
+    let (input, caret) = caret_token(input)?;
+
+    let mut children = vec![caret];
+
+    if let Ok((input, star)) = tag::<&str, Input, ()>("*")(input) {
+        children.push(star.text_token());
+        Ok((input, node(SyntaxKind::SUPERSCRIPT, children)))
+    } else if let Ok((input, (l, contents, r))) = template1(input) {
+        children.push(l);
+        children.extend(object_nodes(contents));
+        children.push(r);
+        Ok((input, node(SyntaxKind::SUPERSCRIPT, children)))
+    } else if let Ok((input, (sign, contents))) = template2(input) {
+        if let Some(s) = sign {
+            children.push(s)
+        }
+        children.push(contents);
+        Ok((input, node(SyntaxKind::SUPERSCRIPT, children)))
+    } else {
+        Err(nom::Err::Error(()))
+    }
+}
+
+pub fn subscript_node(input: Input) -> IResult<Input, GreenElement, ()> {
+    let (input, underscore) = underscore_token(input)?;
+
+    let mut children = vec![underscore];
+
+    if let Ok((input, star)) = tag::<&str, Input, ()>("*")(input) {
+        children.push(star.text_token());
+        Ok((input, node(SyntaxKind::SUBSCRIPT, children)))
+    } else if let Ok((input, (l, contents, r))) = template1(input) {
+        children.push(l);
+        children.extend(object_nodes(contents));
+        children.push(r);
+        Ok((input, node(SyntaxKind::SUBSCRIPT, children)))
+    } else if let Ok((input, (sign, contents))) = template2(input) {
+        if let Some(s) = sign {
+            children.push(s)
+        }
+        children.push(contents);
+        Ok((input, node(SyntaxKind::SUBSCRIPT, children)))
+    } else {
+        Err(nom::Err::Error(()))
+    }
+}
+
+fn template1(input: Input) -> IResult<Input, (GreenElement, Input, GreenElement), ()> {
+    let (input, l) = l_curly_token(input)?;
+    let (input, contents) = balanced_brackets(input)?;
+    let (input, r) = r_curly_token(input)?;
+    Ok((input, (l, contents, r)))
+}
+
+fn template2(input: Input) -> IResult<Input, (Option<GreenElement>, GreenElement), ()> {
+    let (input, sign) = opt(alt((tag("+"), tag("-"))))(input)?;
+
+    let (input, contents) =
+        take_while1(|c: char| c.is_alphanumeric() || c == ',' || c == '\\' || c == '.')(input)?;
+
+    if contents.s.ends_with(|c: char| !c.is_alphanumeric()) {
+        return Err(nom::Err::Error(()));
+    }
+
+    Ok((input, (sign.map(|x| x.text_token()), contents.text_token())))
+}
+
+fn balanced_brackets(input: Input) -> IResult<Input, Input, ()> {
+    let mut pairs = 1;
+    let bytes = input.as_bytes();
+    for i in memchr2_iter(b'{', b'}', bytes) {
+        if bytes[i] == b'{' {
+            pairs += 1;
+        } else if pairs != 1 {
+            pairs -= 1;
+        } else {
+            return Ok(input.take_split(i));
+        }
+    }
+    Err(nom::Err::Error(()))
+}
+
+pub fn verify_pre(s: &str) -> bool {
+    dbg!(&s);
+    if s.is_empty() {
+        return false;
+    }
+    let last = s.as_bytes()[s.len() - 1];
+    last != b' ' && last != b'\t'
+}
+
+#[test]
+fn parse() {
+    use crate::ast::Subscript;
+    use crate::tests::to_ast;
+
+    let to_subscript = to_ast::<Subscript>(subscript_node);
+
+    insta::assert_debug_snapshot!(
+        to_subscript("_*").syntax,
+        @r###"
+    SUBSCRIPT@0..2
+      UNDERSCORE@0..1 "_"
+      TEXT@1..2 "*"
+    "###
+    );
+
+    insta::assert_debug_snapshot!(
+        to_subscript("_{*bo\nld*}").syntax,
+        @r###"
+    SUBSCRIPT@0..10
+      UNDERSCORE@0..1 "_"
+      L_CURLY@1..2 "{"
+      BOLD@2..9
+        STAR@2..3 "*"
+        TEXT@3..8 "bo\nld"
+        STAR@8..9 "*"
+      R_CURLY@9..10 "}"
+    "###
+    );
+
+    insta::assert_debug_snapshot!(
+        to_subscript("_+123").syntax,
+        @r###"
+    SUBSCRIPT@0..5
+      UNDERSCORE@0..1 "_"
+      TEXT@1..2 "+"
+      TEXT@2..5 "123"
+    "###
+    );
+
+    insta::assert_debug_snapshot!(
+        to_subscript("_abc").syntax,
+        @r###"
+    SUBSCRIPT@0..4
+      UNDERSCORE@0..1 "_"
+      TEXT@1..4 "abc"
+    "###
+    );
+}