lemmy/crates/utils/src/utils/markdown/mod.rs

use crate::settings::SETTINGS;
use markdown_it::{plugins::cmark::inline::image::Image, MarkdownIt};
use once_cell::sync::Lazy;
use url::Url;
use urlencoding::encode;

mod link_rule;
mod spoiler_rule;

static MARKDOWN_PARSER: Lazy<MarkdownIt> = Lazy::new(|| {
  let mut parser = MarkdownIt::new();
  markdown_it::plugins::cmark::add(&mut parser);
  markdown_it::plugins::extra::add(&mut parser);
  spoiler_rule::add(&mut parser);
  link_rule::add(&mut parser);

  parser
});

/// Replace special HTML characters in API parameters to prevent XSS attacks.
///
/// Taken from https://github.com/OWASP/CheatSheetSeries/blob/master/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.md#output-encoding-for-html-contexts
///
/// `>` is left in place because it is interpreted as markdown quote.
pub fn sanitize_html(text: &str) -> String {
  text
    .replace('&', "&amp;")
    .replace('<', "&lt;")
    .replace('\"', "&quot;")
    .replace('\'', "&#x27;")
}

pub fn markdown_to_html(text: &str) -> String {
  MARKDOWN_PARSER.parse(text).xrender()
}

/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`.
pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
  let ast = MARKDOWN_PARSER.parse(&src);
  let mut links_offsets = vec![];

  // Walk the syntax tree to find positions of image links
  ast.walk(|node, _depth| {
    if let Some(image) = node.cast::<Image>() {
      // srcmap is always present for image
      // https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387
      let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
      // necessary for custom emojis which look like `![name](url "title")`
      let start_offset = node_offsets.1
        - image.url.len()
        - 1
        - image
          .title
          .as_ref()
          .map(|t| t.len() + 3)
          .unwrap_or_default();
      let end_offset = node_offsets.1 - 1;

      links_offsets.push((start_offset, end_offset));
    }
  });

  let mut links = vec![];
  // Go through the collected links in reverse order
  while let Some((start, end)) = links_offsets.pop() {
    let content = src.get(start..end).unwrap_or_default();
    // necessary for custom emojis which look like `![name](url "title")`
    let (url, extra) = if content.contains(' ') {
      let split = content.split_once(' ').expect("split is valid");
      (split.0, Some(split.1))
    } else {
      (content, None)
    };
    match Url::parse(url) {
      Ok(parsed) => {
        links.push(parsed.clone());
        // If link points to remote domain, replace with proxied link
        if parsed.domain() != Some(&SETTINGS.hostname) {
          let mut proxied = format!(
            "{}/api/v3/image_proxy?url={}",
            SETTINGS.get_protocol_and_hostname(),
            encode(url),
          );
          // restore custom emoji format
          if let Some(extra) = extra {
            proxied = format!("{proxied} {extra}");
          }
          src.replace_range(start..end, &proxied);
        }
      }
      Err(_) => {
        // If its not a valid url, replace with empty text
        src.replace_range(start..end, "");
      }
    }
  }

  (src, links)
}

#[cfg(test)]
mod tests {
  #![allow(clippy::unwrap_used)]
  #![allow(clippy::indexing_slicing)]

  use super::*;
  use pretty_assertions::assert_eq;

  #[test]
  fn test_basic_markdown() {
    let tests: Vec<_> = vec![
            (
                "headings",
                "# h1\n## h2\n### h3\n#### h4\n##### h5\n###### h6",
                "<h1>h1</h1>\n<h2>h2</h2>\n<h3>h3</h3>\n<h4>h4</h4>\n<h5>h5</h5>\n<h6>h6</h6>\n"
            ),
            (
                "line breaks",
                "First\rSecond",
                "<p>First\nSecond</p>\n"),
            (
                "emphasis",
                "__bold__ **bold** *italic* ***bold+italic***",
                "<p><strong>bold</strong> <strong>bold</strong> <em>italic</em> <em><strong>bold+italic</strong></em></p>\n"
            ),
            (
                "blockquotes",
                "> #### Hello\n > \n > - Hola\n > - 안영 \n>> Goodbye\n",
                "<blockquote>\n<h4>Hello</h4>\n<ul>\n<li>Hola</li>\n<li>안영</li>\n</ul>\n<blockquote>\n<p>Goodbye</p>\n</blockquote>\n</blockquote>\n"
            ),
            (
                "lists (ordered, unordered)",
                "1. pen\n2. apple\n3. apple pen\n- pen\n- pineapple\n- pineapple pen",
                "<ol>\n<li>pen</li>\n<li>apple</li>\n<li>apple pen</li>\n</ol>\n<ul>\n<li>pen</li>\n<li>pineapple</li>\n<li>pineapple pen</li>\n</ul>\n"
            ),
            (
                "code and code blocks",
                "this is my amazing `code snippet` and my amazing ```code block```",
                "<p>this is my amazing <code>code snippet</code> and my amazing <code>code block</code></p>\n"
            ),
            // Links with added nofollow attribute
            (
                "links",
                "[Lemmy](https://join-lemmy.org/ \"Join Lemmy!\")",
                "<p><a href=\"https://join-lemmy.org/\" rel=\"nofollow\" title=\"Join Lemmy!\">Lemmy</a></p>\n"
            ),
            // Remote images with proxy
            (
                "images",
                "![My linked image](https://example.com/image.png \"image alt text\")",
                "<p><img src=\"https://example.com/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
            ),
            // Local images without proxy
            (
                "images",
                "![My linked image](https://lemmy-alpha/image.png \"image alt text\")",
                "<p><img src=\"https://lemmy-alpha/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
            ),
            // Ensure spoiler plugin is added
            (
                "basic spoiler",
                "::: spoiler click to see more\nhow spicy!\n:::\n",
                "<details><summary>click to see more</summary><p>how spicy!\n</p></details>\n"
            ),
            (
                "escape html special chars",
                "<script>alert('xss');</script> hello &\"",
                "<p>&lt;script&gt;alert(‘xss’);&lt;/script&gt; hello &amp;&quot;</p>\n"
            )
        ];

    tests.iter().for_each(|&(msg, input, expected)| {
      let result = markdown_to_html(input);

      assert_eq!(
        result, expected,
        "Testing {}, with original input '{}'",
        msg, input
      );
    });
  }

  #[test]
  fn test_markdown_proxy_images() {
    let tests: Vec<_> =
      vec![
          (
            "remote image proxied",
            "![link](http://example.com/image.jpg)",
            "![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)",
          ),
          (
            "local image unproxied",
            "![link](http://lemmy-alpha/image.jpg)",
            "![link](http://lemmy-alpha/image.jpg)",
          ),
          (
            "multiple image links",
            "![link](http://example.com/image1.jpg) ![link](http://example.com/image2.jpg)",
            "![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage1.jpg) ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage2.jpg)",
          ),
          (
            "empty link handled",
            "![image]()",
            "![image]()"
          ),
          (
            "empty label handled",
            "![](http://example.com/image.jpg)",
            "![](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
          ),
         (
            "invalid image link removed",
            "![image](http-not-a-link)",
            "![image]()"
         ),
         (
            "label with nested markdown handled",
            "![a *b* c](http://example.com/image.jpg)",
            "![a *b* c](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
         ),
          (
              "custom emoji support",
              r#"![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#,
              r#"![party-blob](https://lemmy-alpha/api/v3/image_proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#
              )
      ];

    tests.iter().for_each(|&(msg, input, expected)| {
      let result = markdown_rewrite_image_links(input.to_string());

      assert_eq!(
        result.0, expected,
        "Testing {}, with original input '{}'",
        msg, input
      );
    });
  }

  #[test]
  fn test_sanitize_html() {
    let sanitized = sanitize_html("<script>alert('xss');</script> hello &\"'");
    let expected = "&lt;script>alert(&#x27;xss&#x27;);&lt;/script> hello &amp;&quot;&#x27;";
    assert_eq!(expected, sanitized)
  }
}