lemmy/crates/utils/src/utils/markdown/mod.rs
Nutomic e8a52d3a5c
Rewrite images to use local proxy (#4035)
* Add markdown rule to add rel=nofollow for all links

* Add markdown image rule to add local image proxy (fixes #1036)

* comments

* rewrite markdown image links working

* add comment

* perform markdown image processing in api/apub receivers

* clippy

* add db table to validate proxied links

* rewrite link fields for avatar, banner etc

* sql fmt

* proxy links received over federation

* add config option

* undo post.url rewriting, move http route definition

* add tests

* proxy images through pictrs

* testing

* cleanup request.rs file

* more cleanup (fixes #2611)

* include url content type when sending post over apub (fixes #2611)

* store post url content type in db

* should be media_type

* get rid of cache_remote_thumbnails setting, instead automatically
take thumbnail from federation data if available.

* fix tests

* add setting disable_external_link_previews

* federate post url as image depending on mime type

* change setting again

* machete

* invert

* support custom emoji

* clippy

* update defaults

* add image proxy test, fix test

* fix test

* clippy

* revert accidental changes

* address review

* clippy

* Markdown link rule-dess (#4356)

* Extracting opengraph_data to its own type.

* A few additions for markdown-link-rule.

---------

Co-authored-by: Nutomic <me@nutomic.com>

* fix setting

* use enum for image proxy setting

* fix test configs

* add config backwards compat

* clippy

* machete

---------

Co-authored-by: Dessalines <dessalines@users.noreply.github.com>
2024-01-25 09:22:11 -05:00

247 lines
8.8 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use crate::settings::SETTINGS;
use markdown_it::{plugins::cmark::inline::image::Image, MarkdownIt};
use once_cell::sync::Lazy;
use url::Url;
use urlencoding::encode;
mod link_rule;
mod spoiler_rule;
static MARKDOWN_PARSER: Lazy<MarkdownIt> = Lazy::new(|| {
let mut parser = MarkdownIt::new();
markdown_it::plugins::cmark::add(&mut parser);
markdown_it::plugins::extra::add(&mut parser);
spoiler_rule::add(&mut parser);
link_rule::add(&mut parser);
parser
});
/// Replace special HTML characters in API parameters to prevent XSS attacks.
///
/// Taken from https://github.com/OWASP/CheatSheetSeries/blob/master/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.md#output-encoding-for-html-contexts
///
/// `>` is left in place because it is interpreted as markdown quote.
pub fn sanitize_html(text: &str) -> String {
text
.replace('&', "&amp;")
.replace('<', "&lt;")
.replace('\"', "&quot;")
.replace('\'', "&#x27;")
}
pub fn markdown_to_html(text: &str) -> String {
MARKDOWN_PARSER.parse(text).xrender()
}
/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`.
pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
let ast = MARKDOWN_PARSER.parse(&src);
let mut links_offsets = vec![];
// Walk the syntax tree to find positions of image links
ast.walk(|node, _depth| {
if let Some(image) = node.cast::<Image>() {
// srcmap is always present for image
// https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387
let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
// necessary for custom emojis which look like `![name](url "title")`
let start_offset = node_offsets.1
- image.url.len()
- 1
- image
.title
.as_ref()
.map(|t| t.len() + 3)
.unwrap_or_default();
let end_offset = node_offsets.1 - 1;
links_offsets.push((start_offset, end_offset));
}
});
let mut links = vec![];
// Go through the collected links in reverse order
while let Some((start, end)) = links_offsets.pop() {
let content = src.get(start..end).unwrap_or_default();
// necessary for custom emojis which look like `![name](url "title")`
let (url, extra) = if content.contains(' ') {
let split = content.split_once(' ').expect("split is valid");
(split.0, Some(split.1))
} else {
(content, None)
};
match Url::parse(url) {
Ok(parsed) => {
links.push(parsed.clone());
// If link points to remote domain, replace with proxied link
if parsed.domain() != Some(&SETTINGS.hostname) {
let mut proxied = format!(
"{}/api/v3/image_proxy?url={}",
SETTINGS.get_protocol_and_hostname(),
encode(url),
);
// restore custom emoji format
if let Some(extra) = extra {
proxied = format!("{proxied} {extra}");
}
src.replace_range(start..end, &proxied);
}
}
Err(_) => {
// If its not a valid url, replace with empty text
src.replace_range(start..end, "");
}
}
}
(src, links)
}
#[cfg(test)]
mod tests {
#![allow(clippy::unwrap_used)]
#![allow(clippy::indexing_slicing)]
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_basic_markdown() {
let tests: Vec<_> = vec![
(
"headings",
"# h1\n## h2\n### h3\n#### h4\n##### h5\n###### h6",
"<h1>h1</h1>\n<h2>h2</h2>\n<h3>h3</h3>\n<h4>h4</h4>\n<h5>h5</h5>\n<h6>h6</h6>\n"
),
(
"line breaks",
"First\rSecond",
"<p>First\nSecond</p>\n"),
(
"emphasis",
"__bold__ **bold** *italic* ***bold+italic***",
"<p><strong>bold</strong> <strong>bold</strong> <em>italic</em> <em><strong>bold+italic</strong></em></p>\n"
),
(
"blockquotes",
"> #### Hello\n > \n > - Hola\n > - 안영 \n>> Goodbye\n",
"<blockquote>\n<h4>Hello</h4>\n<ul>\n<li>Hola</li>\n<li>안영</li>\n</ul>\n<blockquote>\n<p>Goodbye</p>\n</blockquote>\n</blockquote>\n"
),
(
"lists (ordered, unordered)",
"1. pen\n2. apple\n3. apple pen\n- pen\n- pineapple\n- pineapple pen",
"<ol>\n<li>pen</li>\n<li>apple</li>\n<li>apple pen</li>\n</ol>\n<ul>\n<li>pen</li>\n<li>pineapple</li>\n<li>pineapple pen</li>\n</ul>\n"
),
(
"code and code blocks",
"this is my amazing `code snippet` and my amazing ```code block```",
"<p>this is my amazing <code>code snippet</code> and my amazing <code>code block</code></p>\n"
),
// Links with added nofollow attribute
(
"links",
"[Lemmy](https://join-lemmy.org/ \"Join Lemmy!\")",
"<p><a href=\"https://join-lemmy.org/\" rel=\"nofollow\" title=\"Join Lemmy!\">Lemmy</a></p>\n"
),
// Remote images with proxy
(
"images",
"![My linked image](https://example.com/image.png \"image alt text\")",
"<p><img src=\"https://example.com/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
),
// Local images without proxy
(
"images",
"![My linked image](https://lemmy-alpha/image.png \"image alt text\")",
"<p><img src=\"https://lemmy-alpha/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
),
// Ensure spoiler plugin is added
(
"basic spoiler",
"::: spoiler click to see more\nhow spicy!\n:::\n",
"<details><summary>click to see more</summary><p>how spicy!\n</p></details>\n"
),
(
"escape html special chars",
"<script>alert('xss');</script> hello &\"",
"<p>&lt;script&gt;alert(xss);&lt;/script&gt; hello &amp;&quot;</p>\n"
)
];
tests.iter().for_each(|&(msg, input, expected)| {
let result = markdown_to_html(input);
assert_eq!(
result, expected,
"Testing {}, with original input '{}'",
msg, input
);
});
}
#[test]
fn test_markdown_proxy_images() {
let tests: Vec<_> =
vec![
(
"remote image proxied",
"![link](http://example.com/image.jpg)",
"![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)",
),
(
"local image unproxied",
"![link](http://lemmy-alpha/image.jpg)",
"![link](http://lemmy-alpha/image.jpg)",
),
(
"multiple image links",
"![link](http://example.com/image1.jpg) ![link](http://example.com/image2.jpg)",
"![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage1.jpg) ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage2.jpg)",
),
(
"empty link handled",
"![image]()",
"![image]()"
),
(
"empty label handled",
"![](http://example.com/image.jpg)",
"![](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
),
(
"invalid image link removed",
"![image](http-not-a-link)",
"![image]()"
),
(
"label with nested markdown handled",
"![a *b* c](http://example.com/image.jpg)",
"![a *b* c](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
),
(
"custom emoji support",
r#"![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#,
r#"![party-blob](https://lemmy-alpha/api/v3/image_proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#
)
];
tests.iter().for_each(|&(msg, input, expected)| {
let result = markdown_rewrite_image_links(input.to_string());
assert_eq!(
result.0, expected,
"Testing {}, with original input '{}'",
msg, input
);
});
}
#[test]
fn test_sanitize_html() {
let sanitized = sanitize_html("<script>alert('xss');</script> hello &\"'");
let expected = "&lt;script>alert(&#x27;xss&#x27;);&lt;/script> hello &amp;&quot;&#x27;";
assert_eq!(expected, sanitized)
}
}