mirror of
https://github.com/LemmyNet/lemmy.git
synced 2024-05-15 00:12:38 +00:00
19a1a077c5
* Add a blocklist for URLs. * Fix SQL format * Make clippy happy. * Use regex for URL matching. * Escape regex chars in URLs. * Use post for modification. * Make URL block regex static and remove API routes. * Add date fields to table and use transaction. * Use Cache for blocklist. * Rename check_links + move list to parameters of process_markdown. * SQL format. * Format, again. * Remove println. * Add API test. * Set a shorter lifetime for regex in debug mode. * Add missing macro. * Update lemmy-js-client * Update api_test/pnpm-lock.yaml * Don't break other tests * Use different URL for test --------- Co-authored-by: Dessalines <dessalines@users.noreply.github.com> Co-authored-by: Nutomic <me@nutomic.com>
318 lines
10 KiB
Rust
318 lines
10 KiB
Rust
use crate::{error::LemmyResult, settings::SETTINGS, LemmyErrorType};
|
||
use markdown_it::{plugins::cmark::inline::image::Image, MarkdownIt};
|
||
use once_cell::sync::Lazy;
|
||
use regex::RegexSet;
|
||
use url::Url;
|
||
use urlencoding::encode;
|
||
|
||
mod link_rule;
|
||
mod spoiler_rule;
|
||
|
||
static MARKDOWN_PARSER: Lazy<MarkdownIt> = Lazy::new(|| {
|
||
let mut parser = MarkdownIt::new();
|
||
markdown_it::plugins::cmark::add(&mut parser);
|
||
markdown_it::plugins::extra::add(&mut parser);
|
||
spoiler_rule::add(&mut parser);
|
||
link_rule::add(&mut parser);
|
||
|
||
parser
|
||
});
|
||
|
||
/// Replace special HTML characters in API parameters to prevent XSS attacks.
|
||
///
|
||
/// Taken from https://github.com/OWASP/CheatSheetSeries/blob/master/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.md#output-encoding-for-html-contexts
|
||
///
|
||
/// `>` is left in place because it is interpreted as markdown quote.
|
||
pub fn sanitize_html(text: &str) -> String {
|
||
text
|
||
.replace('&', "&")
|
||
.replace('<', "<")
|
||
.replace('\"', """)
|
||
.replace('\'', "'")
|
||
}
|
||
|
||
pub fn markdown_to_html(text: &str) -> String {
|
||
MARKDOWN_PARSER.parse(text).xrender()
|
||
}
|
||
|
||
/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`.
|
||
pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
|
||
let ast = MARKDOWN_PARSER.parse(&src);
|
||
let mut links_offsets = vec![];
|
||
|
||
// Walk the syntax tree to find positions of image links
|
||
ast.walk(|node, _depth| {
|
||
if let Some(image) = node.cast::<Image>() {
|
||
// srcmap is always present for image
|
||
// https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387
|
||
let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
|
||
// necessary for custom emojis which look like `![name](url "title")`
|
||
let start_offset = node_offsets.1
|
||
- image.url.len()
|
||
- 1
|
||
- image
|
||
.title
|
||
.as_ref()
|
||
.map(|t| t.len() + 3)
|
||
.unwrap_or_default();
|
||
let end_offset = node_offsets.1 - 1;
|
||
|
||
links_offsets.push((start_offset, end_offset));
|
||
}
|
||
});
|
||
|
||
let mut links = vec![];
|
||
// Go through the collected links in reverse order
|
||
while let Some((start, end)) = links_offsets.pop() {
|
||
let content = src.get(start..end).unwrap_or_default();
|
||
// necessary for custom emojis which look like `![name](url "title")`
|
||
let (url, extra) = if content.contains(' ') {
|
||
let split = content.split_once(' ').expect("split is valid");
|
||
(split.0, Some(split.1))
|
||
} else {
|
||
(content, None)
|
||
};
|
||
match Url::parse(url) {
|
||
Ok(parsed) => {
|
||
links.push(parsed.clone());
|
||
// If link points to remote domain, replace with proxied link
|
||
if parsed.domain() != Some(&SETTINGS.hostname) {
|
||
let mut proxied = format!(
|
||
"{}/api/v3/image_proxy?url={}",
|
||
SETTINGS.get_protocol_and_hostname(),
|
||
encode(url),
|
||
);
|
||
// restore custom emoji format
|
||
if let Some(extra) = extra {
|
||
proxied = format!("{proxied} {extra}");
|
||
}
|
||
src.replace_range(start..end, &proxied);
|
||
}
|
||
}
|
||
Err(_) => {
|
||
// If its not a valid url, replace with empty text
|
||
src.replace_range(start..end, "");
|
||
}
|
||
}
|
||
}
|
||
|
||
(src, links)
|
||
}
|
||
|
||
pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> LemmyResult<()> {
|
||
if blocklist.is_match(text) {
|
||
Err(LemmyErrorType::BlockedUrl)?
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
#![allow(clippy::unwrap_used)]
|
||
#![allow(clippy::indexing_slicing)]
|
||
|
||
use super::*;
|
||
use pretty_assertions::assert_eq;
|
||
|
||
#[test]
|
||
fn test_basic_markdown() {
|
||
let tests: Vec<_> = vec![
|
||
(
|
||
"headings",
|
||
"# h1\n## h2\n### h3\n#### h4\n##### h5\n###### h6",
|
||
"<h1>h1</h1>\n<h2>h2</h2>\n<h3>h3</h3>\n<h4>h4</h4>\n<h5>h5</h5>\n<h6>h6</h6>\n"
|
||
),
|
||
(
|
||
"line breaks",
|
||
"First\rSecond",
|
||
"<p>First\nSecond</p>\n"),
|
||
(
|
||
"emphasis",
|
||
"__bold__ **bold** *italic* ***bold+italic***",
|
||
"<p><strong>bold</strong> <strong>bold</strong> <em>italic</em> <em><strong>bold+italic</strong></em></p>\n"
|
||
),
|
||
(
|
||
"blockquotes",
|
||
"> #### Hello\n > \n > - Hola\n > - 안영 \n>> Goodbye\n",
|
||
"<blockquote>\n<h4>Hello</h4>\n<ul>\n<li>Hola</li>\n<li>안영</li>\n</ul>\n<blockquote>\n<p>Goodbye</p>\n</blockquote>\n</blockquote>\n"
|
||
),
|
||
(
|
||
"lists (ordered, unordered)",
|
||
"1. pen\n2. apple\n3. apple pen\n- pen\n- pineapple\n- pineapple pen",
|
||
"<ol>\n<li>pen</li>\n<li>apple</li>\n<li>apple pen</li>\n</ol>\n<ul>\n<li>pen</li>\n<li>pineapple</li>\n<li>pineapple pen</li>\n</ul>\n"
|
||
),
|
||
(
|
||
"code and code blocks",
|
||
"this is my amazing `code snippet` and my amazing ```code block```",
|
||
"<p>this is my amazing <code>code snippet</code> and my amazing <code>code block</code></p>\n"
|
||
),
|
||
// Links with added nofollow attribute
|
||
(
|
||
"links",
|
||
"[Lemmy](https://join-lemmy.org/ \"Join Lemmy!\")",
|
||
"<p><a href=\"https://join-lemmy.org/\" rel=\"nofollow\" title=\"Join Lemmy!\">Lemmy</a></p>\n"
|
||
),
|
||
// Remote images with proxy
|
||
(
|
||
"images",
|
||
"![My linked image](https://example.com/image.png \"image alt text\")",
|
||
"<p><img src=\"https://example.com/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
|
||
),
|
||
// Local images without proxy
|
||
(
|
||
"images",
|
||
"![My linked image](https://lemmy-alpha/image.png \"image alt text\")",
|
||
"<p><img src=\"https://lemmy-alpha/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
|
||
),
|
||
// Ensure spoiler plugin is added
|
||
(
|
||
"basic spoiler",
|
||
"::: spoiler click to see more\nhow spicy!\n:::\n",
|
||
"<details><summary>click to see more</summary><p>how spicy!\n</p></details>\n"
|
||
),
|
||
(
|
||
"escape html special chars",
|
||
"<script>alert('xss');</script> hello &\"",
|
||
"<p><script>alert(‘xss’);</script> hello &"</p>\n"
|
||
)
|
||
];
|
||
|
||
tests.iter().for_each(|&(msg, input, expected)| {
|
||
let result = markdown_to_html(input);
|
||
|
||
assert_eq!(
|
||
result, expected,
|
||
"Testing {}, with original input '{}'",
|
||
msg, input
|
||
);
|
||
});
|
||
}
|
||
|
||
#[test]
|
||
fn test_markdown_proxy_images() {
|
||
let tests: Vec<_> =
|
||
vec![
|
||
(
|
||
"remote image proxied",
|
||
"![link](http://example.com/image.jpg)",
|
||
"![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)",
|
||
),
|
||
(
|
||
"local image unproxied",
|
||
"![link](http://lemmy-alpha/image.jpg)",
|
||
"![link](http://lemmy-alpha/image.jpg)",
|
||
),
|
||
(
|
||
"multiple image links",
|
||
"![link](http://example.com/image1.jpg) ![link](http://example.com/image2.jpg)",
|
||
"![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage1.jpg) ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage2.jpg)",
|
||
),
|
||
(
|
||
"empty link handled",
|
||
"![image]()",
|
||
"![image]()"
|
||
),
|
||
(
|
||
"empty label handled",
|
||
"![](http://example.com/image.jpg)",
|
||
"![](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
|
||
),
|
||
(
|
||
"invalid image link removed",
|
||
"![image](http-not-a-link)",
|
||
"![image]()"
|
||
),
|
||
(
|
||
"label with nested markdown handled",
|
||
"![a *b* c](http://example.com/image.jpg)",
|
||
"![a *b* c](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
|
||
),
|
||
(
|
||
"custom emoji support",
|
||
r#"![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#,
|
||
r#"![party-blob](https://lemmy-alpha/api/v3/image_proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#
|
||
)
|
||
];
|
||
|
||
tests.iter().for_each(|&(msg, input, expected)| {
|
||
let result = markdown_rewrite_image_links(input.to_string());
|
||
|
||
assert_eq!(
|
||
result.0, expected,
|
||
"Testing {}, with original input '{}'",
|
||
msg, input
|
||
);
|
||
});
|
||
}
|
||
|
||
#[test]
|
||
fn test_url_blocking() {
|
||
let set = RegexSet::new(vec![r"(https://)?example\.com/?"]).unwrap();
|
||
|
||
assert!(
|
||
markdown_check_for_blocked_urls(&String::from("[](https://example.com)"), &set).is_err()
|
||
);
|
||
|
||
assert!(markdown_check_for_blocked_urls(
|
||
&String::from("Go to https://example.com to get free Robux"),
|
||
&set
|
||
)
|
||
.is_err());
|
||
|
||
assert!(
|
||
markdown_check_for_blocked_urls(&String::from("[](https://example.blog)"), &set).is_ok()
|
||
);
|
||
|
||
assert!(markdown_check_for_blocked_urls(&String::from("example.com"), &set).is_err());
|
||
|
||
assert!(markdown_check_for_blocked_urls(
|
||
"Odio exercitationem culpa sed sunt
|
||
et. Sit et similique tempora deserunt doloremque. Cupiditate iusto
|
||
repellat et quis qui. Cum veritatis facere quasi repellendus sunt
|
||
eveniet nemo sint. Cumque sit unde est. https://example.com Alias
|
||
repellendus at quos.",
|
||
&set
|
||
)
|
||
.is_err());
|
||
|
||
let set = RegexSet::new(vec![r"(https://)?example\.com/spam\.jpg"]).unwrap();
|
||
assert!(markdown_check_for_blocked_urls(
|
||
&String::from("![](https://example.com/spam.jpg)"),
|
||
&set
|
||
)
|
||
.is_err());
|
||
|
||
let set = RegexSet::new(vec![
|
||
r"(https://)?quo\.example\.com/?",
|
||
r"(https://)?foo\.example\.com/?",
|
||
r"(https://)?bar\.example\.com/?",
|
||
])
|
||
.unwrap();
|
||
|
||
assert!(
|
||
markdown_check_for_blocked_urls(&String::from("https://baz.example.com"), &set).is_ok()
|
||
);
|
||
|
||
assert!(
|
||
markdown_check_for_blocked_urls(&String::from("https://bar.example.com"), &set).is_err()
|
||
);
|
||
|
||
let set = RegexSet::new(vec![r"(https://)?example\.com/banned_page"]).unwrap();
|
||
|
||
assert!(
|
||
markdown_check_for_blocked_urls(&String::from("https://example.com/page"), &set).is_ok()
|
||
);
|
||
|
||
let set = RegexSet::new(vec![r"(https://)?ex\.mple\.com/?"]).unwrap();
|
||
|
||
assert!(markdown_check_for_blocked_urls("example.com", &set).is_ok());
|
||
}
|
||
|
||
#[test]
|
||
fn test_sanitize_html() {
|
||
let sanitized = sanitize_html("<script>alert('xss');</script> hello &\"'");
|
||
let expected = "<script>alert('xss');</script> hello &"'";
|
||
assert_eq!(expected, sanitized)
|
||
}
|
||
}
|