2024-03-15 11:03:29 +00:00
use crate ::{ error ::LemmyResult , settings ::SETTINGS , LemmyErrorType } ;
2024-01-25 14:22:11 +00:00
use markdown_it ::{ plugins ::cmark ::inline ::image ::Image , MarkdownIt } ;
use once_cell ::sync ::Lazy ;
2024-03-15 11:03:29 +00:00
use regex ::RegexSet ;
2024-01-25 14:22:11 +00:00
use url ::Url ;
use urlencoding ::encode ;
mod link_rule ;
mod spoiler_rule ;
static MARKDOWN_PARSER : Lazy < MarkdownIt > = Lazy ::new ( | | {
let mut parser = MarkdownIt ::new ( ) ;
markdown_it ::plugins ::cmark ::add ( & mut parser ) ;
markdown_it ::plugins ::extra ::add ( & mut parser ) ;
spoiler_rule ::add ( & mut parser ) ;
link_rule ::add ( & mut parser ) ;
parser
} ) ;
/// Replace special HTML characters in API parameters to prevent XSS attacks.
///
/// Taken from https://github.com/OWASP/CheatSheetSeries/blob/master/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.md#output-encoding-for-html-contexts
///
/// `>` is left in place because it is interpreted as markdown quote.
pub fn sanitize_html ( text : & str ) -> String {
text
. replace ( '&' , " & " )
. replace ( '<' , " < " )
. replace ( '\"' , " " " )
. replace ( '\'' , " ' " )
}
pub fn markdown_to_html ( text : & str ) -> String {
MARKDOWN_PARSER . parse ( text ) . xrender ( )
}
/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`.
pub fn markdown_rewrite_image_links ( mut src : String ) -> ( String , Vec < Url > ) {
let ast = MARKDOWN_PARSER . parse ( & src ) ;
let mut links_offsets = vec! [ ] ;
// Walk the syntax tree to find positions of image links
ast . walk ( | node , _depth | {
if let Some ( image ) = node . cast ::< Image > ( ) {
// srcmap is always present for image
// https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387
let node_offsets = node . srcmap . expect ( " srcmap is none " ) . get_byte_offsets ( ) ;
// necessary for custom emojis which look like `![name](url "title")`
let start_offset = node_offsets . 1
- image . url . len ( )
- 1
- image
. title
. as_ref ( )
. map ( | t | t . len ( ) + 3 )
. unwrap_or_default ( ) ;
let end_offset = node_offsets . 1 - 1 ;
links_offsets . push ( ( start_offset , end_offset ) ) ;
}
} ) ;
let mut links = vec! [ ] ;
// Go through the collected links in reverse order
while let Some ( ( start , end ) ) = links_offsets . pop ( ) {
let content = src . get ( start .. end ) . unwrap_or_default ( ) ;
// necessary for custom emojis which look like `![name](url "title")`
let ( url , extra ) = if content . contains ( ' ' ) {
let split = content . split_once ( ' ' ) . expect ( " split is valid " ) ;
( split . 0 , Some ( split . 1 ) )
} else {
( content , None )
} ;
match Url ::parse ( url ) {
Ok ( parsed ) = > {
links . push ( parsed . clone ( ) ) ;
// If link points to remote domain, replace with proxied link
if parsed . domain ( ) ! = Some ( & SETTINGS . hostname ) {
let mut proxied = format! (
" {}/api/v3/image_proxy?url={} " ,
SETTINGS . get_protocol_and_hostname ( ) ,
encode ( url ) ,
) ;
// restore custom emoji format
if let Some ( extra ) = extra {
proxied = format! ( " {proxied} {extra} " ) ;
}
src . replace_range ( start .. end , & proxied ) ;
}
}
Err ( _ ) = > {
// If its not a valid url, replace with empty text
src . replace_range ( start .. end , " " ) ;
}
}
}
( src , links )
}
2024-03-15 11:03:29 +00:00
pub fn markdown_check_for_blocked_urls ( text : & str , blocklist : & RegexSet ) -> LemmyResult < ( ) > {
if blocklist . is_match ( text ) {
Err ( LemmyErrorType ::BlockedUrl ) ?
}
Ok ( ( ) )
}
2024-01-25 14:22:11 +00:00
#[ cfg(test) ]
2024-03-26 09:17:42 +00:00
#[ allow(clippy::unwrap_used) ]
#[ allow(clippy::indexing_slicing) ]
2024-01-25 14:22:11 +00:00
mod tests {
use super ::* ;
use pretty_assertions ::assert_eq ;
#[ test ]
fn test_basic_markdown ( ) {
let tests : Vec < _ > = vec! [
2024-03-15 11:03:29 +00:00
(
" headings " ,
" # h1 \n ## h2 \n ### h3 \n #### h4 \n ##### h5 \n ###### h6 " ,
" <h1>h1</h1> \n <h2>h2</h2> \n <h3>h3</h3> \n <h4>h4</h4> \n <h5>h5</h5> \n <h6>h6</h6> \n "
) ,
(
" line breaks " ,
" First \r Second " ,
" <p>First \n Second</p> \n " ) ,
(
" emphasis " ,
" __bold__ **bold** *italic* ***bold+italic*** " ,
" <p><strong>bold</strong> <strong>bold</strong> <em>italic</em> <em><strong>bold+italic</strong></em></p> \n "
) ,
(
" blockquotes " ,
" > #### Hello \n > \n > - Hola \n > - 안영 \n >> Goodbye \n " ,
" <blockquote> \n <h4>Hello</h4> \n <ul> \n <li>Hola</li> \n <li>안영</li> \n </ul> \n <blockquote> \n <p>Goodbye</p> \n </blockquote> \n </blockquote> \n "
) ,
(
" lists (ordered, unordered) " ,
" 1. pen \n 2. apple \n 3. apple pen \n - pen \n - pineapple \n - pineapple pen " ,
" <ol> \n <li>pen</li> \n <li>apple</li> \n <li>apple pen</li> \n </ol> \n <ul> \n <li>pen</li> \n <li>pineapple</li> \n <li>pineapple pen</li> \n </ul> \n "
) ,
(
" code and code blocks " ,
" this is my amazing `code snippet` and my amazing ```code block``` " ,
" <p>this is my amazing <code>code snippet</code> and my amazing <code>code block</code></p> \n "
) ,
// Links with added nofollow attribute
(
" links " ,
" [Lemmy](https://join-lemmy.org/ \" Join Lemmy! \" ) " ,
" <p><a href= \" https://join-lemmy.org/ \" rel= \" nofollow \" title= \" Join Lemmy! \" >Lemmy</a></p> \n "
) ,
// Remote images with proxy
(
" images " ,
" ![My linked image](https://example.com/image.png \" image alt text \" ) " ,
" <p><img src= \" https://example.com/image.png \" alt= \" My linked image \" title= \" image alt text \" /></p> \n "
) ,
// Local images without proxy
(
" images " ,
" ![My linked image](https://lemmy-alpha/image.png \" image alt text \" ) " ,
" <p><img src= \" https://lemmy-alpha/image.png \" alt= \" My linked image \" title= \" image alt text \" /></p> \n "
) ,
// Ensure spoiler plugin is added
(
" basic spoiler " ,
" ::: spoiler click to see more \n how spicy! \n ::: \n " ,
" <details><summary>click to see more</summary><p>how spicy! \n </p></details> \n "
) ,
(
" escape html special chars " ,
" <script>alert('xss');</script> hello & \" " ,
" <p><script>alert(‘ xss’ );</script> hello &"</p> \n "
)
] ;
2024-01-25 14:22:11 +00:00
tests . iter ( ) . for_each ( | & ( msg , input , expected ) | {
let result = markdown_to_html ( input ) ;
assert_eq! (
result , expected ,
" Testing {}, with original input '{}' " ,
msg , input
) ;
} ) ;
}
#[ test ]
fn test_markdown_proxy_images ( ) {
let tests : Vec < _ > =
vec! [
2024-03-15 11:03:29 +00:00
(
" remote image proxied " ,
" ![link](http://example.com/image.jpg) " ,
" ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg) " ,
) ,
(
" local image unproxied " ,
" ![link](http://lemmy-alpha/image.jpg) " ,
" ![link](http://lemmy-alpha/image.jpg) " ,
) ,
(
" multiple image links " ,
" ![link](http://example.com/image1.jpg) ![link](http://example.com/image2.jpg) " ,
" ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage1.jpg) ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage2.jpg) " ,
) ,
(
" empty link handled " ,
" ![image]() " ,
" ![image]() "
) ,
(
" empty label handled " ,
" ![](http://example.com/image.jpg) " ,
" ![](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg) "
) ,
(
" invalid image link removed " ,
" ![image](http-not-a-link) " ,
" ![image]() "
) ,
(
" label with nested markdown handled " ,
" ![a *b* c](http://example.com/image.jpg) " ,
" ![a *b* c](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg) "
) ,
(
" custom emoji support " ,
r # "![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"# ,
r # "![party-blob](https://lemmy-alpha/api/v3/image_proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#
)
2024-01-25 14:22:11 +00:00
] ;
tests . iter ( ) . for_each ( | & ( msg , input , expected ) | {
let result = markdown_rewrite_image_links ( input . to_string ( ) ) ;
assert_eq! (
result . 0 , expected ,
" Testing {}, with original input '{}' " ,
msg , input
) ;
} ) ;
}
2024-03-15 11:03:29 +00:00
#[ test ]
fn test_url_blocking ( ) {
let set = RegexSet ::new ( vec! [ r "(https://)?example\.com/?" ] ) . unwrap ( ) ;
assert! (
markdown_check_for_blocked_urls ( & String ::from ( " [](https://example.com) " ) , & set ) . is_err ( )
) ;
assert! ( markdown_check_for_blocked_urls (
& String ::from ( " Go to https://example.com to get free Robux " ) ,
& set
)
. is_err ( ) ) ;
assert! (
markdown_check_for_blocked_urls ( & String ::from ( " [](https://example.blog) " ) , & set ) . is_ok ( )
) ;
assert! ( markdown_check_for_blocked_urls ( & String ::from ( " example.com " ) , & set ) . is_err ( ) ) ;
assert! ( markdown_check_for_blocked_urls (
" Odio exercitationem culpa sed sunt
et . Sit et similique tempora deserunt doloremque . Cupiditate iusto
repellat et quis qui . Cum veritatis facere quasi repellendus sunt
eveniet nemo sint . Cumque sit unde est . https ://example.com Alias
repellendus at quos . " ,
& set
)
. is_err ( ) ) ;
let set = RegexSet ::new ( vec! [ r "(https://)?example\.com/spam\.jpg" ] ) . unwrap ( ) ;
assert! ( markdown_check_for_blocked_urls (
& String ::from ( " ![](https://example.com/spam.jpg) " ) ,
& set
)
. is_err ( ) ) ;
let set = RegexSet ::new ( vec! [
r "(https://)?quo\.example\.com/?" ,
r "(https://)?foo\.example\.com/?" ,
r "(https://)?bar\.example\.com/?" ,
] )
. unwrap ( ) ;
assert! (
markdown_check_for_blocked_urls ( & String ::from ( " https://baz.example.com " ) , & set ) . is_ok ( )
) ;
assert! (
markdown_check_for_blocked_urls ( & String ::from ( " https://bar.example.com " ) , & set ) . is_err ( )
) ;
let set = RegexSet ::new ( vec! [ r "(https://)?example\.com/banned_page" ] ) . unwrap ( ) ;
assert! (
markdown_check_for_blocked_urls ( & String ::from ( " https://example.com/page " ) , & set ) . is_ok ( )
) ;
let set = RegexSet ::new ( vec! [ r "(https://)?ex\.mple\.com/?" ] ) . unwrap ( ) ;
assert! ( markdown_check_for_blocked_urls ( " example.com " , & set ) . is_ok ( ) ) ;
}
2024-01-25 14:22:11 +00:00
#[ test ]
fn test_sanitize_html ( ) {
let sanitized = sanitize_html ( " <script>alert('xss');</script> hello & \" ' " ) ;
let expected = " <script>alert('xss');</script> hello &"' " ;
assert_eq! ( expected , sanitized )
}
}