Introduce a scrubber that filters only everything except breaks

Add more tests for scrub_html_and_truncate/2
This commit is contained in:
Mark Felder 2021-06-11 14:33:13 -05:00
parent 65137044c1
commit baf7fd2142
3 changed files with 34 additions and 1 deletions

View file

@ -6,6 +6,7 @@ defmodule Pleroma.Web.Metadata.Utils do
alias Pleroma.Activity
alias Pleroma.Emoji
alias Pleroma.Formatter
alias Pleroma.HTML
def scrub_html_and_truncate(%{data: %{"content" => content}} = object) do
content
@ -21,8 +22,9 @@ defmodule Pleroma.Web.Metadata.Utils do
def scrub_html_and_truncate(content, max_length \\ 200) when is_binary(content) do
content
|> Emoji.Formatter.demojify()
|> HTML.filter_tags(Pleroma.HTML.Scrubber.BreaksOnly)
|> HtmlEntities.decode()
|> String.replace(~r/<br\s?\/?>/, " ")
|> String.replace(~r/<br\s?\/?>/, "&#10;&#13;")
|> Formatter.truncate(max_length)
end

View file

@ -0,0 +1,15 @@
defmodule Pleroma.HTML.Scrubber.BreaksOnly do
@moduledoc """
An HTML scrubbing policy which limits to linebreaks only.
"""
require FastSanitize.Sanitizer.Meta
alias FastSanitize.Sanitizer.Meta
Meta.strip_comments()
# linebreaks only
Meta.allow_tag_with_these_attributes(:br, [])
Meta.strip_everything_not_covered()
end

View file

@ -30,5 +30,21 @@ defmodule Pleroma.Web.Metadata.UtilsTest do
test "it truncates to specified chars (binaries)" do
assert Utils.scrub_html_and_truncate("Pleroma's really cool!", 10) == "Pleroma..."
end
# push notifications and link previews should be able to display newlines
test "it replaces <br> with compatible HTML entity (binaries)" do
assert Utils.scrub_html_and_truncate("First line<br>Second line") ==
"First line&#10;&#13;Second line"
end
test "it strips emojis (binaries)" do
assert Utils.scrub_html_and_truncate(
"Open the door get on the floor everybody walk the dinosaur :dinosaur:"
) == "Open the door get on the floor everybody walk the dinosaur"
end
test "it strips HTML tags and other entities (binaries)" do
assert Utils.scrub_html_and_truncate("<title>my title</title> <p>and a paragraph&#33;</p>") == "my title and a paragraph!"
end
end
end