moviewyrm/bookwyrm/sanitize_html.py

72 lines
2.3 KiB
Python
Raw Normal View History

2021-03-08 16:49:10 +00:00
""" html parser to clean up incoming text from unknown sources """
2020-02-15 05:45:13 +00:00
from html.parser import HTMLParser
2021-03-08 16:49:10 +00:00
class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
2021-04-26 16:15:42 +00:00
"""Removes any html that isn't allowed_tagsed from a block"""
2020-02-15 05:45:13 +00:00
def __init__(self):
HTMLParser.__init__(self)
2020-12-20 02:54:56 +00:00
self.allowed_tags = [
2021-03-08 16:49:10 +00:00
"p",
"blockquote",
"br",
"b",
"i",
"strong",
"em",
"pre",
"a",
"span",
"ul",
"ol",
"li",
2020-12-20 02:54:56 +00:00
]
2022-02-03 21:19:56 +00:00
self.allowed_attrs = ["href", "rel", "src", "alt"]
2020-02-15 05:45:13 +00:00
self.tag_stack = []
self.output = []
# if the html appears invalid, we just won't allow any at all
self.allow_html = True
def handle_starttag(self, tag, attrs):
2021-04-26 16:15:42 +00:00
"""check if the tag is valid"""
2020-09-21 17:25:26 +00:00
if self.allow_html and tag in self.allowed_tags:
2022-02-03 21:15:06 +00:00
allowed_attrs = " ".join(
f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
)
2022-02-03 21:19:56 +00:00
reconstructed = f"<{tag}"
2022-02-03 21:15:06 +00:00
if allowed_attrs:
reconstructed += " " + allowed_attrs
reconstructed += ">"
self.output.append(("tag", reconstructed))
2020-02-15 05:45:13 +00:00
self.tag_stack.append(tag)
else:
2021-03-08 16:49:10 +00:00
self.output.append(("data", ""))
2020-02-15 05:45:13 +00:00
def handle_endtag(self, tag):
2021-04-26 16:15:42 +00:00
"""keep the close tag"""
2020-09-21 17:25:26 +00:00
if not self.allow_html or tag not in self.allowed_tags:
2021-03-08 16:49:10 +00:00
self.output.append(("data", ""))
2020-02-15 05:45:13 +00:00
return
if not self.tag_stack or self.tag_stack[-1] != tag:
# the end tag doesn't match the most recent start tag
self.allow_html = False
2021-03-08 16:49:10 +00:00
self.output.append(("data", ""))
2020-02-15 05:45:13 +00:00
return
self.tag_stack = self.tag_stack[:-1]
2021-09-18 04:39:18 +00:00
self.output.append(("tag", f"</{tag}>"))
2020-02-15 05:45:13 +00:00
def handle_data(self, data):
2021-04-26 16:15:42 +00:00
"""extract the answer, if we're in an answer tag"""
2021-03-08 16:49:10 +00:00
self.output.append(("data", data))
2020-02-15 05:45:13 +00:00
def get_output(self):
2021-04-26 16:15:42 +00:00
"""convert the output from a list of tuples to a string"""
if self.tag_stack:
self.allow_html = False
2020-02-15 05:45:13 +00:00
if not self.allow_html:
2021-03-08 16:49:10 +00:00
return "".join(v for (k, v) in self.output if k == "data")
return "".join(v for (k, v) in self.output)