Move to a new HTML parser/stripper

This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place.
2024-06-02 21:39:28 +00:00 · 2023-01-29 17:46:22 -07:00 · 2023-01-29 17:46:22 -07:00 · a6922cb9d6
parent 93c0af992b
commit a6922cb9d6
14 changed files with 503 additions and 562 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -48,13 +48,7 @@ repos:
      - id: mypy
        exclude: "^tests/"
        additional_dependencies:
-          [
-            types-pyopenssl,
-            types-bleach,
-            types-mock,
-            types-cachetools,
-            types-python-dateutil,
-          ]
+          [types-pyopenssl, types-mock, types-cachetools, types-python-dateutil]

  - repo: https://github.com/rtts/djhtml
    rev: v1.5.2
--- a/activities/admin.py
+++ b/activities/admin.py
@ -1,4 +1,3 @@
-from asgiref.sync import async_to_sync
 from django.contrib import admin
 from django.db import models
 from django.utils.safestring import mark_safe
@ -165,7 +164,6 @@ class PostAdmin(admin.ModelAdmin):
    list_filter = ("type", "local", "visibility", "state", "created")
    raw_id_fields = ["emojis"]
    autocomplete_fields = ["to", "mentions", "author"]
-    actions = ["reparse_hashtags"]
    search_fields = ["content", "search_handle", "search_service_handle"]
    inlines = [PostAttachmentInline]
    readonly_fields = ["created", "updated", "state_changed", "object_json"]
@ -183,13 +181,6 @@ class PostAdmin(admin.ModelAdmin):
        )
        return super().get_search_results(request, queryset, search_term)

-    @admin.action(description="Reprocess content for hashtags")
-    def reparse_hashtags(self, request, queryset):
-        for instance in queryset:
-            instance.hashtags = Hashtag.hashtags_from_content(instance.content) or None
-            instance.save()
-            async_to_sync(instance.ensure_hashtags)()
-
    @admin.display(description="ActivityPub JSON")
    def object_json(self, instance):
        return instance.to_ap()
--- a/activities/models/emoji.py
+++ b/activities/models/emoji.py
@ -1,5 +1,4 @@
 import mimetypes
-import re
 from functools import partial
 from typing import ClassVar

@ -14,7 +13,7 @@ from django.db import models
 from django.utils.safestring import mark_safe

 from core.files import get_remote_file
-from core.html import strip_html
+from core.html import FediverseHtmlParser
 from core.ld import format_ld_date
 from core.models import Config
 from core.uploads import upload_emoji_namer
@ -134,8 +133,6 @@ class Emoji(StatorModel):
        admin_disable = "{admin}{self.pk}/disable/"
        admin_copy = "{admin}{self.pk}/copy/"

-    emoji_regex = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
-
    def delete(self, using=None, keep_parents=False):
        if self.file:
            self.file.delete()
@ -242,7 +239,9 @@ class Emoji(StatorModel):
        Return a parsed and sanitized of emoji found in content without
        the surrounding ':'.
        """
-        emoji_hits = cls.emoji_regex.findall(strip_html(content))
+        emoji_hits = FediverseHtmlParser(
+            content, find_emojis=True, emoji_domain=domain
+        ).emojis
        emojis = sorted({emoji.lower() for emoji in emoji_hits})
        return list(
            cls.objects.filter(local=(domain is None) or domain.local)
--- a/activities/models/hashtag.py
+++ b/activities/models/hashtag.py
@ -6,7 +6,6 @@ from asgiref.sync import sync_to_async
 from django.db import models
 from django.utils import timezone

-from core.html import strip_html
 from core.models import Config
 from stator.models import State, StateField, StateGraph, StatorModel

@ -167,16 +166,6 @@ class Hashtag(StatorModel):
                results[date(year, month, day)] = val
        return dict(sorted(results.items(), reverse=True)[:num])

-    @classmethod
-    def hashtags_from_content(cls, content) -> list[str]:
-        """
-        Return a parsed and sanitized of hashtags found in content without
-        leading '#'.
-        """
-        hashtag_hits = cls.hashtag_regex.findall(strip_html(content))
-        hashtags = sorted({tag.lower() for tag in hashtag_hits})
-        return list(hashtags)
-
    def to_mastodon_json(self):
        return {
            "name": self.hashtag,
--- a/activities/models/post.py
+++ b/activities/models/post.py
@ -2,7 +2,6 @@ import datetime
 import hashlib
 import json
 import mimetypes
-import re
 import ssl
 from collections.abc import Iterable
 from typing import Optional
@ -26,7 +25,7 @@ from activities.models.post_types import (
    PostTypeDataEncoder,
 )
 from core.exceptions import capture_message
-from core.html import ContentRenderer, strip_html
+from core.html import ContentRenderer, FediverseHtmlParser
 from core.ld import (
    canonicalise,
    format_ld_date,
@ -374,10 +373,6 @@ class Post(StatorModel):
    def clean_type_data(self, value):
        PostTypeData.parse_obj(value)

-    mention_regex = re.compile(
-        r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
-    )
-
    def _safe_content_note(self, *, local: bool = True):
        return ContentRenderer(local=local).render_post(self.content, self)

@ -474,12 +469,12 @@ class Post(StatorModel):
                # Maintain local-only for replies
                if reply_to.visibility == reply_to.Visibilities.local_only:
                    visibility = reply_to.Visibilities.local_only
-            # Find hashtags in this post
-            hashtags = Hashtag.hashtags_from_content(content) or None
            # Find emoji in this post
            emojis = Emoji.emojis_from_content(content, None)
-            # Strip all HTML and apply linebreaks filter
-            content = linebreaks_filter(strip_html(content))
+            # Strip all unwanted HTML and apply linebreaks filter, grabbing hashtags on the way
+            parser = FediverseHtmlParser(linebreaks_filter(content), find_hashtags=True)
+            content = parser.html
+            hashtags = sorted(parser.hashtags) or None
            # Make the Post object
            post = cls.objects.create(
                author=author,
@ -512,12 +507,13 @@ class Post(StatorModel):
    ):
        with transaction.atomic():
            # Strip all HTML and apply linebreaks filter
-            self.content = linebreaks_filter(strip_html(content))
+            parser = FediverseHtmlParser(linebreaks_filter(content))
+            self.content = parser.html
+            self.hashtags = sorted(parser.hashtags) or None
            self.summary = summary or None
            self.sensitive = bool(summary)
            self.visibility = visibility
            self.edited = timezone.now()
-            self.hashtags = Hashtag.hashtags_from_content(content) or None
            self.mentions.set(self.mentions_from_content(content, self.author))
            self.emojis.set(Emoji.emojis_from_content(content, None))
            self.attachments.set(attachments or [])
@ -525,9 +521,9 @@ class Post(StatorModel):

    @classmethod
    def mentions_from_content(cls, content, author) -> set[Identity]:
-        mention_hits = cls.mention_regex.findall(content)
+        mention_hits = FediverseHtmlParser(content, find_mentions=True).mentions
        mentions = set()
-        for precursor, handle in mention_hits:
+        for handle in mention_hits:
            handle = handle.lower()
            if "@" in handle:
                username, domain = handle.split("@", 1)
--- a/activities/views/compose.py
+++ b/activities/views/compose.py
@ -14,7 +14,7 @@ from activities.models import (
    TimelineEvent,
 )
 from core.files import blurhash_image, resize_image
-from core.html import html_to_plaintext
+from core.html import FediverseHtmlParser
 from core.models import Config
 from users.decorators import identity_required

@ -112,7 +112,7 @@ class Compose(FormView):
                {
                    "reply_to": self.reply_to.pk if self.reply_to else "",
                    "visibility": self.post_obj.visibility,
-                    "text": html_to_plaintext(self.post_obj.content),
+                    "text": FediverseHtmlParser(self.post_obj.content).plain_text,
                    "content_warning": self.post_obj.summary,
                }
            )
--- a/core/html.py
+++ b/core/html.py
@ -1,199 +1,309 @@
+import html
 import re
-from functools import partial
+from html.parser import HTMLParser

-import bleach
-import bleach.callbacks
-from bleach.html5lib_shim import Filter
-from bleach.linkifier import LinkifyFilter
 from django.utils.safestring import mark_safe

-url_regex = re.compile(
-    r"""\(*  # Match any opening parentheses.
-    \b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?)  # http://
-    ([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
-    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+
+class FediverseHtmlParser(HTMLParser):
+    """
+    A custom HTML parser that only allows a certain tag subset and behaviour:
+    - br, p tags are passed through
+    - a tags are passed through if they're not hashtags or mentions
+    - Another set of tags are converted to p
+
+    It also linkifies URLs, mentions, hashtags, and imagifies emoji.
+    """
+
+    REWRITE_TO_P = [
+        "p",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "blockquote",
+        "pre",
+        "ul",
+        "ol",
+    ]
+
+    REWRITE_TO_BR = [
+        "br",
+        "li",
+    ]
+
+    MENTION_REGEX = re.compile(
+        r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
+    )
+
+    HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")
+
+    EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
+
+    URL_REGEX = re.compile(
+        r"""(\(*  # Match any opening parentheses.
+        \b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?)  # http://
+        (?:[\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?)
        # /path/zz (excluding "unsafe" chars from RFC 1738,
        # except for # and ~, which happen in practice)
-    """,
-    re.IGNORECASE | re.VERBOSE | re.UNICODE,
-)
-
-ALLOWED_TAGS = ["br", "p", "a"]
-REWRITTEN_TAGS = [
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "blockquote",
-    "pre",
-    "ul",
-    "ol",
-    "li",
-]
-
-
-class MastodonStrictTagFilter(Filter):
-    """
-    Implements Python equivalent of Mastodon tag rewriter
-
-    Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
-
-    Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
-    """
-
-    def __iter__(self):
-        li_pending_break = False
-        break_token = {
-            "name": "br",
-            "data": {},
-            "type": "StartTag",
-        }
-
-        for token in Filter.__iter__(self):
-            if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
-                "StartTag",
-                "EndTag",
-            ]:
-                yield token
-                continue
-
-            if token["type"] == "StartTag":
-                if token["name"] == "li":
-                    if li_pending_break:
-                        # Another `li` appeared, so break after the last one
-                        yield break_token
-                    continue
-                token["name"] = "p"
-            elif token["type"] == "EndTag":
-                if token["name"] == "li":
-                    # Track that an `li` closed so we know a break should be considered
-                    li_pending_break = True
-                    continue
-                if token["name"] == "ul":
-                    # If the last `li` happened, then don't add a break because Mastodon doesn't
-                    li_pending_break = False
-                token["name"] = "p"
-
-            yield token
-
-
-class UnlinkifyFilter(Filter):
-    """
-    Forcibly replaces link text with the href.
-
-    This is intented to be used when stripping <a> tags to preserve the link
-    location at the expense of the link text.
-    """
-
-    def __iter__(self):
-        discarding_a_text = False
-        for token in Filter.__iter__(self):
-            if token.get("name") == "a":
-                if token["type"] == "EndTag":
-                    discarding_a_text = False
-                    continue
-                href = token["data"].get((None, "href"))
-
-                # If <a> has an href, we use it and throw away all content
-                # within the <a>...</a>. If href missing or empty, try to find
-                # text within the <a>...</a>
-                if href:
-                    yield {"data": href, "type": "Characters"}
-                    discarding_a_text = True
-                    continue
-            elif not discarding_a_text:
-                yield token
-            # else: throw away tokens until we're out of the <a>
-
-
-def allow_a(tag: str, name: str, value: str):
-    if name in ["href", "title", "class"]:
-        return True
-    elif name == "rel":
-        # Only allow rel attributes with a small subset of values
-        # (we're defending against, for example, rel=me)
-        rel_values = value.split()
-        if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
-            return True
-    return False
-
-
-def shorten_link_text(attrs, new=False):
-    """
-    Applies Mastodon's link shortening behavior where URL text links are
-    shortened by removing the scheme and only showing the first 30 chars.
-
-    Orig:
-        <a>https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened</a>
-
-    Becomes:
-        <a>social.example.com/a-long/path</a>
-
-    """
-    text = attrs.get("_text")
-    if not text:
-        text = attrs.get((None, "href"))
-    if text and "://" in text and len(text) > 30:
-        text = text.split("://", 1)[-1]
-        attrs["_text"] = text[:30]
-        if len(text) > 30:
-            attrs[(None, "class")] = " ".join(
-                filter(None, [attrs.pop((None, "class"), ""), "ellipsis"])
-            )
-        # Add the full URL in to title for easier user inspection
-        attrs[(None, "title")] = attrs.get((None, "href"))
-
-    return attrs
-
-
-linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text]
-
-
-def sanitize_html(post_html: str) -> str:
-    """
-    Only allows a, br, p and span tags, and class attributes.
-    """
-    cleaner = bleach.Cleaner(
-        tags=ALLOWED_TAGS + REWRITTEN_TAGS,
-        attributes={  # type:ignore
-            "a": allow_a,
-            "p": ["class"],
-        },
-        filters=[
-            partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks),
-            MastodonStrictTagFilter,
-        ],
-        strip=True,
+        """,
+        re.IGNORECASE | re.VERBOSE | re.UNICODE,
    )
-    return mark_safe(cleaner.clean(post_html))

+    def __init__(
+        self,
+        html: str,
+        uri_domain: str | None = None,
+        mentions: list | None = None,
+        find_mentions: bool = False,
+        find_hashtags: bool = False,
+        find_emojis: bool = False,
+        emoji_domain=None,
+    ):
+        super().__init__()
+        self.uri_domain = uri_domain
+        self.emoji_domain = emoji_domain
+        self.find_mentions = find_mentions
+        self.find_hashtags = find_hashtags
+        self.find_emojis = find_emojis
+        self.calculate_mentions(mentions)
+        self._data_buffer = ""
+        self.html_output = ""
+        self.text_output = ""
+        self.emojis: set[str] = set()
+        self.mentions: set[str] = set()
+        self.hashtags: set[str] = set()
+        self._pending_a: dict | None = None
+        self._fresh_p = False
+        self.feed(html.replace("\n", ""))
+        self.flush_data()

-def strip_html(post_html: str, *, linkify: bool = True) -> str:
-    """
-    Strips all tags from the text, then linkifies it.
-    """
-    cleaner = bleach.Cleaner(
-        tags=[],
-        strip=True,
-        filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
-        if linkify
-        else [UnlinkifyFilter],
-    )
-    return mark_safe(cleaner.clean(post_html))
+    def calculate_mentions(self, mentions: list | None):
+        """
+        Prepares a set of content that we expect to see mentions look like
+        (this imp)
+        """
+        self.mention_matches: dict[str, str] = {}
+        self.mention_aliases: dict[str, str] = {}
+        for mention in mentions or []:
+            if self.uri_domain:
+                url = mention.absolute_profile_uri()
+            else:
+                url = str(mention.urls.view)
+            if mention.username:
+                username = mention.username.lower()
+                domain = mention.domain_id.lower()
+                self.mention_matches[f"{username}"] = url
+                self.mention_matches[f"{username}@{domain}"] = url

+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag in self.REWRITE_TO_P:
+            self.flush_data()
+            self.html_output += "<p>"
+        elif tag in self.REWRITE_TO_BR:
+            self.flush_data()
+            if not self._fresh_p:
+                self.html_output += "<br>"
+                self.text_output += "\n"
+        elif tag == "a":
+            self.flush_data()
+            self._pending_a = {"attrs": dict(attrs), "content": ""}
+        self._fresh_p = tag in self.REWRITE_TO_P

-def html_to_plaintext(post_html: str) -> str:
-    """
-    Tries to do the inverse of the linebreaks filter.
-    """
-    # TODO: Handle HTML entities
-    # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
-    post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
-    # Remove all other HTML and return
-    cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
-    return cleaner.clean(post_html).strip()
+    def handle_endtag(self, tag: str) -> None:
+        self._fresh_p = False
+        if tag in self.REWRITE_TO_P:
+            self.flush_data()
+            self.html_output += "</p>"
+            self.text_output += "\n\n"
+        elif tag == "a":
+            if self._pending_a:
+                href = self._pending_a["attrs"].get("href")
+                content = self._pending_a["content"].strip()
+                # Is it a mention?
+                if content.lower().lstrip("@") in self.mention_matches:
+                    self.html_output += self.create_mention(content)
+                    self.text_output += content
+                # Is it a hashtag?
+                elif self.HASHTAG_REGEX.match(content):
+                    self.html_output += self.create_hashtag(content)
+                    self.text_output += content
+                elif content:
+                    # Shorten the link if we need to
+                    self.html_output += self.create_link(href, content)
+                    self.text_output += href
+                self._pending_a = None
+
+    def handle_data(self, data: str) -> None:
+        self._fresh_p = False
+        if self._pending_a:
+            self._pending_a["content"] += data
+        else:
+            self._data_buffer += data
+
+    def flush_data(self) -> None:
+        """
+        We collect data segments until we encounter a tag we care about,
+        so we can treat <span>#</span>hashtag as #hashtag
+        """
+        self.text_output += self._data_buffer
+        self.html_output += self.linkify(self._data_buffer)
+        self._data_buffer = ""
+
+    def create_link(self, href, content):
+        """
+        Generates a link, doing optional shortening.
+
+        All return values from this function should be HTML-safe.
+        """
+        looks_like_link = bool(self.URL_REGEX.match(content))
+        if looks_like_link:
+            content = content.split("://", 1)[1]
+        if looks_like_link and len(content) > 30:
+            return f'<a href="{html.escape(href)}" rel="nofollow" class="ellipsis" title="{html.escape(content)}">{html.escape(content[:30])}</a>'
+        else:
+            return f'<a href="{html.escape(href)}" rel="nofollow">{html.escape(content)}</a>'
+
+    def create_mention(self, handle) -> str:
+        """
+        Generates a mention link. Handle should have a leading @.
+
+        All return values from this function should be HTML-safe
+        """
+        handle = handle.lstrip("@")
+        if "@" in handle:
+            short_handle = handle.split("@", 1)[0]
+        else:
+            short_handle = handle
+        handle_hash = handle.lower()
+        short_hash = short_handle.lower()
+        self.mentions.add(handle_hash)
+        url = self.mention_matches.get(handle_hash)
+        if url:
+            if short_hash not in self.mention_aliases:
+                self.mention_aliases[short_hash] = handle_hash
+            elif self.mention_aliases.get(short_hash) != handle_hash:
+                short_handle = handle
+            return f'<a href="{html.escape(url)}">@{html.escape(short_handle)}</a>'
+        else:
+            return "@" + html.escape(handle)
+
+    def create_hashtag(self, hashtag) -> str:
+        """
+        Generates a hashtag link. Hashtag does not need to start with #
+
+        All return values from this function should be HTML-safe
+        """
+        hashtag = hashtag.lstrip("#")
+        self.hashtags.add(hashtag.lower())
+        if self.uri_domain:
+            return f'<a href="https://{self.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'
+        else:
+            return f'<a href="/tags/{hashtag.lower()}/">#{hashtag}</a>'
+
+    def create_emoji(self, shortcode) -> str:
+        """
+        Generates an emoji <img> tag
+
+        All return values from this function should be HTML-safe
+        """
+        from activities.models import Emoji
+
+        emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)
+        if emoji and emoji.is_usable:
+            self.emojis.add(shortcode)
+            return emoji.as_html()
+        return f":{shortcode}:"
+
+    def linkify(self, data):
+        """
+        Linkifies some content that is plaintext.
+
+        Handles URLs first, then mentions. Note that this takes great care to
+        keep track of what is HTML and what needs to be escaped.
+        """
+        # Split the string by the URL regex so we know what to escape and what
+        # not to escape.
+        bits = self.URL_REGEX.split(data)
+        result = ""
+        # Even indices are data we should pass though, odd indices are links
+        for i, bit in enumerate(bits):
+            # A link!
+            if i % 2 == 1:
+                result += self.create_link(bit, bit)
+            # Not a link
+            elif self.mention_matches or self.find_mentions:
+                result += self.linkify_mentions(bit)
+            elif self.find_hashtags:
+                result += self.linkify_hashtags(bit)
+            elif self.find_emojis:
+                result += self.linkify_emoji(bit)
+            else:
+                result += html.escape(bit)
+        return result
+
+    def linkify_mentions(self, data):
+        """
+        Linkifies mentions
+        """
+        bits = self.MENTION_REGEX.split(data)
+        result = ""
+        for i, bit in enumerate(bits):
+            # Mention content
+            if i % 3 == 2:
+                result += self.create_mention(bit)
+            # Not part of a mention (0) or mention preamble (1)
+            elif self.find_hashtags:
+                result += self.linkify_hashtags(bit)
+            elif self.find_emojis:
+                result += self.linkify_emoji(bit)
+            else:
+                result += html.escape(bit)
+        return result
+
+    def linkify_hashtags(self, data):
+        """
+        Linkifies hashtags
+        """
+        bits = self.HASHTAG_REGEX.split(data)
+        result = ""
+        for i, bit in enumerate(bits):
+            # Not part of a hashtag
+            if i % 2 == 0:
+                if self.find_emojis:
+                    result += self.linkify_emoji(bit)
+                else:
+                    result += html.escape(bit)
+            # Hashtag content
+            else:
+                result += self.create_hashtag(bit)
+        return result
+
+    def linkify_emoji(self, data):
+        """
+        Linkifies emoji
+        """
+        bits = self.EMOJI_REGEX.split(data)
+        result = ""
+        for i, bit in enumerate(bits):
+            # Not part of an emoji
+            if i % 2 == 0:
+                result += html.escape(bit)
+            # Emoji content
+            else:
+                result += self.create_emoji(bit)
+        return result
+
+    @property
+    def html(self):
+        return self.html_output.strip()
+
+    @property
+    def plain_text(self):
+        return self.text_output.strip()


 class ContentRenderer:
@ -212,33 +322,30 @@ class ContentRenderer:
        """
        if not html:
            return ""
-        html = sanitize_html(html)
-        html = self.linkify_mentions(html, post=post)
-        html = self.linkify_hashtags(html, identity=post.author)
-        if self.local:
-            html = self.imageify_emojis(
-                html,
-                identity=post.author,
-                emojis=post.emojis.all(),
-            )
-        html = self.remove_extra_newlines(html)
-        return mark_safe(html)
+        parser = FediverseHtmlParser(
+            html,
+            mentions=post.mentions.all(),
+            uri_domain=(None if self.local else post.author.domain.uri_domain),
+            find_hashtags=True,
+            find_emojis=True,
+            emoji_domain=post.author.domain,
+        )
+        return mark_safe(parser.html)

-    def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
+    def render_identity_summary(self, html: str, identity) -> str:
        """
        Given identity summary HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
-        if strip:
-            html = strip_html(html)
-        else:
-            html = sanitize_html(html)
-        html = self.linkify_hashtags(html, identity=identity)
-        if self.local:
-            html = self.imageify_emojis(html, identity=identity)
-        html = self.remove_extra_newlines(html)
-        return mark_safe(html)
+        parser = FediverseHtmlParser(
+            html,
+            uri_domain=(None if self.local else identity.domain.uri_domain),
+            find_hashtags=True,
+            find_emojis=True,
+            emoji_domain=identity.domain,
+        )
+        return mark_safe(parser.html)

    def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
        """
@ -246,117 +353,14 @@ class ContentRenderer:
        """
        if not html:
            return ""
-        if strip:
-            html = strip_html(html)
-        else:
-            html = sanitize_html(html)
-        if self.local:
-            html = self.imageify_emojis(html, identity=identity)
-        html = self.remove_extra_newlines(html)
-        return mark_safe(html)
-
-    def linkify_mentions(self, html: str, post) -> str:
-        """
-        Links mentions _in the context of the post_ - as in, using the mentions
-        property as the only source (as we might be doing this without other
-        DB access allowed)
-        """
-        from activities.models import Post
-
-        possible_matches = {}
-        for mention in post.mentions.all():
-            if self.local:
-                url = str(mention.urls.view)
-            else:
-                url = mention.absolute_profile_uri()
-            # Might not have fetched it (yet)
-            if mention.username:
-                username = mention.username.lower()
-                possible_matches[username] = url
-                possible_matches[f"{username}@{mention.domain_id}"] = url
-
-        collapse_name: dict[str, str] = {}
-
-        def replacer(match):
-            precursor = match.group(1)
-            handle = match.group(2)
-            if "@" in handle:
-                short_handle = handle.split("@", 1)[0]
-            else:
-                short_handle = handle
-            handle_hash = handle.lower()
-            short_hash = short_handle.lower()
-            if handle_hash in possible_matches:
-                if short_hash not in collapse_name:
-                    collapse_name[short_hash] = handle_hash
-                elif collapse_name.get(short_hash) != handle_hash:
-                    short_handle = handle
-                return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
-            else:
-                return match.group()
-
-        return Post.mention_regex.sub(replacer, html)
-
-    def linkify_hashtags(self, html, identity) -> str:
-        from activities.models import Hashtag
-
-        def replacer(attrs, new=False):
-            # See if the text in this link looks like a hashtag
-            if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
-                return attrs
-            hashtag = attrs["_text"].strip().lstrip("#")
-            attrs[None, "class"] = "hashtag"
-            if (None, "rel") in attrs:
-                del attrs[None, "rel"]
-            if self.local:
-                attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
-            else:
-                attrs[
-                    None, "href"
-                ] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
-            return attrs
-
-        linker = bleach.linkifier.Linker(
-            url_re=Hashtag.hashtag_regex, callbacks=[replacer]
+        parser = FediverseHtmlParser(
+            html,
+            uri_domain=(None if self.local else identity.domain.uri_domain),
+            find_hashtags=False,
+            find_emojis=True,
+            emoji_domain=identity.domain,
        )
-        return linker.linkify(html)
-
-    def imageify_emojis(
-        self, html: str, identity, include_local: bool = True, emojis=None
-    ):
-        """
-        Find :emoji: in content and convert to <img>. If include_local is True,
-        the local emoji will be used as a fallback for any shortcodes not defined
-        by emojis.
-        """
-        from activities.models import Emoji
-
-        # If precached emojis were passed, prep them
-        cached_emojis = {}
-        if emojis:
-            for emoji in emojis:
-                cached_emojis[emoji.shortcode] = emoji
-
-        def replacer(match):
-            shortcode = match.group(1).lower()
-            if shortcode in cached_emojis:
-                return cached_emojis[shortcode].as_html()
-
-            emoji = Emoji.get_by_domain(shortcode, identity.domain)
-            if emoji and emoji.is_usable:
-                return emoji.as_html()
-            elif not emoji and include_local:
-                emoji = Emoji.get_by_domain(shortcode, None)
-                if emoji:
-                    return emoji.as_html()
-
-            return match.group()
-
-        return Emoji.emoji_regex.sub(replacer, html)
-
-    def remove_extra_newlines(self, html: str) -> str:
-        """
-        Some clients are sensitive to extra newlines even though it's HTML
-        """
-        # TODO: More intelligent way to strip these?
-        return html.replace("\n", "")
+        if strip:
+            return mark_safe(parser.html)
+        else:
+            return mark_safe(parser.html)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
-bleach~=5.0.1
 blurhash-python~=1.1.3
 cachetools~=5.2.0
 cryptography~=39.0
--- a/tests/activities/models/test_hashtag.py
+++ b/tests/activities/models/test_hashtag.py
@ -1,44 +0,0 @@
-from activities.models import Hashtag
-from core.html import ContentRenderer
-
-
-def test_hashtag_from_content():
-    assert Hashtag.hashtags_from_content("#hashtag") == ["hashtag"]
-    assert Hashtag.hashtags_from_content("a#hashtag") == []
-    assert Hashtag.hashtags_from_content("Text #with #hashtag in it") == [
-        "hashtag",
-        "with",
-    ]
-    assert Hashtag.hashtags_from_content("#hashtag.") == ["hashtag"]
-    assert Hashtag.hashtags_from_content("More text\n#one # two ##three #hashtag!") == [
-        "hashtag",
-        "one",
-        "three",
-    ]
-    assert Hashtag.hashtags_from_content("my #html loves &#32; entities") == ["html"]
-    assert Hashtag.hashtags_from_content("<span class='hash'>#</span>tag") == ["tag"]
-
-
-def test_linkify_hashtag():
-    linkify = lambda html: ContentRenderer(local=True).linkify_hashtags(html, None)
-
-    assert linkify("# hashtag") == "# hashtag"
-    assert (
-        linkify('<a href="/url/with#anchor">Text</a>')
-        == '<a href="/url/with#anchor">Text</a>'
-    )
-    assert (
-        linkify("#HashTag") == '<a href="/tags/hashtag/" class="hashtag">#HashTag</a>'
-    )
-    assert (
-        linkify(
-            """A longer text #bigContent
-with #tags, linebreaks, and
-maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
-#allTheTags #AllTheTags #ALLTHETAGS"""
-        )
-        == """A longer text <a href="/tags/bigcontent/" class="hashtag">#bigContent</a>
-with <a href="/tags/tags/" class="hashtag">#tags</a>, linebreaks, and
-maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
-<a href="/tags/allthetags/" class="hashtag">#allTheTags</a> <a href="/tags/allthetags/" class="hashtag">#AllTheTags</a> <a href="/tags/allthetags/" class="hashtag">#ALLTHETAGS</a>"""
-    )
--- a/tests/api/test_statuses.py
+++ b/tests/api/test_statuses.py
@ -1,5 +1,7 @@
 import pytest

+from activities.models import Post
+

@pytest.mark.django_db
 def test_post_status(api_token, identity, client):
@ -15,3 +17,44 @@ def test_post_status(api_token, identity, client):
    ).json()
    assert response["content"] == "<p>Hello, world!</p>"
    assert response["visibility"] == "unlisted"
+
+
+@pytest.mark.django_db
+def test_mention_format(api_token, identity, remote_identity, client):
+    """
+    Ensures mentions work, and only have one link around them.
+    """
+    # Make a local post and check it
+    response = client.post(
+        "/api/v1/statuses",
+        HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
+        HTTP_ACCEPT="application/json",
+        content_type="application/json",
+        data={
+            "status": "Hello, @test!",
+            "visibility": "unlisted",
+        },
+    ).json()
+    assert (
+        response["content"]
+        == '<p>Hello, <a href="https://example.com/@test/">@test</a>!</p>'
+    )
+    assert response["visibility"] == "unlisted"
+
+    # Make a remote post and check it
+    post = Post.objects.create(
+        local=False,
+        author=remote_identity,
+        content='<p>Hey <a href="https://example.com/@test/" class="u-url mention" rel="nofollow">@test</a></p>',
+        object_uri="https://remote.test/status/12345",
+    )
+    post.mentions.add(identity)
+    response = client.get(
+        f"/api/v1/statuses/{post.id}",
+        HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
+        HTTP_ACCEPT="application/json",
+        content_type="application/json",
+    ).json()
+    assert (
+        response["text"] == '<p>Hey <a href="https://example.com/@test/">@test</a></p>'
+    )
--- a/tests/core/test_html.py
+++ b/tests/core/test_html.py
@ -1,155 +1,117 @@
-from unittest.mock import Mock
-
 import pytest

-from core.html import ContentRenderer, html_to_plaintext, sanitize_html
-
-
-def test_html_to_plaintext():
-
-    assert html_to_plaintext("<p>Hi!</p>") == "Hi!"
-    assert html_to_plaintext("<p>Hi!<br>There</p>") == "Hi!\nThere"
-    assert (
-        html_to_plaintext("<p>Hi!</p>\n\n<p>How are you?</p>") == "Hi!\n\nHow are you?"
-    )
-
-    assert (
-        html_to_plaintext("<p>Hi!</p>\n\n<p>How are<br> you?</p><p>today</p>")
-        == "Hi!\n\nHow are\n you?\n\ntoday"
-    )
-
-    assert (
-        html_to_plaintext(
-            '<p><a href="https://fedi.takahe.social/with/a/long/path">'
-            '<b>The</b> <img src="takahe.png"> Link</a> '
-            '<a href="">Empty href</a> '
-            "<a>Empty A</a></p>"
-        )
-        == "https://fedi.takahe.social/with/a/long/path Empty href Empty A"
-    )
-
-
-def test_sanitize_post():
-
-    assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
-    assert sanitize_html("<p>It&#39;s great</p>") == "<p>It&#39;s great</p>"
-
-    # Note that we only want to linkify things with protocol prefixes to prevent
-    # too many false positives.
-    assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
-    assert (
-        sanitize_html("<p>https://test.com</p>")
-        == '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
-    )
-    assert (
-        sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
-        == "<p>@someone@subdomain.some-domain.com</p>"
-    )
-
-
-def test_shorten_url():
-    full_url = (
-        "https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened"
-    )
-    assert (
-        sanitize_html(f"<p>{full_url}</p>")
-        == f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url}">social.example.com/a-long/path</a></p>'
-    )
-
-    assert (
-        sanitize_html(
-            f'<p><a href="{full_url}">This is a long link text, but cannot be shortened as a URL</a></p>'
-        )
-        == f'<p><a href="{full_url}" rel="nofollow">This is a long link text, but cannot be shortened as a URL</a></p>'
-    )
+from core.html import FediverseHtmlParser


@pytest.mark.django_db
-def test_link_preservation():
+def test_parser(identity):
    """
-    We want to:
-     - Preserve incoming links from other servers
-     - Linkify mentions and hashtags
-     - Not have these all step on each other!
+    Validates the HtmlParser in its various output modes
    """
-    renderer = ContentRenderer(local=True)
-    fake_mention = Mock()
-    fake_mention.username = "andrew"
-    fake_mention.domain_id = "aeracode.org"
-    fake_mention.urls.view = "/@andrew@aeracode.org/"
-    fake_post = Mock()
-    fake_post.mentions.all.return_value = [fake_mention]
-    fake_post.author.domain.uri_domain = "example.com"
-    fake_post.emojis.all.return_value = []

+    # Basic tag allowance
+    parser = FediverseHtmlParser("<p>Hello!</p><script></script>")
+    assert parser.html == "<p>Hello!</p>"
+    assert parser.plain_text == "Hello!"
+
+    # Newline erasure
+    parser = FediverseHtmlParser("<p>Hi!</p>\n\n<p>How are you?</p>")
+    assert parser.html == "<p>Hi!</p><p>How are you?</p>"
+    assert parser.plain_text == "Hi!\n\nHow are you?"
+
+    # Trying to be evil
+    parser = FediverseHtmlParser("<scri<span></span>pt>")
+    assert "<scr" not in parser.html
+    parser = FediverseHtmlParser("<scri #hashtag pt>")
+    assert "<scr" not in parser.html
+
+    # Entities are escaped
+    parser = FediverseHtmlParser("<p>It&#39;s great</p>", find_hashtags=True)
+    assert parser.html == "<p>It&#x27;s great</p>"
+    assert parser.plain_text == "It's great"
+    assert parser.hashtags == set()
+
+    # Linkify works, but only with protocol prefixes
+    parser = FediverseHtmlParser("<p>test.com</p>")
+    assert parser.html == "<p>test.com</p>"
+    assert parser.plain_text == "test.com"
+    parser = FediverseHtmlParser("<p>https://test.com</p>")
    assert (
-        renderer.render_post(
-            'Hello @andrew, I want to link to this <span>#</span>hashtag: <a href="http://example.com/@andrew/#notahashtag">here</a> and rewrite <a href="https://example.com/tags/thishashtag/">#thishashtag</a>',
-            fake_post,
-        )
-        == 'Hello <a href="/@andrew@aeracode.org/">@andrew</a>, I want to link to this <a href="/tags/hashtag/" class="hashtag">#hashtag</a>: <a href="http://example.com/@andrew/#notahashtag" rel="nofollow">here</a> and rewrite <a href="/tags/thishashtag/" class="hashtag">#thishashtag</a>'
+        parser.html == '<p><a href="https://test.com" rel="nofollow">test.com</a></p>'
    )
+    assert parser.plain_text == "https://test.com"

-
-@pytest.mark.django_db
-def test_list_rendering():
-    """
-    We want to:
-     - Preserve incoming links from other servers
-     - Linkify mentions and hashtags
-     - Not have these all step on each other!
-    """
-    renderer = ContentRenderer(local=True)
-    fake_mention = Mock()
-    fake_mention.username = "andrew"
-    fake_mention.domain_id = "aeracode.org"
-    fake_mention.urls.view = "/@andrew@aeracode.org/"
-    fake_post = Mock()
-    fake_post.mentions.all.return_value = [fake_mention]
-    fake_post.author.domain.uri_domain = "example.com"
-    fake_post.emojis.all.return_value = []
-
+    # Links are preserved
+    parser = FediverseHtmlParser("<a href='https://takahe.social'>takahe social</a>")
    assert (
-        renderer.render_post(
-            "<p>Ok. The roster so far is:</p><ul><li>Infosec.exchange (mastodon)</li><li>pixel.Infosec.exchange (pixelfed)</li><li>video.Infosec.exchange (peertube)</li><li>relay.Infosec.exchange (activitypub relay)</li><li>risky.af (alt mastodon)</li></ul><p>What’s next?  I think I promised some people here bookwyrm</p>",
-            fake_post,
-        )
-        == "<p>Ok. The roster so far is:</p><p>Infosec.exchange (mastodon)<br>pixel.Infosec.exchange (pixelfed)<br>video.Infosec.exchange (peertube)<br>relay.Infosec.exchange (activitypub relay)<br>risky.af (alt mastodon)</p><p>What’s next?  I think I promised some people here bookwyrm</p>"
+        parser.html
+        == '<a href="https://takahe.social" rel="nofollow">takahe social</a>'
+    )
+    assert parser.plain_text == "https://takahe.social"
+
+    # Very long links are shortened
+    full_url = "https://social.example.com/a-long/path/that-should-be-shortened"
+    parser = FediverseHtmlParser(f"<p>{full_url}</p>")
+    assert (
+        parser.html
+        == f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url.removeprefix("https://")}">social.example.com/a-long/path</a></p>'
+    )
+    assert (
+        parser.plain_text
+        == "https://social.example.com/a-long/path/that-should-be-shortened"
    )

-
-@pytest.mark.django_db
-def test_link_mixcase_mentions():
-    renderer = ContentRenderer(local=True)
-    fake_mention = Mock()
-    fake_mention.username = "Manfre"
-    fake_mention.domain_id = "manfre.net"
-    fake_mention.urls.view = "/@Manfre@manfre.net/"
-    fake_mention2 = Mock()
-    fake_mention2.username = "manfre"
-    fake_mention2.domain_id = "takahe.social"
-    fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/"
-
-    unfetched_mention = Mock()
-    unfetched_mention.username = None
-    unfetched_mention.domain_id = None
-    unfetched_mention.urls.view = "/None@None/"
-
-    fake_post = Mock()
-    fake_post.mentions.all.return_value = [
-        fake_mention,
-        fake_mention2,
-        unfetched_mention,
-    ]
-    fake_post.author.domain.uri_domain = "example.com"
-    fake_post.emojis.all.return_value = []
-
-    assert renderer.render_post(
-        "@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net",
-        fake_post,
-    ) == (
-        '<a href="/@Manfre@manfre.net/">@Manfre</a> '
-        '<a href="https://takahe.social/@manfre@takahe.social/">@mAnFrE@takahe.social</a> '
-        '<a href="/@Manfre@manfre.net/">@manfre</a> '
-        "@unfetched@manfre.net"
+    # Make sure things that look like mentions are left alone with no mentions supplied.
+    parser = FediverseHtmlParser(
+        "<p>@test@example.com</p>",
+        find_mentions=True,
+        find_hashtags=True,
+        find_emojis=True,
    )
+    assert parser.html == "<p>@test@example.com</p>"
+    assert parser.plain_text == "@test@example.com"
+    assert parser.mentions == {"test@example.com"}
+
+    # Make sure mentions work when there is a mention supplied
+    parser = FediverseHtmlParser(
+        "<p>@test@example.com</p>",
+        mentions=[identity],
+        find_hashtags=True,
+        find_emojis=True,
+    )
+    assert parser.html == '<p><a href="/@test@example.com/">@test</a></p>'
+    assert parser.plain_text == "@test@example.com"
+    assert parser.mentions == {"test@example.com"}
+
+    # Ensure mentions are case insensitive
+    parser = FediverseHtmlParser(
+        "<p>@TeSt@ExamPle.com</p>",
+        mentions=[identity],
+        find_hashtags=True,
+        find_emojis=True,
+    )
+    assert parser.html == '<p><a href="/@test@example.com/">@TeSt</a></p>'
+    assert parser.plain_text == "@TeSt@ExamPle.com"
+    assert parser.mentions == {"test@example.com"}
+
+    # Ensure hashtags are linked, even through spans, but not within hrefs
+    parser = FediverseHtmlParser(
+        '<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>',
+        find_hashtags=True,
+        find_emojis=True,
+    )
+    assert (
+        parser.html
+        == '<a href="http://example.com#notahashtag" rel="nofollow">something</a> <a href="/tags/hashtag/">#hashtag</a> <a href="/tags/hashtagtwo/">#hashtagtwo</a>'
+    )
+    assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo"
+    assert parser.hashtags == {"hashtag", "hashtagtwo"}
+
+    # Ensure lists are rendered reasonably
+    parser = FediverseHtmlParser(
+        "<p>List:</p><ul><li>One</li><li>Two</li><li>Three</li></ul><p>End!</p>",
+        find_hashtags=True,
+        find_emojis=True,
+    )
+    assert parser.html == "<p>List:</p><p>One<br>Two<br>Three</p><p>End!</p>"
+    assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!"
--- a/users/models/identity.py
+++ b/users/models/identity.py
@ -13,7 +13,7 @@ from django.utils.functional import lazy
 from lxml import etree

 from core.exceptions import ActorMismatchError, capture_message
-from core.html import ContentRenderer, html_to_plaintext, strip_html
+from core.html import ContentRenderer, FediverseHtmlParser
 from core.ld import (
    canonicalise,
    format_ld_date,
@ -530,8 +530,8 @@ class Identity(StatorModel):
            response["attachment"] = [
                {
                    "type": "http://schema.org#PropertyValue",
-                    "name": strip_html(item["name"], linkify=False),
-                    "value": strip_html(item["value"]),
+                    "name": FediverseHtmlParser(item["name"]).plain_text,
+                    "value": FediverseHtmlParser(item["value"]).html,
                }
                for item in self.metadata
            ]
@ -781,7 +781,9 @@ class Identity(StatorModel):
                self.metadata.append(
                    {
                        "name": attachment.get("name"),
-                        "value": strip_html(attachment.get("http://schema.org#value")),
+                        "value": FediverseHtmlParser(
+                            attachment.get("http://schema.org#value")
+                        ).html,
                    }
                )
        # Now go do webfinger with that info to see if we can get a canonical domain
@ -903,12 +905,14 @@ class Identity(StatorModel):
                Post.Visibilities.mentioned: "direct",
            }
            result["source"] = {
-                "note": html_to_plaintext(self.summary) if self.summary else "",
+                "note": FediverseHtmlParser(self.summary).plain_text
+                if self.summary
+                else "",
                "fields": (
                    [
                        {
                            "name": m["name"],
-                            "value": strip_html(m["value"], linkify=False),
+                            "value": FediverseHtmlParser(m["value"]).plain_text,
                            "verified_at": None,
                        }
                        for m in self.metadata
--- a/users/services/identity.py
+++ b/users/services/identity.py
@ -3,7 +3,7 @@ from django.template.defaultfilters import linebreaks_filter

 from activities.models import FanOut
 from core.files import resize_image
-from core.html import strip_html
+from core.html import FediverseHtmlParser
 from users.models import (
    Block,
    BlockStates,
@ -211,7 +211,7 @@ class IdentityService:
        Safely sets a summary and turns linebreaks into HTML
        """
        if summary:
-            self.identity.summary = linebreaks_filter(strip_html(summary))
+            self.identity.summary = FediverseHtmlParser(linebreaks_filter(summary)).html
        else:
            self.identity.summary = None
        self.identity.save()
--- a/users/views/settings/profile.py
+++ b/users/views/settings/profile.py
@ -4,7 +4,7 @@ from django.shortcuts import redirect
 from django.utils.decorators import method_decorator
 from django.views.generic import FormView

-from core.html import html_to_plaintext
+from core.html import FediverseHtmlParser
 from core.models.config import Config
 from users.decorators import identity_required
 from users.models import IdentityStates
@ -65,7 +65,11 @@ class ProfilePage(FormView):
        identity = self.request.identity
        return {
            "name": identity.name,
-            "summary": html_to_plaintext(identity.summary) if identity.summary else "",
+            "summary": (
+                FediverseHtmlParser(identity.summary).plain_text
+                if identity.summary
+                else ""
+            ),
            "icon": identity.icon and identity.icon.url,
            "image": identity.image and identity.image.url,
            "discoverable": identity.discoverable,