Move to a new HTML parser/stripper

This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place.
2023-01-29 17:46:22 -07:00 · 2023-01-29 17:46:22 -07:00 · a6922cb9d6
parent 93c0af992b
commit a6922cb9d6
14 changed files with 503 additions and 562 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -48,13 +48,7 @@ repos:
      - id: mypy
        exclude: "^tests/"
        additional_dependencies:
-          [
+          [types-pyopenssl, types-mock, types-cachetools, types-python-dateutil]
            types-pyopenssl,
            types-bleach,
            types-mock,
            types-cachetools,
            types-python-dateutil,
          ]
  - repo: https://github.com/rtts/djhtml
    rev: v1.5.2
--- a/activities/admin.py
+++ b/activities/admin.py
@ -1,4 +1,3 @@
 from asgiref.sync import async_to_sync
 from django.contrib import admin
 from django.db import models
 from django.utils.safestring import mark_safe
@ -165,7 +164,6 @@ class PostAdmin(admin.ModelAdmin):
    list_filter = ("type", "local", "visibility", "state", "created")
    raw_id_fields = ["emojis"]
    autocomplete_fields = ["to", "mentions", "author"]
    actions = ["reparse_hashtags"]
    search_fields = ["content", "search_handle", "search_service_handle"]
    inlines = [PostAttachmentInline]
    readonly_fields = ["created", "updated", "state_changed", "object_json"]
@ -183,13 +181,6 @@ class PostAdmin(admin.ModelAdmin):
        )
        return super().get_search_results(request, queryset, search_term)
    @admin.action(description="Reprocess content for hashtags")
    def reparse_hashtags(self, request, queryset):
        for instance in queryset:
            instance.hashtags = Hashtag.hashtags_from_content(instance.content) or None
            instance.save()
            async_to_sync(instance.ensure_hashtags)()
    @admin.display(description="ActivityPub JSON")
    def object_json(self, instance):
        return instance.to_ap()
--- a/activities/models/emoji.py
+++ b/activities/models/emoji.py
@ -1,5 +1,4 @@
 import mimetypes
 import re
 from functools import partial
 from typing import ClassVar
@ -14,7 +13,7 @@ from django.db import models
 from django.utils.safestring import mark_safe
 from core.files import get_remote_file
-from core.html import strip_html
+from core.html import FediverseHtmlParser
 from core.ld import format_ld_date
 from core.models import Config
 from core.uploads import upload_emoji_namer
@ -134,8 +133,6 @@ class Emoji(StatorModel):
        admin_disable = "{admin}{self.pk}/disable/"
        admin_copy = "{admin}{self.pk}/copy/"
    emoji_regex = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
    def delete(self, using=None, keep_parents=False):
        if self.file:
            self.file.delete()
@ -242,7 +239,9 @@ class Emoji(StatorModel):
        Return a parsed and sanitized of emoji found in content without
        the surrounding ':'.
        """
-        emoji_hits = cls.emoji_regex.findall(strip_html(content))
+        emoji_hits = FediverseHtmlParser(
            content, find_emojis=True, emoji_domain=domain
        ).emojis
        emojis = sorted({emoji.lower() for emoji in emoji_hits})
        return list(
            cls.objects.filter(local=(domain is None) or domain.local)
--- a/activities/models/hashtag.py
+++ b/activities/models/hashtag.py
@ -6,7 +6,6 @@ from asgiref.sync import sync_to_async
 from django.db import models
 from django.utils import timezone
 from core.html import strip_html
 from core.models import Config
 from stator.models import State, StateField, StateGraph, StatorModel
@ -167,16 +166,6 @@ class Hashtag(StatorModel):
                results[date(year, month, day)] = val
        return dict(sorted(results.items(), reverse=True)[:num])
    @classmethod
    def hashtags_from_content(cls, content) -> list[str]:
        """
        Return a parsed and sanitized of hashtags found in content without
        leading '#'.
        """
        hashtag_hits = cls.hashtag_regex.findall(strip_html(content))
        hashtags = sorted({tag.lower() for tag in hashtag_hits})
        return list(hashtags)
    def to_mastodon_json(self):
        return {
            "name": self.hashtag,
--- a/activities/models/post.py
+++ b/activities/models/post.py
@ -2,7 +2,6 @@ import datetime
 import hashlib
 import json
 import mimetypes
 import re
 import ssl
 from collections.abc import Iterable
 from typing import Optional
@ -26,7 +25,7 @@ from activities.models.post_types import (
    PostTypeDataEncoder,
 )
 from core.exceptions import capture_message
-from core.html import ContentRenderer, strip_html
+from core.html import ContentRenderer, FediverseHtmlParser
 from core.ld import (
    canonicalise,
    format_ld_date,
@ -374,10 +373,6 @@ class Post(StatorModel):
    def clean_type_data(self, value):
        PostTypeData.parse_obj(value)
    mention_regex = re.compile(
        r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
    )
    def _safe_content_note(self, *, local: bool = True):
        return ContentRenderer(local=local).render_post(self.content, self)
@ -474,12 +469,12 @@ class Post(StatorModel):
                # Maintain local-only for replies
                if reply_to.visibility == reply_to.Visibilities.local_only:
                    visibility = reply_to.Visibilities.local_only
            # Find hashtags in this post
            hashtags = Hashtag.hashtags_from_content(content) or None
            # Find emoji in this post
            emojis = Emoji.emojis_from_content(content, None)
-            # Strip all HTML and apply linebreaks filter
+            # Strip all unwanted HTML and apply linebreaks filter, grabbing hashtags on the way
-            content = linebreaks_filter(strip_html(content))
+            parser = FediverseHtmlParser(linebreaks_filter(content), find_hashtags=True)
            content = parser.html
            hashtags = sorted(parser.hashtags) or None
            # Make the Post object
            post = cls.objects.create(
                author=author,
@ -512,12 +507,13 @@ class Post(StatorModel):
    ):
        with transaction.atomic():
            # Strip all HTML and apply linebreaks filter
-            self.content = linebreaks_filter(strip_html(content))
+            parser = FediverseHtmlParser(linebreaks_filter(content))
            self.content = parser.html
            self.hashtags = sorted(parser.hashtags) or None
            self.summary = summary or None
            self.sensitive = bool(summary)
            self.visibility = visibility
            self.edited = timezone.now()
            self.hashtags = Hashtag.hashtags_from_content(content) or None
            self.mentions.set(self.mentions_from_content(content, self.author))
            self.emojis.set(Emoji.emojis_from_content(content, None))
            self.attachments.set(attachments or [])
@ -525,9 +521,9 @@ class Post(StatorModel):
    @classmethod
    def mentions_from_content(cls, content, author) -> set[Identity]:
-        mention_hits = cls.mention_regex.findall(content)
+        mention_hits = FediverseHtmlParser(content, find_mentions=True).mentions
        mentions = set()
-        for precursor, handle in mention_hits:
+        for handle in mention_hits:
            handle = handle.lower()
            if "@" in handle:
                username, domain = handle.split("@", 1)
--- a/activities/views/compose.py
+++ b/activities/views/compose.py
@ -14,7 +14,7 @@ from activities.models import (
    TimelineEvent,
 )
 from core.files import blurhash_image, resize_image
-from core.html import html_to_plaintext
+from core.html import FediverseHtmlParser
 from core.models import Config
 from users.decorators import identity_required
@ -112,7 +112,7 @@ class Compose(FormView):
                {
                    "reply_to": self.reply_to.pk if self.reply_to else "",
                    "visibility": self.post_obj.visibility,
-                    "text": html_to_plaintext(self.post_obj.content),
+                    "text": FediverseHtmlParser(self.post_obj.content).plain_text,
                    "content_warning": self.post_obj.summary,
                }
            )
--- a/core/html.py
+++ b/core/html.py
@ -1,199 +1,309 @@
 import html
 import re
-from functools import partial
+from html.parser import HTMLParser
 import bleach
 import bleach.callbacks
 from bleach.html5lib_shim import Filter
 from bleach.linkifier import LinkifyFilter
 from django.utils.safestring import mark_safe
-url_regex = re.compile(
+
-    r"""\(*  # Match any opening parentheses.
+class FediverseHtmlParser(HTMLParser):
-    \b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?)  # http://
+    """
-    ([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+    A custom HTML parser that only allows a certain tag subset and behaviour:
-    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+    - br, p tags are passed through
    - a tags are passed through if they're not hashtags or mentions
    - Another set of tags are converted to p
    It also linkifies URLs, mentions, hashtags, and imagifies emoji.
    """
    REWRITE_TO_P = [
        "p",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "blockquote",
        "pre",
        "ul",
        "ol",
    ]
    REWRITE_TO_BR = [
        "br",
        "li",
    ]
    MENTION_REGEX = re.compile(
        r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
    )
    HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")
    EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
    URL_REGEX = re.compile(
        r"""(\(*  # Match any opening parentheses.
        \b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?)  # http://
        (?:[\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?)
        # /path/zz (excluding "unsafe" chars from RFC 1738,
        # except for # and ~, which happen in practice)
-    """,
+        """,
-    re.IGNORECASE | re.VERBOSE | re.UNICODE,
+        re.IGNORECASE | re.VERBOSE | re.UNICODE,
 )
 ALLOWED_TAGS = ["br", "p", "a"]
 REWRITTEN_TAGS = [
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "blockquote",
    "pre",
    "ul",
    "ol",
    "li",
 ]
 class MastodonStrictTagFilter(Filter):
    """
    Implements Python equivalent of Mastodon tag rewriter
    Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
    Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
    """
    def __iter__(self):
        li_pending_break = False
        break_token = {
            "name": "br",
            "data": {},
            "type": "StartTag",
        }
        for token in Filter.__iter__(self):
            if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
                "StartTag",
                "EndTag",
            ]:
                yield token
                continue
            if token["type"] == "StartTag":
                if token["name"] == "li":
                    if li_pending_break:
                        # Another `li` appeared, so break after the last one
                        yield break_token
                    continue
                token["name"] = "p"
            elif token["type"] == "EndTag":
                if token["name"] == "li":
                    # Track that an `li` closed so we know a break should be considered
                    li_pending_break = True
                    continue
                if token["name"] == "ul":
                    # If the last `li` happened, then don't add a break because Mastodon doesn't
                    li_pending_break = False
                token["name"] = "p"
            yield token
 class UnlinkifyFilter(Filter):
    """
    Forcibly replaces link text with the href.
    This is intented to be used when stripping <a> tags to preserve the link
    location at the expense of the link text.
    """
    def __iter__(self):
        discarding_a_text = False
        for token in Filter.__iter__(self):
            if token.get("name") == "a":
                if token["type"] == "EndTag":
                    discarding_a_text = False
                    continue
                href = token["data"].get((None, "href"))
                # If <a> has an href, we use it and throw away all content
                # within the <a>...</a>. If href missing or empty, try to find
                # text within the <a>...</a>
                if href:
                    yield {"data": href, "type": "Characters"}
                    discarding_a_text = True
                    continue
            elif not discarding_a_text:
                yield token
            # else: throw away tokens until we're out of the <a>
 def allow_a(tag: str, name: str, value: str):
    if name in ["href", "title", "class"]:
        return True
    elif name == "rel":
        # Only allow rel attributes with a small subset of values
        # (we're defending against, for example, rel=me)
        rel_values = value.split()
        if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
            return True
    return False
 def shorten_link_text(attrs, new=False):
    """
    Applies Mastodon's link shortening behavior where URL text links are
    shortened by removing the scheme and only showing the first 30 chars.
    Orig:
        <a>https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened</a>
    Becomes:
        <a>social.example.com/a-long/path</a>
    """
    text = attrs.get("_text")
    if not text:
        text = attrs.get((None, "href"))
    if text and "://" in text and len(text) > 30:
        text = text.split("://", 1)[-1]
        attrs["_text"] = text[:30]
        if len(text) > 30:
            attrs[(None, "class")] = " ".join(
                filter(None, [attrs.pop((None, "class"), ""), "ellipsis"])
            )
        # Add the full URL in to title for easier user inspection
        attrs[(None, "title")] = attrs.get((None, "href"))
    return attrs
 linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text]
 def sanitize_html(post_html: str) -> str:
    """
    Only allows a, br, p and span tags, and class attributes.
    """
    cleaner = bleach.Cleaner(
        tags=ALLOWED_TAGS + REWRITTEN_TAGS,
        attributes={  # type:ignore
            "a": allow_a,
            "p": ["class"],
        },
        filters=[
            partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks),
            MastodonStrictTagFilter,
        ],
        strip=True,
    )
    return mark_safe(cleaner.clean(post_html))
    def __init__(
        self,
        html: str,
        uri_domain: str | None = None,
        mentions: list | None = None,
        find_mentions: bool = False,
        find_hashtags: bool = False,
        find_emojis: bool = False,
        emoji_domain=None,
    ):
        super().__init__()
        self.uri_domain = uri_domain
        self.emoji_domain = emoji_domain
        self.find_mentions = find_mentions
        self.find_hashtags = find_hashtags
        self.find_emojis = find_emojis
        self.calculate_mentions(mentions)
        self._data_buffer = ""
        self.html_output = ""
        self.text_output = ""
        self.emojis: set[str] = set()
        self.mentions: set[str] = set()
        self.hashtags: set[str] = set()
        self._pending_a: dict | None = None
        self._fresh_p = False
        self.feed(html.replace("\n", ""))
        self.flush_data()
-def strip_html(post_html: str, *, linkify: bool = True) -> str:
+    def calculate_mentions(self, mentions: list | None):
-    """
+        """
-    Strips all tags from the text, then linkifies it.
+        Prepares a set of content that we expect to see mentions look like
-    """
+        (this imp)
-    cleaner = bleach.Cleaner(
+        """
-        tags=[],
+        self.mention_matches: dict[str, str] = {}
-        strip=True,
+        self.mention_aliases: dict[str, str] = {}
-        filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
+        for mention in mentions or []:
-        if linkify
+            if self.uri_domain:
-        else [UnlinkifyFilter],
+                url = mention.absolute_profile_uri()
-    )
+            else:
-    return mark_safe(cleaner.clean(post_html))
+                url = str(mention.urls.view)
            if mention.username:
                username = mention.username.lower()
                domain = mention.domain_id.lower()
                self.mention_matches[f"{username}"] = url
                self.mention_matches[f"{username}@{domain}"] = url
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag in self.REWRITE_TO_P:
            self.flush_data()
            self.html_output += "<p>"
        elif tag in self.REWRITE_TO_BR:
            self.flush_data()
            if not self._fresh_p:
                self.html_output += "<br>"
                self.text_output += "\n"
        elif tag == "a":
            self.flush_data()
            self._pending_a = {"attrs": dict(attrs), "content": ""}
        self._fresh_p = tag in self.REWRITE_TO_P
-def html_to_plaintext(post_html: str) -> str:
+    def handle_endtag(self, tag: str) -> None:
-    """
+        self._fresh_p = False
-    Tries to do the inverse of the linebreaks filter.
+        if tag in self.REWRITE_TO_P:
-    """
+            self.flush_data()
-    # TODO: Handle HTML entities
+            self.html_output += "</p>"
-    # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
+            self.text_output += "\n\n"
-    post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
+        elif tag == "a":
-    # Remove all other HTML and return
+            if self._pending_a:
-    cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
+                href = self._pending_a["attrs"].get("href")
-    return cleaner.clean(post_html).strip()
+                content = self._pending_a["content"].strip()
                # Is it a mention?
                if content.lower().lstrip("@") in self.mention_matches:
                    self.html_output += self.create_mention(content)
                    self.text_output += content
                # Is it a hashtag?
                elif self.HASHTAG_REGEX.match(content):
                    self.html_output += self.create_hashtag(content)
                    self.text_output += content
                elif content:
                    # Shorten the link if we need to
                    self.html_output += self.create_link(href, content)
                    self.text_output += href
                self._pending_a = None
    def handle_data(self, data: str) -> None:
        self._fresh_p = False
        if self._pending_a:
            self._pending_a["content"] += data
        else:
            self._data_buffer += data
    def flush_data(self) -> None:
        """
        We collect data segments until we encounter a tag we care about,
        so we can treat <span>#</span>hashtag as #hashtag
        """
        self.text_output += self._data_buffer
        self.html_output += self.linkify(self._data_buffer)
        self._data_buffer = ""
    def create_link(self, href, content):
        """
        Generates a link, doing optional shortening.
        All return values from this function should be HTML-safe.
        """
        looks_like_link = bool(self.URL_REGEX.match(content))
        if looks_like_link:
            content = content.split("://", 1)[1]
        if looks_like_link and len(content) > 30:
            return f'<a href="{html.escape(href)}" rel="nofollow" class="ellipsis" title="{html.escape(content)}">{html.escape(content[:30])}</a>'
        else:
            return f'<a href="{html.escape(href)}" rel="nofollow">{html.escape(content)}</a>'
    def create_mention(self, handle) -> str:
        """
        Generates a mention link. Handle should have a leading @.
        All return values from this function should be HTML-safe
        """
        handle = handle.lstrip("@")
        if "@" in handle:
            short_handle = handle.split("@", 1)[0]
        else:
            short_handle = handle
        handle_hash = handle.lower()
        short_hash = short_handle.lower()
        self.mentions.add(handle_hash)
        url = self.mention_matches.get(handle_hash)
        if url:
            if short_hash not in self.mention_aliases:
                self.mention_aliases[short_hash] = handle_hash
            elif self.mention_aliases.get(short_hash) != handle_hash:
                short_handle = handle
            return f'<a href="{html.escape(url)}">@{html.escape(short_handle)}</a>'
        else:
            return "@" + html.escape(handle)
    def create_hashtag(self, hashtag) -> str:
        """
        Generates a hashtag link. Hashtag does not need to start with #
        All return values from this function should be HTML-safe
        """
        hashtag = hashtag.lstrip("#")
        self.hashtags.add(hashtag.lower())
        if self.uri_domain:
            return f'<a href="https://{self.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'
        else:
            return f'<a href="/tags/{hashtag.lower()}/">#{hashtag}</a>'
    def create_emoji(self, shortcode) -> str:
        """
        Generates an emoji <img> tag
        All return values from this function should be HTML-safe
        """
        from activities.models import Emoji
        emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)
        if emoji and emoji.is_usable:
            self.emojis.add(shortcode)
            return emoji.as_html()
        return f":{shortcode}:"
    def linkify(self, data):
        """
        Linkifies some content that is plaintext.
        Handles URLs first, then mentions. Note that this takes great care to
        keep track of what is HTML and what needs to be escaped.
        """
        # Split the string by the URL regex so we know what to escape and what
        # not to escape.
        bits = self.URL_REGEX.split(data)
        result = ""
        # Even indices are data we should pass though, odd indices are links
        for i, bit in enumerate(bits):
            # A link!
            if i % 2 == 1:
                result += self.create_link(bit, bit)
            # Not a link
            elif self.mention_matches or self.find_mentions:
                result += self.linkify_mentions(bit)
            elif self.find_hashtags:
                result += self.linkify_hashtags(bit)
            elif self.find_emojis:
                result += self.linkify_emoji(bit)
            else:
                result += html.escape(bit)
        return result
    def linkify_mentions(self, data):
        """
        Linkifies mentions
        """
        bits = self.MENTION_REGEX.split(data)
        result = ""
        for i, bit in enumerate(bits):
            # Mention content
            if i % 3 == 2:
                result += self.create_mention(bit)
            # Not part of a mention (0) or mention preamble (1)
            elif self.find_hashtags:
                result += self.linkify_hashtags(bit)
            elif self.find_emojis:
                result += self.linkify_emoji(bit)
            else:
                result += html.escape(bit)
        return result
    def linkify_hashtags(self, data):
        """
        Linkifies hashtags
        """
        bits = self.HASHTAG_REGEX.split(data)
        result = ""
        for i, bit in enumerate(bits):
            # Not part of a hashtag
            if i % 2 == 0:
                if self.find_emojis:
                    result += self.linkify_emoji(bit)
                else:
                    result += html.escape(bit)
            # Hashtag content
            else:
                result += self.create_hashtag(bit)
        return result
    def linkify_emoji(self, data):
        """
        Linkifies emoji
        """
        bits = self.EMOJI_REGEX.split(data)
        result = ""
        for i, bit in enumerate(bits):
            # Not part of an emoji
            if i % 2 == 0:
                result += html.escape(bit)
            # Emoji content
            else:
                result += self.create_emoji(bit)
        return result
    @property
    def html(self):
        return self.html_output.strip()
    @property
    def plain_text(self):
        return self.text_output.strip()
 class ContentRenderer:
@ -212,33 +322,30 @@ class ContentRenderer:
        """
        if not html:
            return ""
-        html = sanitize_html(html)
+        parser = FediverseHtmlParser(
-        html = self.linkify_mentions(html, post=post)
+            html,
-        html = self.linkify_hashtags(html, identity=post.author)
+            mentions=post.mentions.all(),
-        if self.local:
+            uri_domain=(None if self.local else post.author.domain.uri_domain),
-            html = self.imageify_emojis(
+            find_hashtags=True,
-                html,
+            find_emojis=True,
-                identity=post.author,
+            emoji_domain=post.author.domain,
-                emojis=post.emojis.all(),
+        )
-            )
+        return mark_safe(parser.html)
        html = self.remove_extra_newlines(html)
        return mark_safe(html)
-    def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
+    def render_identity_summary(self, html: str, identity) -> str:
        """
        Given identity summary HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
-        if strip:
+        parser = FediverseHtmlParser(
-            html = strip_html(html)
+            html,
-        else:
+            uri_domain=(None if self.local else identity.domain.uri_domain),
-            html = sanitize_html(html)
+            find_hashtags=True,
-        html = self.linkify_hashtags(html, identity=identity)
+            find_emojis=True,
-        if self.local:
+            emoji_domain=identity.domain,
-            html = self.imageify_emojis(html, identity=identity)
+        )
-        html = self.remove_extra_newlines(html)
+        return mark_safe(parser.html)
        return mark_safe(html)
    def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
        """
@ -246,117 +353,14 @@ class ContentRenderer:
        """
        if not html:
            return ""
-        if strip:
+        parser = FediverseHtmlParser(
-            html = strip_html(html)
+            html,
-        else:
+            uri_domain=(None if self.local else identity.domain.uri_domain),
-            html = sanitize_html(html)
+            find_hashtags=False,
-        if self.local:
+            find_emojis=True,
-            html = self.imageify_emojis(html, identity=identity)
+            emoji_domain=identity.domain,
        html = self.remove_extra_newlines(html)
        return mark_safe(html)
    def linkify_mentions(self, html: str, post) -> str:
        """
        Links mentions _in the context of the post_ - as in, using the mentions
        property as the only source (as we might be doing this without other
        DB access allowed)
        """
        from activities.models import Post
        possible_matches = {}
        for mention in post.mentions.all():
            if self.local:
                url = str(mention.urls.view)
            else:
                url = mention.absolute_profile_uri()
            # Might not have fetched it (yet)
            if mention.username:
                username = mention.username.lower()
                possible_matches[username] = url
                possible_matches[f"{username}@{mention.domain_id}"] = url
        collapse_name: dict[str, str] = {}
        def replacer(match):
            precursor = match.group(1)
            handle = match.group(2)
            if "@" in handle:
                short_handle = handle.split("@", 1)[0]
            else:
                short_handle = handle
            handle_hash = handle.lower()
            short_hash = short_handle.lower()
            if handle_hash in possible_matches:
                if short_hash not in collapse_name:
                    collapse_name[short_hash] = handle_hash
                elif collapse_name.get(short_hash) != handle_hash:
                    short_handle = handle
                return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
            else:
                return match.group()
        return Post.mention_regex.sub(replacer, html)
    def linkify_hashtags(self, html, identity) -> str:
        from activities.models import Hashtag
        def replacer(attrs, new=False):
            # See if the text in this link looks like a hashtag
            if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
                return attrs
            hashtag = attrs["_text"].strip().lstrip("#")
            attrs[None, "class"] = "hashtag"
            if (None, "rel") in attrs:
                del attrs[None, "rel"]
            if self.local:
                attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
            else:
                attrs[
                    None, "href"
                ] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
            return attrs
        linker = bleach.linkifier.Linker(
            url_re=Hashtag.hashtag_regex, callbacks=[replacer]
        )
-        return linker.linkify(html)
+        if strip:
-
+            return mark_safe(parser.html)
-    def imageify_emojis(
+        else:
-        self, html: str, identity, include_local: bool = True, emojis=None
+            return mark_safe(parser.html)
    ):
        """
        Find :emoji: in content and convert to <img>. If include_local is True,
        the local emoji will be used as a fallback for any shortcodes not defined
        by emojis.
        """
        from activities.models import Emoji
        # If precached emojis were passed, prep them
        cached_emojis = {}
        if emojis:
            for emoji in emojis:
                cached_emojis[emoji.shortcode] = emoji
        def replacer(match):
            shortcode = match.group(1).lower()
            if shortcode in cached_emojis:
                return cached_emojis[shortcode].as_html()
            emoji = Emoji.get_by_domain(shortcode, identity.domain)
            if emoji and emoji.is_usable:
                return emoji.as_html()
            elif not emoji and include_local:
                emoji = Emoji.get_by_domain(shortcode, None)
                if emoji:
                    return emoji.as_html()
            return match.group()
        return Emoji.emoji_regex.sub(replacer, html)
    def remove_extra_newlines(self, html: str) -> str:
        """
        Some clients are sensitive to extra newlines even though it's HTML
        """
        # TODO: More intelligent way to strip these?
        return html.replace("\n", "")
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
 bleach~=5.0.1
 blurhash-python~=1.1.3
 cachetools~=5.2.0
 cryptography~=39.0
--- a/tests/activities/models/test_hashtag.py
+++ b/tests/activities/models/test_hashtag.py
@ -1,44 +0,0 @@
 from activities.models import Hashtag
 from core.html import ContentRenderer
 def test_hashtag_from_content():
    assert Hashtag.hashtags_from_content("#hashtag") == ["hashtag"]
    assert Hashtag.hashtags_from_content("a#hashtag") == []
    assert Hashtag.hashtags_from_content("Text #with #hashtag in it") == [
        "hashtag",
        "with",
    ]
    assert Hashtag.hashtags_from_content("#hashtag.") == ["hashtag"]
    assert Hashtag.hashtags_from_content("More text\n#one # two ##three #hashtag!") == [
        "hashtag",
        "one",
        "three",
    ]
    assert Hashtag.hashtags_from_content("my #html loves &#32; entities") == ["html"]
    assert Hashtag.hashtags_from_content("<span class='hash'>#</span>tag") == ["tag"]
 def test_linkify_hashtag():
    linkify = lambda html: ContentRenderer(local=True).linkify_hashtags(html, None)
    assert linkify("# hashtag") == "# hashtag"
    assert (
        linkify('<a href="/url/with#anchor">Text</a>')
        == '<a href="/url/with#anchor">Text</a>'
    )
    assert (
        linkify("#HashTag") == '<a href="/tags/hashtag/" class="hashtag">#HashTag</a>'
    )
    assert (
        linkify(
            """A longer text #bigContent
 with #tags, linebreaks, and
 maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
 #allTheTags #AllTheTags #ALLTHETAGS"""
        )
        == """A longer text <a href="/tags/bigcontent/" class="hashtag">#bigContent</a>
 with <a href="/tags/tags/" class="hashtag">#tags</a>, linebreaks, and
 maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
 <a href="/tags/allthetags/" class="hashtag">#allTheTags</a> <a href="/tags/allthetags/" class="hashtag">#AllTheTags</a> <a href="/tags/allthetags/" class="hashtag">#ALLTHETAGS</a>"""
    )
--- a/tests/api/test_statuses.py
+++ b/tests/api/test_statuses.py
@ -1,5 +1,7 @@
 import pytest
 from activities.models import Post
@pytest.mark.django_db
 def test_post_status(api_token, identity, client):
@ -15,3 +17,44 @@ def test_post_status(api_token, identity, client):
    ).json()
    assert response["content"] == "<p>Hello, world!</p>"
    assert response["visibility"] == "unlisted"
@pytest.mark.django_db
 def test_mention_format(api_token, identity, remote_identity, client):
    """
    Ensures mentions work, and only have one link around them.
    """
    # Make a local post and check it
    response = client.post(
        "/api/v1/statuses",
        HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
        HTTP_ACCEPT="application/json",
        content_type="application/json",
        data={
            "status": "Hello, @test!",
            "visibility": "unlisted",
        },
    ).json()
    assert (
        response["content"]
        == '<p>Hello, <a href="https://example.com/@test/">@test</a>!</p>'
    )
    assert response["visibility"] == "unlisted"
    # Make a remote post and check it
    post = Post.objects.create(
        local=False,
        author=remote_identity,
        content='<p>Hey <a href="https://example.com/@test/" class="u-url mention" rel="nofollow">@test</a></p>',
        object_uri="https://remote.test/status/12345",
    )
    post.mentions.add(identity)
    response = client.get(
        f"/api/v1/statuses/{post.id}",
        HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
        HTTP_ACCEPT="application/json",
        content_type="application/json",
    ).json()
    assert (
        response["text"] == '<p>Hey <a href="https://example.com/@test/">@test</a></p>'
    )
--- a/tests/core/test_html.py
+++ b/tests/core/test_html.py
@ -1,155 +1,117 @@
 from unittest.mock import Mock
 import pytest
-from core.html import ContentRenderer, html_to_plaintext, sanitize_html
+from core.html import FediverseHtmlParser
 def test_html_to_plaintext():
    assert html_to_plaintext("<p>Hi!</p>") == "Hi!"
    assert html_to_plaintext("<p>Hi!<br>There</p>") == "Hi!\nThere"
    assert (
        html_to_plaintext("<p>Hi!</p>\n\n<p>How are you?</p>") == "Hi!\n\nHow are you?"
    )
    assert (
        html_to_plaintext("<p>Hi!</p>\n\n<p>How are<br> you?</p><p>today</p>")
        == "Hi!\n\nHow are\n you?\n\ntoday"
    )
    assert (
        html_to_plaintext(
            '<p><a href="https://fedi.takahe.social/with/a/long/path">'
            '<b>The</b> <img src="takahe.png"> Link</a> '
            '<a href="">Empty href</a> '
            "<a>Empty A</a></p>"
        )
        == "https://fedi.takahe.social/with/a/long/path Empty href Empty A"
    )
 def test_sanitize_post():
    assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
    assert sanitize_html("<p>It&#39;s great</p>") == "<p>It&#39;s great</p>"
    # Note that we only want to linkify things with protocol prefixes to prevent
    # too many false positives.
    assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
    assert (
        sanitize_html("<p>https://test.com</p>")
        == '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
    )
    assert (
        sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
        == "<p>@someone@subdomain.some-domain.com</p>"
    )
 def test_shorten_url():
    full_url = (
        "https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened"
    )
    assert (
        sanitize_html(f"<p>{full_url}</p>")
        == f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url}">social.example.com/a-long/path</a></p>'
    )
    assert (
        sanitize_html(
            f'<p><a href="{full_url}">This is a long link text, but cannot be shortened as a URL</a></p>'
        )
        == f'<p><a href="{full_url}" rel="nofollow">This is a long link text, but cannot be shortened as a URL</a></p>'
    )
@pytest.mark.django_db
-def test_link_preservation():
+def test_parser(identity):
    """
-    We want to:
+    Validates the HtmlParser in its various output modes
     - Preserve incoming links from other servers
     - Linkify mentions and hashtags
     - Not have these all step on each other!
    """
    renderer = ContentRenderer(local=True)
    fake_mention = Mock()
    fake_mention.username = "andrew"
    fake_mention.domain_id = "aeracode.org"
    fake_mention.urls.view = "/@andrew@aeracode.org/"
    fake_post = Mock()
    fake_post.mentions.all.return_value = [fake_mention]
    fake_post.author.domain.uri_domain = "example.com"
    fake_post.emojis.all.return_value = []
    # Basic tag allowance
    parser = FediverseHtmlParser("<p>Hello!</p><script></script>")
    assert parser.html == "<p>Hello!</p>"
    assert parser.plain_text == "Hello!"
    # Newline erasure
    parser = FediverseHtmlParser("<p>Hi!</p>\n\n<p>How are you?</p>")
    assert parser.html == "<p>Hi!</p><p>How are you?</p>"
    assert parser.plain_text == "Hi!\n\nHow are you?"
    # Trying to be evil
    parser = FediverseHtmlParser("<scri<span></span>pt>")
    assert "<scr" not in parser.html
    parser = FediverseHtmlParser("<scri #hashtag pt>")
    assert "<scr" not in parser.html
    # Entities are escaped
    parser = FediverseHtmlParser("<p>It&#39;s great</p>", find_hashtags=True)
    assert parser.html == "<p>It&#x27;s great</p>"
    assert parser.plain_text == "It's great"
    assert parser.hashtags == set()
    # Linkify works, but only with protocol prefixes
    parser = FediverseHtmlParser("<p>test.com</p>")
    assert parser.html == "<p>test.com</p>"
    assert parser.plain_text == "test.com"
    parser = FediverseHtmlParser("<p>https://test.com</p>")
    assert (
-        renderer.render_post(
+        parser.html == '<p><a href="https://test.com" rel="nofollow">test.com</a></p>'
            'Hello @andrew, I want to link to this <span>#</span>hashtag: <a href="http://example.com/@andrew/#notahashtag">here</a> and rewrite <a href="https://example.com/tags/thishashtag/">#thishashtag</a>',
            fake_post,
        )
        == 'Hello <a href="/@andrew@aeracode.org/">@andrew</a>, I want to link to this <a href="/tags/hashtag/" class="hashtag">#hashtag</a>: <a href="http://example.com/@andrew/#notahashtag" rel="nofollow">here</a> and rewrite <a href="/tags/thishashtag/" class="hashtag">#thishashtag</a>'
    )
    assert parser.plain_text == "https://test.com"
-
+    # Links are preserved
-@pytest.mark.django_db
+    parser = FediverseHtmlParser("<a href='https://takahe.social'>takahe social</a>")
 def test_list_rendering():
    """
    We want to:
     - Preserve incoming links from other servers
     - Linkify mentions and hashtags
     - Not have these all step on each other!
    """
    renderer = ContentRenderer(local=True)
    fake_mention = Mock()
    fake_mention.username = "andrew"
    fake_mention.domain_id = "aeracode.org"
    fake_mention.urls.view = "/@andrew@aeracode.org/"
    fake_post = Mock()
    fake_post.mentions.all.return_value = [fake_mention]
    fake_post.author.domain.uri_domain = "example.com"
    fake_post.emojis.all.return_value = []
    assert (
-        renderer.render_post(
+        parser.html
-            "<p>Ok. The roster so far is:</p><ul><li>Infosec.exchange (mastodon)</li><li>pixel.Infosec.exchange (pixelfed)</li><li>video.Infosec.exchange (peertube)</li><li>relay.Infosec.exchange (activitypub relay)</li><li>risky.af (alt mastodon)</li></ul><p>What’s next?  I think I promised some people here bookwyrm</p>",
+        == '<a href="https://takahe.social" rel="nofollow">takahe social</a>'
-            fake_post,
+    )
-        )
+    assert parser.plain_text == "https://takahe.social"
-        == "<p>Ok. The roster so far is:</p><p>Infosec.exchange (mastodon)<br>pixel.Infosec.exchange (pixelfed)<br>video.Infosec.exchange (peertube)<br>relay.Infosec.exchange (activitypub relay)<br>risky.af (alt mastodon)</p><p>What’s next?  I think I promised some people here bookwyrm</p>"
+
    # Very long links are shortened
    full_url = "https://social.example.com/a-long/path/that-should-be-shortened"
    parser = FediverseHtmlParser(f"<p>{full_url}</p>")
    assert (
        parser.html
        == f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url.removeprefix("https://")}">social.example.com/a-long/path</a></p>'
    )
    assert (
        parser.plain_text
        == "https://social.example.com/a-long/path/that-should-be-shortened"
    )
-
+    # Make sure things that look like mentions are left alone with no mentions supplied.
-@pytest.mark.django_db
+    parser = FediverseHtmlParser(
-def test_link_mixcase_mentions():
+        "<p>@test@example.com</p>",
-    renderer = ContentRenderer(local=True)
+        find_mentions=True,
-    fake_mention = Mock()
+        find_hashtags=True,
-    fake_mention.username = "Manfre"
+        find_emojis=True,
    fake_mention.domain_id = "manfre.net"
    fake_mention.urls.view = "/@Manfre@manfre.net/"
    fake_mention2 = Mock()
    fake_mention2.username = "manfre"
    fake_mention2.domain_id = "takahe.social"
    fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/"
    unfetched_mention = Mock()
    unfetched_mention.username = None
    unfetched_mention.domain_id = None
    unfetched_mention.urls.view = "/None@None/"
    fake_post = Mock()
    fake_post.mentions.all.return_value = [
        fake_mention,
        fake_mention2,
        unfetched_mention,
    ]
    fake_post.author.domain.uri_domain = "example.com"
    fake_post.emojis.all.return_value = []
    assert renderer.render_post(
        "@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net",
        fake_post,
    ) == (
        '<a href="/@Manfre@manfre.net/">@Manfre</a> '
        '<a href="https://takahe.social/@manfre@takahe.social/">@mAnFrE@takahe.social</a> '
        '<a href="/@Manfre@manfre.net/">@manfre</a> '
        "@unfetched@manfre.net"
    )
    assert parser.html == "<p>@test@example.com</p>"
    assert parser.plain_text == "@test@example.com"
    assert parser.mentions == {"test@example.com"}
    # Make sure mentions work when there is a mention supplied
    parser = FediverseHtmlParser(
        "<p>@test@example.com</p>",
        mentions=[identity],
        find_hashtags=True,
        find_emojis=True,
    )
    assert parser.html == '<p><a href="/@test@example.com/">@test</a></p>'
    assert parser.plain_text == "@test@example.com"
    assert parser.mentions == {"test@example.com"}
    # Ensure mentions are case insensitive
    parser = FediverseHtmlParser(
        "<p>@TeSt@ExamPle.com</p>",
        mentions=[identity],
        find_hashtags=True,
        find_emojis=True,
    )
    assert parser.html == '<p><a href="/@test@example.com/">@TeSt</a></p>'
    assert parser.plain_text == "@TeSt@ExamPle.com"
    assert parser.mentions == {"test@example.com"}
    # Ensure hashtags are linked, even through spans, but not within hrefs
    parser = FediverseHtmlParser(
        '<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>',
        find_hashtags=True,
        find_emojis=True,
    )
    assert (
        parser.html
        == '<a href="http://example.com#notahashtag" rel="nofollow">something</a> <a href="/tags/hashtag/">#hashtag</a> <a href="/tags/hashtagtwo/">#hashtagtwo</a>'
    )
    assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo"
    assert parser.hashtags == {"hashtag", "hashtagtwo"}
    # Ensure lists are rendered reasonably
    parser = FediverseHtmlParser(
        "<p>List:</p><ul><li>One</li><li>Two</li><li>Three</li></ul><p>End!</p>",
        find_hashtags=True,
        find_emojis=True,
    )
    assert parser.html == "<p>List:</p><p>One<br>Two<br>Three</p><p>End!</p>"
    assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!"
--- a/users/models/identity.py
+++ b/users/models/identity.py
@ -13,7 +13,7 @@ from django.utils.functional import lazy
 from lxml import etree
 from core.exceptions import ActorMismatchError, capture_message
-from core.html import ContentRenderer, html_to_plaintext, strip_html
+from core.html import ContentRenderer, FediverseHtmlParser
 from core.ld import (
    canonicalise,
    format_ld_date,
@ -530,8 +530,8 @@ class Identity(StatorModel):
            response["attachment"] = [
                {
                    "type": "http://schema.org#PropertyValue",
-                    "name": strip_html(item["name"], linkify=False),
+                    "name": FediverseHtmlParser(item["name"]).plain_text,
-                    "value": strip_html(item["value"]),
+                    "value": FediverseHtmlParser(item["value"]).html,
                }
                for item in self.metadata
            ]
@ -781,7 +781,9 @@ class Identity(StatorModel):
                self.metadata.append(
                    {
                        "name": attachment.get("name"),
-                        "value": strip_html(attachment.get("http://schema.org#value")),
+                        "value": FediverseHtmlParser(
                            attachment.get("http://schema.org#value")
                        ).html,
                    }
                )
        # Now go do webfinger with that info to see if we can get a canonical domain
@ -903,12 +905,14 @@ class Identity(StatorModel):
                Post.Visibilities.mentioned: "direct",
            }
            result["source"] = {
-                "note": html_to_plaintext(self.summary) if self.summary else "",
+                "note": FediverseHtmlParser(self.summary).plain_text
                if self.summary
                else "",
                "fields": (
                    [
                        {
                            "name": m["name"],
-                            "value": strip_html(m["value"], linkify=False),
+                            "value": FediverseHtmlParser(m["value"]).plain_text,
                            "verified_at": None,
                        }
                        for m in self.metadata
--- a/users/services/identity.py
+++ b/users/services/identity.py
@ -3,7 +3,7 @@ from django.template.defaultfilters import linebreaks_filter
 from activities.models import FanOut
 from core.files import resize_image
-from core.html import strip_html
+from core.html import FediverseHtmlParser
 from users.models import (
    Block,
    BlockStates,
@ -211,7 +211,7 @@ class IdentityService:
        Safely sets a summary and turns linebreaks into HTML
        """
        if summary:
-            self.identity.summary = linebreaks_filter(strip_html(summary))
+            self.identity.summary = FediverseHtmlParser(linebreaks_filter(summary)).html
        else:
            self.identity.summary = None
        self.identity.save()
--- a/users/views/settings/profile.py
+++ b/users/views/settings/profile.py
@ -4,7 +4,7 @@ from django.shortcuts import redirect
 from django.utils.decorators import method_decorator
 from django.views.generic import FormView
-from core.html import html_to_plaintext
+from core.html import FediverseHtmlParser
 from core.models.config import Config
 from users.decorators import identity_required
 from users.models import IdentityStates
@ -65,7 +65,11 @@ class ProfilePage(FormView):
        identity = self.request.identity
        return {
            "name": identity.name,
-            "summary": html_to_plaintext(identity.summary) if identity.summary else "",
+            "summary": (
                FediverseHtmlParser(identity.summary).plain_text
                if identity.summary
                else ""
            ),
            "icon": identity.icon and identity.icon.url,
            "image": identity.image and identity.image.url,
            "discoverable": identity.discoverable,