Match sanitizing for posts to Mastodon (#422)

Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73 I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact.
2023-01-15 21:32:04 -08:00 · 2023-01-15 21:32:04 -08:00 · cfe18932b8
parent b721833b4f
commit cfe18932b8
2 changed files with 90 additions and 2 deletions
--- a/core/html.py
+++ b/core/html.py
@ -2,6 +2,7 @@ import re
 from functools import partial

 import bleach
+from bleach.html5lib_shim import Filter
 from bleach.linkifier import LinkifyFilter
 from django.utils.safestring import mark_safe

@ -16,6 +17,66 @@ url_regex = re.compile(
    re.IGNORECASE | re.VERBOSE | re.UNICODE,
 )

+ALLOWED_TAGS = ["br", "p", "a"]
+REWRITTEN_TAGS = [
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "blockquote",
+    "pre",
+    "ul",
+    "ol",
+    "li",
+]
+
+
+class MastodonStrictTagFilter(Filter):
+    """
+    Implements Python equivalent of Mastodon tag rewriter
+
+    Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
+
+    Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
+    """
+
+    def __iter__(self):
+        li_pending_break = False
+        break_token = {
+            "name": "br",
+            "data": {},
+            "type": "StartTag",
+        }
+
+        for token in Filter.__iter__(self):
+            if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
+                "StartTag",
+                "EndTag",
+            ]:
+                yield token
+                continue
+
+            if token["type"] == "StartTag":
+                if token["name"] == "li":
+                    if li_pending_break:
+                        # Another `li` appeared, so break after the last one
+                        yield break_token
+                    continue
+                token["name"] = "p"
+            elif token["type"] == "EndTag":
+                if token["name"] == "li":
+                    # Track that an `li` closed so we know a break should be considered
+                    li_pending_break = True
+                    continue
+                if token["name"] == "ul":
+                    # If the last `li` happened, then don't add a break because Mastodon doesn't
+                    li_pending_break = False
+                token["name"] = "p"
+
+            yield token
+

 def allow_a(tag: str, name: str, value: str):
    if name in ["href", "title", "class"]:
@ -34,12 +95,12 @@ def sanitize_html(post_html: str) -> str:
    Only allows a, br, p and span tags, and class attributes.
    """
    cleaner = bleach.Cleaner(
-        tags=["br", "p", "a"],
+        tags=ALLOWED_TAGS + REWRITTEN_TAGS,
        attributes={  # type:ignore
            "a": allow_a,
            "p": ["class"],
        },
-        filters=[partial(LinkifyFilter, url_re=url_regex)],
+        filters=[partial(LinkifyFilter, url_re=url_regex), MastodonStrictTagFilter],
        strip=True,
    )
    return mark_safe(cleaner.clean(post_html))
--- a/tests/core/test_html.py
+++ b/tests/core/test_html.py
@ -64,6 +64,33 @@ def test_link_preservation():
    )


+@pytest.mark.django_db
+def test_list_rendering():
+    """
+    We want to:
+     - Preserve incoming links from other servers
+     - Linkify mentions and hashtags
+     - Not have these all step on each other!
+    """
+    renderer = ContentRenderer(local=True)
+    fake_mention = Mock()
+    fake_mention.username = "andrew"
+    fake_mention.domain_id = "aeracode.org"
+    fake_mention.urls.view = "/@andrew@aeracode.org/"
+    fake_post = Mock()
+    fake_post.mentions.all.return_value = [fake_mention]
+    fake_post.author.domain.uri_domain = "example.com"
+    fake_post.emojis.all.return_value = []
+
+    assert (
+        renderer.render_post(
+            "<p>Ok. The roster so far is:</p><ul><li>Infosec.exchange (mastodon)</li><li>pixel.Infosec.exchange (pixelfed)</li><li>video.Infosec.exchange (peertube)</li><li>relay.Infosec.exchange (activitypub relay)</li><li>risky.af (alt mastodon)</li></ul><p>What’s next?  I think I promised some people here bookwyrm</p>",
+            fake_post,
+        )
+        == "<p>Ok. The roster so far is:</p><p>Infosec.exchange (mastodon)<br>pixel.Infosec.exchange (pixelfed)<br>video.Infosec.exchange (peertube)<br>relay.Infosec.exchange (activitypub relay)<br>risky.af (alt mastodon)</p><p>What’s next?  I think I promised some people here bookwyrm</p>"
+    )
+
+
@pytest.mark.django_db
 def test_link_mixcase_mentions():
    renderer = ContentRenderer(local=True)