From a6922cb9d6b98f958a230636c489fd9f4c96bd5b Mon Sep 17 00:00:00 2001
From: Andrew Godwin
Date: Sun, 29 Jan 2023 17:46:22 -0700
Subject: [PATCH] Move to a new HTML parser/stripper
This removes the use of the EOL'd Bleach, and also integrates hashtag,
mention and emoji searching into one single place.
---
.pre-commit-config.yaml | 8 +-
activities/admin.py | 9 -
activities/models/emoji.py | 9 +-
activities/models/hashtag.py | 11 -
activities/models/post.py | 24 +-
activities/views/compose.py | 4 +-
core/html.py | 642 ++++++++++++------------
requirements.txt | 1 -
tests/activities/models/test_hashtag.py | 44 --
tests/api/test_statuses.py | 43 ++
tests/core/test_html.py | 242 ++++-----
users/models/identity.py | 16 +-
users/services/identity.py | 4 +-
users/views/settings/profile.py | 8 +-
14 files changed, 503 insertions(+), 562 deletions(-)
delete mode 100644 tests/activities/models/test_hashtag.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b2392b9..9fc237b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,13 +48,7 @@ repos:
- id: mypy
exclude: "^tests/"
additional_dependencies:
- [
- types-pyopenssl,
- types-bleach,
- types-mock,
- types-cachetools,
- types-python-dateutil,
- ]
+ [types-pyopenssl, types-mock, types-cachetools, types-python-dateutil]
- repo: https://github.com/rtts/djhtml
rev: v1.5.2
diff --git a/activities/admin.py b/activities/admin.py
index f3444ed..24ef6e5 100644
--- a/activities/admin.py
+++ b/activities/admin.py
@@ -1,4 +1,3 @@
-from asgiref.sync import async_to_sync
from django.contrib import admin
from django.db import models
from django.utils.safestring import mark_safe
@@ -165,7 +164,6 @@ class PostAdmin(admin.ModelAdmin):
list_filter = ("type", "local", "visibility", "state", "created")
raw_id_fields = ["emojis"]
autocomplete_fields = ["to", "mentions", "author"]
- actions = ["reparse_hashtags"]
search_fields = ["content", "search_handle", "search_service_handle"]
inlines = [PostAttachmentInline]
readonly_fields = ["created", "updated", "state_changed", "object_json"]
@@ -183,13 +181,6 @@ class PostAdmin(admin.ModelAdmin):
)
return super().get_search_results(request, queryset, search_term)
- @admin.action(description="Reprocess content for hashtags")
- def reparse_hashtags(self, request, queryset):
- for instance in queryset:
- instance.hashtags = Hashtag.hashtags_from_content(instance.content) or None
- instance.save()
- async_to_sync(instance.ensure_hashtags)()
-
@admin.display(description="ActivityPub JSON")
def object_json(self, instance):
return instance.to_ap()
diff --git a/activities/models/emoji.py b/activities/models/emoji.py
index 0e29a47..2946a94 100644
--- a/activities/models/emoji.py
+++ b/activities/models/emoji.py
@@ -1,5 +1,4 @@
import mimetypes
-import re
from functools import partial
from typing import ClassVar
@@ -14,7 +13,7 @@ from django.db import models
from django.utils.safestring import mark_safe
from core.files import get_remote_file
-from core.html import strip_html
+from core.html import FediverseHtmlParser
from core.ld import format_ld_date
from core.models import Config
from core.uploads import upload_emoji_namer
@@ -134,8 +133,6 @@ class Emoji(StatorModel):
admin_disable = "{admin}{self.pk}/disable/"
admin_copy = "{admin}{self.pk}/copy/"
- emoji_regex = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
-
def delete(self, using=None, keep_parents=False):
if self.file:
self.file.delete()
@@ -242,7 +239,9 @@ class Emoji(StatorModel):
Return a parsed and sanitized of emoji found in content without
the surrounding ':'.
"""
- emoji_hits = cls.emoji_regex.findall(strip_html(content))
+ emoji_hits = FediverseHtmlParser(
+ content, find_emojis=True, emoji_domain=domain
+ ).emojis
emojis = sorted({emoji.lower() for emoji in emoji_hits})
return list(
cls.objects.filter(local=(domain is None) or domain.local)
diff --git a/activities/models/hashtag.py b/activities/models/hashtag.py
index 8430fd4..176bdc1 100644
--- a/activities/models/hashtag.py
+++ b/activities/models/hashtag.py
@@ -6,7 +6,6 @@ from asgiref.sync import sync_to_async
from django.db import models
from django.utils import timezone
-from core.html import strip_html
from core.models import Config
from stator.models import State, StateField, StateGraph, StatorModel
@@ -167,16 +166,6 @@ class Hashtag(StatorModel):
results[date(year, month, day)] = val
return dict(sorted(results.items(), reverse=True)[:num])
- @classmethod
- def hashtags_from_content(cls, content) -> list[str]:
- """
- Return a parsed and sanitized of hashtags found in content without
- leading '#'.
- """
- hashtag_hits = cls.hashtag_regex.findall(strip_html(content))
- hashtags = sorted({tag.lower() for tag in hashtag_hits})
- return list(hashtags)
-
def to_mastodon_json(self):
return {
"name": self.hashtag,
diff --git a/activities/models/post.py b/activities/models/post.py
index 8d00f46..88e53af 100644
--- a/activities/models/post.py
+++ b/activities/models/post.py
@@ -2,7 +2,6 @@ import datetime
import hashlib
import json
import mimetypes
-import re
import ssl
from collections.abc import Iterable
from typing import Optional
@@ -26,7 +25,7 @@ from activities.models.post_types import (
PostTypeDataEncoder,
)
from core.exceptions import capture_message
-from core.html import ContentRenderer, strip_html
+from core.html import ContentRenderer, FediverseHtmlParser
from core.ld import (
canonicalise,
format_ld_date,
@@ -374,10 +373,6 @@ class Post(StatorModel):
def clean_type_data(self, value):
PostTypeData.parse_obj(value)
- mention_regex = re.compile(
- r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
- )
-
def _safe_content_note(self, *, local: bool = True):
return ContentRenderer(local=local).render_post(self.content, self)
@@ -474,12 +469,12 @@ class Post(StatorModel):
# Maintain local-only for replies
if reply_to.visibility == reply_to.Visibilities.local_only:
visibility = reply_to.Visibilities.local_only
- # Find hashtags in this post
- hashtags = Hashtag.hashtags_from_content(content) or None
# Find emoji in this post
emojis = Emoji.emojis_from_content(content, None)
- # Strip all HTML and apply linebreaks filter
- content = linebreaks_filter(strip_html(content))
+ # Strip all unwanted HTML and apply linebreaks filter, grabbing hashtags on the way
+ parser = FediverseHtmlParser(linebreaks_filter(content), find_hashtags=True)
+ content = parser.html
+ hashtags = sorted(parser.hashtags) or None
# Make the Post object
post = cls.objects.create(
author=author,
@@ -512,12 +507,13 @@ class Post(StatorModel):
):
with transaction.atomic():
# Strip all HTML and apply linebreaks filter
- self.content = linebreaks_filter(strip_html(content))
+ parser = FediverseHtmlParser(linebreaks_filter(content))
+ self.content = parser.html
+ self.hashtags = sorted(parser.hashtags) or None
self.summary = summary or None
self.sensitive = bool(summary)
self.visibility = visibility
self.edited = timezone.now()
- self.hashtags = Hashtag.hashtags_from_content(content) or None
self.mentions.set(self.mentions_from_content(content, self.author))
self.emojis.set(Emoji.emojis_from_content(content, None))
self.attachments.set(attachments or [])
@@ -525,9 +521,9 @@ class Post(StatorModel):
@classmethod
def mentions_from_content(cls, content, author) -> set[Identity]:
- mention_hits = cls.mention_regex.findall(content)
+ mention_hits = FediverseHtmlParser(content, find_mentions=True).mentions
mentions = set()
- for precursor, handle in mention_hits:
+ for handle in mention_hits:
handle = handle.lower()
if "@" in handle:
username, domain = handle.split("@", 1)
diff --git a/activities/views/compose.py b/activities/views/compose.py
index c2e3618..96c9e11 100644
--- a/activities/views/compose.py
+++ b/activities/views/compose.py
@@ -14,7 +14,7 @@ from activities.models import (
TimelineEvent,
)
from core.files import blurhash_image, resize_image
-from core.html import html_to_plaintext
+from core.html import FediverseHtmlParser
from core.models import Config
from users.decorators import identity_required
@@ -112,7 +112,7 @@ class Compose(FormView):
{
"reply_to": self.reply_to.pk if self.reply_to else "",
"visibility": self.post_obj.visibility,
- "text": html_to_plaintext(self.post_obj.content),
+ "text": FediverseHtmlParser(self.post_obj.content).plain_text,
"content_warning": self.post_obj.summary,
}
)
diff --git a/core/html.py b/core/html.py
index 5728899..fc790c8 100644
--- a/core/html.py
+++ b/core/html.py
@@ -1,199 +1,309 @@
+import html
import re
-from functools import partial
+from html.parser import HTMLParser
-import bleach
-import bleach.callbacks
-from bleach.html5lib_shim import Filter
-from bleach.linkifier import LinkifyFilter
from django.utils.safestring import mark_safe
-url_regex = re.compile(
- r"""\(* # Match any opening parentheses.
- \b(?"]*)?
+
+class FediverseHtmlParser(HTMLParser):
+ """
+ A custom HTML parser that only allows a certain tag subset and behaviour:
+ - br, p tags are passed through
+ - a tags are passed through if they're not hashtags or mentions
+ - Another set of tags are converted to p
+
+ It also linkifies URLs, mentions, hashtags, and imagifies emoji.
+ """
+
+ REWRITE_TO_P = [
+ "p",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "blockquote",
+ "pre",
+ "ul",
+ "ol",
+ ]
+
+ REWRITE_TO_BR = [
+ "br",
+ "li",
+ ]
+
+ MENTION_REGEX = re.compile(
+ r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
+ )
+
+ HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")
+
+ EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
+
+ URL_REGEX = re.compile(
+ r"""(\(* # Match any opening parentheses.
+ \b(?"]*)?)
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
- """,
- re.IGNORECASE | re.VERBOSE | re.UNICODE,
-)
-
-ALLOWED_TAGS = ["br", "p", "a"]
-REWRITTEN_TAGS = [
- "h1",
- "h2",
- "h3",
- "h4",
- "h5",
- "h6",
- "blockquote",
- "pre",
- "ul",
- "ol",
- "li",
-]
-
-
-class MastodonStrictTagFilter(Filter):
- """
- Implements Python equivalent of Mastodon tag rewriter
-
- Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
-
- Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `
` lists
- """
-
- def __iter__(self):
- li_pending_break = False
- break_token = {
- "name": "br",
- "data": {},
- "type": "StartTag",
- }
-
- for token in Filter.__iter__(self):
- if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
- "StartTag",
- "EndTag",
- ]:
- yield token
- continue
-
- if token["type"] == "StartTag":
- if token["name"] == "li":
- if li_pending_break:
- # Another `li` appeared, so break after the last one
- yield break_token
- continue
- token["name"] = "p"
- elif token["type"] == "EndTag":
- if token["name"] == "li":
- # Track that an `li` closed so we know a break should be considered
- li_pending_break = True
- continue
- if token["name"] == "ul":
- # If the last `li` happened, then don't add a break because Mastodon doesn't
- li_pending_break = False
- token["name"] = "p"
-
- yield token
-
-
-class UnlinkifyFilter(Filter):
- """
- Forcibly replaces link text with the href.
-
- This is intented to be used when stripping tags to preserve the link
- location at the expense of the link text.
- """
-
- def __iter__(self):
- discarding_a_text = False
- for token in Filter.__iter__(self):
- if token.get("name") == "a":
- if token["type"] == "EndTag":
- discarding_a_text = False
- continue
- href = token["data"].get((None, "href"))
-
- # If has an href, we use it and throw away all content
- # within the .... If href missing or empty, try to find
- # text within the ...
- if href:
- yield {"data": href, "type": "Characters"}
- discarding_a_text = True
- continue
- elif not discarding_a_text:
- yield token
- # else: throw away tokens until we're out of the
-
-
-def allow_a(tag: str, name: str, value: str):
- if name in ["href", "title", "class"]:
- return True
- elif name == "rel":
- # Only allow rel attributes with a small subset of values
- # (we're defending against, for example, rel=me)
- rel_values = value.split()
- if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
- return True
- return False
-
-
-def shorten_link_text(attrs, new=False):
- """
- Applies Mastodon's link shortening behavior where URL text links are
- shortened by removing the scheme and only showing the first 30 chars.
-
- Orig:
- https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened
-
- Becomes:
- social.example.com/a-long/path
-
- """
- text = attrs.get("_text")
- if not text:
- text = attrs.get((None, "href"))
- if text and "://" in text and len(text) > 30:
- text = text.split("://", 1)[-1]
- attrs["_text"] = text[:30]
- if len(text) > 30:
- attrs[(None, "class")] = " ".join(
- filter(None, [attrs.pop((None, "class"), ""), "ellipsis"])
- )
- # Add the full URL in to title for easier user inspection
- attrs[(None, "title")] = attrs.get((None, "href"))
-
- return attrs
-
-
-linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text]
-
-
-def sanitize_html(post_html: str) -> str:
- """
- Only allows a, br, p and span tags, and class attributes.
- """
- cleaner = bleach.Cleaner(
- tags=ALLOWED_TAGS + REWRITTEN_TAGS,
- attributes={ # type:ignore
- "a": allow_a,
- "p": ["class"],
- },
- filters=[
- partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks),
- MastodonStrictTagFilter,
- ],
- strip=True,
+ """,
+ re.IGNORECASE | re.VERBOSE | re.UNICODE,
)
- return mark_safe(cleaner.clean(post_html))
+ def __init__(
+ self,
+ html: str,
+ uri_domain: str | None = None,
+ mentions: list | None = None,
+ find_mentions: bool = False,
+ find_hashtags: bool = False,
+ find_emojis: bool = False,
+ emoji_domain=None,
+ ):
+ super().__init__()
+ self.uri_domain = uri_domain
+ self.emoji_domain = emoji_domain
+ self.find_mentions = find_mentions
+ self.find_hashtags = find_hashtags
+ self.find_emojis = find_emojis
+ self.calculate_mentions(mentions)
+ self._data_buffer = ""
+ self.html_output = ""
+ self.text_output = ""
+ self.emojis: set[str] = set()
+ self.mentions: set[str] = set()
+ self.hashtags: set[str] = set()
+ self._pending_a: dict | None = None
+ self._fresh_p = False
+ self.feed(html.replace("\n", ""))
+ self.flush_data()
-def strip_html(post_html: str, *, linkify: bool = True) -> str:
- """
- Strips all tags from the text, then linkifies it.
- """
- cleaner = bleach.Cleaner(
- tags=[],
- strip=True,
- filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
- if linkify
- else [UnlinkifyFilter],
- )
- return mark_safe(cleaner.clean(post_html))
+ def calculate_mentions(self, mentions: list | None):
+ """
+ Prepares a set of content that we expect to see mentions look like
+ (this imp)
+ """
+ self.mention_matches: dict[str, str] = {}
+ self.mention_aliases: dict[str, str] = {}
+ for mention in mentions or []:
+ if self.uri_domain:
+ url = mention.absolute_profile_uri()
+ else:
+ url = str(mention.urls.view)
+ if mention.username:
+ username = mention.username.lower()
+ domain = mention.domain_id.lower()
+ self.mention_matches[f"{username}"] = url
+ self.mention_matches[f"{username}@{domain}"] = url
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+ if tag in self.REWRITE_TO_P:
+ self.flush_data()
+ self.html_output += ""
+ elif tag in self.REWRITE_TO_BR:
+ self.flush_data()
+ if not self._fresh_p:
+ self.html_output += "
"
+ self.text_output += "\n"
+ elif tag == "a":
+ self.flush_data()
+ self._pending_a = {"attrs": dict(attrs), "content": ""}
+ self._fresh_p = tag in self.REWRITE_TO_P
-def html_to_plaintext(post_html: str) -> str:
- """
- Tries to do the inverse of the linebreaks filter.
- """
- # TODO: Handle HTML entities
- # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
- post_html = post_html.replace("\n", "").replace("
", "\n").replace("
", "\n")
- # Remove all other HTML and return
- cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
- return cleaner.clean(post_html).strip()
+ def handle_endtag(self, tag: str) -> None:
+ self._fresh_p = False
+ if tag in self.REWRITE_TO_P:
+ self.flush_data()
+ self.html_output += "
"
+ self.text_output += "\n\n"
+ elif tag == "a":
+ if self._pending_a:
+ href = self._pending_a["attrs"].get("href")
+ content = self._pending_a["content"].strip()
+ # Is it a mention?
+ if content.lower().lstrip("@") in self.mention_matches:
+ self.html_output += self.create_mention(content)
+ self.text_output += content
+ # Is it a hashtag?
+ elif self.HASHTAG_REGEX.match(content):
+ self.html_output += self.create_hashtag(content)
+ self.text_output += content
+ elif content:
+ # Shorten the link if we need to
+ self.html_output += self.create_link(href, content)
+ self.text_output += href
+ self._pending_a = None
+
+ def handle_data(self, data: str) -> None:
+ self._fresh_p = False
+ if self._pending_a:
+ self._pending_a["content"] += data
+ else:
+ self._data_buffer += data
+
+ def flush_data(self) -> None:
+ """
+ We collect data segments until we encounter a tag we care about,
+ so we can treat #hashtag as #hashtag
+ """
+ self.text_output += self._data_buffer
+ self.html_output += self.linkify(self._data_buffer)
+ self._data_buffer = ""
+
+ def create_link(self, href, content):
+ """
+ Generates a link, doing optional shortening.
+
+ All return values from this function should be HTML-safe.
+ """
+ looks_like_link = bool(self.URL_REGEX.match(content))
+ if looks_like_link:
+ content = content.split("://", 1)[1]
+ if looks_like_link and len(content) > 30:
+ return f'{html.escape(content[:30])}'
+ else:
+ return f'{html.escape(content)}'
+
+ def create_mention(self, handle) -> str:
+ """
+ Generates a mention link. Handle should have a leading @.
+
+ All return values from this function should be HTML-safe
+ """
+ handle = handle.lstrip("@")
+ if "@" in handle:
+ short_handle = handle.split("@", 1)[0]
+ else:
+ short_handle = handle
+ handle_hash = handle.lower()
+ short_hash = short_handle.lower()
+ self.mentions.add(handle_hash)
+ url = self.mention_matches.get(handle_hash)
+ if url:
+ if short_hash not in self.mention_aliases:
+ self.mention_aliases[short_hash] = handle_hash
+ elif self.mention_aliases.get(short_hash) != handle_hash:
+ short_handle = handle
+ return f'@{html.escape(short_handle)}'
+ else:
+ return "@" + html.escape(handle)
+
+ def create_hashtag(self, hashtag) -> str:
+ """
+ Generates a hashtag link. Hashtag does not need to start with #
+
+ All return values from this function should be HTML-safe
+ """
+ hashtag = hashtag.lstrip("#")
+ self.hashtags.add(hashtag.lower())
+ if self.uri_domain:
+ return f'#{hashtag}'
+ else:
+ return f'#{hashtag}'
+
+ def create_emoji(self, shortcode) -> str:
+ """
+ Generates an emoji tag
+
+ All return values from this function should be HTML-safe
+ """
+ from activities.models import Emoji
+
+ emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)
+ if emoji and emoji.is_usable:
+ self.emojis.add(shortcode)
+ return emoji.as_html()
+ return f":{shortcode}:"
+
+ def linkify(self, data):
+ """
+ Linkifies some content that is plaintext.
+
+ Handles URLs first, then mentions. Note that this takes great care to
+ keep track of what is HTML and what needs to be escaped.
+ """
+ # Split the string by the URL regex so we know what to escape and what
+ # not to escape.
+ bits = self.URL_REGEX.split(data)
+ result = ""
+ # Even indices are data we should pass though, odd indices are links
+ for i, bit in enumerate(bits):
+ # A link!
+ if i % 2 == 1:
+ result += self.create_link(bit, bit)
+ # Not a link
+ elif self.mention_matches or self.find_mentions:
+ result += self.linkify_mentions(bit)
+ elif self.find_hashtags:
+ result += self.linkify_hashtags(bit)
+ elif self.find_emojis:
+ result += self.linkify_emoji(bit)
+ else:
+ result += html.escape(bit)
+ return result
+
+ def linkify_mentions(self, data):
+ """
+ Linkifies mentions
+ """
+ bits = self.MENTION_REGEX.split(data)
+ result = ""
+ for i, bit in enumerate(bits):
+ # Mention content
+ if i % 3 == 2:
+ result += self.create_mention(bit)
+ # Not part of a mention (0) or mention preamble (1)
+ elif self.find_hashtags:
+ result += self.linkify_hashtags(bit)
+ elif self.find_emojis:
+ result += self.linkify_emoji(bit)
+ else:
+ result += html.escape(bit)
+ return result
+
+ def linkify_hashtags(self, data):
+ """
+ Linkifies hashtags
+ """
+ bits = self.HASHTAG_REGEX.split(data)
+ result = ""
+ for i, bit in enumerate(bits):
+ # Not part of a hashtag
+ if i % 2 == 0:
+ if self.find_emojis:
+ result += self.linkify_emoji(bit)
+ else:
+ result += html.escape(bit)
+ # Hashtag content
+ else:
+ result += self.create_hashtag(bit)
+ return result
+
+ def linkify_emoji(self, data):
+ """
+ Linkifies emoji
+ """
+ bits = self.EMOJI_REGEX.split(data)
+ result = ""
+ for i, bit in enumerate(bits):
+ # Not part of an emoji
+ if i % 2 == 0:
+ result += html.escape(bit)
+ # Emoji content
+ else:
+ result += self.create_emoji(bit)
+ return result
+
+ @property
+ def html(self):
+ return self.html_output.strip()
+
+ @property
+ def plain_text(self):
+ return self.text_output.strip()
class ContentRenderer:
@@ -212,33 +322,30 @@ class ContentRenderer:
"""
if not html:
return ""
- html = sanitize_html(html)
- html = self.linkify_mentions(html, post=post)
- html = self.linkify_hashtags(html, identity=post.author)
- if self.local:
- html = self.imageify_emojis(
- html,
- identity=post.author,
- emojis=post.emojis.all(),
- )
- html = self.remove_extra_newlines(html)
- return mark_safe(html)
+ parser = FediverseHtmlParser(
+ html,
+ mentions=post.mentions.all(),
+ uri_domain=(None if self.local else post.author.domain.uri_domain),
+ find_hashtags=True,
+ find_emojis=True,
+ emoji_domain=post.author.domain,
+ )
+ return mark_safe(parser.html)
- def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
+ def render_identity_summary(self, html: str, identity) -> str:
"""
Given identity summary HTML, normalises it and renders it for presentation.
"""
if not html:
return ""
- if strip:
- html = strip_html(html)
- else:
- html = sanitize_html(html)
- html = self.linkify_hashtags(html, identity=identity)
- if self.local:
- html = self.imageify_emojis(html, identity=identity)
- html = self.remove_extra_newlines(html)
- return mark_safe(html)
+ parser = FediverseHtmlParser(
+ html,
+ uri_domain=(None if self.local else identity.domain.uri_domain),
+ find_hashtags=True,
+ find_emojis=True,
+ emoji_domain=identity.domain,
+ )
+ return mark_safe(parser.html)
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
"""
@@ -246,117 +353,14 @@ class ContentRenderer:
"""
if not html:
return ""
- if strip:
- html = strip_html(html)
- else:
- html = sanitize_html(html)
- if self.local:
- html = self.imageify_emojis(html, identity=identity)
- html = self.remove_extra_newlines(html)
- return mark_safe(html)
-
- def linkify_mentions(self, html: str, post) -> str:
- """
- Links mentions _in the context of the post_ - as in, using the mentions
- property as the only source (as we might be doing this without other
- DB access allowed)
- """
- from activities.models import Post
-
- possible_matches = {}
- for mention in post.mentions.all():
- if self.local:
- url = str(mention.urls.view)
- else:
- url = mention.absolute_profile_uri()
- # Might not have fetched it (yet)
- if mention.username:
- username = mention.username.lower()
- possible_matches[username] = url
- possible_matches[f"{username}@{mention.domain_id}"] = url
-
- collapse_name: dict[str, str] = {}
-
- def replacer(match):
- precursor = match.group(1)
- handle = match.group(2)
- if "@" in handle:
- short_handle = handle.split("@", 1)[0]
- else:
- short_handle = handle
- handle_hash = handle.lower()
- short_hash = short_handle.lower()
- if handle_hash in possible_matches:
- if short_hash not in collapse_name:
- collapse_name[short_hash] = handle_hash
- elif collapse_name.get(short_hash) != handle_hash:
- short_handle = handle
- return f'{precursor}@{short_handle}'
- else:
- return match.group()
-
- return Post.mention_regex.sub(replacer, html)
-
- def linkify_hashtags(self, html, identity) -> str:
- from activities.models import Hashtag
-
- def replacer(attrs, new=False):
- # See if the text in this link looks like a hashtag
- if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
- return attrs
- hashtag = attrs["_text"].strip().lstrip("#")
- attrs[None, "class"] = "hashtag"
- if (None, "rel") in attrs:
- del attrs[None, "rel"]
- if self.local:
- attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
- else:
- attrs[
- None, "href"
- ] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
- return attrs
-
- linker = bleach.linkifier.Linker(
- url_re=Hashtag.hashtag_regex, callbacks=[replacer]
+ parser = FediverseHtmlParser(
+ html,
+ uri_domain=(None if self.local else identity.domain.uri_domain),
+ find_hashtags=False,
+ find_emojis=True,
+ emoji_domain=identity.domain,
)
- return linker.linkify(html)
-
- def imageify_emojis(
- self, html: str, identity, include_local: bool = True, emojis=None
- ):
- """
- Find :emoji: in content and convert to . If include_local is True,
- the local emoji will be used as a fallback for any shortcodes not defined
- by emojis.
- """
- from activities.models import Emoji
-
- # If precached emojis were passed, prep them
- cached_emojis = {}
- if emojis:
- for emoji in emojis:
- cached_emojis[emoji.shortcode] = emoji
-
- def replacer(match):
- shortcode = match.group(1).lower()
- if shortcode in cached_emojis:
- return cached_emojis[shortcode].as_html()
-
- emoji = Emoji.get_by_domain(shortcode, identity.domain)
- if emoji and emoji.is_usable:
- return emoji.as_html()
- elif not emoji and include_local:
- emoji = Emoji.get_by_domain(shortcode, None)
- if emoji:
- return emoji.as_html()
-
- return match.group()
-
- return Emoji.emoji_regex.sub(replacer, html)
-
- def remove_extra_newlines(self, html: str) -> str:
- """
- Some clients are sensitive to extra newlines even though it's HTML
- """
- # TODO: More intelligent way to strip these?
- return html.replace("\n", "")
+ if strip:
+ return mark_safe(parser.html)
+ else:
+ return mark_safe(parser.html)
diff --git a/requirements.txt b/requirements.txt
index 546a13f..911830f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-bleach~=5.0.1
blurhash-python~=1.1.3
cachetools~=5.2.0
cryptography~=39.0
diff --git a/tests/activities/models/test_hashtag.py b/tests/activities/models/test_hashtag.py
deleted file mode 100644
index 872d72e..0000000
--- a/tests/activities/models/test_hashtag.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from activities.models import Hashtag
-from core.html import ContentRenderer
-
-
-def test_hashtag_from_content():
- assert Hashtag.hashtags_from_content("#hashtag") == ["hashtag"]
- assert Hashtag.hashtags_from_content("a#hashtag") == []
- assert Hashtag.hashtags_from_content("Text #with #hashtag in it") == [
- "hashtag",
- "with",
- ]
- assert Hashtag.hashtags_from_content("#hashtag.") == ["hashtag"]
- assert Hashtag.hashtags_from_content("More text\n#one # two ##three #hashtag!") == [
- "hashtag",
- "one",
- "three",
- ]
- assert Hashtag.hashtags_from_content("my #html loves entities") == ["html"]
- assert Hashtag.hashtags_from_content("#tag") == ["tag"]
-
-
-def test_linkify_hashtag():
- linkify = lambda html: ContentRenderer(local=True).linkify_hashtags(html, None)
-
- assert linkify("# hashtag") == "# hashtag"
- assert (
- linkify('Text')
- == 'Text'
- )
- assert (
- linkify("#HashTag") == '#HashTag'
- )
- assert (
- linkify(
- """A longer text #bigContent
-with #tags, linebreaks, and
-maybe a few links
-#allTheTags #AllTheTags #ALLTHETAGS"""
- )
- == """A longer text #bigContent
-with #tags, linebreaks, and
-maybe a few links
-#allTheTags #AllTheTags #ALLTHETAGS"""
- )
diff --git a/tests/api/test_statuses.py b/tests/api/test_statuses.py
index 1b00642..df576e9 100644
--- a/tests/api/test_statuses.py
+++ b/tests/api/test_statuses.py
@@ -1,5 +1,7 @@
import pytest
+from activities.models import Post
+
@pytest.mark.django_db
def test_post_status(api_token, identity, client):
@@ -15,3 +17,44 @@ def test_post_status(api_token, identity, client):
).json()
assert response["content"] == "Hello, world!
"
assert response["visibility"] == "unlisted"
+
+
+@pytest.mark.django_db
+def test_mention_format(api_token, identity, remote_identity, client):
+ """
+ Ensures mentions work, and only have one link around them.
+ """
+ # Make a local post and check it
+ response = client.post(
+ "/api/v1/statuses",
+ HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
+ HTTP_ACCEPT="application/json",
+ content_type="application/json",
+ data={
+ "status": "Hello, @test!",
+ "visibility": "unlisted",
+ },
+ ).json()
+ assert (
+ response["content"]
+ == 'Hello, @test!
'
+ )
+ assert response["visibility"] == "unlisted"
+
+ # Make a remote post and check it
+ post = Post.objects.create(
+ local=False,
+ author=remote_identity,
+ content='Hey @test
',
+ object_uri="https://remote.test/status/12345",
+ )
+ post.mentions.add(identity)
+ response = client.get(
+ f"/api/v1/statuses/{post.id}",
+ HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
+ HTTP_ACCEPT="application/json",
+ content_type="application/json",
+ ).json()
+ assert (
+ response["text"] == 'Hey @test
'
+ )
diff --git a/tests/core/test_html.py b/tests/core/test_html.py
index 527991a..c076592 100644
--- a/tests/core/test_html.py
+++ b/tests/core/test_html.py
@@ -1,155 +1,117 @@
-from unittest.mock import Mock
-
import pytest
-from core.html import ContentRenderer, html_to_plaintext, sanitize_html
-
-
-def test_html_to_plaintext():
-
- assert html_to_plaintext("Hi!
") == "Hi!"
- assert html_to_plaintext("Hi!
There
") == "Hi!\nThere"
- assert (
- html_to_plaintext("Hi!
\n\nHow are you?
") == "Hi!\n\nHow are you?"
- )
-
- assert (
- html_to_plaintext("Hi!
\n\nHow are
you?
today
")
- == "Hi!\n\nHow are\n you?\n\ntoday"
- )
-
- assert (
- html_to_plaintext(
- ''
- 'The Link '
- 'Empty href '
- "Empty A
"
- )
- == "https://fedi.takahe.social/with/a/long/path Empty href Empty A"
- )
-
-
-def test_sanitize_post():
-
- assert sanitize_html("Hello!
") == "Hello!
"
- assert sanitize_html("It's great
") == "It's great
"
-
- # Note that we only want to linkify things with protocol prefixes to prevent
- # too many false positives.
- assert sanitize_html("test.com
") == "test.com
"
- assert (
- sanitize_html("https://test.com
")
- == 'https://test.com
'
- )
- assert (
- sanitize_html("@someone@subdomain.some-domain.com
")
- == "@someone@subdomain.some-domain.com
"
- )
-
-
-def test_shorten_url():
- full_url = (
- "https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened"
- )
- assert (
- sanitize_html(f"{full_url}
")
- == f'social.example.com/a-long/path
'
- )
-
- assert (
- sanitize_html(
- f'This is a long link text, but cannot be shortened as a URL
'
- )
- == f'This is a long link text, but cannot be shortened as a URL
'
- )
+from core.html import FediverseHtmlParser
@pytest.mark.django_db
-def test_link_preservation():
+def test_parser(identity):
"""
- We want to:
- - Preserve incoming links from other servers
- - Linkify mentions and hashtags
- - Not have these all step on each other!
+ Validates the HtmlParser in its various output modes
"""
- renderer = ContentRenderer(local=True)
- fake_mention = Mock()
- fake_mention.username = "andrew"
- fake_mention.domain_id = "aeracode.org"
- fake_mention.urls.view = "/@andrew@aeracode.org/"
- fake_post = Mock()
- fake_post.mentions.all.return_value = [fake_mention]
- fake_post.author.domain.uri_domain = "example.com"
- fake_post.emojis.all.return_value = []
+ # Basic tag allowance
+ parser = FediverseHtmlParser("Hello!
")
+ assert parser.html == "Hello!
"
+ assert parser.plain_text == "Hello!"
+
+ # Newline erasure
+ parser = FediverseHtmlParser("Hi!
\n\nHow are you?
")
+ assert parser.html == "Hi!
How are you?
"
+ assert parser.plain_text == "Hi!\n\nHow are you?"
+
+ # Trying to be evil
+ parser = FediverseHtmlParser("pt>")
+ assert "")
+ assert "It's great", find_hashtags=True)
+ assert parser.html == "It's great
"
+ assert parser.plain_text == "It's great"
+ assert parser.hashtags == set()
+
+ # Linkify works, but only with protocol prefixes
+ parser = FediverseHtmlParser("test.com
")
+ assert parser.html == "test.com
"
+ assert parser.plain_text == "test.com"
+ parser = FediverseHtmlParser("https://test.com
")
assert (
- renderer.render_post(
- 'Hello @andrew, I want to link to this #hashtag: here and rewrite #thishashtag',
- fake_post,
- )
- == 'Hello @andrew, I want to link to this #hashtag: here and rewrite #thishashtag'
+ parser.html == 'test.com
'
)
+ assert parser.plain_text == "https://test.com"
-
-@pytest.mark.django_db
-def test_list_rendering():
- """
- We want to:
- - Preserve incoming links from other servers
- - Linkify mentions and hashtags
- - Not have these all step on each other!
- """
- renderer = ContentRenderer(local=True)
- fake_mention = Mock()
- fake_mention.username = "andrew"
- fake_mention.domain_id = "aeracode.org"
- fake_mention.urls.view = "/@andrew@aeracode.org/"
- fake_post = Mock()
- fake_post.mentions.all.return_value = [fake_mention]
- fake_post.author.domain.uri_domain = "example.com"
- fake_post.emojis.all.return_value = []
-
+ # Links are preserved
+ parser = FediverseHtmlParser("takahe social")
assert (
- renderer.render_post(
- "Ok. The roster so far is:
- Infosec.exchange (mastodon)
- pixel.Infosec.exchange (pixelfed)
- video.Infosec.exchange (peertube)
- relay.Infosec.exchange (activitypub relay)
- risky.af (alt mastodon)
What’s next? I think I promised some people here bookwyrm
",
- fake_post,
- )
- == "Ok. The roster so far is:
Infosec.exchange (mastodon)
pixel.Infosec.exchange (pixelfed)
video.Infosec.exchange (peertube)
relay.Infosec.exchange (activitypub relay)
risky.af (alt mastodon)
What’s next? I think I promised some people here bookwyrm
"
+ parser.html
+ == 'takahe social'
+ )
+ assert parser.plain_text == "https://takahe.social"
+
+ # Very long links are shortened
+ full_url = "https://social.example.com/a-long/path/that-should-be-shortened"
+ parser = FediverseHtmlParser(f"{full_url}
")
+ assert (
+ parser.html
+ == f'social.example.com/a-long/path
'
+ )
+ assert (
+ parser.plain_text
+ == "https://social.example.com/a-long/path/that-should-be-shortened"
)
-
-@pytest.mark.django_db
-def test_link_mixcase_mentions():
- renderer = ContentRenderer(local=True)
- fake_mention = Mock()
- fake_mention.username = "Manfre"
- fake_mention.domain_id = "manfre.net"
- fake_mention.urls.view = "/@Manfre@manfre.net/"
- fake_mention2 = Mock()
- fake_mention2.username = "manfre"
- fake_mention2.domain_id = "takahe.social"
- fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/"
-
- unfetched_mention = Mock()
- unfetched_mention.username = None
- unfetched_mention.domain_id = None
- unfetched_mention.urls.view = "/None@None/"
-
- fake_post = Mock()
- fake_post.mentions.all.return_value = [
- fake_mention,
- fake_mention2,
- unfetched_mention,
- ]
- fake_post.author.domain.uri_domain = "example.com"
- fake_post.emojis.all.return_value = []
-
- assert renderer.render_post(
- "@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net",
- fake_post,
- ) == (
- '@Manfre '
- '@mAnFrE@takahe.social '
- '@manfre '
- "@unfetched@manfre.net"
+ # Make sure things that look like mentions are left alone with no mentions supplied.
+ parser = FediverseHtmlParser(
+ "@test@example.com
",
+ find_mentions=True,
+ find_hashtags=True,
+ find_emojis=True,
)
+ assert parser.html == "@test@example.com
"
+ assert parser.plain_text == "@test@example.com"
+ assert parser.mentions == {"test@example.com"}
+
+ # Make sure mentions work when there is a mention supplied
+ parser = FediverseHtmlParser(
+ "@test@example.com
",
+ mentions=[identity],
+ find_hashtags=True,
+ find_emojis=True,
+ )
+ assert parser.html == '@test
'
+ assert parser.plain_text == "@test@example.com"
+ assert parser.mentions == {"test@example.com"}
+
+ # Ensure mentions are case insensitive
+ parser = FediverseHtmlParser(
+ "@TeSt@ExamPle.com
",
+ mentions=[identity],
+ find_hashtags=True,
+ find_emojis=True,
+ )
+ assert parser.html == '@TeSt
'
+ assert parser.plain_text == "@TeSt@ExamPle.com"
+ assert parser.mentions == {"test@example.com"}
+
+ # Ensure hashtags are linked, even through spans, but not within hrefs
+ parser = FediverseHtmlParser(
+ 'something #hashtag #hashtagtwo',
+ find_hashtags=True,
+ find_emojis=True,
+ )
+ assert (
+ parser.html
+ == 'something #hashtag #hashtagtwo'
+ )
+ assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo"
+ assert parser.hashtags == {"hashtag", "hashtagtwo"}
+
+ # Ensure lists are rendered reasonably
+ parser = FediverseHtmlParser(
+ "List:
End!
",
+ find_hashtags=True,
+ find_emojis=True,
+ )
+ assert parser.html == "List:
One
Two
Three
End!
"
+ assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!"
diff --git a/users/models/identity.py b/users/models/identity.py
index 754fe77..3050b06 100644
--- a/users/models/identity.py
+++ b/users/models/identity.py
@@ -13,7 +13,7 @@ from django.utils.functional import lazy
from lxml import etree
from core.exceptions import ActorMismatchError, capture_message
-from core.html import ContentRenderer, html_to_plaintext, strip_html
+from core.html import ContentRenderer, FediverseHtmlParser
from core.ld import (
canonicalise,
format_ld_date,
@@ -530,8 +530,8 @@ class Identity(StatorModel):
response["attachment"] = [
{
"type": "http://schema.org#PropertyValue",
- "name": strip_html(item["name"], linkify=False),
- "value": strip_html(item["value"]),
+ "name": FediverseHtmlParser(item["name"]).plain_text,
+ "value": FediverseHtmlParser(item["value"]).html,
}
for item in self.metadata
]
@@ -781,7 +781,9 @@ class Identity(StatorModel):
self.metadata.append(
{
"name": attachment.get("name"),
- "value": strip_html(attachment.get("http://schema.org#value")),
+ "value": FediverseHtmlParser(
+ attachment.get("http://schema.org#value")
+ ).html,
}
)
# Now go do webfinger with that info to see if we can get a canonical domain
@@ -903,12 +905,14 @@ class Identity(StatorModel):
Post.Visibilities.mentioned: "direct",
}
result["source"] = {
- "note": html_to_plaintext(self.summary) if self.summary else "",
+ "note": FediverseHtmlParser(self.summary).plain_text
+ if self.summary
+ else "",
"fields": (
[
{
"name": m["name"],
- "value": strip_html(m["value"], linkify=False),
+ "value": FediverseHtmlParser(m["value"]).plain_text,
"verified_at": None,
}
for m in self.metadata
diff --git a/users/services/identity.py b/users/services/identity.py
index a653ebe..9560982 100644
--- a/users/services/identity.py
+++ b/users/services/identity.py
@@ -3,7 +3,7 @@ from django.template.defaultfilters import linebreaks_filter
from activities.models import FanOut
from core.files import resize_image
-from core.html import strip_html
+from core.html import FediverseHtmlParser
from users.models import (
Block,
BlockStates,
@@ -211,7 +211,7 @@ class IdentityService:
Safely sets a summary and turns linebreaks into HTML
"""
if summary:
- self.identity.summary = linebreaks_filter(strip_html(summary))
+ self.identity.summary = FediverseHtmlParser(linebreaks_filter(summary)).html
else:
self.identity.summary = None
self.identity.save()
diff --git a/users/views/settings/profile.py b/users/views/settings/profile.py
index 7a2e957..bcbea8a 100644
--- a/users/views/settings/profile.py
+++ b/users/views/settings/profile.py
@@ -4,7 +4,7 @@ from django.shortcuts import redirect
from django.utils.decorators import method_decorator
from django.views.generic import FormView
-from core.html import html_to_plaintext
+from core.html import FediverseHtmlParser
from core.models.config import Config
from users.decorators import identity_required
from users.models import IdentityStates
@@ -65,7 +65,11 @@ class ProfilePage(FormView):
identity = self.request.identity
return {
"name": identity.name,
- "summary": html_to_plaintext(identity.summary) if identity.summary else "",
+ "summary": (
+ FediverseHtmlParser(identity.summary).plain_text
+ if identity.summary
+ else ""
+ ),
"icon": identity.icon and identity.icon.url,
"image": identity.image and identity.image.url,
"discoverable": identity.discoverable,