From a6922cb9d6b98f958a230636c489fd9f4c96bd5b Mon Sep 17 00:00:00 2001 From: Andrew Godwin Date: Sun, 29 Jan 2023 17:46:22 -0700 Subject: [PATCH] Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. --- .pre-commit-config.yaml | 8 +- activities/admin.py | 9 - activities/models/emoji.py | 9 +- activities/models/hashtag.py | 11 - activities/models/post.py | 24 +- activities/views/compose.py | 4 +- core/html.py | 642 ++++++++++++------------ requirements.txt | 1 - tests/activities/models/test_hashtag.py | 44 -- tests/api/test_statuses.py | 43 ++ tests/core/test_html.py | 242 ++++----- users/models/identity.py | 16 +- users/services/identity.py | 4 +- users/views/settings/profile.py | 8 +- 14 files changed, 503 insertions(+), 562 deletions(-) delete mode 100644 tests/activities/models/test_hashtag.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b2392b9..9fc237b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,13 +48,7 @@ repos: - id: mypy exclude: "^tests/" additional_dependencies: - [ - types-pyopenssl, - types-bleach, - types-mock, - types-cachetools, - types-python-dateutil, - ] + [types-pyopenssl, types-mock, types-cachetools, types-python-dateutil] - repo: https://github.com/rtts/djhtml rev: v1.5.2 diff --git a/activities/admin.py b/activities/admin.py index f3444ed..24ef6e5 100644 --- a/activities/admin.py +++ b/activities/admin.py @@ -1,4 +1,3 @@ -from asgiref.sync import async_to_sync from django.contrib import admin from django.db import models from django.utils.safestring import mark_safe @@ -165,7 +164,6 @@ class PostAdmin(admin.ModelAdmin): list_filter = ("type", "local", "visibility", "state", "created") raw_id_fields = ["emojis"] autocomplete_fields = ["to", "mentions", "author"] - actions = ["reparse_hashtags"] search_fields = ["content", "search_handle", "search_service_handle"] inlines = [PostAttachmentInline] readonly_fields = ["created", "updated", "state_changed", "object_json"] @@ -183,13 +181,6 @@ class PostAdmin(admin.ModelAdmin): ) return super().get_search_results(request, queryset, search_term) - @admin.action(description="Reprocess content for hashtags") - def reparse_hashtags(self, request, queryset): - for instance in queryset: - instance.hashtags = Hashtag.hashtags_from_content(instance.content) or None - instance.save() - async_to_sync(instance.ensure_hashtags)() - @admin.display(description="ActivityPub JSON") def object_json(self, instance): return instance.to_ap() diff --git a/activities/models/emoji.py b/activities/models/emoji.py index 0e29a47..2946a94 100644 --- a/activities/models/emoji.py +++ b/activities/models/emoji.py @@ -1,5 +1,4 @@ import mimetypes -import re from functools import partial from typing import ClassVar @@ -14,7 +13,7 @@ from django.db import models from django.utils.safestring import mark_safe from core.files import get_remote_file -from core.html import strip_html +from core.html import FediverseHtmlParser from core.ld import format_ld_date from core.models import Config from core.uploads import upload_emoji_namer @@ -134,8 +133,6 @@ class Emoji(StatorModel): admin_disable = "{admin}{self.pk}/disable/" admin_copy = "{admin}{self.pk}/copy/" - emoji_regex = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B") - def delete(self, using=None, keep_parents=False): if self.file: self.file.delete() @@ -242,7 +239,9 @@ class Emoji(StatorModel): Return a parsed and sanitized of emoji found in content without the surrounding ':'. """ - emoji_hits = cls.emoji_regex.findall(strip_html(content)) + emoji_hits = FediverseHtmlParser( + content, find_emojis=True, emoji_domain=domain + ).emojis emojis = sorted({emoji.lower() for emoji in emoji_hits}) return list( cls.objects.filter(local=(domain is None) or domain.local) diff --git a/activities/models/hashtag.py b/activities/models/hashtag.py index 8430fd4..176bdc1 100644 --- a/activities/models/hashtag.py +++ b/activities/models/hashtag.py @@ -6,7 +6,6 @@ from asgiref.sync import sync_to_async from django.db import models from django.utils import timezone -from core.html import strip_html from core.models import Config from stator.models import State, StateField, StateGraph, StatorModel @@ -167,16 +166,6 @@ class Hashtag(StatorModel): results[date(year, month, day)] = val return dict(sorted(results.items(), reverse=True)[:num]) - @classmethod - def hashtags_from_content(cls, content) -> list[str]: - """ - Return a parsed and sanitized of hashtags found in content without - leading '#'. - """ - hashtag_hits = cls.hashtag_regex.findall(strip_html(content)) - hashtags = sorted({tag.lower() for tag in hashtag_hits}) - return list(hashtags) - def to_mastodon_json(self): return { "name": self.hashtag, diff --git a/activities/models/post.py b/activities/models/post.py index 8d00f46..88e53af 100644 --- a/activities/models/post.py +++ b/activities/models/post.py @@ -2,7 +2,6 @@ import datetime import hashlib import json import mimetypes -import re import ssl from collections.abc import Iterable from typing import Optional @@ -26,7 +25,7 @@ from activities.models.post_types import ( PostTypeDataEncoder, ) from core.exceptions import capture_message -from core.html import ContentRenderer, strip_html +from core.html import ContentRenderer, FediverseHtmlParser from core.ld import ( canonicalise, format_ld_date, @@ -374,10 +373,6 @@ class Post(StatorModel): def clean_type_data(self, value): PostTypeData.parse_obj(value) - mention_regex = re.compile( - r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)" - ) - def _safe_content_note(self, *, local: bool = True): return ContentRenderer(local=local).render_post(self.content, self) @@ -474,12 +469,12 @@ class Post(StatorModel): # Maintain local-only for replies if reply_to.visibility == reply_to.Visibilities.local_only: visibility = reply_to.Visibilities.local_only - # Find hashtags in this post - hashtags = Hashtag.hashtags_from_content(content) or None # Find emoji in this post emojis = Emoji.emojis_from_content(content, None) - # Strip all HTML and apply linebreaks filter - content = linebreaks_filter(strip_html(content)) + # Strip all unwanted HTML and apply linebreaks filter, grabbing hashtags on the way + parser = FediverseHtmlParser(linebreaks_filter(content), find_hashtags=True) + content = parser.html + hashtags = sorted(parser.hashtags) or None # Make the Post object post = cls.objects.create( author=author, @@ -512,12 +507,13 @@ class Post(StatorModel): ): with transaction.atomic(): # Strip all HTML and apply linebreaks filter - self.content = linebreaks_filter(strip_html(content)) + parser = FediverseHtmlParser(linebreaks_filter(content)) + self.content = parser.html + self.hashtags = sorted(parser.hashtags) or None self.summary = summary or None self.sensitive = bool(summary) self.visibility = visibility self.edited = timezone.now() - self.hashtags = Hashtag.hashtags_from_content(content) or None self.mentions.set(self.mentions_from_content(content, self.author)) self.emojis.set(Emoji.emojis_from_content(content, None)) self.attachments.set(attachments or []) @@ -525,9 +521,9 @@ class Post(StatorModel): @classmethod def mentions_from_content(cls, content, author) -> set[Identity]: - mention_hits = cls.mention_regex.findall(content) + mention_hits = FediverseHtmlParser(content, find_mentions=True).mentions mentions = set() - for precursor, handle in mention_hits: + for handle in mention_hits: handle = handle.lower() if "@" in handle: username, domain = handle.split("@", 1) diff --git a/activities/views/compose.py b/activities/views/compose.py index c2e3618..96c9e11 100644 --- a/activities/views/compose.py +++ b/activities/views/compose.py @@ -14,7 +14,7 @@ from activities.models import ( TimelineEvent, ) from core.files import blurhash_image, resize_image -from core.html import html_to_plaintext +from core.html import FediverseHtmlParser from core.models import Config from users.decorators import identity_required @@ -112,7 +112,7 @@ class Compose(FormView): { "reply_to": self.reply_to.pk if self.reply_to else "", "visibility": self.post_obj.visibility, - "text": html_to_plaintext(self.post_obj.content), + "text": FediverseHtmlParser(self.post_obj.content).plain_text, "content_warning": self.post_obj.summary, } ) diff --git a/core/html.py b/core/html.py index 5728899..fc790c8 100644 --- a/core/html.py +++ b/core/html.py @@ -1,199 +1,309 @@ +import html import re -from functools import partial +from html.parser import HTMLParser -import bleach -import bleach.callbacks -from bleach.html5lib_shim import Filter -from bleach.linkifier import LinkifyFilter from django.utils.safestring import mark_safe -url_regex = re.compile( - r"""\(* # Match any opening parentheses. - \b(?"]*)? + +class FediverseHtmlParser(HTMLParser): + """ + A custom HTML parser that only allows a certain tag subset and behaviour: + - br, p tags are passed through + - a tags are passed through if they're not hashtags or mentions + - Another set of tags are converted to p + + It also linkifies URLs, mentions, hashtags, and imagifies emoji. + """ + + REWRITE_TO_P = [ + "p", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "blockquote", + "pre", + "ul", + "ol", + ] + + REWRITE_TO_BR = [ + "br", + "li", + ] + + MENTION_REGEX = re.compile( + r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)" + ) + + HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)") + + EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B") + + URL_REGEX = re.compile( + r"""(\(* # Match any opening parentheses. + \b(?"]*)?) # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) - """, - re.IGNORECASE | re.VERBOSE | re.UNICODE, -) - -ALLOWED_TAGS = ["br", "p", "a"] -REWRITTEN_TAGS = [ - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "blockquote", - "pre", - "ul", - "ol", - "li", -] - - -class MastodonStrictTagFilter(Filter): - """ - Implements Python equivalent of Mastodon tag rewriter - - Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55 - - Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `
` lists - """ - - def __iter__(self): - li_pending_break = False - break_token = { - "name": "br", - "data": {}, - "type": "StartTag", - } - - for token in Filter.__iter__(self): - if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [ - "StartTag", - "EndTag", - ]: - yield token - continue - - if token["type"] == "StartTag": - if token["name"] == "li": - if li_pending_break: - # Another `li` appeared, so break after the last one - yield break_token - continue - token["name"] = "p" - elif token["type"] == "EndTag": - if token["name"] == "li": - # Track that an `li` closed so we know a break should be considered - li_pending_break = True - continue - if token["name"] == "ul": - # If the last `li` happened, then don't add a break because Mastodon doesn't - li_pending_break = False - token["name"] = "p" - - yield token - - -class UnlinkifyFilter(Filter): - """ - Forcibly replaces link text with the href. - - This is intented to be used when stripping tags to preserve the link - location at the expense of the link text. - """ - - def __iter__(self): - discarding_a_text = False - for token in Filter.__iter__(self): - if token.get("name") == "a": - if token["type"] == "EndTag": - discarding_a_text = False - continue - href = token["data"].get((None, "href")) - - # If has an href, we use it and throw away all content - # within the .... If href missing or empty, try to find - # text within the ... - if href: - yield {"data": href, "type": "Characters"} - discarding_a_text = True - continue - elif not discarding_a_text: - yield token - # else: throw away tokens until we're out of the - - -def allow_a(tag: str, name: str, value: str): - if name in ["href", "title", "class"]: - return True - elif name == "rel": - # Only allow rel attributes with a small subset of values - # (we're defending against, for example, rel=me) - rel_values = value.split() - if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values): - return True - return False - - -def shorten_link_text(attrs, new=False): - """ - Applies Mastodon's link shortening behavior where URL text links are - shortened by removing the scheme and only showing the first 30 chars. - - Orig: - https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened - - Becomes: - social.example.com/a-long/path - - """ - text = attrs.get("_text") - if not text: - text = attrs.get((None, "href")) - if text and "://" in text and len(text) > 30: - text = text.split("://", 1)[-1] - attrs["_text"] = text[:30] - if len(text) > 30: - attrs[(None, "class")] = " ".join( - filter(None, [attrs.pop((None, "class"), ""), "ellipsis"]) - ) - # Add the full URL in to title for easier user inspection - attrs[(None, "title")] = attrs.get((None, "href")) - - return attrs - - -linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text] - - -def sanitize_html(post_html: str) -> str: - """ - Only allows a, br, p and span tags, and class attributes. - """ - cleaner = bleach.Cleaner( - tags=ALLOWED_TAGS + REWRITTEN_TAGS, - attributes={ # type:ignore - "a": allow_a, - "p": ["class"], - }, - filters=[ - partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks), - MastodonStrictTagFilter, - ], - strip=True, + """, + re.IGNORECASE | re.VERBOSE | re.UNICODE, ) - return mark_safe(cleaner.clean(post_html)) + def __init__( + self, + html: str, + uri_domain: str | None = None, + mentions: list | None = None, + find_mentions: bool = False, + find_hashtags: bool = False, + find_emojis: bool = False, + emoji_domain=None, + ): + super().__init__() + self.uri_domain = uri_domain + self.emoji_domain = emoji_domain + self.find_mentions = find_mentions + self.find_hashtags = find_hashtags + self.find_emojis = find_emojis + self.calculate_mentions(mentions) + self._data_buffer = "" + self.html_output = "" + self.text_output = "" + self.emojis: set[str] = set() + self.mentions: set[str] = set() + self.hashtags: set[str] = set() + self._pending_a: dict | None = None + self._fresh_p = False + self.feed(html.replace("\n", "")) + self.flush_data() -def strip_html(post_html: str, *, linkify: bool = True) -> str: - """ - Strips all tags from the text, then linkifies it. - """ - cleaner = bleach.Cleaner( - tags=[], - strip=True, - filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)] - if linkify - else [UnlinkifyFilter], - ) - return mark_safe(cleaner.clean(post_html)) + def calculate_mentions(self, mentions: list | None): + """ + Prepares a set of content that we expect to see mentions look like + (this imp) + """ + self.mention_matches: dict[str, str] = {} + self.mention_aliases: dict[str, str] = {} + for mention in mentions or []: + if self.uri_domain: + url = mention.absolute_profile_uri() + else: + url = str(mention.urls.view) + if mention.username: + username = mention.username.lower() + domain = mention.domain_id.lower() + self.mention_matches[f"{username}"] = url + self.mention_matches[f"{username}@{domain}"] = url + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag in self.REWRITE_TO_P: + self.flush_data() + self.html_output += "

" + elif tag in self.REWRITE_TO_BR: + self.flush_data() + if not self._fresh_p: + self.html_output += "
" + self.text_output += "\n" + elif tag == "a": + self.flush_data() + self._pending_a = {"attrs": dict(attrs), "content": ""} + self._fresh_p = tag in self.REWRITE_TO_P -def html_to_plaintext(post_html: str) -> str: - """ - Tries to do the inverse of the linebreaks filter. - """ - # TODO: Handle HTML entities - # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach) - post_html = post_html.replace("\n", "").replace("
", "\n").replace("

", "\n") - # Remove all other HTML and return - cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter]) - return cleaner.clean(post_html).strip() + def handle_endtag(self, tag: str) -> None: + self._fresh_p = False + if tag in self.REWRITE_TO_P: + self.flush_data() + self.html_output += "

" + self.text_output += "\n\n" + elif tag == "a": + if self._pending_a: + href = self._pending_a["attrs"].get("href") + content = self._pending_a["content"].strip() + # Is it a mention? + if content.lower().lstrip("@") in self.mention_matches: + self.html_output += self.create_mention(content) + self.text_output += content + # Is it a hashtag? + elif self.HASHTAG_REGEX.match(content): + self.html_output += self.create_hashtag(content) + self.text_output += content + elif content: + # Shorten the link if we need to + self.html_output += self.create_link(href, content) + self.text_output += href + self._pending_a = None + + def handle_data(self, data: str) -> None: + self._fresh_p = False + if self._pending_a: + self._pending_a["content"] += data + else: + self._data_buffer += data + + def flush_data(self) -> None: + """ + We collect data segments until we encounter a tag we care about, + so we can treat #hashtag as #hashtag + """ + self.text_output += self._data_buffer + self.html_output += self.linkify(self._data_buffer) + self._data_buffer = "" + + def create_link(self, href, content): + """ + Generates a link, doing optional shortening. + + All return values from this function should be HTML-safe. + """ + looks_like_link = bool(self.URL_REGEX.match(content)) + if looks_like_link: + content = content.split("://", 1)[1] + if looks_like_link and len(content) > 30: + return f'{html.escape(content[:30])}' + else: + return f'{html.escape(content)}' + + def create_mention(self, handle) -> str: + """ + Generates a mention link. Handle should have a leading @. + + All return values from this function should be HTML-safe + """ + handle = handle.lstrip("@") + if "@" in handle: + short_handle = handle.split("@", 1)[0] + else: + short_handle = handle + handle_hash = handle.lower() + short_hash = short_handle.lower() + self.mentions.add(handle_hash) + url = self.mention_matches.get(handle_hash) + if url: + if short_hash not in self.mention_aliases: + self.mention_aliases[short_hash] = handle_hash + elif self.mention_aliases.get(short_hash) != handle_hash: + short_handle = handle + return f'@{html.escape(short_handle)}' + else: + return "@" + html.escape(handle) + + def create_hashtag(self, hashtag) -> str: + """ + Generates a hashtag link. Hashtag does not need to start with # + + All return values from this function should be HTML-safe + """ + hashtag = hashtag.lstrip("#") + self.hashtags.add(hashtag.lower()) + if self.uri_domain: + return f'#{hashtag}' + else: + return f'#{hashtag}' + + def create_emoji(self, shortcode) -> str: + """ + Generates an emoji tag + + All return values from this function should be HTML-safe + """ + from activities.models import Emoji + + emoji = Emoji.get_by_domain(shortcode, self.emoji_domain) + if emoji and emoji.is_usable: + self.emojis.add(shortcode) + return emoji.as_html() + return f":{shortcode}:" + + def linkify(self, data): + """ + Linkifies some content that is plaintext. + + Handles URLs first, then mentions. Note that this takes great care to + keep track of what is HTML and what needs to be escaped. + """ + # Split the string by the URL regex so we know what to escape and what + # not to escape. + bits = self.URL_REGEX.split(data) + result = "" + # Even indices are data we should pass though, odd indices are links + for i, bit in enumerate(bits): + # A link! + if i % 2 == 1: + result += self.create_link(bit, bit) + # Not a link + elif self.mention_matches or self.find_mentions: + result += self.linkify_mentions(bit) + elif self.find_hashtags: + result += self.linkify_hashtags(bit) + elif self.find_emojis: + result += self.linkify_emoji(bit) + else: + result += html.escape(bit) + return result + + def linkify_mentions(self, data): + """ + Linkifies mentions + """ + bits = self.MENTION_REGEX.split(data) + result = "" + for i, bit in enumerate(bits): + # Mention content + if i % 3 == 2: + result += self.create_mention(bit) + # Not part of a mention (0) or mention preamble (1) + elif self.find_hashtags: + result += self.linkify_hashtags(bit) + elif self.find_emojis: + result += self.linkify_emoji(bit) + else: + result += html.escape(bit) + return result + + def linkify_hashtags(self, data): + """ + Linkifies hashtags + """ + bits = self.HASHTAG_REGEX.split(data) + result = "" + for i, bit in enumerate(bits): + # Not part of a hashtag + if i % 2 == 0: + if self.find_emojis: + result += self.linkify_emoji(bit) + else: + result += html.escape(bit) + # Hashtag content + else: + result += self.create_hashtag(bit) + return result + + def linkify_emoji(self, data): + """ + Linkifies emoji + """ + bits = self.EMOJI_REGEX.split(data) + result = "" + for i, bit in enumerate(bits): + # Not part of an emoji + if i % 2 == 0: + result += html.escape(bit) + # Emoji content + else: + result += self.create_emoji(bit) + return result + + @property + def html(self): + return self.html_output.strip() + + @property + def plain_text(self): + return self.text_output.strip() class ContentRenderer: @@ -212,33 +322,30 @@ class ContentRenderer: """ if not html: return "" - html = sanitize_html(html) - html = self.linkify_mentions(html, post=post) - html = self.linkify_hashtags(html, identity=post.author) - if self.local: - html = self.imageify_emojis( - html, - identity=post.author, - emojis=post.emojis.all(), - ) - html = self.remove_extra_newlines(html) - return mark_safe(html) + parser = FediverseHtmlParser( + html, + mentions=post.mentions.all(), + uri_domain=(None if self.local else post.author.domain.uri_domain), + find_hashtags=True, + find_emojis=True, + emoji_domain=post.author.domain, + ) + return mark_safe(parser.html) - def render_identity_summary(self, html: str, identity, strip: bool = False) -> str: + def render_identity_summary(self, html: str, identity) -> str: """ Given identity summary HTML, normalises it and renders it for presentation. """ if not html: return "" - if strip: - html = strip_html(html) - else: - html = sanitize_html(html) - html = self.linkify_hashtags(html, identity=identity) - if self.local: - html = self.imageify_emojis(html, identity=identity) - html = self.remove_extra_newlines(html) - return mark_safe(html) + parser = FediverseHtmlParser( + html, + uri_domain=(None if self.local else identity.domain.uri_domain), + find_hashtags=True, + find_emojis=True, + emoji_domain=identity.domain, + ) + return mark_safe(parser.html) def render_identity_data(self, html: str, identity, strip: bool = False) -> str: """ @@ -246,117 +353,14 @@ class ContentRenderer: """ if not html: return "" - if strip: - html = strip_html(html) - else: - html = sanitize_html(html) - if self.local: - html = self.imageify_emojis(html, identity=identity) - html = self.remove_extra_newlines(html) - return mark_safe(html) - - def linkify_mentions(self, html: str, post) -> str: - """ - Links mentions _in the context of the post_ - as in, using the mentions - property as the only source (as we might be doing this without other - DB access allowed) - """ - from activities.models import Post - - possible_matches = {} - for mention in post.mentions.all(): - if self.local: - url = str(mention.urls.view) - else: - url = mention.absolute_profile_uri() - # Might not have fetched it (yet) - if mention.username: - username = mention.username.lower() - possible_matches[username] = url - possible_matches[f"{username}@{mention.domain_id}"] = url - - collapse_name: dict[str, str] = {} - - def replacer(match): - precursor = match.group(1) - handle = match.group(2) - if "@" in handle: - short_handle = handle.split("@", 1)[0] - else: - short_handle = handle - handle_hash = handle.lower() - short_hash = short_handle.lower() - if handle_hash in possible_matches: - if short_hash not in collapse_name: - collapse_name[short_hash] = handle_hash - elif collapse_name.get(short_hash) != handle_hash: - short_handle = handle - return f'{precursor}@{short_handle}' - else: - return match.group() - - return Post.mention_regex.sub(replacer, html) - - def linkify_hashtags(self, html, identity) -> str: - from activities.models import Hashtag - - def replacer(attrs, new=False): - # See if the text in this link looks like a hashtag - if not Hashtag.hashtag_regex.match(attrs.get("_text", "")): - return attrs - hashtag = attrs["_text"].strip().lstrip("#") - attrs[None, "class"] = "hashtag" - if (None, "rel") in attrs: - del attrs[None, "rel"] - if self.local: - attrs[None, "href"] = f"/tags/{hashtag.lower()}/" - else: - attrs[ - None, "href" - ] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/" - return attrs - - linker = bleach.linkifier.Linker( - url_re=Hashtag.hashtag_regex, callbacks=[replacer] + parser = FediverseHtmlParser( + html, + uri_domain=(None if self.local else identity.domain.uri_domain), + find_hashtags=False, + find_emojis=True, + emoji_domain=identity.domain, ) - return linker.linkify(html) - - def imageify_emojis( - self, html: str, identity, include_local: bool = True, emojis=None - ): - """ - Find :emoji: in content and convert to . If include_local is True, - the local emoji will be used as a fallback for any shortcodes not defined - by emojis. - """ - from activities.models import Emoji - - # If precached emojis were passed, prep them - cached_emojis = {} - if emojis: - for emoji in emojis: - cached_emojis[emoji.shortcode] = emoji - - def replacer(match): - shortcode = match.group(1).lower() - if shortcode in cached_emojis: - return cached_emojis[shortcode].as_html() - - emoji = Emoji.get_by_domain(shortcode, identity.domain) - if emoji and emoji.is_usable: - return emoji.as_html() - elif not emoji and include_local: - emoji = Emoji.get_by_domain(shortcode, None) - if emoji: - return emoji.as_html() - - return match.group() - - return Emoji.emoji_regex.sub(replacer, html) - - def remove_extra_newlines(self, html: str) -> str: - """ - Some clients are sensitive to extra newlines even though it's HTML - """ - # TODO: More intelligent way to strip these? - return html.replace("\n", "") + if strip: + return mark_safe(parser.html) + else: + return mark_safe(parser.html) diff --git a/requirements.txt b/requirements.txt index 546a13f..911830f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -bleach~=5.0.1 blurhash-python~=1.1.3 cachetools~=5.2.0 cryptography~=39.0 diff --git a/tests/activities/models/test_hashtag.py b/tests/activities/models/test_hashtag.py deleted file mode 100644 index 872d72e..0000000 --- a/tests/activities/models/test_hashtag.py +++ /dev/null @@ -1,44 +0,0 @@ -from activities.models import Hashtag -from core.html import ContentRenderer - - -def test_hashtag_from_content(): - assert Hashtag.hashtags_from_content("#hashtag") == ["hashtag"] - assert Hashtag.hashtags_from_content("a#hashtag") == [] - assert Hashtag.hashtags_from_content("Text #with #hashtag in it") == [ - "hashtag", - "with", - ] - assert Hashtag.hashtags_from_content("#hashtag.") == ["hashtag"] - assert Hashtag.hashtags_from_content("More text\n#one # two ##three #hashtag!") == [ - "hashtag", - "one", - "three", - ] - assert Hashtag.hashtags_from_content("my #html loves entities") == ["html"] - assert Hashtag.hashtags_from_content("#tag") == ["tag"] - - -def test_linkify_hashtag(): - linkify = lambda html: ContentRenderer(local=True).linkify_hashtags(html, None) - - assert linkify("# hashtag") == "# hashtag" - assert ( - linkify('Text') - == 'Text' - ) - assert ( - linkify("#HashTag") == '#HashTag' - ) - assert ( - linkify( - """A longer text #bigContent -with #tags, linebreaks, and -maybe a few links -#allTheTags #AllTheTags #ALLTHETAGS""" - ) - == """A longer text #bigContent -with #tags, linebreaks, and -maybe a few links -#allTheTags #AllTheTags #ALLTHETAGS""" - ) diff --git a/tests/api/test_statuses.py b/tests/api/test_statuses.py index 1b00642..df576e9 100644 --- a/tests/api/test_statuses.py +++ b/tests/api/test_statuses.py @@ -1,5 +1,7 @@ import pytest +from activities.models import Post + @pytest.mark.django_db def test_post_status(api_token, identity, client): @@ -15,3 +17,44 @@ def test_post_status(api_token, identity, client): ).json() assert response["content"] == "

Hello, world!

" assert response["visibility"] == "unlisted" + + +@pytest.mark.django_db +def test_mention_format(api_token, identity, remote_identity, client): + """ + Ensures mentions work, and only have one link around them. + """ + # Make a local post and check it + response = client.post( + "/api/v1/statuses", + HTTP_AUTHORIZATION=f"Bearer {api_token.token}", + HTTP_ACCEPT="application/json", + content_type="application/json", + data={ + "status": "Hello, @test!", + "visibility": "unlisted", + }, + ).json() + assert ( + response["content"] + == '

Hello, @test!

' + ) + assert response["visibility"] == "unlisted" + + # Make a remote post and check it + post = Post.objects.create( + local=False, + author=remote_identity, + content='

Hey @test

', + object_uri="https://remote.test/status/12345", + ) + post.mentions.add(identity) + response = client.get( + f"/api/v1/statuses/{post.id}", + HTTP_AUTHORIZATION=f"Bearer {api_token.token}", + HTTP_ACCEPT="application/json", + content_type="application/json", + ).json() + assert ( + response["text"] == '

Hey @test

' + ) diff --git a/tests/core/test_html.py b/tests/core/test_html.py index 527991a..c076592 100644 --- a/tests/core/test_html.py +++ b/tests/core/test_html.py @@ -1,155 +1,117 @@ -from unittest.mock import Mock - import pytest -from core.html import ContentRenderer, html_to_plaintext, sanitize_html - - -def test_html_to_plaintext(): - - assert html_to_plaintext("

Hi!

") == "Hi!" - assert html_to_plaintext("

Hi!
There

") == "Hi!\nThere" - assert ( - html_to_plaintext("

Hi!

\n\n

How are you?

") == "Hi!\n\nHow are you?" - ) - - assert ( - html_to_plaintext("

Hi!

\n\n

How are
you?

today

") - == "Hi!\n\nHow are\n you?\n\ntoday" - ) - - assert ( - html_to_plaintext( - '

' - 'The Link ' - 'Empty href ' - "Empty A

" - ) - == "https://fedi.takahe.social/with/a/long/path Empty href Empty A" - ) - - -def test_sanitize_post(): - - assert sanitize_html("

Hello!

") == "

Hello!

" - assert sanitize_html("

It's great

") == "

It's great

" - - # Note that we only want to linkify things with protocol prefixes to prevent - # too many false positives. - assert sanitize_html("

test.com

") == "

test.com

" - assert ( - sanitize_html("

https://test.com

") - == '

https://test.com

' - ) - assert ( - sanitize_html("

@someone@subdomain.some-domain.com

") - == "

@someone@subdomain.some-domain.com

" - ) - - -def test_shorten_url(): - full_url = ( - "https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened" - ) - assert ( - sanitize_html(f"

{full_url}

") - == f'

social.example.com/a-long/path

' - ) - - assert ( - sanitize_html( - f'

This is a long link text, but cannot be shortened as a URL

' - ) - == f'

This is a long link text, but cannot be shortened as a URL

' - ) +from core.html import FediverseHtmlParser @pytest.mark.django_db -def test_link_preservation(): +def test_parser(identity): """ - We want to: - - Preserve incoming links from other servers - - Linkify mentions and hashtags - - Not have these all step on each other! + Validates the HtmlParser in its various output modes """ - renderer = ContentRenderer(local=True) - fake_mention = Mock() - fake_mention.username = "andrew" - fake_mention.domain_id = "aeracode.org" - fake_mention.urls.view = "/@andrew@aeracode.org/" - fake_post = Mock() - fake_post.mentions.all.return_value = [fake_mention] - fake_post.author.domain.uri_domain = "example.com" - fake_post.emojis.all.return_value = [] + # Basic tag allowance + parser = FediverseHtmlParser("

Hello!

") + assert parser.html == "

Hello!

" + assert parser.plain_text == "Hello!" + + # Newline erasure + parser = FediverseHtmlParser("

Hi!

\n\n

How are you?

") + assert parser.html == "

Hi!

How are you?

" + assert parser.plain_text == "Hi!\n\nHow are you?" + + # Trying to be evil + parser = FediverseHtmlParser("pt>") + assert "") + assert "It's great

", find_hashtags=True) + assert parser.html == "

It's great

" + assert parser.plain_text == "It's great" + assert parser.hashtags == set() + + # Linkify works, but only with protocol prefixes + parser = FediverseHtmlParser("

test.com

") + assert parser.html == "

test.com

" + assert parser.plain_text == "test.com" + parser = FediverseHtmlParser("

https://test.com

") assert ( - renderer.render_post( - 'Hello @andrew, I want to link to this #hashtag: here and rewrite #thishashtag', - fake_post, - ) - == 'Hello @andrew, I want to link to this #hashtag: here and rewrite #thishashtag' + parser.html == '

test.com

' ) + assert parser.plain_text == "https://test.com" - -@pytest.mark.django_db -def test_list_rendering(): - """ - We want to: - - Preserve incoming links from other servers - - Linkify mentions and hashtags - - Not have these all step on each other! - """ - renderer = ContentRenderer(local=True) - fake_mention = Mock() - fake_mention.username = "andrew" - fake_mention.domain_id = "aeracode.org" - fake_mention.urls.view = "/@andrew@aeracode.org/" - fake_post = Mock() - fake_post.mentions.all.return_value = [fake_mention] - fake_post.author.domain.uri_domain = "example.com" - fake_post.emojis.all.return_value = [] - + # Links are preserved + parser = FediverseHtmlParser("takahe social") assert ( - renderer.render_post( - "

Ok. The roster so far is:

  • Infosec.exchange (mastodon)
  • pixel.Infosec.exchange (pixelfed)
  • video.Infosec.exchange (peertube)
  • relay.Infosec.exchange (activitypub relay)
  • risky.af (alt mastodon)

What’s next? I think I promised some people here bookwyrm

", - fake_post, - ) - == "

Ok. The roster so far is:

Infosec.exchange (mastodon)
pixel.Infosec.exchange (pixelfed)
video.Infosec.exchange (peertube)
relay.Infosec.exchange (activitypub relay)
risky.af (alt mastodon)

What’s next? I think I promised some people here bookwyrm

" + parser.html + == 'takahe social' + ) + assert parser.plain_text == "https://takahe.social" + + # Very long links are shortened + full_url = "https://social.example.com/a-long/path/that-should-be-shortened" + parser = FediverseHtmlParser(f"

{full_url}

") + assert ( + parser.html + == f'

social.example.com/a-long/path

' + ) + assert ( + parser.plain_text + == "https://social.example.com/a-long/path/that-should-be-shortened" ) - -@pytest.mark.django_db -def test_link_mixcase_mentions(): - renderer = ContentRenderer(local=True) - fake_mention = Mock() - fake_mention.username = "Manfre" - fake_mention.domain_id = "manfre.net" - fake_mention.urls.view = "/@Manfre@manfre.net/" - fake_mention2 = Mock() - fake_mention2.username = "manfre" - fake_mention2.domain_id = "takahe.social" - fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/" - - unfetched_mention = Mock() - unfetched_mention.username = None - unfetched_mention.domain_id = None - unfetched_mention.urls.view = "/None@None/" - - fake_post = Mock() - fake_post.mentions.all.return_value = [ - fake_mention, - fake_mention2, - unfetched_mention, - ] - fake_post.author.domain.uri_domain = "example.com" - fake_post.emojis.all.return_value = [] - - assert renderer.render_post( - "@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net", - fake_post, - ) == ( - '@Manfre ' - '@mAnFrE@takahe.social ' - '@manfre ' - "@unfetched@manfre.net" + # Make sure things that look like mentions are left alone with no mentions supplied. + parser = FediverseHtmlParser( + "

@test@example.com

", + find_mentions=True, + find_hashtags=True, + find_emojis=True, ) + assert parser.html == "

@test@example.com

" + assert parser.plain_text == "@test@example.com" + assert parser.mentions == {"test@example.com"} + + # Make sure mentions work when there is a mention supplied + parser = FediverseHtmlParser( + "

@test@example.com

", + mentions=[identity], + find_hashtags=True, + find_emojis=True, + ) + assert parser.html == '

@test

' + assert parser.plain_text == "@test@example.com" + assert parser.mentions == {"test@example.com"} + + # Ensure mentions are case insensitive + parser = FediverseHtmlParser( + "

@TeSt@ExamPle.com

", + mentions=[identity], + find_hashtags=True, + find_emojis=True, + ) + assert parser.html == '

@TeSt

' + assert parser.plain_text == "@TeSt@ExamPle.com" + assert parser.mentions == {"test@example.com"} + + # Ensure hashtags are linked, even through spans, but not within hrefs + parser = FediverseHtmlParser( + 'something #hashtag #hashtagtwo', + find_hashtags=True, + find_emojis=True, + ) + assert ( + parser.html + == 'something #hashtag #hashtagtwo' + ) + assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo" + assert parser.hashtags == {"hashtag", "hashtagtwo"} + + # Ensure lists are rendered reasonably + parser = FediverseHtmlParser( + "

List:

  • One
  • Two
  • Three

End!

", + find_hashtags=True, + find_emojis=True, + ) + assert parser.html == "

List:

One
Two
Three

End!

" + assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!" diff --git a/users/models/identity.py b/users/models/identity.py index 754fe77..3050b06 100644 --- a/users/models/identity.py +++ b/users/models/identity.py @@ -13,7 +13,7 @@ from django.utils.functional import lazy from lxml import etree from core.exceptions import ActorMismatchError, capture_message -from core.html import ContentRenderer, html_to_plaintext, strip_html +from core.html import ContentRenderer, FediverseHtmlParser from core.ld import ( canonicalise, format_ld_date, @@ -530,8 +530,8 @@ class Identity(StatorModel): response["attachment"] = [ { "type": "http://schema.org#PropertyValue", - "name": strip_html(item["name"], linkify=False), - "value": strip_html(item["value"]), + "name": FediverseHtmlParser(item["name"]).plain_text, + "value": FediverseHtmlParser(item["value"]).html, } for item in self.metadata ] @@ -781,7 +781,9 @@ class Identity(StatorModel): self.metadata.append( { "name": attachment.get("name"), - "value": strip_html(attachment.get("http://schema.org#value")), + "value": FediverseHtmlParser( + attachment.get("http://schema.org#value") + ).html, } ) # Now go do webfinger with that info to see if we can get a canonical domain @@ -903,12 +905,14 @@ class Identity(StatorModel): Post.Visibilities.mentioned: "direct", } result["source"] = { - "note": html_to_plaintext(self.summary) if self.summary else "", + "note": FediverseHtmlParser(self.summary).plain_text + if self.summary + else "", "fields": ( [ { "name": m["name"], - "value": strip_html(m["value"], linkify=False), + "value": FediverseHtmlParser(m["value"]).plain_text, "verified_at": None, } for m in self.metadata diff --git a/users/services/identity.py b/users/services/identity.py index a653ebe..9560982 100644 --- a/users/services/identity.py +++ b/users/services/identity.py @@ -3,7 +3,7 @@ from django.template.defaultfilters import linebreaks_filter from activities.models import FanOut from core.files import resize_image -from core.html import strip_html +from core.html import FediverseHtmlParser from users.models import ( Block, BlockStates, @@ -211,7 +211,7 @@ class IdentityService: Safely sets a summary and turns linebreaks into HTML """ if summary: - self.identity.summary = linebreaks_filter(strip_html(summary)) + self.identity.summary = FediverseHtmlParser(linebreaks_filter(summary)).html else: self.identity.summary = None self.identity.save() diff --git a/users/views/settings/profile.py b/users/views/settings/profile.py index 7a2e957..bcbea8a 100644 --- a/users/views/settings/profile.py +++ b/users/views/settings/profile.py @@ -4,7 +4,7 @@ from django.shortcuts import redirect from django.utils.decorators import method_decorator from django.views.generic import FormView -from core.html import html_to_plaintext +from core.html import FediverseHtmlParser from core.models.config import Config from users.decorators import identity_required from users.models import IdentityStates @@ -65,7 +65,11 @@ class ProfilePage(FormView): identity = self.request.identity return { "name": identity.name, - "summary": html_to_plaintext(identity.summary) if identity.summary else "", + "summary": ( + FediverseHtmlParser(identity.summary).plain_text + if identity.summary + else "" + ), "icon": identity.icon and identity.icon.url, "image": identity.image and identity.image.url, "discoverable": identity.discoverable,