diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b2392b9..9fc237b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,13 +48,7 @@ repos:
- id: mypy
exclude: "^tests/"
additional_dependencies:
- [
- types-pyopenssl,
- types-bleach,
- types-mock,
- types-cachetools,
- types-python-dateutil,
- ]
+ [types-pyopenssl, types-mock, types-cachetools, types-python-dateutil]
- repo: https://github.com/rtts/djhtml
rev: v1.5.2
diff --git a/activities/admin.py b/activities/admin.py
index f3444ed..24ef6e5 100644
--- a/activities/admin.py
+++ b/activities/admin.py
@@ -1,4 +1,3 @@
-from asgiref.sync import async_to_sync
from django.contrib import admin
from django.db import models
from django.utils.safestring import mark_safe
@@ -165,7 +164,6 @@ class PostAdmin(admin.ModelAdmin):
list_filter = ("type", "local", "visibility", "state", "created")
raw_id_fields = ["emojis"]
autocomplete_fields = ["to", "mentions", "author"]
- actions = ["reparse_hashtags"]
search_fields = ["content", "search_handle", "search_service_handle"]
inlines = [PostAttachmentInline]
readonly_fields = ["created", "updated", "state_changed", "object_json"]
@@ -183,13 +181,6 @@ class PostAdmin(admin.ModelAdmin):
)
return super().get_search_results(request, queryset, search_term)
- @admin.action(description="Reprocess content for hashtags")
- def reparse_hashtags(self, request, queryset):
- for instance in queryset:
- instance.hashtags = Hashtag.hashtags_from_content(instance.content) or None
- instance.save()
- async_to_sync(instance.ensure_hashtags)()
-
@admin.display(description="ActivityPub JSON")
def object_json(self, instance):
return instance.to_ap()
diff --git a/activities/models/emoji.py b/activities/models/emoji.py
index 0e29a47..2946a94 100644
--- a/activities/models/emoji.py
+++ b/activities/models/emoji.py
@@ -1,5 +1,4 @@
import mimetypes
-import re
from functools import partial
from typing import ClassVar
@@ -14,7 +13,7 @@ from django.db import models
from django.utils.safestring import mark_safe
from core.files import get_remote_file
-from core.html import strip_html
+from core.html import FediverseHtmlParser
from core.ld import format_ld_date
from core.models import Config
from core.uploads import upload_emoji_namer
@@ -134,8 +133,6 @@ class Emoji(StatorModel):
admin_disable = "{admin}{self.pk}/disable/"
admin_copy = "{admin}{self.pk}/copy/"
- emoji_regex = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
-
def delete(self, using=None, keep_parents=False):
if self.file:
self.file.delete()
@@ -242,7 +239,9 @@ class Emoji(StatorModel):
Return a parsed and sanitized of emoji found in content without
the surrounding ':'.
"""
- emoji_hits = cls.emoji_regex.findall(strip_html(content))
+ emoji_hits = FediverseHtmlParser(
+ content, find_emojis=True, emoji_domain=domain
+ ).emojis
emojis = sorted({emoji.lower() for emoji in emoji_hits})
return list(
cls.objects.filter(local=(domain is None) or domain.local)
diff --git a/activities/models/hashtag.py b/activities/models/hashtag.py
index 8430fd4..176bdc1 100644
--- a/activities/models/hashtag.py
+++ b/activities/models/hashtag.py
@@ -6,7 +6,6 @@ from asgiref.sync import sync_to_async
from django.db import models
from django.utils import timezone
-from core.html import strip_html
from core.models import Config
from stator.models import State, StateField, StateGraph, StatorModel
@@ -167,16 +166,6 @@ class Hashtag(StatorModel):
results[date(year, month, day)] = val
return dict(sorted(results.items(), reverse=True)[:num])
- @classmethod
- def hashtags_from_content(cls, content) -> list[str]:
- """
- Return a parsed and sanitized of hashtags found in content without
- leading '#'.
- """
- hashtag_hits = cls.hashtag_regex.findall(strip_html(content))
- hashtags = sorted({tag.lower() for tag in hashtag_hits})
- return list(hashtags)
-
def to_mastodon_json(self):
return {
"name": self.hashtag,
diff --git a/activities/models/post.py b/activities/models/post.py
index 8d00f46..88e53af 100644
--- a/activities/models/post.py
+++ b/activities/models/post.py
@@ -2,7 +2,6 @@ import datetime
import hashlib
import json
import mimetypes
-import re
import ssl
from collections.abc import Iterable
from typing import Optional
@@ -26,7 +25,7 @@ from activities.models.post_types import (
PostTypeDataEncoder,
)
from core.exceptions import capture_message
-from core.html import ContentRenderer, strip_html
+from core.html import ContentRenderer, FediverseHtmlParser
from core.ld import (
canonicalise,
format_ld_date,
@@ -374,10 +373,6 @@ class Post(StatorModel):
def clean_type_data(self, value):
PostTypeData.parse_obj(value)
- mention_regex = re.compile(
- r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
- )
-
def _safe_content_note(self, *, local: bool = True):
return ContentRenderer(local=local).render_post(self.content, self)
@@ -474,12 +469,12 @@ class Post(StatorModel):
# Maintain local-only for replies
if reply_to.visibility == reply_to.Visibilities.local_only:
visibility = reply_to.Visibilities.local_only
- # Find hashtags in this post
- hashtags = Hashtag.hashtags_from_content(content) or None
# Find emoji in this post
emojis = Emoji.emojis_from_content(content, None)
- # Strip all HTML and apply linebreaks filter
- content = linebreaks_filter(strip_html(content))
+ # Strip all unwanted HTML and apply linebreaks filter, grabbing hashtags on the way
+ parser = FediverseHtmlParser(linebreaks_filter(content), find_hashtags=True)
+ content = parser.html
+ hashtags = sorted(parser.hashtags) or None
# Make the Post object
post = cls.objects.create(
author=author,
@@ -512,12 +507,13 @@ class Post(StatorModel):
):
with transaction.atomic():
# Strip all HTML and apply linebreaks filter
- self.content = linebreaks_filter(strip_html(content))
+ parser = FediverseHtmlParser(linebreaks_filter(content))
+ self.content = parser.html
+ self.hashtags = sorted(parser.hashtags) or None
self.summary = summary or None
self.sensitive = bool(summary)
self.visibility = visibility
self.edited = timezone.now()
- self.hashtags = Hashtag.hashtags_from_content(content) or None
self.mentions.set(self.mentions_from_content(content, self.author))
self.emojis.set(Emoji.emojis_from_content(content, None))
self.attachments.set(attachments or [])
@@ -525,9 +521,9 @@ class Post(StatorModel):
@classmethod
def mentions_from_content(cls, content, author) -> set[Identity]:
- mention_hits = cls.mention_regex.findall(content)
+ mention_hits = FediverseHtmlParser(content, find_mentions=True).mentions
mentions = set()
- for precursor, handle in mention_hits:
+ for handle in mention_hits:
handle = handle.lower()
if "@" in handle:
username, domain = handle.split("@", 1)
diff --git a/activities/views/compose.py b/activities/views/compose.py
index c2e3618..96c9e11 100644
--- a/activities/views/compose.py
+++ b/activities/views/compose.py
@@ -14,7 +14,7 @@ from activities.models import (
TimelineEvent,
)
from core.files import blurhash_image, resize_image
-from core.html import html_to_plaintext
+from core.html import FediverseHtmlParser
from core.models import Config
from users.decorators import identity_required
@@ -112,7 +112,7 @@ class Compose(FormView):
{
"reply_to": self.reply_to.pk if self.reply_to else "",
"visibility": self.post_obj.visibility,
- "text": html_to_plaintext(self.post_obj.content),
+ "text": FediverseHtmlParser(self.post_obj.content).plain_text,
"content_warning": self.post_obj.summary,
}
)
diff --git a/core/html.py b/core/html.py
index 5728899..fc790c8 100644
--- a/core/html.py
+++ b/core/html.py
@@ -1,199 +1,309 @@
+import html
import re
-from functools import partial
+from html.parser import HTMLParser
-import bleach
-import bleach.callbacks
-from bleach.html5lib_shim import Filter
-from bleach.linkifier import LinkifyFilter
from django.utils.safestring import mark_safe
-url_regex = re.compile(
- r"""\(* # Match any opening parentheses.
- \b(?"]*)?
+
+class FediverseHtmlParser(HTMLParser):
+ """
+ A custom HTML parser that only allows a certain tag subset and behaviour:
+ - br, p tags are passed through
+ - a tags are passed through if they're not hashtags or mentions
+ - Another set of tags are converted to p
+
+ It also linkifies URLs, mentions, hashtags, and imagifies emoji.
+ """
+
+ REWRITE_TO_P = [
+ "p",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "blockquote",
+ "pre",
+ "ul",
+ "ol",
+ ]
+
+ REWRITE_TO_BR = [
+ "br",
+ "li",
+ ]
+
+ MENTION_REGEX = re.compile(
+ r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
+ )
+
+ HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")
+
+ EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
+
+ URL_REGEX = re.compile(
+ r"""(\(* # Match any opening parentheses.
+ \b(?"]*)?)
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
- """,
- re.IGNORECASE | re.VERBOSE | re.UNICODE,
-)
-
-ALLOWED_TAGS = ["br", "p", "a"]
-REWRITTEN_TAGS = [
- "h1",
- "h2",
- "h3",
- "h4",
- "h5",
- "h6",
- "blockquote",
- "pre",
- "ul",
- "ol",
- "li",
-]
-
-
-class MastodonStrictTagFilter(Filter):
- """
- Implements Python equivalent of Mastodon tag rewriter
-
- Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
-
- Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `
` lists
- """
-
- def __iter__(self):
- li_pending_break = False
- break_token = {
- "name": "br",
- "data": {},
- "type": "StartTag",
- }
-
- for token in Filter.__iter__(self):
- if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
- "StartTag",
- "EndTag",
- ]:
- yield token
- continue
-
- if token["type"] == "StartTag":
- if token["name"] == "li":
- if li_pending_break:
- # Another `li` appeared, so break after the last one
- yield break_token
- continue
- token["name"] = "p"
- elif token["type"] == "EndTag":
- if token["name"] == "li":
- # Track that an `li` closed so we know a break should be considered
- li_pending_break = True
- continue
- if token["name"] == "ul":
- # If the last `li` happened, then don't add a break because Mastodon doesn't
- li_pending_break = False
- token["name"] = "p"
-
- yield token
-
-
-class UnlinkifyFilter(Filter):
- """
- Forcibly replaces link text with the href.
-
- This is intented to be used when stripping tags to preserve the link
- location at the expense of the link text.
- """
-
- def __iter__(self):
- discarding_a_text = False
- for token in Filter.__iter__(self):
- if token.get("name") == "a":
- if token["type"] == "EndTag":
- discarding_a_text = False
- continue
- href = token["data"].get((None, "href"))
-
- # If has an href, we use it and throw away all content
- # within the .... If href missing or empty, try to find
- # text within the ...
- if href:
- yield {"data": href, "type": "Characters"}
- discarding_a_text = True
- continue
- elif not discarding_a_text:
- yield token
- # else: throw away tokens until we're out of the
-
-
-def allow_a(tag: str, name: str, value: str):
- if name in ["href", "title", "class"]:
- return True
- elif name == "rel":
- # Only allow rel attributes with a small subset of values
- # (we're defending against, for example, rel=me)
- rel_values = value.split()
- if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
- return True
- return False
-
-
-def shorten_link_text(attrs, new=False):
- """
- Applies Mastodon's link shortening behavior where URL text links are
- shortened by removing the scheme and only showing the first 30 chars.
-
- Orig:
- https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened
-
- Becomes:
- social.example.com/a-long/path
-
- """
- text = attrs.get("_text")
- if not text:
- text = attrs.get((None, "href"))
- if text and "://" in text and len(text) > 30:
- text = text.split("://", 1)[-1]
- attrs["_text"] = text[:30]
- if len(text) > 30:
- attrs[(None, "class")] = " ".join(
- filter(None, [attrs.pop((None, "class"), ""), "ellipsis"])
- )
- # Add the full URL in to title for easier user inspection
- attrs[(None, "title")] = attrs.get((None, "href"))
-
- return attrs
-
-
-linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text]
-
-
-def sanitize_html(post_html: str) -> str:
- """
- Only allows a, br, p and span tags, and class attributes.
- """
- cleaner = bleach.Cleaner(
- tags=ALLOWED_TAGS + REWRITTEN_TAGS,
- attributes={ # type:ignore
- "a": allow_a,
- "p": ["class"],
- },
- filters=[
- partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks),
- MastodonStrictTagFilter,
- ],
- strip=True,
+ """,
+ re.IGNORECASE | re.VERBOSE | re.UNICODE,
)
- return mark_safe(cleaner.clean(post_html))
+ def __init__(
+ self,
+ html: str,
+ uri_domain: str | None = None,
+ mentions: list | None = None,
+ find_mentions: bool = False,
+ find_hashtags: bool = False,
+ find_emojis: bool = False,
+ emoji_domain=None,
+ ):
+ super().__init__()
+ self.uri_domain = uri_domain
+ self.emoji_domain = emoji_domain
+ self.find_mentions = find_mentions
+ self.find_hashtags = find_hashtags
+ self.find_emojis = find_emojis
+ self.calculate_mentions(mentions)
+ self._data_buffer = ""
+ self.html_output = ""
+ self.text_output = ""
+ self.emojis: set[str] = set()
+ self.mentions: set[str] = set()
+ self.hashtags: set[str] = set()
+ self._pending_a: dict | None = None
+ self._fresh_p = False
+ self.feed(html.replace("\n", ""))
+ self.flush_data()
-def strip_html(post_html: str, *, linkify: bool = True) -> str:
- """
- Strips all tags from the text, then linkifies it.
- """
- cleaner = bleach.Cleaner(
- tags=[],
- strip=True,
- filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
- if linkify
- else [UnlinkifyFilter],
- )
- return mark_safe(cleaner.clean(post_html))
+ def calculate_mentions(self, mentions: list | None):
+ """
+ Prepares a set of content that we expect to see mentions look like
+ (this imp)
+ """
+ self.mention_matches: dict[str, str] = {}
+ self.mention_aliases: dict[str, str] = {}
+ for mention in mentions or []:
+ if self.uri_domain:
+ url = mention.absolute_profile_uri()
+ else:
+ url = str(mention.urls.view)
+ if mention.username:
+ username = mention.username.lower()
+ domain = mention.domain_id.lower()
+ self.mention_matches[f"{username}"] = url
+ self.mention_matches[f"{username}@{domain}"] = url
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+ if tag in self.REWRITE_TO_P:
+ self.flush_data()
+ self.html_output += "
"
+ elif tag in self.REWRITE_TO_BR:
+ self.flush_data()
+ if not self._fresh_p:
+ self.html_output += "
"
+ self.text_output += "\n"
+ elif tag == "a":
+ self.flush_data()
+ self._pending_a = {"attrs": dict(attrs), "content": ""}
+ self._fresh_p = tag in self.REWRITE_TO_P
-def html_to_plaintext(post_html: str) -> str:
- """
- Tries to do the inverse of the linebreaks filter.
- """
- # TODO: Handle HTML entities
- # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
- post_html = post_html.replace("\n", "").replace("
", "\n").replace("
Hello, world!
" assert response["visibility"] == "unlisted" + + +@pytest.mark.django_db +def test_mention_format(api_token, identity, remote_identity, client): + """ + Ensures mentions work, and only have one link around them. + """ + # Make a local post and check it + response = client.post( + "/api/v1/statuses", + HTTP_AUTHORIZATION=f"Bearer {api_token.token}", + HTTP_ACCEPT="application/json", + content_type="application/json", + data={ + "status": "Hello, @test!", + "visibility": "unlisted", + }, + ).json() + assert ( + response["content"] + == 'Hello, @test!
' + ) + assert response["visibility"] == "unlisted" + + # Make a remote post and check it + post = Post.objects.create( + local=False, + author=remote_identity, + content='Hey @test
', + object_uri="https://remote.test/status/12345", + ) + post.mentions.add(identity) + response = client.get( + f"/api/v1/statuses/{post.id}", + HTTP_AUTHORIZATION=f"Bearer {api_token.token}", + HTTP_ACCEPT="application/json", + content_type="application/json", + ).json() + assert ( + response["text"] == 'Hey @test
' + ) diff --git a/tests/core/test_html.py b/tests/core/test_html.py index 527991a..c076592 100644 --- a/tests/core/test_html.py +++ b/tests/core/test_html.py @@ -1,155 +1,117 @@ -from unittest.mock import Mock - import pytest -from core.html import ContentRenderer, html_to_plaintext, sanitize_html - - -def test_html_to_plaintext(): - - assert html_to_plaintext("Hi!
") == "Hi!" - assert html_to_plaintext("Hi!
There
Hi!
\n\nHow are you?
") == "Hi!\n\nHow are you?" - ) - - assert ( - html_to_plaintext("Hi!
\n\nHow are
you?
today
") - == "Hi!\n\nHow are\n you?\n\ntoday" - ) - - assert ( - html_to_plaintext( - '' - 'The Link ' - 'Empty href ' - "Empty A
" - ) - == "https://fedi.takahe.social/with/a/long/path Empty href Empty A" - ) - - -def test_sanitize_post(): - - assert sanitize_html("Hello!
") == "Hello!
" - assert sanitize_html("It's great
") == "It's great
" - - # Note that we only want to linkify things with protocol prefixes to prevent - # too many false positives. - assert sanitize_html("test.com
") == "test.com
" - assert ( - sanitize_html("https://test.com
") - == '' - ) - assert ( - sanitize_html("@someone@subdomain.some-domain.com
") - == "@someone@subdomain.some-domain.com
" - ) - - -def test_shorten_url(): - full_url = ( - "https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened" - ) - assert ( - sanitize_html(f"{full_url}
") - == f'social.example.com/a-long/path
' - ) - - assert ( - sanitize_html( - f'This is a long link text, but cannot be shortened as a URL
' - ) - == f'This is a long link text, but cannot be shortened as a URL
' - ) +from core.html import FediverseHtmlParser @pytest.mark.django_db -def test_link_preservation(): +def test_parser(identity): """ - We want to: - - Preserve incoming links from other servers - - Linkify mentions and hashtags - - Not have these all step on each other! + Validates the HtmlParser in its various output modes """ - renderer = ContentRenderer(local=True) - fake_mention = Mock() - fake_mention.username = "andrew" - fake_mention.domain_id = "aeracode.org" - fake_mention.urls.view = "/@andrew@aeracode.org/" - fake_post = Mock() - fake_post.mentions.all.return_value = [fake_mention] - fake_post.author.domain.uri_domain = "example.com" - fake_post.emojis.all.return_value = [] + # Basic tag allowance + parser = FediverseHtmlParser("Hello!
") + assert parser.html == "Hello!
" + assert parser.plain_text == "Hello!" + + # Newline erasure + parser = FediverseHtmlParser("Hi!
\n\nHow are you?
") + assert parser.html == "Hi!
How are you?
" + assert parser.plain_text == "Hi!\n\nHow are you?" + + # Trying to be evil + parser = FediverseHtmlParser("It's great
" + assert parser.plain_text == "It's great" + assert parser.hashtags == set() + + # Linkify works, but only with protocol prefixes + parser = FediverseHtmlParser("test.com
") + assert parser.html == "test.com
" + assert parser.plain_text == "test.com" + parser = FediverseHtmlParser("https://test.com
") assert ( - renderer.render_post( - 'Hello @andrew, I want to link to this #hashtag: here and rewrite #thishashtag', - fake_post, - ) - == 'Hello @andrew, I want to link to this #hashtag: here and rewrite #thishashtag' + parser.html == '' ) + assert parser.plain_text == "https://test.com" - -@pytest.mark.django_db -def test_list_rendering(): - """ - We want to: - - Preserve incoming links from other servers - - Linkify mentions and hashtags - - Not have these all step on each other! - """ - renderer = ContentRenderer(local=True) - fake_mention = Mock() - fake_mention.username = "andrew" - fake_mention.domain_id = "aeracode.org" - fake_mention.urls.view = "/@andrew@aeracode.org/" - fake_post = Mock() - fake_post.mentions.all.return_value = [fake_mention] - fake_post.author.domain.uri_domain = "example.com" - fake_post.emojis.all.return_value = [] - + # Links are preserved + parser = FediverseHtmlParser("takahe social") assert ( - renderer.render_post( - "Ok. The roster so far is:
What’s next? I think I promised some people here bookwyrm
", - fake_post, - ) - == "Ok. The roster so far is:
Infosec.exchange (mastodon)
pixel.Infosec.exchange (pixelfed)
video.Infosec.exchange (peertube)
relay.Infosec.exchange (activitypub relay)
risky.af (alt mastodon)
What’s next? I think I promised some people here bookwyrm
" + parser.html + == 'takahe social' + ) + assert parser.plain_text == "https://takahe.social" + + # Very long links are shortened + full_url = "https://social.example.com/a-long/path/that-should-be-shortened" + parser = FediverseHtmlParser(f"{full_url}
") + assert ( + parser.html + == f'social.example.com/a-long/path
' + ) + assert ( + parser.plain_text + == "https://social.example.com/a-long/path/that-should-be-shortened" ) - -@pytest.mark.django_db -def test_link_mixcase_mentions(): - renderer = ContentRenderer(local=True) - fake_mention = Mock() - fake_mention.username = "Manfre" - fake_mention.domain_id = "manfre.net" - fake_mention.urls.view = "/@Manfre@manfre.net/" - fake_mention2 = Mock() - fake_mention2.username = "manfre" - fake_mention2.domain_id = "takahe.social" - fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/" - - unfetched_mention = Mock() - unfetched_mention.username = None - unfetched_mention.domain_id = None - unfetched_mention.urls.view = "/None@None/" - - fake_post = Mock() - fake_post.mentions.all.return_value = [ - fake_mention, - fake_mention2, - unfetched_mention, - ] - fake_post.author.domain.uri_domain = "example.com" - fake_post.emojis.all.return_value = [] - - assert renderer.render_post( - "@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net", - fake_post, - ) == ( - '@Manfre ' - '@mAnFrE@takahe.social ' - '@manfre ' - "@unfetched@manfre.net" + # Make sure things that look like mentions are left alone with no mentions supplied. + parser = FediverseHtmlParser( + "@test@example.com
", + find_mentions=True, + find_hashtags=True, + find_emojis=True, ) + assert parser.html == "@test@example.com
" + assert parser.plain_text == "@test@example.com" + assert parser.mentions == {"test@example.com"} + + # Make sure mentions work when there is a mention supplied + parser = FediverseHtmlParser( + "@test@example.com
", + mentions=[identity], + find_hashtags=True, + find_emojis=True, + ) + assert parser.html == '' + assert parser.plain_text == "@test@example.com" + assert parser.mentions == {"test@example.com"} + + # Ensure mentions are case insensitive + parser = FediverseHtmlParser( + "@TeSt@ExamPle.com
", + mentions=[identity], + find_hashtags=True, + find_emojis=True, + ) + assert parser.html == '' + assert parser.plain_text == "@TeSt@ExamPle.com" + assert parser.mentions == {"test@example.com"} + + # Ensure hashtags are linked, even through spans, but not within hrefs + parser = FediverseHtmlParser( + 'something #hashtag #hashtagtwo', + find_hashtags=True, + find_emojis=True, + ) + assert ( + parser.html + == 'something #hashtag #hashtagtwo' + ) + assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo" + assert parser.hashtags == {"hashtag", "hashtagtwo"} + + # Ensure lists are rendered reasonably + parser = FediverseHtmlParser( + "List:
End!
", + find_hashtags=True, + find_emojis=True, + ) + assert parser.html == "List:
One
Two
Three
End!
" + assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!" diff --git a/users/models/identity.py b/users/models/identity.py index 754fe77..3050b06 100644 --- a/users/models/identity.py +++ b/users/models/identity.py @@ -13,7 +13,7 @@ from django.utils.functional import lazy from lxml import etree from core.exceptions import ActorMismatchError, capture_message -from core.html import ContentRenderer, html_to_plaintext, strip_html +from core.html import ContentRenderer, FediverseHtmlParser from core.ld import ( canonicalise, format_ld_date, @@ -530,8 +530,8 @@ class Identity(StatorModel): response["attachment"] = [ { "type": "http://schema.org#PropertyValue", - "name": strip_html(item["name"], linkify=False), - "value": strip_html(item["value"]), + "name": FediverseHtmlParser(item["name"]).plain_text, + "value": FediverseHtmlParser(item["value"]).html, } for item in self.metadata ] @@ -781,7 +781,9 @@ class Identity(StatorModel): self.metadata.append( { "name": attachment.get("name"), - "value": strip_html(attachment.get("http://schema.org#value")), + "value": FediverseHtmlParser( + attachment.get("http://schema.org#value") + ).html, } ) # Now go do webfinger with that info to see if we can get a canonical domain @@ -903,12 +905,14 @@ class Identity(StatorModel): Post.Visibilities.mentioned: "direct", } result["source"] = { - "note": html_to_plaintext(self.summary) if self.summary else "", + "note": FediverseHtmlParser(self.summary).plain_text + if self.summary + else "", "fields": ( [ { "name": m["name"], - "value": strip_html(m["value"], linkify=False), + "value": FediverseHtmlParser(m["value"]).plain_text, "verified_at": None, } for m in self.metadata diff --git a/users/services/identity.py b/users/services/identity.py index a653ebe..9560982 100644 --- a/users/services/identity.py +++ b/users/services/identity.py @@ -3,7 +3,7 @@ from django.template.defaultfilters import linebreaks_filter from activities.models import FanOut from core.files import resize_image -from core.html import strip_html +from core.html import FediverseHtmlParser from users.models import ( Block, BlockStates, @@ -211,7 +211,7 @@ class IdentityService: Safely sets a summary and turns linebreaks into HTML """ if summary: - self.identity.summary = linebreaks_filter(strip_html(summary)) + self.identity.summary = FediverseHtmlParser(linebreaks_filter(summary)).html else: self.identity.summary = None self.identity.save() diff --git a/users/views/settings/profile.py b/users/views/settings/profile.py index 7a2e957..bcbea8a 100644 --- a/users/views/settings/profile.py +++ b/users/views/settings/profile.py @@ -4,7 +4,7 @@ from django.shortcuts import redirect from django.utils.decorators import method_decorator from django.views.generic import FormView -from core.html import html_to_plaintext +from core.html import FediverseHtmlParser from core.models.config import Config from users.decorators import identity_required from users.models import IdentityStates @@ -65,7 +65,11 @@ class ProfilePage(FormView): identity = self.request.identity return { "name": identity.name, - "summary": html_to_plaintext(identity.summary) if identity.summary else "", + "summary": ( + FediverseHtmlParser(identity.summary).plain_text + if identity.summary + else "" + ), "icon": identity.icon and identity.icon.url, "image": identity.image and identity.image.url, "discoverable": identity.discoverable,