takahe/core/html.py

import re
from functools import partial

import bleach
from bleach.linkifier import LinkifyFilter
from django.utils.safestring import mark_safe

url_regex = re.compile(
    r"""\(*  # Match any opening parentheses.
    \b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?)  # http://
    ([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
    (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
        # /path/zz (excluding "unsafe" chars from RFC 1738,
        # except for # and ~, which happen in practice)
    """,
    re.IGNORECASE | re.VERBOSE | re.UNICODE,
)


def allow_a(tag: str, name: str, value: str):
    if name in ["href", "title", "class"]:
        return True
    elif name == "rel":
        # Only allow rel attributes with a small subset of values
        # (we're defending against, for example, rel=me)
        rel_values = value.split()
        if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
            return True
    return False


def sanitize_html(post_html: str) -> str:
    """
    Only allows a, br, p and span tags, and class attributes.
    """
    cleaner = bleach.Cleaner(
        tags=["br", "p", "a"],
        attributes={  # type:ignore
            "a": allow_a,
            "p": ["class"],
        },
        filters=[partial(LinkifyFilter, url_re=url_regex)],
        strip=True,
    )
    return mark_safe(cleaner.clean(post_html))


def strip_html(post_html: str, *, linkify: bool = True) -> str:
    """
    Strips all tags from the text, then linkifies it.
    """
    cleaner = bleach.Cleaner(
        tags=[],
        strip=True,
        filters=[partial(LinkifyFilter, url_re=url_regex)] if linkify else [],
    )
    return mark_safe(cleaner.clean(post_html))


def html_to_plaintext(post_html: str) -> str:
    """
    Tries to do the inverse of the linebreaks filter.
    """
    # TODO: Handle HTML entities
    # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
    post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
    # Remove all other HTML and return
    cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
    return cleaner.clean(post_html).strip()


class ContentRenderer:
    """
    Renders HTML for posts, identity fields, and more.

    The `local` parameter affects whether links are absolute (False) or relative (True)
    """

    def __init__(self, local: bool):
        self.local = local

    def render_post(self, html: str, post) -> str:
        """
        Given post HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
        html = sanitize_html(html)
        html = self.linkify_mentions(html, post=post)
        html = self.linkify_hashtags(html, identity=post.author)
        if self.local:
            html = self.imageify_emojis(
                html,
                identity=post.author,
                emojis=post.emojis.all(),
            )
        html = self.remove_extra_newlines(html)
        return mark_safe(html)

    def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
        """
        Given identity summary HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
        if strip:
            html = strip_html(html)
        else:
            html = sanitize_html(html)
        html = self.linkify_hashtags(html, identity=identity)
        if self.local:
            html = self.imageify_emojis(html, identity=identity)
        html = self.remove_extra_newlines(html)
        return mark_safe(html)

    def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
        """
        Given name/basic value HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
        if strip:
            html = strip_html(html)
        else:
            html = sanitize_html(html)
        if self.local:
            html = self.imageify_emojis(html, identity=identity)
        html = self.remove_extra_newlines(html)
        return mark_safe(html)

    def linkify_mentions(self, html: str, post) -> str:
        """
        Links mentions _in the context of the post_ - as in, using the mentions
        property as the only source (as we might be doing this without other
        DB access allowed)
        """
        from activities.models import Post

        possible_matches = {}
        for mention in post.mentions.all():
            if self.local:
                url = str(mention.urls.view)
            else:
                url = mention.absolute_profile_uri()
            # Might not have fetched it (yet)
            if mention.username:
                username = mention.username.lower()
                possible_matches[username] = url
                possible_matches[f"{username}@{mention.domain_id}"] = url

        collapse_name: dict[str, str] = {}

        def replacer(match):
            precursor = match.group(1)
            handle = match.group(2)
            if "@" in handle:
                short_handle = handle.split("@", 1)[0]
            else:
                short_handle = handle
            handle_hash = handle.lower()
            short_hash = short_handle.lower()
            if handle_hash in possible_matches:
                if short_hash not in collapse_name:
                    collapse_name[short_hash] = handle_hash
                elif collapse_name.get(short_hash) != handle_hash:
                    short_handle = handle
                return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
            else:
                return match.group()

        return Post.mention_regex.sub(replacer, html)

    def linkify_hashtags(self, html, identity) -> str:
        from activities.models import Hashtag

        def replacer(attrs, new=False):
            # See if the text in this link looks like a hashtag
            if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
                return attrs
            hashtag = attrs["_text"].strip().lstrip("#")
            attrs[None, "class"] = "hashtag"
            if (None, "rel") in attrs:
                del attrs[None, "rel"]
            if self.local:
                attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
            else:
                attrs[
                    None, "href"
                ] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
            return attrs

        linker = bleach.linkifier.Linker(
            url_re=Hashtag.hashtag_regex, callbacks=[replacer]
        )
        return linker.linkify(html)

    def imageify_emojis(
        self, html: str, identity, include_local: bool = True, emojis=None
    ):
        """
        Find :emoji: in content and convert to <img>. If include_local is True,
        the local emoji will be used as a fallback for any shortcodes not defined
        by emojis.
        """
        from activities.models import Emoji

        # If precached emojis were passed, prep them
        cached_emojis = {}
        if emojis:
            for emoji in emojis:
                cached_emojis[emoji.shortcode] = emoji

        def replacer(match):
            shortcode = match.group(1).lower()
            if shortcode in cached_emojis:
                return cached_emojis[shortcode].as_html()

            emoji = Emoji.get_by_domain(shortcode, identity.domain)
            if emoji and emoji.is_usable:
                return emoji.as_html()
            elif not emoji and include_local:
                emoji = Emoji.get_by_domain(shortcode, None)
                if emoji:
                    return emoji.as_html()

            return match.group()

        return Emoji.emoji_regex.sub(replacer, html)

    def remove_extra_newlines(self, html: str) -> str:
        """
        Some clients are sensitive to extra newlines even though it's HTML
        """
        # TODO: More intelligent way to strip these?
        return html.replace("\n", "")
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`import re`
			`from functools import partial`

Got up to incoming posts working 2022-11-11 21:02:43 -08:00			`import bleach`
Permit Mastodon's weird HTML through 2022-11-13 18:03:43 -08:00			`from bleach.linkifier import LinkifyFilter`
Got up to incoming posts working 2022-11-11 21:02:43 -08:00			`from django.utils.safestring import mark_safe`

Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`url_regex = re.compile(`
			`r"""\(* # Match any opening parentheses.`
			`\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://`
			`([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?`
			(?:[/?][^\s\{{\}}\\|\\\^\[\]`<>"]*)?
			`# /path/zz (excluding "unsafe" chars from RFC 1738,`
			`# except for # and ~, which happen in practice)`
			`""",`
			`re.IGNORECASE \| re.VERBOSE \| re.UNICODE,`
			`)`

Got up to incoming posts working 2022-11-11 21:02:43 -08:00
Permit Mastodon's weird HTML through 2022-11-13 18:03:43 -08:00			`def allow_a(tag: str, name: str, value: str):`
			`if name in ["href", "title", "class"]:`
			`return True`
			`elif name == "rel":`
			`# Only allow rel attributes with a small subset of values`
			`# (we're defending against, for example, rel=me)`
			`rel_values = value.split()`
			`if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):`
			`return True`
			`return False`


Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`def sanitize_html(post_html: str) -> str:`
Got up to incoming posts working 2022-11-11 21:02:43 -08:00			`"""`
			`Only allows a, br, p and span tags, and class attributes.`
			`"""`
Permit Mastodon's weird HTML through 2022-11-13 18:03:43 -08:00			`cleaner = bleach.Cleaner(`
Significantly better hashtag link parsing Fixes #203 2022-12-20 05:55:14 -08:00			`tags=["br", "p", "a"],`
Permit Mastodon's weird HTML through 2022-11-13 18:03:43 -08:00			`attributes={ # type:ignore`
			`"a": allow_a,`
			`"p": ["class"],`
			`},`
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`filters=[partial(LinkifyFilter, url_re=url_regex)],`
Return images and summary in actor 2022-11-17 18:31:00 -08:00			`strip=True,`
Got up to incoming posts working 2022-11-11 21:02:43 -08:00			`)`
Permit Mastodon's weird HTML through 2022-11-13 18:03:43 -08:00			`return mark_safe(cleaner.clean(post_html))`
Outgoing mentions mostly work (exc. cc followers) 2022-11-21 20:18:13 -08:00

Include Identity metadata fields in ActivityPub messages (#295) 2022-12-27 16:42:30 -08:00			`def strip_html(post_html: str, *, linkify: bool = True) -> str:`
Outgoing mentions mostly work (exc. cc followers) 2022-11-21 20:18:13 -08:00			`"""`
			`Strips all tags from the text, then linkifies it.`
			`"""`
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`cleaner = bleach.Cleaner(`
			`tags=[],`
			`strip=True,`
Include Identity metadata fields in ActivityPub messages (#295) 2022-12-27 16:42:30 -08:00			`filters=[partial(LinkifyFilter, url_re=url_regex)] if linkify else [],`
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`)`
Outgoing mentions mostly work (exc. cc followers) 2022-11-21 20:18:13 -08:00			`return mark_safe(cleaner.clean(post_html))`
Some cleanup around editing 2022-11-27 11:09:08 -08:00

			`def html_to_plaintext(post_html: str) -> str:`
			`"""`
			`Tries to do the inverse of the linebreaks filter.`
			`"""`
			`# TODO: Handle HTML entities`
			`# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)`
			`post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")`
			`# Remove all other HTML and return`
			`cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])`
			`return cleaner.clean(post_html).strip()`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00

			`class ContentRenderer:`
			`"""`
			`Renders HTML for posts, identity fields, and more.`

			The `local` parameter affects whether links are absolute (False) or relative (True)
			`"""`

			`def __init__(self, local: bool):`
			`self.local = local`

			`def render_post(self, html: str, post) -> str:`
			`"""`
			`Given post HTML, normalises it and renders it for presentation.`
			`"""`
			`if not html:`
			`return ""`
			`html = sanitize_html(html)`
			`html = self.linkify_mentions(html, post=post)`
			`html = self.linkify_hashtags(html, identity=post.author)`
			`if self.local:`
Emoji refactor Emojis are now prefetched from the post, and if not, looked up individually by shortcode, to prevent loading hundreds. 2022-12-22 08:55:31 -08:00			`html = self.imageify_emojis(`
			`html,`
			`identity=post.author,`
			`emojis=post.emojis.all(),`
			`)`
Remove extra newlines in post content Fixes #282 2022-12-27 10:38:18 -08:00			`html = self.remove_extra_newlines(html)`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`return mark_safe(html)`

Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`"""`
Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`Given identity summary HTML, normalises it and renders it for presentation.`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`"""`
			`if not html:`
			`return ""`
			`if strip:`
			`html = strip_html(html)`
			`else:`
			`html = sanitize_html(html)`
			`html = self.linkify_hashtags(html, identity=identity)`
			`if self.local:`
			`html = self.imageify_emojis(html, identity=identity)`
Remove extra newlines in post content Fixes #282 2022-12-27 10:38:18 -08:00			`html = self.remove_extra_newlines(html)`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`return mark_safe(html)`

Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`def render_identity_data(self, html: str, identity, strip: bool = False) -> str:`
			`"""`
			`Given name/basic value HTML, normalises it and renders it for presentation.`
			`"""`
			`if not html:`
			`return ""`
			`if strip:`
			`html = strip_html(html)`
			`else:`
			`html = sanitize_html(html)`
			`if self.local:`
			`html = self.imageify_emojis(html, identity=identity)`
Remove extra newlines in post content Fixes #282 2022-12-27 10:38:18 -08:00			`html = self.remove_extra_newlines(html)`
Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`return mark_safe(html)`

Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`def linkify_mentions(self, html: str, post) -> str:`
			`"""`
			`Links mentions _in the context of the post_ - as in, using the mentions`
			`property as the only source (as we might be doing this without other`
			`DB access allowed)`
			`"""`
			`from activities.models import Post`

			`possible_matches = {}`
			`for mention in post.mentions.all():`
			`if self.local:`
			`url = str(mention.urls.view)`
			`else:`
			`url = mention.absolute_profile_uri()`
Guard Post mentions processing from unfetched Identities (#272) 2022-12-25 13:37:31 -08:00			`# Might not have fetched it (yet)`
			`if mention.username:`
			`username = mention.username.lower()`
			`possible_matches[username] = url`
			`possible_matches[f"{username}@{mention.domain_id}"] = url`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00
			`collapse_name: dict[str, str] = {}`

			`def replacer(match):`
			`precursor = match.group(1)`
Fixed mention linking with mixed case usernames (#265) 2022-12-24 20:04:25 -08:00			`handle = match.group(2)`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`if "@" in handle:`
			`short_handle = handle.split("@", 1)[0]`
			`else:`
			`short_handle = handle`
Fixed mention linking with mixed case usernames (#265) 2022-12-24 20:04:25 -08:00			`handle_hash = handle.lower()`
			`short_hash = short_handle.lower()`
			`if handle_hash in possible_matches:`
			`if short_hash not in collapse_name:`
			`collapse_name[short_hash] = handle_hash`
			`elif collapse_name.get(short_hash) != handle_hash:`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`short_handle = handle`
Fixed mention linking with mixed case usernames (#265) 2022-12-24 20:04:25 -08:00			`return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`else:`
			`return match.group()`

			`return Post.mention_regex.sub(replacer, html)`

			`def linkify_hashtags(self, html, identity) -> str:`
			`from activities.models import Hashtag`

Significantly better hashtag link parsing Fixes #203 2022-12-20 05:55:14 -08:00			`def replacer(attrs, new=False):`
			`# See if the text in this link looks like a hashtag`
			`if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):`
			`return attrs`
			`hashtag = attrs["_text"].strip().lstrip("#")`
			`attrs[None, "class"] = "hashtag"`
			`if (None, "rel") in attrs:`
			`del attrs[None, "rel"]`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`if self.local:`
Significantly better hashtag link parsing Fixes #203 2022-12-20 05:55:14 -08:00			`attrs[None, "href"] = f"/tags/{hashtag.lower()}/"`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`else:`
Significantly better hashtag link parsing Fixes #203 2022-12-20 05:55:14 -08:00			`attrs[`
			`None, "href"`
			`] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"`
			`return attrs`

			`linker = bleach.linkifier.Linker(`
			`url_re=Hashtag.hashtag_regex, callbacks=[replacer]`
			`)`
			`return linker.linkify(html)`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00
Emoji refactor Emojis are now prefetched from the post, and if not, looked up individually by shortcode, to prevent loading hundreds. 2022-12-22 08:55:31 -08:00			`def imageify_emojis(`
			`self, html: str, identity, include_local: bool = True, emojis=None`
			`):`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`"""`
			`Find :emoji: in content and convert to <img>. If include_local is True,`
			`the local emoji will be used as a fallback for any shortcodes not defined`
			`by emojis.`
			`"""`
			`from activities.models import Emoji`

Emoji refactor Emojis are now prefetched from the post, and if not, looked up individually by shortcode, to prevent loading hundreds. 2022-12-22 08:55:31 -08:00			`# If precached emojis were passed, prep them`
			`cached_emojis = {}`
			`if emojis:`
			`for emoji in emojis:`
			`cached_emojis[emoji.shortcode] = emoji`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00
			`def replacer(match):`
Emoji refactor Emojis are now prefetched from the post, and if not, looked up individually by shortcode, to prevent loading hundreds. 2022-12-22 08:55:31 -08:00			`shortcode = match.group(1).lower()`
			`if shortcode in cached_emojis:`
			`return cached_emojis[shortcode].as_html()`
Modify emoji loader for cache-optimized return value (#371) Also fixes an apparent bug in `imageify_emojis.replacer` where `include_local` was not being used correctly (previous code path never returned anything. 2023-01-07 14:19:47 -08:00
			`emoji = Emoji.get_by_domain(shortcode, identity.domain)`
			`if emoji and emoji.is_usable:`
			`return emoji.as_html()`
			`elif not emoji and include_local:`
			`emoji = Emoji.get_by_domain(shortcode, None)`
			`if emoji:`
Emoji refactor Emojis are now prefetched from the post, and if not, looked up individually by shortcode, to prevent loading hundreds. 2022-12-22 08:55:31 -08:00			`return emoji.as_html()`
Modify emoji loader for cache-optimized return value (#371) Also fixes an apparent bug in `imageify_emojis.replacer` where `include_local` was not being used correctly (previous code path never returned anything. 2023-01-07 14:19:47 -08:00
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`return match.group()`

			`return Emoji.emoji_regex.sub(replacer, html)`
Remove extra newlines in post content Fixes #282 2022-12-27 10:38:18 -08:00
			`def remove_extra_newlines(self, html: str) -> str:`
			`"""`
			`Some clients are sensitive to extra newlines even though it's HTML`
			`"""`
			`# TODO: More intelligent way to strip these?`
			`return html.replace("\n", "")`