takahe/core/html.py

import html
import re
from html.parser import HTMLParser

from django.utils.safestring import mark_safe


class FediverseHtmlParser(HTMLParser):
    """
    A custom HTML parser that only allows a certain tag subset and behaviour:
    - br, p tags are passed through
    - a tags are passed through if they're not hashtags or mentions
    - Another set of tags are converted to p

    It also linkifies URLs, mentions, hashtags, and imagifies emoji.
    """

    REWRITE_TO_P = [
        "p",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "blockquote",
        "pre",
        "ul",
        "ol",
    ]

    REWRITE_TO_BR = [
        "br",
        "li",
    ]

    MENTION_REGEX = re.compile(
        r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
    )

    HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")

    EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")

    URL_REGEX = re.compile(
        r"""(\(*  # Match any opening parentheses.
        \b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?)  # http://
        (?:[\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?)
        # /path/zz (excluding "unsafe" chars from RFC 1738,
        # except for # and ~, which happen in practice)
        """,
        re.IGNORECASE | re.VERBOSE | re.UNICODE,
    )

    def __init__(
        self,
        html: str,
        uri_domain: str | None = None,
        mentions: list | None = None,
        find_mentions: bool = False,
        find_hashtags: bool = False,
        find_emojis: bool = False,
        emoji_domain=None,
    ):
        super().__init__()
        self.uri_domain = uri_domain
        self.emoji_domain = emoji_domain
        self.find_mentions = find_mentions
        self.find_hashtags = find_hashtags
        self.find_emojis = find_emojis
        self.calculate_mentions(mentions)
        self._data_buffer = ""
        self.html_output = ""
        self.text_output = ""
        self.emojis: set[str] = set()
        self.mentions: set[str] = set()
        self.hashtags: set[str] = set()
        self._pending_a: dict | None = None
        self._fresh_p = False
        self.feed(html.replace("\n", ""))
        self.flush_data()

    def calculate_mentions(self, mentions: list | None):
        """
        Prepares a set of content that we expect to see mentions look like
        (this imp)
        """
        self.mention_matches: dict[str, str] = {}
        self.mention_aliases: dict[str, str] = {}
        for mention in mentions or []:
            if self.uri_domain:
                url = mention.absolute_profile_uri()
            else:
                url = str(mention.urls.view)
            if mention.username:
                username = mention.username.lower()
                domain = mention.domain_id.lower()
                self.mention_matches[f"{username}"] = url
                self.mention_matches[f"{username}@{domain}"] = url

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag in self.REWRITE_TO_P:
            self.flush_data()
            self.html_output += "<p>"
        elif tag in self.REWRITE_TO_BR:
            self.flush_data()
            if not self._fresh_p:
                self.html_output += "<br>"
                self.text_output += "\n"
        elif tag == "a":
            self.flush_data()
            self._pending_a = {"attrs": dict(attrs), "content": ""}
        self._fresh_p = tag in self.REWRITE_TO_P

    def handle_endtag(self, tag: str) -> None:
        self._fresh_p = False
        if tag in self.REWRITE_TO_P:
            self.flush_data()
            self.html_output += "</p>"
            self.text_output += "\n\n"
        elif tag == "a":
            if self._pending_a:
                href = self._pending_a["attrs"].get("href")
                content = self._pending_a["content"].strip()
                # Is it a mention?
                if content.lower().lstrip("@") in self.mention_matches:
                    self.html_output += self.create_mention(content)
                    self.text_output += content
                # Is it a hashtag?
                elif self.HASHTAG_REGEX.match(content):
                    self.html_output += self.create_hashtag(content)
                    self.text_output += content
                elif content:
                    # Shorten the link if we need to
                    self.html_output += self.create_link(href, content)
                    self.text_output += href
                self._pending_a = None

    def handle_data(self, data: str) -> None:
        self._fresh_p = False
        if self._pending_a:
            self._pending_a["content"] += data
        else:
            self._data_buffer += data

    def flush_data(self) -> None:
        """
        We collect data segments until we encounter a tag we care about,
        so we can treat <span>#</span>hashtag as #hashtag
        """
        self.text_output += self._data_buffer
        self.html_output += self.linkify(self._data_buffer)
        self._data_buffer = ""

    def create_link(self, href, content):
        """
        Generates a link, doing optional shortening.

        All return values from this function should be HTML-safe.
        """
        looks_like_link = bool(self.URL_REGEX.match(content))
        if looks_like_link:
            content = content.split("://", 1)[1]
        if looks_like_link and len(content) > 30:
            return f'<a href="{html.escape(href)}" rel="nofollow" class="ellipsis" title="{html.escape(content)}">{html.escape(content[:30])}</a>'
        else:
            return f'<a href="{html.escape(href)}" rel="nofollow">{html.escape(content)}</a>'

    def create_mention(self, handle) -> str:
        """
        Generates a mention link. Handle should have a leading @.

        All return values from this function should be HTML-safe
        """
        handle = handle.lstrip("@")
        if "@" in handle:
            short_handle = handle.split("@", 1)[0]
        else:
            short_handle = handle
        handle_hash = handle.lower()
        short_hash = short_handle.lower()
        self.mentions.add(handle_hash)
        url = self.mention_matches.get(handle_hash)
        if url:
            if short_hash not in self.mention_aliases:
                self.mention_aliases[short_hash] = handle_hash
            elif self.mention_aliases.get(short_hash) != handle_hash:
                short_handle = handle
            return f'<a href="{html.escape(url)}">@{html.escape(short_handle)}</a>'
        else:
            return "@" + html.escape(handle)

    def create_hashtag(self, hashtag) -> str:
        """
        Generates a hashtag link. Hashtag does not need to start with #

        All return values from this function should be HTML-safe
        """
        hashtag = hashtag.lstrip("#")
        self.hashtags.add(hashtag.lower())
        if self.uri_domain:
            return f'<a href="https://{self.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'
        else:
            return f'<a href="/tags/{hashtag.lower()}/">#{hashtag}</a>'

    def create_emoji(self, shortcode) -> str:
        """
        Generates an emoji <img> tag

        All return values from this function should be HTML-safe
        """
        from activities.models import Emoji

        emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)
        if emoji and emoji.is_usable:
            self.emojis.add(shortcode)
            return emoji.as_html()
        return f":{shortcode}:"

    def linkify(self, data):
        """
        Linkifies some content that is plaintext.

        Handles URLs first, then mentions. Note that this takes great care to
        keep track of what is HTML and what needs to be escaped.
        """
        # Split the string by the URL regex so we know what to escape and what
        # not to escape.
        bits = self.URL_REGEX.split(data)
        result = ""
        # Even indices are data we should pass though, odd indices are links
        for i, bit in enumerate(bits):
            # A link!
            if i % 2 == 1:
                result += self.create_link(bit, bit)
            # Not a link
            elif self.mention_matches or self.find_mentions:
                result += self.linkify_mentions(bit)
            elif self.find_hashtags:
                result += self.linkify_hashtags(bit)
            elif self.find_emojis:
                result += self.linkify_emoji(bit)
            else:
                result += html.escape(bit)
        return result

    def linkify_mentions(self, data):
        """
        Linkifies mentions
        """
        bits = self.MENTION_REGEX.split(data)
        result = ""
        for i, bit in enumerate(bits):
            # Mention content
            if i % 3 == 2:
                result += self.create_mention(bit)
            # Not part of a mention (0) or mention preamble (1)
            elif self.find_hashtags:
                result += self.linkify_hashtags(bit)
            elif self.find_emojis:
                result += self.linkify_emoji(bit)
            else:
                result += html.escape(bit)
        return result

    def linkify_hashtags(self, data):
        """
        Linkifies hashtags
        """
        bits = self.HASHTAG_REGEX.split(data)
        result = ""
        for i, bit in enumerate(bits):
            # Not part of a hashtag
            if i % 2 == 0:
                if self.find_emojis:
                    result += self.linkify_emoji(bit)
                else:
                    result += html.escape(bit)
            # Hashtag content
            else:
                result += self.create_hashtag(bit)
        return result

    def linkify_emoji(self, data):
        """
        Linkifies emoji
        """
        bits = self.EMOJI_REGEX.split(data)
        result = ""
        for i, bit in enumerate(bits):
            # Not part of an emoji
            if i % 2 == 0:
                result += html.escape(bit)
            # Emoji content
            else:
                result += self.create_emoji(bit)
        return result

    @property
    def html(self):
        return self.html_output.strip()

    @property
    def plain_text(self):
        return self.text_output.strip()


class ContentRenderer:
    """
    Renders HTML for posts, identity fields, and more.

    The `local` parameter affects whether links are absolute (False) or relative (True)
    """

    def __init__(self, local: bool):
        self.local = local

    def render_post(self, html: str, post) -> str:
        """
        Given post HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
        parser = FediverseHtmlParser(
            html,
            mentions=post.mentions.all(),
            uri_domain=(None if self.local else post.author.domain.uri_domain),
            find_hashtags=True,
            find_emojis=self.local,
            emoji_domain=post.author.domain,
        )
        return mark_safe(parser.html)

    def render_identity_summary(self, html: str, identity) -> str:
        """
        Given identity summary HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
        parser = FediverseHtmlParser(
            html,
            uri_domain=(None if self.local else identity.domain.uri_domain),
            find_hashtags=True,
            find_emojis=self.local,
            emoji_domain=identity.domain,
        )
        return mark_safe(parser.html)

    def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
        """
        Given name/basic value HTML, normalises it and renders it for presentation.
        """
        if not html:
            return ""
        parser = FediverseHtmlParser(
            html,
            uri_domain=(None if self.local else identity.domain.uri_domain),
            find_hashtags=False,
            find_emojis=self.local,
            emoji_domain=identity.domain,
        )
        if strip:
            return mark_safe(parser.html)
        else:
            return mark_safe(parser.html)
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`import html`
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`import re`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`from html.parser import HTMLParser`
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00
Got up to incoming posts working 2022-11-11 21:02:43 -08:00			`from django.utils.safestring import mark_safe`

Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00
			`class FediverseHtmlParser(HTMLParser):`
			`"""`
			`A custom HTML parser that only allows a certain tag subset and behaviour:`
			`- br, p tags are passed through`
			`- a tags are passed through if they're not hashtags or mentions`
			`- Another set of tags are converted to p`

			`It also linkifies URLs, mentions, hashtags, and imagifies emoji.`
			`"""`

			`REWRITE_TO_P = [`
			`"p",`
			`"h1",`
			`"h2",`
			`"h3",`
			`"h4",`
			`"h5",`
			`"h6",`
			`"blockquote",`
			`"pre",`
			`"ul",`
			`"ol",`
			`]`

			`REWRITE_TO_BR = [`
			`"br",`
			`"li",`
			`]`

			`MENTION_REGEX = re.compile(`
			`r"(^\|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"`
			`)`

			`HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")`

			`EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")`

			`URL_REGEX = re.compile(`
			`r"""(\(* # Match any opening parentheses.`
			`\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://`
			`(?:[\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?`
			(?:[/?][^\s\{{\}}\\|\\\^\[\]`<>"]*)?)
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`# /path/zz (excluding "unsafe" chars from RFC 1738,`
			`# except for # and ~, which happen in practice)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`""",`
			`re.IGNORECASE \| re.VERBOSE \| re.UNICODE,`
			`)`
Match sanitizing for posts to Mastodon (#422) Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73 I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact. 2023-01-15 21:32:04 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def __init__(`
			`self,`
			`html: str,`
			`uri_domain: str \| None = None,`
			`mentions: list \| None = None,`
			`find_mentions: bool = False,`
			`find_hashtags: bool = False,`
			`find_emojis: bool = False,`
			`emoji_domain=None,`
			`):`
			`super().__init__()`
			`self.uri_domain = uri_domain`
			`self.emoji_domain = emoji_domain`
			`self.find_mentions = find_mentions`
			`self.find_hashtags = find_hashtags`
			`self.find_emojis = find_emojis`
			`self.calculate_mentions(mentions)`
			`self._data_buffer = ""`
			`self.html_output = ""`
			`self.text_output = ""`
			`self.emojis: set[str] = set()`
			`self.mentions: set[str] = set()`
			`self.hashtags: set[str] = set()`
			`self._pending_a: dict \| None = None`
			`self._fresh_p = False`
			`self.feed(html.replace("\n", ""))`
			`self.flush_data()`

			`def calculate_mentions(self, mentions: list \| None):`
			`"""`
			`Prepares a set of content that we expect to see mentions look like`
			`(this imp)`
			`"""`
			`self.mention_matches: dict[str, str] = {}`
			`self.mention_aliases: dict[str, str] = {}`
			`for mention in mentions or []:`
			`if self.uri_domain:`
			`url = mention.absolute_profile_uri()`
			`else:`
			`url = str(mention.urls.view)`
			`if mention.username:`
			`username = mention.username.lower()`
			`domain = mention.domain_id.lower()`
			`self.mention_matches[f"{username}"] = url`
			`self.mention_matches[f"{username}@{domain}"] = url`

			`def handle_starttag(self, tag: str, attrs: list[tuple[str, str \| None]]) -> None:`
			`if tag in self.REWRITE_TO_P:`
			`self.flush_data()`
			`self.html_output += "<p>"`
			`elif tag in self.REWRITE_TO_BR:`
			`self.flush_data()`
			`if not self._fresh_p:`
			`self.html_output += "<br>"`
			`self.text_output += "\n"`
			`elif tag == "a":`
			`self.flush_data()`
			`self._pending_a = {"attrs": dict(attrs), "content": ""}`
			`self._fresh_p = tag in self.REWRITE_TO_P`

			`def handle_endtag(self, tag: str) -> None:`
			`self._fresh_p = False`
			`if tag in self.REWRITE_TO_P:`
			`self.flush_data()`
			`self.html_output += "</p>"`
			`self.text_output += "\n\n"`
			`elif tag == "a":`
			`if self._pending_a:`
			`href = self._pending_a["attrs"].get("href")`
			`content = self._pending_a["content"].strip()`
			`# Is it a mention?`
			`if content.lower().lstrip("@") in self.mention_matches:`
			`self.html_output += self.create_mention(content)`
			`self.text_output += content`
			`# Is it a hashtag?`
			`elif self.HASHTAG_REGEX.match(content):`
			`self.html_output += self.create_hashtag(content)`
			`self.text_output += content`
			`elif content:`
			`# Shorten the link if we need to`
			`self.html_output += self.create_link(href, content)`
			`self.text_output += href`
			`self._pending_a = None`

			`def handle_data(self, data: str) -> None:`
			`self._fresh_p = False`
			`if self._pending_a:`
			`self._pending_a["content"] += data`
			`else:`
			`self._data_buffer += data`
Match sanitizing for posts to Mastodon (#422) Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73 I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact. 2023-01-15 21:32:04 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def flush_data(self) -> None:`
			`"""`
			`We collect data segments until we encounter a tag we care about,`
			`so we can treat <span>#</span>hashtag as #hashtag`
			`"""`
			`self.text_output += self._data_buffer`
			`self.html_output += self.linkify(self._data_buffer)`
			`self._data_buffer = ""`
Match sanitizing for posts to Mastodon (#422) Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73 I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact. 2023-01-15 21:32:04 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def create_link(self, href, content):`
			`"""`
			`Generates a link, doing optional shortening.`
Fixes #431 - Preserve href when stripping <a> tags (#436) 2023-01-17 22:41:33 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`All return values from this function should be HTML-safe.`
			`"""`
			`looks_like_link = bool(self.URL_REGEX.match(content))`
			`if looks_like_link:`
			`content = content.split("://", 1)[1]`
			`if looks_like_link and len(content) > 30:`
			`return f'<a href="{html.escape(href)}" rel="nofollow" class="ellipsis" title="{html.escape(content)}">{html.escape(content[:30])}</a>'`
			`else:`
			`return f'<a href="{html.escape(href)}" rel="nofollow">{html.escape(content)}</a>'`
Fixes #431 - Preserve href when stripping <a> tags (#436) 2023-01-17 22:41:33 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def create_mention(self, handle) -> str:`
			`"""`
			`Generates a mention link. Handle should have a leading @.`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`All return values from this function should be HTML-safe`
			`"""`
			`handle = handle.lstrip("@")`
			`if "@" in handle:`
			`short_handle = handle.split("@", 1)[0]`
			`else:`
			`short_handle = handle`
			`handle_hash = handle.lower()`
			`short_hash = short_handle.lower()`
			`self.mentions.add(handle_hash)`
			`url = self.mention_matches.get(handle_hash)`
			`if url:`
			`if short_hash not in self.mention_aliases:`
			`self.mention_aliases[short_hash] = handle_hash`
			`elif self.mention_aliases.get(short_hash) != handle_hash:`
			`short_handle = handle`
			`return f'<a href="{html.escape(url)}">@{html.escape(short_handle)}</a>'`
			`else:`
			`return "@" + html.escape(handle)`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def create_hashtag(self, hashtag) -> str:`
			`"""`
			`Generates a hashtag link. Hashtag does not need to start with #`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`All return values from this function should be HTML-safe`
			`"""`
			`hashtag = hashtag.lstrip("#")`
			`self.hashtags.add(hashtag.lower())`
			`if self.uri_domain:`
			`return f'<a href="https://{self.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'`
			`else:`
			`return f'<a href="/tags/{hashtag.lower()}/">#{hashtag}</a>'`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def create_emoji(self, shortcode) -> str:`
			`"""`
			`Generates an emoji <img> tag`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`All return values from this function should be HTML-safe`
			`"""`
			`from activities.models import Emoji`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)`
			`if emoji and emoji.is_usable:`
			`self.emojis.add(shortcode)`
			`return emoji.as_html()`
			`return f":{shortcode}:"`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def linkify(self, data):`
			`"""`
			`Linkifies some content that is plaintext.`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`Handles URLs first, then mentions. Note that this takes great care to`
			`keep track of what is HTML and what needs to be escaped.`
			`"""`
			`# Split the string by the URL regex so we know what to escape and what`
			`# not to escape.`
			`bits = self.URL_REGEX.split(data)`
			`result = ""`
			`# Even indices are data we should pass though, odd indices are links`
			`for i, bit in enumerate(bits):`
			`# A link!`
			`if i % 2 == 1:`
			`result += self.create_link(bit, bit)`
			`# Not a link`
			`elif self.mention_matches or self.find_mentions:`
			`result += self.linkify_mentions(bit)`
			`elif self.find_hashtags:`
			`result += self.linkify_hashtags(bit)`
			`elif self.find_emojis:`
			`result += self.linkify_emoji(bit)`
			`else:`
			`result += html.escape(bit)`
			`return result`

			`def linkify_mentions(self, data):`
			`"""`
			`Linkifies mentions`
			`"""`
			`bits = self.MENTION_REGEX.split(data)`
			`result = ""`
			`for i, bit in enumerate(bits):`
			`# Mention content`
			`if i % 3 == 2:`
			`result += self.create_mention(bit)`
			`# Not part of a mention (0) or mention preamble (1)`
			`elif self.find_hashtags:`
			`result += self.linkify_hashtags(bit)`
			`elif self.find_emojis:`
			`result += self.linkify_emoji(bit)`
			`else:`
			`result += html.escape(bit)`
			`return result`
Outgoing mentions mostly work (exc. cc followers) 2022-11-21 20:18:13 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def linkify_hashtags(self, data):`
			`"""`
			`Linkifies hashtags`
			`"""`
			`bits = self.HASHTAG_REGEX.split(data)`
			`result = ""`
			`for i, bit in enumerate(bits):`
			`# Not part of a hashtag`
			`if i % 2 == 0:`
			`if self.find_emojis:`
			`result += self.linkify_emoji(bit)`
			`else:`
			`result += html.escape(bit)`
			`# Hashtag content`
			`else:`
			`result += self.create_hashtag(bit)`
			`return result`
Outgoing mentions mostly work (exc. cc followers) 2022-11-21 20:18:13 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`def linkify_emoji(self, data):`
			`"""`
			`Linkifies emoji`
			`"""`
			`bits = self.EMOJI_REGEX.split(data)`
			`result = ""`
			`for i, bit in enumerate(bits):`
			`# Not part of an emoji`
			`if i % 2 == 0:`
			`result += html.escape(bit)`
			`# Emoji content`
			`else:`
			`result += self.create_emoji(bit)`
			`return result`
Some cleanup around editing 2022-11-27 11:09:08 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`@property`
			`def html(self):`
			`return self.html_output.strip()`
Some cleanup around editing 2022-11-27 11:09:08 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`@property`
			`def plain_text(self):`
			`return self.text_output.strip()`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00

			`class ContentRenderer:`
			`"""`
			`Renders HTML for posts, identity fields, and more.`

			The `local` parameter affects whether links are absolute (False) or relative (True)
			`"""`

			`def __init__(self, local: bool):`
			`self.local = local`

			`def render_post(self, html: str, post) -> str:`
			`"""`
			`Given post HTML, normalises it and renders it for presentation.`
			`"""`
			`if not html:`
			`return ""`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser = FediverseHtmlParser(`
			`html,`
			`mentions=post.mentions.all(),`
			`uri_domain=(None if self.local else post.author.domain.uri_domain),`
			`find_hashtags=True,`
Don't try to render emojis for remote sends 2023-01-31 09:04:57 -08:00			`find_emojis=self.local,`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`emoji_domain=post.author.domain,`
			`)`
			`return mark_safe(parser.html)`

			`def render_identity_summary(self, html: str, identity) -> str:`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`"""`
Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`Given identity summary HTML, normalises it and renders it for presentation.`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00			`"""`
			`if not html:`
			`return ""`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser = FediverseHtmlParser(`
			`html,`
			`uri_domain=(None if self.local else identity.domain.uri_domain),`
			`find_hashtags=True,`
Don't try to render emojis for remote sends 2023-01-31 09:04:57 -08:00			`find_emojis=self.local,`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`emoji_domain=identity.domain,`
			`)`
			`return mark_safe(parser.html)`
Refactor HTML rendering into one place Also suppress using external <a> tags for now, until we can separate them from hashtags properly. 2022-12-20 03:39:45 -08:00
Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`def render_identity_data(self, html: str, identity, strip: bool = False) -> str:`
			`"""`
			`Given name/basic value HTML, normalises it and renders it for presentation.`
			`"""`
			`if not html:`
			`return ""`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser = FediverseHtmlParser(`
			`html,`
			`uri_domain=(None if self.local else identity.domain.uri_domain),`
			`find_hashtags=False,`
Don't try to render emojis for remote sends 2023-01-31 09:04:57 -08:00			`find_emojis=self.local,`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`emoji_domain=identity.domain,`
			`)`
Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`if strip:`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`return mark_safe(parser.html)`
Don't linkify hashtags in usernames and metadata Fixes #228 2022-12-21 17:10:25 -08:00			`else:`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`return mark_safe(parser.html)`