2022-12-20 05:10:35 -08:00
|
|
|
import re
|
|
|
|
from functools import partial
|
|
|
|
|
2022-11-11 21:02:43 -08:00
|
|
|
import bleach
|
2022-11-13 18:03:43 -08:00
|
|
|
from bleach.linkifier import LinkifyFilter
|
2022-11-11 21:02:43 -08:00
|
|
|
from django.utils.safestring import mark_safe
|
|
|
|
|
2022-12-20 05:10:35 -08:00
|
|
|
url_regex = re.compile(
|
|
|
|
r"""\(* # Match any opening parentheses.
|
|
|
|
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
|
|
|
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
|
|
|
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
|
|
|
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
|
|
|
# except for # and ~, which happen in practice)
|
|
|
|
""",
|
|
|
|
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
|
|
|
)
|
|
|
|
|
2022-11-11 21:02:43 -08:00
|
|
|
|
2022-11-13 18:03:43 -08:00
|
|
|
def allow_a(tag: str, name: str, value: str):
|
|
|
|
if name in ["href", "title", "class"]:
|
|
|
|
return True
|
|
|
|
elif name == "rel":
|
|
|
|
# Only allow rel attributes with a small subset of values
|
|
|
|
# (we're defending against, for example, rel=me)
|
|
|
|
rel_values = value.split()
|
|
|
|
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-12-20 03:39:45 -08:00
|
|
|
def sanitize_html(post_html: str) -> str:
|
2022-11-11 21:02:43 -08:00
|
|
|
"""
|
|
|
|
Only allows a, br, p and span tags, and class attributes.
|
|
|
|
"""
|
2022-11-13 18:03:43 -08:00
|
|
|
cleaner = bleach.Cleaner(
|
2022-12-20 05:55:14 -08:00
|
|
|
tags=["br", "p", "a"],
|
2022-11-13 18:03:43 -08:00
|
|
|
attributes={ # type:ignore
|
|
|
|
"a": allow_a,
|
|
|
|
"p": ["class"],
|
|
|
|
},
|
2022-12-20 05:10:35 -08:00
|
|
|
filters=[partial(LinkifyFilter, url_re=url_regex)],
|
2022-11-17 18:31:00 -08:00
|
|
|
strip=True,
|
2022-11-11 21:02:43 -08:00
|
|
|
)
|
2022-11-13 18:03:43 -08:00
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-21 20:18:13 -08:00
|
|
|
|
|
|
|
|
|
|
|
def strip_html(post_html: str) -> str:
|
|
|
|
"""
|
|
|
|
Strips all tags from the text, then linkifies it.
|
|
|
|
"""
|
2022-12-20 05:10:35 -08:00
|
|
|
cleaner = bleach.Cleaner(
|
|
|
|
tags=[],
|
|
|
|
strip=True,
|
|
|
|
filters=[partial(LinkifyFilter, url_re=url_regex)],
|
|
|
|
)
|
2022-11-21 20:18:13 -08:00
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-27 11:09:08 -08:00
|
|
|
|
|
|
|
|
|
|
|
def html_to_plaintext(post_html: str) -> str:
|
|
|
|
"""
|
|
|
|
Tries to do the inverse of the linebreaks filter.
|
|
|
|
"""
|
|
|
|
# TODO: Handle HTML entities
|
|
|
|
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
|
|
|
|
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
|
|
|
|
# Remove all other HTML and return
|
|
|
|
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
|
|
|
|
return cleaner.clean(post_html).strip()
|
2022-12-20 03:39:45 -08:00
|
|
|
|
|
|
|
|
|
|
|
class ContentRenderer:
|
|
|
|
"""
|
|
|
|
Renders HTML for posts, identity fields, and more.
|
|
|
|
|
|
|
|
The `local` parameter affects whether links are absolute (False) or relative (True)
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, local: bool):
|
|
|
|
self.local = local
|
|
|
|
|
|
|
|
def render_post(self, html: str, post) -> str:
|
|
|
|
"""
|
|
|
|
Given post HTML, normalises it and renders it for presentation.
|
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
html = sanitize_html(html)
|
|
|
|
html = self.linkify_mentions(html, post=post)
|
|
|
|
html = self.linkify_hashtags(html, identity=post.author)
|
|
|
|
if self.local:
|
|
|
|
html = self.imageify_emojis(html, identity=post.author)
|
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-21 17:10:25 -08:00
|
|
|
def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
|
2022-12-20 03:39:45 -08:00
|
|
|
"""
|
2022-12-21 17:10:25 -08:00
|
|
|
Given identity summary HTML, normalises it and renders it for presentation.
|
2022-12-20 03:39:45 -08:00
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
if strip:
|
|
|
|
html = strip_html(html)
|
|
|
|
else:
|
|
|
|
html = sanitize_html(html)
|
|
|
|
html = self.linkify_hashtags(html, identity=identity)
|
|
|
|
if self.local:
|
|
|
|
html = self.imageify_emojis(html, identity=identity)
|
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-21 17:10:25 -08:00
|
|
|
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
|
|
|
|
"""
|
|
|
|
Given name/basic value HTML, normalises it and renders it for presentation.
|
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
if strip:
|
|
|
|
html = strip_html(html)
|
|
|
|
else:
|
|
|
|
html = sanitize_html(html)
|
|
|
|
if self.local:
|
|
|
|
html = self.imageify_emojis(html, identity=identity)
|
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-20 03:39:45 -08:00
|
|
|
def linkify_mentions(self, html: str, post) -> str:
|
|
|
|
"""
|
|
|
|
Links mentions _in the context of the post_ - as in, using the mentions
|
|
|
|
property as the only source (as we might be doing this without other
|
|
|
|
DB access allowed)
|
|
|
|
"""
|
|
|
|
from activities.models import Post
|
|
|
|
|
|
|
|
possible_matches = {}
|
|
|
|
for mention in post.mentions.all():
|
|
|
|
if self.local:
|
|
|
|
url = str(mention.urls.view)
|
|
|
|
else:
|
|
|
|
url = mention.absolute_profile_uri()
|
|
|
|
possible_matches[mention.username] = url
|
|
|
|
possible_matches[f"{mention.username}@{mention.domain_id}"] = url
|
|
|
|
|
|
|
|
collapse_name: dict[str, str] = {}
|
|
|
|
|
|
|
|
def replacer(match):
|
|
|
|
precursor = match.group(1)
|
|
|
|
handle = match.group(2).lower()
|
|
|
|
if "@" in handle:
|
|
|
|
short_handle = handle.split("@", 1)[0]
|
|
|
|
else:
|
|
|
|
short_handle = handle
|
|
|
|
if handle in possible_matches:
|
|
|
|
if short_handle not in collapse_name:
|
|
|
|
collapse_name[short_handle] = handle
|
|
|
|
elif collapse_name.get(short_handle) != handle:
|
|
|
|
short_handle = handle
|
|
|
|
return f'{precursor}<a href="{possible_matches[handle]}">@{short_handle}</a>'
|
|
|
|
else:
|
|
|
|
return match.group()
|
|
|
|
|
|
|
|
return Post.mention_regex.sub(replacer, html)
|
|
|
|
|
|
|
|
def linkify_hashtags(self, html, identity) -> str:
|
|
|
|
from activities.models import Hashtag
|
|
|
|
|
2022-12-20 05:55:14 -08:00
|
|
|
def replacer(attrs, new=False):
|
|
|
|
# See if the text in this link looks like a hashtag
|
|
|
|
if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
|
|
|
|
return attrs
|
|
|
|
hashtag = attrs["_text"].strip().lstrip("#")
|
|
|
|
attrs[None, "class"] = "hashtag"
|
|
|
|
if (None, "rel") in attrs:
|
|
|
|
del attrs[None, "rel"]
|
2022-12-20 03:39:45 -08:00
|
|
|
if self.local:
|
2022-12-20 05:55:14 -08:00
|
|
|
attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
|
2022-12-20 03:39:45 -08:00
|
|
|
else:
|
2022-12-20 05:55:14 -08:00
|
|
|
attrs[
|
|
|
|
None, "href"
|
|
|
|
] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
|
|
|
|
return attrs
|
|
|
|
|
|
|
|
linker = bleach.linkifier.Linker(
|
|
|
|
url_re=Hashtag.hashtag_regex, callbacks=[replacer]
|
|
|
|
)
|
|
|
|
return linker.linkify(html)
|
2022-12-20 03:39:45 -08:00
|
|
|
|
|
|
|
def imageify_emojis(self, html: str, identity, include_local: bool = True):
|
|
|
|
"""
|
|
|
|
Find :emoji: in content and convert to <img>. If include_local is True,
|
|
|
|
the local emoji will be used as a fallback for any shortcodes not defined
|
|
|
|
by emojis.
|
|
|
|
"""
|
|
|
|
from activities.models import Emoji
|
|
|
|
|
|
|
|
emoji_set = Emoji.for_domain(identity.domain)
|
|
|
|
if include_local:
|
|
|
|
emoji_set.extend(Emoji.for_domain(None))
|
|
|
|
|
|
|
|
possible_matches = {
|
|
|
|
emoji.shortcode: emoji.as_html() for emoji in emoji_set if emoji.is_usable
|
|
|
|
}
|
|
|
|
|
|
|
|
def replacer(match):
|
|
|
|
fullcode = match.group(1).lower()
|
|
|
|
if fullcode in possible_matches:
|
|
|
|
return possible_matches[fullcode]
|
|
|
|
return match.group()
|
|
|
|
|
|
|
|
return Emoji.emoji_regex.sub(replacer, html)
|