Move linkifying to all http-prefixed links

This commit is contained in:
Andrew Godwin 2022-12-20 13:10:35 +00:00
parent 202046247c
commit 78d2283458
2 changed files with 32 additions and 2 deletions

View File

@ -1,7 +1,21 @@
import re
from functools import partial
import bleach
from bleach.linkifier import LinkifyFilter
from django.utils.safestring import mark_safe
url_regex = re.compile(
r"""\(* # Match any opening parentheses.
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
""",
re.IGNORECASE | re.VERBOSE | re.UNICODE,
)
def allow_a(tag: str, name: str, value: str):
if name in ["href", "title", "class"]:
@ -26,7 +40,7 @@ def sanitize_html(post_html: str) -> str:
"p": ["class"],
"span": ["class"],
},
filters=[LinkifyFilter],
filters=[partial(LinkifyFilter, url_re=url_regex)],
strip=True,
)
return mark_safe(cleaner.clean(post_html))
@ -36,7 +50,11 @@ def strip_html(post_html: str) -> str:
"""
Strips all tags from the text, then linkifies it.
"""
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[LinkifyFilter])
cleaner = bleach.Cleaner(
tags=[],
strip=True,
filters=[partial(LinkifyFilter, url_re=url_regex)],
)
return mark_safe(cleaner.clean(post_html))

View File

@ -19,3 +19,15 @@ def test_sanitize_post():
assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
assert sanitize_html("<p>It&#39;s great</p>") == "<p>It&#39;s great</p>"
# Note that we only want to linkify things with protocol prefixes to prevent
# too many false positives.
assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
assert (
sanitize_html("<p>https://test.com</p>")
== '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
)
assert (
sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
== "<p>@someone@subdomain.some-domain.com</p>"
)