takahe/core/html.py

53 lines
1.6 KiB
Python
Raw Normal View History

2022-11-11 21:02:43 -08:00
import bleach
2022-11-13 18:03:43 -08:00
from bleach.linkifier import LinkifyFilter
2022-11-11 21:02:43 -08:00
from django.utils.safestring import mark_safe
2022-11-13 18:03:43 -08:00
def allow_a(tag: str, name: str, value: str):
if name in ["href", "title", "class"]:
return True
elif name == "rel":
# Only allow rel attributes with a small subset of values
# (we're defending against, for example, rel=me)
rel_values = value.split()
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
return True
return False
2022-11-11 21:02:43 -08:00
def sanitize_post(post_html: str) -> str:
"""
Only allows a, br, p and span tags, and class attributes.
"""
2022-11-13 18:03:43 -08:00
cleaner = bleach.Cleaner(
tags=["br", "p", "a"],
2022-11-13 18:03:43 -08:00
attributes={ # type:ignore
"a": allow_a,
"p": ["class"],
"span": ["class"],
},
filters=[LinkifyFilter],
2022-11-17 18:31:00 -08:00
strip=True,
2022-11-11 21:02:43 -08:00
)
2022-11-13 18:03:43 -08:00
return mark_safe(cleaner.clean(post_html))
def strip_html(post_html: str) -> str:
"""
Strips all tags from the text, then linkifies it.
"""
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[LinkifyFilter])
return mark_safe(cleaner.clean(post_html))
2022-11-27 11:09:08 -08:00
def html_to_plaintext(post_html: str) -> str:
"""
Tries to do the inverse of the linebreaks filter.
"""
# TODO: Handle HTML entities
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
# Remove all other HTML and return
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
return cleaner.clean(post_html).strip()