From cc75863b8f9f34c01aaa8a13159fd5386ff16f52 Mon Sep 17 00:00:00 2001 From: Michael Manfre Date: Wed, 18 Jan 2023 01:41:33 -0500 Subject: [PATCH] Fixes #431 - Preserve href when stripping tags (#436) --- core/html.py | 33 +++++++++++++++++++++++++++++++-- tests/core/test_html.py | 10 ++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/core/html.py b/core/html.py index 1ae5ee2..5728899 100644 --- a/core/html.py +++ b/core/html.py @@ -79,6 +79,35 @@ class MastodonStrictTagFilter(Filter): yield token +class UnlinkifyFilter(Filter): + """ + Forcibly replaces link text with the href. + + This is intented to be used when stripping tags to preserve the link + location at the expense of the link text. + """ + + def __iter__(self): + discarding_a_text = False + for token in Filter.__iter__(self): + if token.get("name") == "a": + if token["type"] == "EndTag": + discarding_a_text = False + continue + href = token["data"].get((None, "href")) + + # If has an href, we use it and throw away all content + # within the .... If href missing or empty, try to find + # text within the ... + if href: + yield {"data": href, "type": "Characters"} + discarding_a_text = True + continue + elif not discarding_a_text: + yield token + # else: throw away tokens until we're out of the + + def allow_a(tag: str, name: str, value: str): if name in ["href", "title", "class"]: return True @@ -150,7 +179,7 @@ def strip_html(post_html: str, *, linkify: bool = True) -> str: strip=True, filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)] if linkify - else [], + else [UnlinkifyFilter], ) return mark_safe(cleaner.clean(post_html)) @@ -163,7 +192,7 @@ def html_to_plaintext(post_html: str) -> str: # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach) post_html = post_html.replace("\n", "").replace("
", "\n").replace("

", "\n") # Remove all other HTML and return - cleaner = bleach.Cleaner(tags=[], strip=True, filters=[]) + cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter]) return cleaner.clean(post_html).strip() diff --git a/tests/core/test_html.py b/tests/core/test_html.py index bc63920..527991a 100644 --- a/tests/core/test_html.py +++ b/tests/core/test_html.py @@ -18,6 +18,16 @@ def test_html_to_plaintext(): == "Hi!\n\nHow are\n you?\n\ntoday" ) + assert ( + html_to_plaintext( + '

' + 'The Link ' + 'Empty href ' + "Empty A

" + ) + == "https://fedi.takahe.social/with/a/long/path Empty href Empty A" + ) + def test_sanitize_post():