From cc75863b8f9f34c01aaa8a13159fd5386ff16f52 Mon Sep 17 00:00:00 2001
From: Michael Manfre <mike@manfre.net>
Date: Wed, 18 Jan 2023 01:41:33 -0500
Subject: [PATCH] Fixes #431 - Preserve href when stripping <a> tags (#436)

---
 core/html.py            | 33 +++++++++++++++++++++++++++++++--
 tests/core/test_html.py | 10 ++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)
diff --git a/core/html.py b/core/html.py
index 1ae5ee2..5728899 100644
--- a/core/html.py
+++ b/core/html.py
@@ -79,6 +79,35 @@ class MastodonStrictTagFilter(Filter):
             yield token
 
 
+class UnlinkifyFilter(Filter):
+    """
+    Forcibly replaces link text with the href.
+
+    This is intented to be used when stripping <a> tags to preserve the link
+    location at the expense of the link text.
+    """
+
+    def __iter__(self):
+        discarding_a_text = False
+        for token in Filter.__iter__(self):
+            if token.get("name") == "a":
+                if token["type"] == "EndTag":
+                    discarding_a_text = False
+                    continue
+                href = token["data"].get((None, "href"))
+
+                # If <a> has an href, we use it and throw away all content
+                # within the <a>...</a>. If href missing or empty, try to find
+                # text within the <a>...</a>
+                if href:
+                    yield {"data": href, "type": "Characters"}
+                    discarding_a_text = True
+                    continue
+            elif not discarding_a_text:
+                yield token
+            # else: throw away tokens until we're out of the <a>
+
+
 def allow_a(tag: str, name: str, value: str):
     if name in ["href", "title", "class"]:
         return True
@@ -150,7 +179,7 @@ def strip_html(post_html: str, *, linkify: bool = True) -> str:
         strip=True,
         filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
         if linkify
-        else [],
+        else [UnlinkifyFilter],
     )
     return mark_safe(cleaner.clean(post_html))
 
@@ -163,7 +192,7 @@ def html_to_plaintext(post_html: str) -> str:
     # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
     post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
     # Remove all other HTML and return
-    cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
+    cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
     return cleaner.clean(post_html).strip()
 
 
diff --git a/tests/core/test_html.py b/tests/core/test_html.py
index bc63920..527991a 100644
--- a/tests/core/test_html.py
+++ b/tests/core/test_html.py
@@ -18,6 +18,16 @@ def test_html_to_plaintext():
         == "Hi!\n\nHow are\n you?\n\ntoday"
     )
 
+    assert (
+        html_to_plaintext(
+            '<p><a href="https://fedi.takahe.social/with/a/long/path">'
+            '<b>The</b> <img src="takahe.png"> Link</a> '
+            '<a href="">Empty href</a> '
+            "<a>Empty A</a></p>"
+        )
+        == "https://fedi.takahe.social/with/a/long/path Empty href Empty A"
+    )
+
 
 def test_sanitize_post():