takahe/tests/core/test_html.py

import pytest

from core.html import FediverseHtmlParser


@pytest.mark.django_db
def test_parser(identity):
    """
    Validates the HtmlParser in its various output modes
    """

    # Basic tag allowance
    parser = FediverseHtmlParser("<p>Hello!</p><script></script>")
    assert parser.html == "<p>Hello!</p>"
    assert parser.plain_text == "Hello!"

    # Newline erasure
    parser = FediverseHtmlParser("<p>Hi!</p>\n\n<p>How are you?</p>")
    assert parser.html == "<p>Hi!</p><p>How are you?</p>"
    assert parser.plain_text == "Hi!\n\nHow are you?"

    # Trying to be evil
    parser = FediverseHtmlParser("<scri<span></span>pt>")
    assert "<scr" not in parser.html
    parser = FediverseHtmlParser("<scri #hashtag pt>")
    assert "<scr" not in parser.html

    # Entities are escaped
    parser = FediverseHtmlParser("<p>It&#39;s great</p>", find_hashtags=True)
    assert parser.html == "<p>It&#x27;s great</p>"
    assert parser.plain_text == "It's great"
    assert parser.hashtags == set()

    # Linkify works, but only with protocol prefixes
    parser = FediverseHtmlParser("<p>test.com</p>")
    assert parser.html == "<p>test.com</p>"
    assert parser.plain_text == "test.com"
    parser = FediverseHtmlParser("<p>https://test.com</p>")
    assert (
        parser.html == '<p><a href="https://test.com" rel="nofollow">test.com</a></p>'
    )
    assert parser.plain_text == "https://test.com"

    # Links are preserved
    parser = FediverseHtmlParser("<a href='https://takahe.social'>takahe social</a>")
    assert (
        parser.html
        == '<a href="https://takahe.social" rel="nofollow">takahe social</a>'
    )
    assert parser.plain_text == "https://takahe.social"

    # Very long links are shortened
    full_url = "https://social.example.com/a-long/path/that-should-be-shortened"
    parser = FediverseHtmlParser(f"<p>{full_url}</p>")
    assert (
        parser.html
        == f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url.removeprefix("https://")}">social.example.com/a-long/path</a></p>'
    )
    assert (
        parser.plain_text
        == "https://social.example.com/a-long/path/that-should-be-shortened"
    )

    # Make sure things that look like mentions are left alone with no mentions supplied.
    parser = FediverseHtmlParser(
        "<p>@test@example.com</p>",
        find_mentions=True,
        find_hashtags=True,
        find_emojis=True,
    )
    assert parser.html == "<p>@test@example.com</p>"
    assert parser.plain_text == "@test@example.com"
    assert parser.mentions == {"test@example.com"}

    # Make sure mentions work when there is a mention supplied
    parser = FediverseHtmlParser(
        "<p>@test@example.com</p>",
        mentions=[identity],
        find_hashtags=True,
        find_emojis=True,
    )
    assert parser.html == '<p><a href="/@test@example.com/">@test</a></p>'
    assert parser.plain_text == "@test@example.com"
    assert parser.mentions == {"test@example.com"}

    # Ensure mentions are case insensitive
    parser = FediverseHtmlParser(
        "<p>@TeSt@ExamPle.com</p>",
        mentions=[identity],
        find_hashtags=True,
        find_emojis=True,
    )
    assert parser.html == '<p><a href="/@test@example.com/">@TeSt</a></p>'
    assert parser.plain_text == "@TeSt@ExamPle.com"
    assert parser.mentions == {"test@example.com"}

    # Ensure hashtags are linked, even through spans, but not within hrefs
    parser = FediverseHtmlParser(
        '<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>',
        find_hashtags=True,
        find_emojis=True,
    )
    assert (
        parser.html
        == '<a href="http://example.com#notahashtag" rel="nofollow">something</a> <a href="/tags/hashtag/" rel="tag">#hashtag</a> <a href="/tags/hashtagtwo/" rel="tag">#hashtagtwo</a>'
    )
    assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo"
    assert parser.hashtags == {"hashtag", "hashtagtwo"}

    # Ensure lists are rendered reasonably
    parser = FediverseHtmlParser(
        "<p>List:</p><ul><li>One</li><li>Two</li><li>Three</li></ul><p>End!</p>",
        find_hashtags=True,
        find_emojis=True,
    )
    assert parser.html == "<p>List:</p><p>One<br>Two<br>Three</p><p>End!</p>"
    assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!"
Significantly better hashtag link parsing Fixes #203 2022-12-20 05:55:14 -08:00			`import pytest`

Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`from core.html import FediverseHtmlParser`
Some cleanup around editing 2022-11-27 11:09:08 -08:00

Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`@pytest.mark.django_db`
			`def test_parser(identity):`
			`"""`
			`Validates the HtmlParser in its various output modes`
			`"""`
Some cleanup around editing 2022-11-27 11:09:08 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`# Basic tag allowance`
			`parser = FediverseHtmlParser("<p>Hello!</p><script></script>")`
			`assert parser.html == "<p>Hello!</p>"`
			`assert parser.plain_text == "Hello!"`

			`# Newline erasure`
			`parser = FediverseHtmlParser("<p>Hi!</p>\n\n<p>How are you?</p>")`
			`assert parser.html == "<p>Hi!</p><p>How are you?</p>"`
			`assert parser.plain_text == "Hi!\n\nHow are you?"`

			`# Trying to be evil`
			`parser = FediverseHtmlParser("<scri<span></span>pt>")`
			`assert "<scr" not in parser.html`
			`parser = FediverseHtmlParser("<scri #hashtag pt>")`
			`assert "<scr" not in parser.html`

			`# Entities are escaped`
			`parser = FediverseHtmlParser("<p>It's great</p>", find_hashtags=True)`
			`assert parser.html == "<p>It's great</p>"`
			`assert parser.plain_text == "It's great"`
			`assert parser.hashtags == set()`

			`# Linkify works, but only with protocol prefixes`
			`parser = FediverseHtmlParser("<p>test.com</p>")`
			`assert parser.html == "<p>test.com</p>"`
			`assert parser.plain_text == "test.com"`
			`parser = FediverseHtmlParser("<p>https://test.com</p>")`
Some cleanup around editing 2022-11-27 11:09:08 -08:00			`assert (`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser.html == '<p><a href="https://test.com" rel="nofollow">test.com</a></p>'`
Some cleanup around editing 2022-11-27 11:09:08 -08:00			`)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`assert parser.plain_text == "https://test.com"`
Some cleanup around editing 2022-11-27 11:09:08 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`# Links are preserved`
			`parser = FediverseHtmlParser("<a href='https://takahe.social'>takahe social</a>")`
Some cleanup around editing 2022-11-27 11:09:08 -08:00			`assert (`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser.html`
			`== '<a href="https://takahe.social" rel="nofollow">takahe social</a>'`
Some cleanup around editing 2022-11-27 11:09:08 -08:00			`)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`assert parser.plain_text == "https://takahe.social"`
Fix hashtagging of HTML entities 2022-11-28 21:34:14 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`# Very long links are shortened`
			`full_url = "https://social.example.com/a-long/path/that-should-be-shortened"`
			`parser = FediverseHtmlParser(f"<p>{full_url}</p>")`
Fixes #431 - Preserve href when stripping <a> tags (#436) 2023-01-17 22:41:33 -08:00			`assert (`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser.html`
			`== f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url.removeprefix("https://")}">social.example.com/a-long/path</a></p>'`
Fixes #431 - Preserve href when stripping <a> tags (#436) 2023-01-17 22:41:33 -08:00			`)`
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`assert (`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser.plain_text`
			`== "https://social.example.com/a-long/path/that-should-be-shortened"`
Move linkifying to all http-prefixed links 2022-12-20 05:10:35 -08:00			`)`
Significantly better hashtag link parsing Fixes #203 2022-12-20 05:55:14 -08:00
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`# Make sure things that look like mentions are left alone with no mentions supplied.`
			`parser = FediverseHtmlParser(`
			`"<p>@test@example.com</p>",`
			`find_mentions=True,`
			`find_hashtags=True,`
			`find_emojis=True,`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00			`)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`assert parser.html == "<p>@test@example.com</p>"`
			`assert parser.plain_text == "@test@example.com"`
			`assert parser.mentions == {"test@example.com"}`

			`# Make sure mentions work when there is a mention supplied`
			`parser = FediverseHtmlParser(`
			`"<p>@test@example.com</p>",`
			`mentions=[identity],`
			`find_hashtags=True,`
			`find_emojis=True,`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00			`)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`assert parser.html == '<p><a href="/@test@example.com/">@test</a></p>'`
			`assert parser.plain_text == "@test@example.com"`
			`assert parser.mentions == {"test@example.com"}`

			`# Ensure mentions are case insensitive`
			`parser = FediverseHtmlParser(`
			`"<p>@TeSt@ExamPle.com</p>",`
			`mentions=[identity],`
			`find_hashtags=True,`
			`find_emojis=True,`
Apply Mastodon style link text shortening (#426) 2023-01-16 10:59:46 -08:00			`)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`assert parser.html == '<p><a href="/@test@example.com/">@TeSt</a></p>'`
			`assert parser.plain_text == "@TeSt@ExamPle.com"`
			`assert parser.mentions == {"test@example.com"}`

			`# Ensure hashtags are linked, even through spans, but not within hrefs`
			`parser = FediverseHtmlParser(`
			`'<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>',`
			`find_hashtags=True,`
			`find_emojis=True,`
Significantly better hashtag link parsing Fixes #203 2022-12-20 05:55:14 -08:00			`)`
Match sanitizing for posts to Mastodon (#422) Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73 I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact. 2023-01-15 21:32:04 -08:00			`assert (`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`parser.html`
Fix hashtag being handled as common link in the preview card (#515) 2023-02-19 19:35:54 -08:00			`== '<a href="http://example.com#notahashtag" rel="nofollow">something</a> <a href="/tags/hashtag/" rel="tag">#hashtag</a> <a href="/tags/hashtagtwo/" rel="tag">#hashtagtwo</a>'`
Match sanitizing for posts to Mastodon (#422) Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73 I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact. 2023-01-15 21:32:04 -08:00			`)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo"`
			`assert parser.hashtags == {"hashtag", "hashtagtwo"}`

			`# Ensure lists are rendered reasonably`
			`parser = FediverseHtmlParser(`
			`"<p>List:</p><ul><li>One</li><li>Two</li><li>Three</li></ul><p>End!</p>",`
			`find_hashtags=True,`
			`find_emojis=True,`
Fixed mention linking with mixed case usernames (#265) 2022-12-24 20:04:25 -08:00			`)`
Move to a new HTML parser/stripper This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place. 2023-01-29 16:46:22 -08:00			`assert parser.html == "<p>List:</p><p>One<br>Two<br>Three</p><p>End!</p>"`
			`assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!"`