Better hashtag normalization when processing a post (#26614)

This commit is contained in:
Renaud Chaput 2023-08-23 08:18:07 +02:00 committed by GitHub
parent bd023a2637
commit 58acaa9ae6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 11 deletions

View File

@ -105,6 +105,21 @@ describe('computeHashtagBarForStatus', () => {
); );
}); });
it('handles server-side normalized tags with accentuated characters', () => {
const status = createStatus(
'<p>Text</p><p><a href="test">#éaa</a> <a href="test">#Éaa</a></p>',
['eaa'], // The server may normalize the hashtags in the `tags` attribute
);
const { hashtagsInBar, statusContentProps } =
computeHashtagBarForStatus(status);
expect(hashtagsInBar).toEqual(['Éaa']);
expect(statusContentProps.statusContent).toMatchInlineSnapshot(
`"<p>Text</p>"`,
);
});
it('does not display in bar a hashtag in content with a case difference', () => { it('does not display in bar a hashtag in content with a case difference', () => {
const status = createStatus( const status = createStatus(
'<p>Text <a href="test">#Éaa</a></p><p><a href="test">#éaa</a></p>', '<p>Text <a href="test">#Éaa</a></p><p><a href="test">#éaa</a></p>',

View File

@ -23,8 +23,9 @@ export type StatusLike = Record<{
}>; }>;
function normalizeHashtag(hashtag: string) { function normalizeHashtag(hashtag: string) {
if (hashtag && hashtag.startsWith('#')) return hashtag.slice(1); return (
else return hashtag; hashtag && hashtag.startsWith('#') ? hashtag.slice(1) : hashtag
).normalize('NFKC');
} }
function isNodeLinkHashtag(element: Node): element is HTMLLinkElement { function isNodeLinkHashtag(element: Node): element is HTMLLinkElement {
@ -70,9 +71,16 @@ function uniqueHashtagsWithCaseHandling(hashtags: string[]) {
} }
// Create the collator once, this is much more efficient // Create the collator once, this is much more efficient
const collator = new Intl.Collator(undefined, { sensitivity: 'accent' }); const collator = new Intl.Collator(undefined, {
sensitivity: 'base', // we use this to emulate the ASCII folding done on the server-side, hopefuly more efficiently
});
function localeAwareInclude(collection: string[], value: string) { function localeAwareInclude(collection: string[], value: string) {
return collection.find((item) => collator.compare(item, value) === 0); const normalizedValue = value.normalize('NFKC');
return !!collection.find(
(item) => collator.compare(item.normalize('NFKC'), normalizedValue) === 0,
);
} }
// We use an intermediate function here to make it easier to test // We use an intermediate function here to make it easier to test
@ -121,11 +129,13 @@ export function computeHashtagBarForStatus(status: StatusLike): {
// try to see if the last line is only hashtags // try to see if the last line is only hashtags
let onlyHashtags = true; let onlyHashtags = true;
const normalizedTagNames = tagNames.map((tag) => tag.normalize('NFKC'));
Array.from(lastChild.childNodes).forEach((node) => { Array.from(lastChild.childNodes).forEach((node) => {
if (isNodeLinkHashtag(node) && node.textContent) { if (isNodeLinkHashtag(node) && node.textContent) {
const normalized = normalizeHashtag(node.textContent); const normalized = normalizeHashtag(node.textContent);
if (!localeAwareInclude(tagNames, normalized)) { if (!localeAwareInclude(normalizedTagNames, normalized)) {
// stop here, this is not a real hashtag, so consider it as text // stop here, this is not a real hashtag, so consider it as text
onlyHashtags = false; onlyHashtags = false;
return; return;
@ -140,12 +150,14 @@ export function computeHashtagBarForStatus(status: StatusLike): {
} }
}); });
const hashtagsInBar = tagNames.filter( const hashtagsInBar = tagNames.filter((tag) => {
(tag) => const normalizedTag = tag.normalize('NFKC');
// the tag does not appear at all in the status content, it is an out-of-band tag // the tag does not appear at all in the status content, it is an out-of-band tag
!localeAwareInclude(contentHashtags, tag) && return (
!localeAwareInclude(lastLineHashtags, tag), !localeAwareInclude(contentHashtags, normalizedTag) &&
); !localeAwareInclude(lastLineHashtags, normalizedTag)
);
});
const isOnlyOneLine = contentWithoutLastLine.content.childElementCount === 0; const isOnlyOneLine = contentWithoutLastLine.content.childElementCount === 0;
const hasMedia = status.get('media_attachments').size > 0; const hasMedia = status.get('media_attachments').size > 0;