From 5d09a67b5210b6fb6506d1432ec2ae5418df0266 Mon Sep 17 00:00:00 2001 From: Levi Bard Date: Sat, 17 Sep 2022 19:06:45 +0200 Subject: [PATCH] Fix the tag span generation for tags with nonascii characters (#2700) * Update mention and tag regexes from mastodon * Normalize nonascii tag names the same way that mastodon does --- .../keylesspalace/tusky/util/AsciiFolding.kt | 26 +++++++++++++++++++ .../keylesspalace/tusky/util/LinkHelper.kt | 2 +- .../com/keylesspalace/tusky/util/SpanUtils.kt | 7 +++-- .../com/keylesspalace/tusky/SpanUtilsTest.kt | 1 + .../tusky/util/LinkHelperTest.kt | 11 ++++++++ 5 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 app/src/main/java/com/keylesspalace/tusky/util/AsciiFolding.kt diff --git a/app/src/main/java/com/keylesspalace/tusky/util/AsciiFolding.kt b/app/src/main/java/com/keylesspalace/tusky/util/AsciiFolding.kt new file mode 100644 index 000000000..8f1101d29 --- /dev/null +++ b/app/src/main/java/com/keylesspalace/tusky/util/AsciiFolding.kt @@ -0,0 +1,26 @@ +/* Copyright 2022 Tusky contributors + * + * This file is a part of Tusky. + * + * This program is free software; you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation; either version 3 of the + * License, or (at your option) any later version. + * + * Tusky is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even + * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + * Public License for more details. + * + * You should have received a copy of the GNU General Public License along with Tusky; if not, + * see . */ + +package com.keylesspalace.tusky.util + +// Inspired by https://github.com/mastodon/mastodon/blob/main/app/lib/ascii_folding.rb + +val unicodeToASCIIMap = "ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž".toList().zip( + "AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz".toList() +).toMap() + +fun normalizeToASCII(text: CharSequence): CharSequence { + return String(text.map { unicodeToASCIIMap[it] ?: it }.toCharArray()) +} diff --git a/app/src/main/java/com/keylesspalace/tusky/util/LinkHelper.kt b/app/src/main/java/com/keylesspalace/tusky/util/LinkHelper.kt index 442a65513..9a25ecaac 100644 --- a/app/src/main/java/com/keylesspalace/tusky/util/LinkHelper.kt +++ b/app/src/main/java/com/keylesspalace/tusky/util/LinkHelper.kt @@ -124,7 +124,7 @@ fun setClickableText( @VisibleForTesting fun getTagName(text: CharSequence, tags: List?): String? { - val scrapedName = text.subSequence(1, text.length).toString() + val scrapedName = normalizeToASCII(text.subSequence(1, text.length)).toString() return when (tags) { null -> scrapedName else -> tags.firstOrNull { it.name.equals(scrapedName, true) }?.name diff --git a/app/src/main/java/com/keylesspalace/tusky/util/SpanUtils.kt b/app/src/main/java/com/keylesspalace/tusky/util/SpanUtils.kt index 7734d9d7f..7087a165f 100644 --- a/app/src/main/java/com/keylesspalace/tusky/util/SpanUtils.kt +++ b/app/src/main/java/com/keylesspalace/tusky/util/SpanUtils.kt @@ -12,13 +12,16 @@ import kotlin.math.max * @see * Tag#HASHTAG_RE. */ -private const val TAG_REGEX = "(?:^|[^/)A-Za-z0-9_])#([\\w_]*[\\p{Alpha}_][\\w_]*)" +private const val HASHTAG_SEPARATORS = "_\\u00B7\\u200c" +private const val UNICODE_WORD = "\\p{L}\\p{Mn}\\p{Nd}\\p{Nl}\\p{Pc}" // Ugh, java ( https://stackoverflow.com/questions/4304928/unicode-equivalents-for-w-and-b-in-java-regular-expressions ) +private const val TAG_REGEX = "(?:^|[^/)\\w])#(([${UNICODE_WORD}_][$UNICODE_WORD$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][$UNICODE_WORD$HASHTAG_SEPARATORS]*[${UNICODE_WORD}_])|([${UNICODE_WORD}_]*[\\p{Alpha}][${UNICODE_WORD}_]*))" /** * @see * Account#MENTION_RE */ -private const val MENTION_REGEX = "(?:^|[^/[:word:]])@([a-z0-9_-]+(?:@[a-z0-9\\.\\-]+[a-z0-9]+)?)" +private const val USERNAME_REGEX = "[\\w]+([\\w\\.-]+[\\w]+)?" +private const val MENTION_REGEX = "(?<=^|[^\\/$UNICODE_WORD])@(($USERNAME_REGEX)(?:@[$UNICODE_WORD\\.\\-]+[$UNICODE_WORD]+)?)" private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)" private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)" diff --git a/app/src/test/java/com/keylesspalace/tusky/SpanUtilsTest.kt b/app/src/test/java/com/keylesspalace/tusky/SpanUtilsTest.kt index 213405603..268ae5d36 100644 --- a/app/src/test/java/com/keylesspalace/tusky/SpanUtilsTest.kt +++ b/app/src/test/java/com/keylesspalace/tusky/SpanUtilsTest.kt @@ -38,6 +38,7 @@ class SpanUtilsTest { return listOf( "@mention", "#tag", + "#tåg", "https://thr.ee/meh?foo=bar&wat=@at#hmm", "http://thr.ee/meh?foo=bar&wat=@at#hmm" ) diff --git a/app/src/test/java/com/keylesspalace/tusky/util/LinkHelperTest.kt b/app/src/test/java/com/keylesspalace/tusky/util/LinkHelperTest.kt index 3bd2311d8..9e8f899e5 100644 --- a/app/src/test/java/com/keylesspalace/tusky/util/LinkHelperTest.kt +++ b/app/src/test/java/com/keylesspalace/tusky/util/LinkHelperTest.kt @@ -86,6 +86,17 @@ class LinkHelperTest { } } + @Test + fun whenCheckingTags_tagNameIsNormalized() { + val mutator = "aeiou".toList().zip("åÉîøÜ".toList()).toMap() + for (tag in tags) { + val mutatedTagName = String(tag.name.map { mutator[it] ?: it }.toCharArray()) + val tagName = getTagName("#$mutatedTagName", tags) + Assert.assertNotNull(tagName) + Assert.assertNotNull(tags.firstOrNull { it.name == tagName }) + } + } + @Test fun hashedUrlSpans_withNoMatchingTag_areNotModified() { for (tag in tags) {