diff --git a/app/src/main/java/app/pachli/util/SpanUtils.kt b/app/src/main/java/app/pachli/util/SpanUtils.kt index 81143d1c9..e182da69a 100644 --- a/app/src/main/java/app/pachli/util/SpanUtils.kt +++ b/app/src/main/java/app/pachli/util/SpanUtils.kt @@ -1,68 +1,33 @@ +/* + * Copyright 2024 Pachli Association + * + * This file is a part of Pachli. + * + * This program is free software; you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation; either version 3 of the + * License, or (at your option) any later version. + * + * Pachli is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even + * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + * Public License for more details. + * + * You should have received a copy of the GNU General Public License along with Pachli; if not, + * see . + */ + package app.pachli.util import android.text.Spannable import android.text.Spanned -import android.text.style.CharacterStyle import android.text.style.ForegroundColorSpan import android.text.style.URLSpan import app.pachli.core.ui.MentionSpan import app.pachli.core.ui.NoUnderlineURLSpan -import java.util.regex.Pattern -import kotlin.math.max - -/** - * @see - * Tag#HASHTAG_RE. - */ -private const val HASHTAG_SEPARATORS = "_\\u00B7\\u200c" -private const val UNICODE_WORD = "\\p{L}\\p{Mn}\\p{Nd}\\p{Nl}\\p{Pc}" // Ugh, java ( https://stackoverflow.com/questions/4304928/unicode-equivalents-for-w-and-b-in-java-regular-expressions ) -private const val TAG_REGEX = "(?:^|[^/)\\w])#(([${UNICODE_WORD}_][$UNICODE_WORD$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][$UNICODE_WORD$HASHTAG_SEPARATORS]*[${UNICODE_WORD}_])|([${UNICODE_WORD}_]*[\\p{Alpha}][${UNICODE_WORD}_]*))" - -/** - * @see - * Account#MENTION_RE - */ -private const val USERNAME_REGEX = "[\\w]+([\\w\\.-]+[\\w]+)?" -private const val MENTION_REGEX = "(?<=^|[^\\/$UNICODE_WORD])@(($USERNAME_REGEX)(?:@[$UNICODE_WORD\\.\\-]+[$UNICODE_WORD]+)?)" - -private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)" -private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)" - -/** - * Dump of android.util.Patterns.WEB_URL - */ -private val STRICT_WEB_URL_PATTERN = Pattern.compile("(((?:(?i:http|https|rtsp)://(?:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?(?:(([a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029  ]]](?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029  ]]_\\-]{0,61}[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029  ]]]){0,1}\\.)+(xn\\-\\-[\\w\\-]{0,58}\\w|[a-zA-Z[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029  ]]]{2,63})|((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9]))))(?:\\:\\d{1,5})?)([/\\?](?:(?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029  ]];/\\?:@&=#~\\-\\.\\+!\\*'\\(\\),_\\\$])|(?:%[a-fA-F0-9]{2}))*)?(?:\\b|\$|^))") +import com.twitter.twittertext.Extractor private val spanClasses = listOf(ForegroundColorSpan::class.java, URLSpan::class.java) -private val finders = mapOf( - FoundMatchType.HTTP_URL to PatternFinder(':', HTTP_URL_REGEX, 5, Character::isWhitespace), - FoundMatchType.HTTPS_URL to PatternFinder(':', HTTPS_URL_REGEX, 6, Character::isWhitespace), - FoundMatchType.TAG to PatternFinder('#', TAG_REGEX, 1, ::isValidForTagPrefix), - // TODO: We also need a proper validator for mentions - FoundMatchType.MENTION to PatternFinder('@', MENTION_REGEX, 1, Character::isWhitespace), -) -private enum class FoundMatchType { - HTTP_URL, - HTTPS_URL, - TAG, - MENTION, -} - -private class FindCharsResult { - lateinit var matchType: FoundMatchType - var start: Int = -1 - var end: Int = -1 -} - -private class PatternFinder( - val searchCharacter: Char, - regex: String, - val searchPrefixWidth: Int, - val prefixValidator: (Int) -> Boolean, -) { - val pattern: Pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE) -} +private val extractor = Extractor().apply { isExtractURLWithoutProtocol = false } /** * Takes text containing mentions and hashtags and urls and makes them the given colour. @@ -75,18 +40,16 @@ fun highlightSpans(text: Spannable, colour: Int) { // Colour the mentions and hashtags. val string = text.toString() - val length = text.length - var start = 0 - var end = 0 - while (end in 0 until length && start >= 0) { - // Search for url first because it can contain the other characters - val found = findPattern(string, end) - start = found.start - end = found.end - if (start in 0 until end) { - text.setSpan(getSpan(found.matchType, string, colour, start, end), start, end, Spanned.SPAN_INCLUSIVE_EXCLUSIVE) - start += finders[found.matchType]!!.searchPrefixWidth + + val entities = extractor.extractEntitiesWithIndices(string) + + for (entity in entities) { + val span = when (entity.type) { + Extractor.Entity.Type.URL -> NoUnderlineURLSpan(string.substring(entity.start, entity.end)) + Extractor.Entity.Type.HASHTAG -> ForegroundColorSpan(colour) + Extractor.Entity.Type.MENTION -> MentionSpan(string.substring(entity.start, entity.end)) } + text.setSpan(span, entity.start, entity.end, Spanned.SPAN_INCLUSIVE_EXCLUSIVE) } } @@ -95,87 +58,3 @@ private fun clearSpans(text: Spannable, spanClass: Class) { text.removeSpan(span) } } - -private fun findPattern(string: String, fromIndex: Int): FindCharsResult { - val result = FindCharsResult() - for (i in fromIndex..string.lastIndex) { - val c = string[i] - for (matchType in FoundMatchType.entries) { - val finder = finders[matchType] - if (finder!!.searchCharacter == c && - ( - (i - fromIndex) < finder.searchPrefixWidth || - finder.prefixValidator(string.codePointAt(i - finder.searchPrefixWidth)) - ) - ) { - result.matchType = matchType - result.start = max(0, i - finder.searchPrefixWidth) - findEndOfPattern(string, result, finder.pattern) - if (result.start + finder.searchPrefixWidth <= i + 1 && // The found result is actually triggered by the correct search character - result.end >= result.start - ) { // ...and we actually found a valid result - return result - } - } - } - } - return result -} - -private fun findEndOfPattern(string: String, result: FindCharsResult, pattern: Pattern) { - val matcher = pattern.matcher(string) - if (matcher.find(result.start)) { - // Once we have API level 26+, we can use named captures... - val end = matcher.end() - result.start = matcher.start() - when (result.matchType) { - FoundMatchType.TAG -> { - if (isValidForTagPrefix(string.codePointAt(result.start))) { - if (string[result.start] != '#' || - (string[result.start] == '#' && string[result.start + 1] == '#') - ) { - ++result.start - } - } - } - else -> { - if (Character.isWhitespace(string.codePointAt(result.start))) { - ++result.start - } - } - } - when (result.matchType) { - FoundMatchType.HTTP_URL, FoundMatchType.HTTPS_URL -> { - // Preliminary url patterns are fast/permissive, now we'll do full validation - if (STRICT_WEB_URL_PATTERN.matcher(string.substring(result.start, end)).matches()) { - result.end = end - } - } - else -> result.end = end - } - } -} - -private fun getSpan(matchType: FoundMatchType, string: String, colour: Int, start: Int, end: Int): CharacterStyle { - return when (matchType) { - FoundMatchType.HTTP_URL -> NoUnderlineURLSpan(string.substring(start, end)) - FoundMatchType.HTTPS_URL -> NoUnderlineURLSpan(string.substring(start, end)) - FoundMatchType.MENTION -> MentionSpan(string.substring(start, end)) - else -> ForegroundColorSpan(colour) - } -} - -private fun isWordCharacters(codePoint: Int): Boolean { - return (codePoint in 0x30..0x39) || // [0-9] - (codePoint in 0x41..0x5a) || // [A-Z] - (codePoint == 0x5f) || // _ - (codePoint in 0x61..0x7a) // [a-z] -} - -private fun isValidForTagPrefix(codePoint: Int): Boolean { - return !( - isWordCharacters(codePoint) || // \w - (codePoint == 0x2f) || // / - (codePoint == 0x29) - ) // ) -} diff --git a/app/src/test/java/app/pachli/SpanUtilsTest.kt b/app/src/test/java/app/pachli/SpanUtilsTest.kt index 0888e4f80..f06da2201 100644 --- a/app/src/test/java/app/pachli/SpanUtilsTest.kt +++ b/app/src/test/java/app/pachli/SpanUtilsTest.kt @@ -4,9 +4,11 @@ import app.pachli.core.testing.fakes.FakeSpannable import app.pachli.util.highlightSpans import org.junit.Assert import org.junit.Test +import org.junit.experimental.runners.Enclosed import org.junit.runner.RunWith import org.junit.runners.Parameterized +@RunWith(Enclosed::class) class SpanUtilsTest { @Test fun matchesMixedSpans() { @@ -19,8 +21,8 @@ class SpanUtilsTest { @Test fun doesntMergeAdjacentURLs() { - val firstURL = "http://first.thing" - val secondURL = "https://second.thing" + val firstURL = "http://first.bar" + val secondURL = "https://second.bar" val inputSpannable = FakeSpannable("$firstURL $secondURL") highlightSpans(inputSpannable, 0xffffff) val spans = inputSpannable.spans @@ -71,14 +73,6 @@ class SpanUtilsTest { Assert.assertTrue(spans.isEmpty()) } - @Test - fun doesNotMatchSpanEmbeddedInAnotherSpan() { - val inputSpannable = FakeSpannable("@aa${thingToHighlight}aa") - highlightSpans(inputSpannable, 0xffffff) - val spans = inputSpannable.spans - Assert.assertEquals(1, spans.size) - } - @Test fun spansDoNotOverlap() { val begin = "@begin" diff --git a/app/src/test/java/app/pachli/components/compose/ComposeActivityTest.kt b/app/src/test/java/app/pachli/components/compose/ComposeActivityTest.kt index 6c9f2606b..a122b68b5 100644 --- a/app/src/test/java/app/pachli/components/compose/ComposeActivityTest.kt +++ b/app/src/test/java/app/pachli/components/compose/ComposeActivityTest.kt @@ -288,7 +288,7 @@ class ComposeActivityTest { @Test fun whenTextContainsUrl_onlyEllipsizedURLIsCounted() { - val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:" + val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A" val additionalContent = "Check out this @image #search result: " rule.launch() rule.getScenario().onActivity { @@ -303,7 +303,7 @@ class ComposeActivityTest { @Test fun whenTextContainsShortUrls_allUrlsGetEllipsized() { val shortUrl = "https://pachli.app" - val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:" + val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A" val additionalContent = " Check out this @image #search result: " rule.launch() rule.getScenario().onActivity { @@ -317,7 +317,7 @@ class ComposeActivityTest { @Test fun whenTextContainsMultipleURLs_allURLsGetEllipsized() { - val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:" + val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A" val additionalContent = " Check out this @image #search result: " rule.launch() rule.getScenario().onActivity { @@ -331,7 +331,7 @@ class ComposeActivityTest { @Test fun whenTextContainsUrl_onlyEllipsizedURLIsCounted_withCustomConfiguration() { - val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:" + val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A" val additionalContent = "Check out this @image #search result: " val customUrlLength = 16 getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) } @@ -348,7 +348,7 @@ class ComposeActivityTest { @Test fun whenTextContainsShortUrls_allUrlsGetEllipsized_withCustomConfiguration() { val shortUrl = "https://pachli.app" - val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:" + val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A" val additionalContent = " Check out this @image #search result: " val customUrlLength = 18 // The intention is that this is longer than shortUrl.length getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) } @@ -364,7 +364,7 @@ class ComposeActivityTest { @Test fun whenTextContainsMultipleURLs_allURLsGetEllipsized_withCustomConfiguration() { - val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:" + val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A" val additionalContent = " Check out this @image #search result: " val customUrlLength = 16 getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) } diff --git a/app/src/test/java/app/pachli/components/compose/StatusLengthTest.kt b/app/src/test/java/app/pachli/components/compose/StatusLengthTest.kt index 4f39f8bbf..bf047fcff 100644 --- a/app/src/test/java/app/pachli/components/compose/StatusLengthTest.kt +++ b/app/src/test/java/app/pachli/components/compose/StatusLengthTest.kt @@ -40,9 +40,12 @@ class StatusLengthTest( // "@user@server" should be treated as "@user" arrayOf("123 @example@example.org", 12), // URLs under 23 chars are treated as 23 chars - arrayOf("123 http://example.url", 27), + arrayOf("123 http://example.org", 27), // URLs over 23 chars are treated as 23 chars arrayOf("123 http://urlthatislongerthan23characters.example.org", 27), + // URLs end when they should (the ")." should be part of the status + // length, not considered to be part of the URL) + arrayOf("test (https://example.com). test", 36), // Short hashtags are treated as is arrayOf("123 #basictag", 13), // Long hashtags are *also* treated as is (not treated as 23, like URLs) diff --git a/core/ui/src/main/kotlin/com/twitter/twittertext/Extractor.kt b/core/ui/src/main/kotlin/com/twitter/twittertext/Extractor.kt new file mode 100644 index 000000000..ef46a7056 --- /dev/null +++ b/core/ui/src/main/kotlin/com/twitter/twittertext/Extractor.kt @@ -0,0 +1,325 @@ +// Copyright 2018 Twitter, Inc. +// Licensed under the Apache License, Version 2.0 +// http://www.apache.org/licenses/LICENSE-2.0 + +package com.twitter.twittertext + +import java.net.IDN +import java.util.regex.Matcher + +/** + * A class to extract usernames, hashtags and URLs from Mastodon text. + */ +open class Extractor { + data class Entity( + var start: Int, + var end: Int, + val value: String, + val type: Type, + ) { + enum class Type { + URL, + HASHTAG, + MENTION, + } + + @JvmOverloads + constructor(matcher: Matcher, type: Type, groupNumber: Int, startOffset: Int = -1) : this( + matcher.start(groupNumber) + startOffset, + matcher.end(groupNumber), + matcher.group(groupNumber)!!, + type, + ) + } + + var isExtractURLWithoutProtocol = true + + private fun removeOverlappingEntities(entities: MutableList) { + // sort by index + entities.sortWith(Comparator { e1, e2 -> e1.start - e2.start }) + + // Remove overlapping entities. + // Two entities overlap only when one is URL and the other is hashtag/mention + // which is a part of the URL. When it happens, we choose URL over hashtag/mention + // by selecting the one with smaller start index. + if (!entities.isEmpty()) { + val it = entities.iterator() + var prev = it.next() + while (it.hasNext()) { + val cur = it.next() + if (prev.end > cur.start) { + it.remove() + } else { + prev = cur + } + } + } + } + + /** + * Extract URLs, @mentions, lists and #hashtag from a given text/tweet. + * + * @param text text of tweet + * @return list of extracted entities + */ + fun extractEntitiesWithIndices(text: String): List = buildList { + addAll(extractURLsWithIndices(text)) + addAll(extractHashtagsWithIndices(text, false)) + addAll(extractMentionsOrListsWithIndices(text)) + removeOverlappingEntities(this) + } + + /** + * Extract @username and an optional list reference from Tweet text. A mention is an occurrence + * of @username anywhere in a Tweet. A mention with a list is a @username/list. + * + * @param text of the tweet from which to extract usernames + * @return List of usernames (without the leading @ sign) and an optional lists referenced + */ + private fun extractMentionsOrListsWithIndices(text: String): List { + if (text.isEmpty()) return emptyList() + + // Performance optimization. + // If text doesn't contain @/@ at all, the text doesn't + // contain @mention. So we can simply return an empty list. + var found = false + for (c in text.toCharArray()) { + if (c == '@' || c == '@') { + found = true + break + } + } + if (!found) { + return emptyList() + } + val extracted: MutableList = ArrayList() + val matcher: Matcher = Regex.VALID_MENTION_OR_LIST.matcher(text) + while (matcher.find()) { + val after = text.substring(matcher.end()) + if (!Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) { + if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null) { + extracted.add( + Entity( + matcher, + Entity.Type.MENTION, + Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME, + ), + ) + } else { + extracted.add( + Entity( + matcher.start(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME) - 1, + matcher.end(Regex.VALID_MENTION_OR_LIST_GROUP_LIST), + matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME), + Entity.Type.MENTION, + ), + ) + } + } + } + return extracted + } + + /** + * Extract URL references from Tweet text. + * + * @param text of the tweet from which to extract URLs + * @return List of URLs referenced. + */ + private fun extractURLsWithIndices(text: String?): List { + if (text.isNullOrEmpty() || + (if (isExtractURLWithoutProtocol) text.indexOf('.') else text.indexOf(':')) == -1 + ) { + // Performance optimization. + // If text doesn't contain '.' or ':' at all, text doesn't contain URL, + // so we can simply return an empty list. + return emptyList() + } + val urls: MutableList = ArrayList() + val matcher: Matcher = Regex.VALID_URL.matcher(text) + while (matcher.find()) { + val protocol = matcher.group(Regex.VALID_URL_GROUP_PROTOCOL) + if (protocol.isNullOrEmpty()) { + // skip if protocol is not present and 'extractURLWithoutProtocol' is false + // or URL is preceded by invalid character. + if (!isExtractURLWithoutProtocol || + Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN + .matcher(matcher.group(Regex.VALID_URL_GROUP_BEFORE)) + .matches() + ) { + continue + } + } + val url = matcher.group(Regex.VALID_URL_GROUP_URL) + val start = matcher.start(Regex.VALID_URL_GROUP_URL) + val end = matcher.end(Regex.VALID_URL_GROUP_URL) + val host = matcher.group(Regex.VALID_URL_GROUP_DOMAIN) + if (isValidHostAndLength(url.length, protocol, host)) { + urls.add(Entity(start, end, url, Entity.Type.URL)) + } + } + return urls + } + + /** + * Extract #hashtag references from Tweet text. + * + * @param text of the tweet from which to extract hashtags + * @param checkUrlOverlap if true, check if extracted hashtags overlap URLs and + * remove overlapping ones + * @return List of hashtags referenced (without the leading # sign) + */ + private fun extractHashtagsWithIndices(text: String, checkUrlOverlap: Boolean): List { + if (text.isEmpty()) return emptyList() + + // Performance optimization. + // If text doesn't contain #/# at all, text doesn't contain + // hashtag, so we can simply return an empty list. + var found = false + for (c in text.toCharArray()) { + if (c == '#' || c == '#') { + found = true + break + } + } + if (!found) { + return emptyList() + } + val extracted: MutableList = ArrayList() + val matcher: Matcher = Regex.VALID_HASHTAG.matcher(text) + while (matcher.find()) { + val after = text.substring(matcher.end()) + if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) { + extracted.add( + Entity( + matcher, + Entity.Type.HASHTAG, + Regex.VALID_HASHTAG_GROUP_TAG, + ), + ) + } + } + if (checkUrlOverlap) { + // extract URLs + val urls = extractURLsWithIndices(text) + if (urls.isNotEmpty()) { + extracted.addAll(urls) + // remove overlap + removeOverlappingEntities(extracted) + // remove URL entities + val it = extracted.iterator() + while (it.hasNext()) { + val entity = it.next() + if (entity.type != Entity.Type.HASHTAG) { + it.remove() + } + } + } + } + return extracted + } + + /** + * An efficient converter of indices between code points and code units. + */ + private class IndexConverter(val text: String) { + // Keep track of a single corresponding pair of code unit and code point + // offsets so that we can re-use counting work if the next requested + // entity is near the most recent entity. + private var codePointIndex = 0 + private var charIndex = 0 + + /** + * Converts code units to code points + * + * @param charIndex Index into the string measured in code units. + * @return The code point index that corresponds to the specified character index. + */ + fun codeUnitsToCodePoints(charIndex: Int): Int { + if (charIndex < this.charIndex) { + codePointIndex -= text.codePointCount(charIndex, this.charIndex) + } else { + codePointIndex += text.codePointCount(this.charIndex, charIndex) + } + this.charIndex = charIndex + + // Make sure that charIndex never points to the second code unit of a + // surrogate pair. + if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) { + this.charIndex -= 1 + } + return codePointIndex + } + + /** + * Converts code points to code units + * + * @param codePointIndex Index into the string measured in code points. + * @return the code unit index that corresponds to the specified code point index. + */ + fun codePointsToCodeUnits(codePointIndex: Int): Int { + // Note that offsetByCodePoints accepts negative indices. + charIndex = text.offsetByCodePoints(charIndex, codePointIndex - this.codePointIndex) + this.codePointIndex = codePointIndex + return charIndex + } + } + + companion object { + /** + * The maximum url length that the Twitter backend supports. + */ + const val MAX_URL_LENGTH = 4096 + + /** + * The backend adds http:// for normal links and https to *.twitter.com URLs + * (it also rewrites http to https for URLs matching *.twitter.com). + * We're better off adding https:// all the time. By making the assumption that + * URL_GROUP_PROTOCOL_LENGTH is https, the trade off is we'll disallow a http URL + * that is 4096 characters. + */ + private const val URL_GROUP_PROTOCOL_LENGTH = "https://".length + + /** + * Verifies that the host name adheres to RFC 3490 and 1035 + * Also, verifies that the entire url (including protocol) doesn't exceed MAX_URL_LENGTH + * + * @param originalUrlLength The length of the entire URL, including protocol if any + * @param protocol The protocol used + * @param originalHost The hostname to check validity of + * @return true if the host is valid + */ + fun isValidHostAndLength( + originalUrlLength: Int, + protocol: String?, + originalHost: String?, + ): Boolean { + if (originalHost.isNullOrEmpty()) { + return false + } + val originalHostLength = originalHost.length + val host: String = try { + // Use IDN for all host names, if the host is all ASCII, it returns unchanged. + // It comes with an added benefit of checking host length to be between 1 and 63 characters. + IDN.toASCII(originalHost, IDN.ALLOW_UNASSIGNED) + // toASCII can throw IndexOutOfBoundsException when the domain name is longer than + // 256 characters, instead of the documented IllegalArgumentException. + } catch (e: IllegalArgumentException) { + return false + } catch (e: IndexOutOfBoundsException) { + return false + } + val punycodeEncodedHostLength = host.length + if (punycodeEncodedHostLength == 0) { + return false + } + // The punycodeEncoded host length might be different now, offset that length from the URL. + val urlLength = originalUrlLength + punycodeEncodedHostLength - originalHostLength + // Add the protocol to our length check, if there isn't one, + // to ensure it doesn't go over the limit. + val urlLengthWithProtocol = + urlLength + if (protocol == null) URL_GROUP_PROTOCOL_LENGTH else 0 + return urlLengthWithProtocol <= MAX_URL_LENGTH + } + } +} diff --git a/core/ui/src/main/kotlin/com/twitter/twittertext/Regex.kt b/core/ui/src/main/kotlin/com/twitter/twittertext/Regex.kt new file mode 100644 index 000000000..0d65b4f6b --- /dev/null +++ b/core/ui/src/main/kotlin/com/twitter/twittertext/Regex.kt @@ -0,0 +1,296 @@ +// Copyright 2018 Twitter, Inc. +// Licensed under the Apache License, Version 2.0 +// http://www.apache.org/licenses/LICENSE-2.0 + +package com.twitter.twittertext + +import java.util.regex.Pattern + +object Regex { + private val URL_VALID_GTLD = "(?:(?:" + + join(TldLists.GTLDS) + + ")(?=[^a-z0-9@+-]|$))" + private val URL_VALID_CCTLD = "(?:(?:" + + join(TldLists.CTLDS) + + ")(?=[^a-z0-9@+-]|$))" + private const val INVALID_CHARACTERS = "\\uFFFE" + // BOM + "\\uFEFF" + // BOM + "\\uFFFF" // Special + private const val DIRECTIONAL_CHARACTERS = "\\u061C" + // ARABIC LETTER MARK (ALM) + "\\u200E" + // LEFT-TO-RIGHT MARK (LRM) + "\\u200F" + // RIGHT-TO-LEFT MARK (RLM) + "\\u202A" + // LEFT-TO-RIGHT EMBEDDING (LRE) + "\\u202B" + // RIGHT-TO-LEFT EMBEDDING (RLE) + "\\u202C" + // POP DIRECTIONAL FORMATTING (PDF) + "\\u202D" + // LEFT-TO-RIGHT OVERRIDE (LRO) + "\\u202E" + // RIGHT-TO-LEFT OVERRIDE (RLO) + "\\u2066" + // LEFT-TO-RIGHT ISOLATE (LRI) + "\\u2067" + // RIGHT-TO-LEFT ISOLATE (RLI) + "\\u2068" + // FIRST STRONG ISOLATE (FSI) + "\\u2069" // POP DIRECTIONAL ISOLATE (PDI) + private const val UNICODE_SPACES = "[" + + "\\u0009-\\u000d" + // # White_Space # Cc [5] .. + "\\u0020" + // White_Space # Zs SPACE + "\\u0085" + // White_Space # Cc + "\\u00a0" + // White_Space # Zs NO-BREAK SPACE + "\\u1680" + // White_Space # Zs OGHAM SPACE MARK + "\\u180E" + // White_Space # Zs MONGOLIAN VOWEL SEPARATOR + "\\u2000-\\u200a" + // # White_Space # Zs [11] EN QUAD..HAIR SPACE + "\\u2028" + // White_Space # Zl LINE SEPARATOR + "\\u2029" + // White_Space # Zp PARAGRAPH SEPARATOR + "\\u202F" + // White_Space # Zs NARROW NO-BREAK SPACE + "\\u205F" + // White_Space # Zs MEDIUM MATHEMATICAL SPACE + "\\u3000" + // White_Space # Zs IDEOGRAPHIC SPACE + "]" + private const val LATIN_ACCENTS_CHARS = // Latin-1 + "\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff" + // Latin Extended A and B + "\\u0100-\\u024f" + // IPA Extensions + "\\u0253\\u0254\\u0256\\u0257\\u0259\\u025b\\u0263\\u0268\\u026f\\u0272\\u0289\\u028b" + // Hawaiian + "\\u02bb" + // Combining diacritics + "\\u0300-\\u036f" + // Latin Extended Additional (mostly for Vietnamese) + "\\u1e00-\\u1eff" + private const val CYRILLIC_CHARS = "\\u0400-\\u04ff" + + // Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Java's \p{L}\p{M} + private const val HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" + + "\\u037f\\u0528-\\u052f\\u08a0-\\u08b2\\u08e4-\\u08ff\\u0978\\u0980\\u0c00\\u0c34\\u0c81" + + "\\u0d01\\u0ede\\u0edf\\u10c7\\u10cd\\u10fd-\\u10ff\\u16f1-\\u16f8\\u17b4\\u17b5\\u191d" + + "\\u191e\\u1ab0-\\u1abe\\u1bab-\\u1bad\\u1bba-\\u1bbf\\u1cf3-\\u1cf6\\u1cf8\\u1cf9" + + "\\u1de7-\\u1df5\\u2cf2\\u2cf3\\u2d27\\u2d2d\\u2d66\\u2d67\\u9fcc\\ua674-\\ua67b\\ua698" + + "-\\ua69d\\ua69f\\ua792-\\ua79f\\ua7aa-\\ua7ad\\ua7b0\\ua7b1\\ua7f7-\\ua7f9\\ua9e0-" + + "\\ua9ef\\ua9fa-\\ua9fe\\uaa7c-\\uaa7f\\uaae0-\\uaaef\\uaaf2-\\uaaf6\\uab30-\\uab5a" + + "\\uab5c-\\uab5f\\uab64\\uab65\\uf870-\\uf87f\\uf882\\uf884-\\uf89f\\uf8b8\\uf8c1-" + + "\\uf8d6\\ufa2e\\ufa2f\\ufe27-\\ufe2d\\ud800\\udee0\\ud800\\udf1f\\ud800\\udf50-\\ud800" + + "\\udf7a\\ud801\\udd00-\\ud801\\udd27\\ud801\\udd30-\\ud801\\udd63\\ud801\\ude00-\\ud801" + + "\\udf36\\ud801\\udf40-\\ud801\\udf55\\ud801\\udf60-\\ud801\\udf67\\ud802\\udc60-\\ud802" + + "\\udc76\\ud802\\udc80-\\ud802\\udc9e\\ud802\\udd80-\\ud802\\uddb7\\ud802\\uddbe\\ud802" + + "\\uddbf\\ud802\\ude80-\\ud802\\ude9c\\ud802\\udec0-\\ud802\\udec7\\ud802\\udec9-\\ud802" + + "\\udee6\\ud802\\udf80-\\ud802\\udf91\\ud804\\udc7f\\ud804\\udcd0-\\ud804\\udce8\\ud804" + + "\\udd00-\\ud804\\udd34\\ud804\\udd50-\\ud804\\udd73\\ud804\\udd76\\ud804\\udd80-\\ud804" + + "\\uddc4\\ud804\\uddda\\ud804\\ude00-\\ud804\\ude11\\ud804\\ude13-\\ud804\\ude37\\ud804" + + "\\udeb0-\\ud804\\udeea\\ud804\\udf01-\\ud804\\udf03\\ud804\\udf05-\\ud804\\udf0c\\ud804" + + "\\udf0f\\ud804\\udf10\\ud804\\udf13-\\ud804\\udf28\\ud804\\udf2a-\\ud804\\udf30\\ud804" + + "\\udf32\\ud804\\udf33\\ud804\\udf35-\\ud804\\udf39\\ud804\\udf3c-\\ud804\\udf44\\ud804" + + "\\udf47\\ud804\\udf48\\ud804\\udf4b-\\ud804\\udf4d\\ud804\\udf57\\ud804\\udf5d-\\ud804" + + "\\udf63\\ud804\\udf66-\\ud804\\udf6c\\ud804\\udf70-\\ud804\\udf74\\ud805\\udc80-\\ud805" + + "\\udcc5\\ud805\\udcc7\\ud805\\udd80-\\ud805\\uddb5\\ud805\\uddb8-\\ud805\\uddc0\\ud805" + + "\\ude00-\\ud805\\ude40\\ud805\\ude44\\ud805\\ude80-\\ud805\\udeb7\\ud806\\udca0-\\ud806" + + "\\udcdf\\ud806\\udcff\\ud806\\udec0-\\ud806\\udef8\\ud808\\udf6f-\\ud808\\udf98\\ud81a" + + "\\ude40-\\ud81a\\ude5e\\ud81a\\uded0-\\ud81a\\udeed\\ud81a\\udef0-\\ud81a\\udef4\\ud81a" + + "\\udf00-\\ud81a\\udf36\\ud81a\\udf40-\\ud81a\\udf43\\ud81a\\udf63-\\ud81a\\udf77\\ud81a" + + "\\udf7d-\\ud81a\\udf8f\\ud81b\\udf00-\\ud81b\\udf44\\ud81b\\udf50-\\ud81b\\udf7e\\ud81b" + + "\\udf8f-\\ud81b\\udf9f\\ud82f\\udc00-\\ud82f\\udc6a\\ud82f\\udc70-\\ud82f\\udc7c\\ud82f" + + "\\udc80-\\ud82f\\udc88\\ud82f\\udc90-\\ud82f\\udc99\\ud82f\\udc9d\\ud82f\\udc9e\\ud83a" + + "\\udc00-\\ud83a\\udcc4\\ud83a\\udcd0-\\ud83a\\udcd6\\ud83b\\ude00-\\ud83b\\ude03\\ud83b" + + "\\ude05-\\ud83b\\ude1f\\ud83b\\ude21\\ud83b\\ude22\\ud83b\\ude24\\ud83b\\ude27\\ud83b" + + "\\ude29-\\ud83b\\ude32\\ud83b\\ude34-\\ud83b\\ude37\\ud83b\\ude39\\ud83b\\ude3b\\ud83b" + + "\\ude42\\ud83b\\ude47\\ud83b\\ude49\\ud83b\\ude4b\\ud83b\\ude4d-\\ud83b\\ude4f\\ud83b" + + "\\ude51\\ud83b\\ude52\\ud83b\\ude54\\ud83b\\ude57\\ud83b\\ude59\\ud83b\\ude5b\\ud83b" + + "\\ude5d\\ud83b\\ude5f\\ud83b\\ude61\\ud83b\\ude62\\ud83b\\ude64\\ud83b\\ude67-\\ud83b" + + "\\ude6a\\ud83b\\ude6c-\\ud83b\\ude72\\ud83b\\ude74-\\ud83b\\ude77\\ud83b\\ude79-\\ud83b" + + "\\ude7c\\ud83b\\ude7e\\ud83b\\ude80-\\ud83b\\ude89\\ud83b\\ude8b-\\ud83b\\ude9b\\ud83b" + + "\\udea1-\\ud83b\\udea3\\ud83b\\udea5-\\ud83b\\udea9\\ud83b\\udeab-\\ud83b\\udebb" + + // Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Java's \p{Nd} + private const val HASHTAG_NUMERALS = "\\p{Nd}" + + "\\u0de6-\\u0def\\ua9f0-\\ua9f9\\ud804\\udcf0-\\ud804\\udcf9\\ud804\\udd36-\\ud804" + + "\\udd3f\\ud804\\uddd0-\\ud804\\uddd9\\ud804\\udef0-\\ud804\\udef9\\ud805\\udcd0-\\ud805" + + "\\udcd9\\ud805\\ude50-\\ud805\\ude59\\ud805\\udec0-\\ud805\\udec9\\ud806\\udce0-\\ud806" + + "\\udce9\\ud81a\\ude60-\\ud81a\\ude69\\ud81a\\udf50-\\ud81a\\udf59" + private const val HASHTAG_SPECIAL_CHARS = "_" + // underscore + "\\u200c" + // ZERO WIDTH NON-JOINER (ZWNJ) + "\\u200d" + // ZERO WIDTH JOINER (ZWJ) + "\\ua67e" + // CYRILLIC KAVYKA + "\\u05be" + // HEBREW PUNCTUATION MAQAF + "\\u05f3" + // HEBREW PUNCTUATION GERESH + "\\u05f4" + // HEBREW PUNCTUATION GERSHAYIM + "\\uff5e" + // FULLWIDTH TILDE + "\\u301c" + // WAVE DASH + "\\u309b" + // KATAKANA-HIRAGANA VOICED SOUND MARK + "\\u309c" + // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + "\\u30a0" + // KATAKANA-HIRAGANA DOUBLE HYPHEN + "\\u30fb" + // KATAKANA MIDDLE DOT + "\\u3003" + // DITTO MARK + "\\u0f0b" + // TIBETAN MARK INTERSYLLABIC TSHEG + "\\u0f0c" + // TIBETAN MARK DELIMITER TSHEG BSTAR + "\\u00b7" // MIDDLE DOT + private const val HASHTAG_LETTERS_NUMERALS = + HASHTAG_LETTERS_AND_MARKS + HASHTAG_NUMERALS + HASHTAG_SPECIAL_CHARS + private const val HASHTAG_LETTERS_SET = "[$HASHTAG_LETTERS_AND_MARKS]" + private const val HASHTAG_LETTERS_NUMERALS_SET = "[$HASHTAG_LETTERS_NUMERALS]" + + /* URL related hash regex collection */ + private const val URL_VALID_PRECEDING_CHARS = + "(?:[^a-z0-9@@$##$INVALID_CHARACTERS]|[$DIRECTIONAL_CHARACTERS]|^)" + private const val URL_VALID_CHARS = "[a-z0-9$LATIN_ACCENTS_CHARS]" + private const val URL_VALID_SUBDOMAIN = + "(?>(?:$URL_VALID_CHARS[$URL_VALID_CHARS\\-_]*)?$URL_VALID_CHARS\\.)" + private const val URL_VALID_DOMAIN_NAME = + "(?:(?:$URL_VALID_CHARS[$URL_VALID_CHARS\\-]*)?$URL_VALID_CHARS\\.)" + private const val PUNCTUATION_CHARS = "-_!\"#$%&'\\(\\)*+,./:;<=>?@\\[\\]^`\\{|}~" + + // Any non-space, non-punctuation characters. + // \p{Z} = any kind of whitespace or invisible separator. + private const val URL_VALID_UNICODE_CHARS = + "[^$PUNCTUATION_CHARS\\s\\p{Z}\\p{InGeneralPunctuation}]" + private const val URL_VALID_UNICODE_DOMAIN_NAME = + "(?:(?:" + URL_VALID_UNICODE_CHARS + "[" + URL_VALID_UNICODE_CHARS + "\\-]*)?" + + URL_VALID_UNICODE_CHARS + "\\.)" + private const val URL_PUNYCODE = "(?:xn--[-0-9a-z]+)" + private val URL_VALID_DOMAIN = "(?:" + // optional sub-domain + domain + TLD + URL_VALID_SUBDOMAIN + "*" + URL_VALID_DOMAIN_NAME + // e.g. twitter.com, foo.co.jp ... + "(?:" + URL_VALID_GTLD + "|" + URL_VALID_CCTLD + "|" + URL_PUNYCODE + ")" + + ")" + + "|(?:" + "(?<=https?://)" + + "(?:" + + "(?:" + URL_VALID_DOMAIN_NAME + URL_VALID_CCTLD + ")" + // protocol + domain + ccTLD + "|(?:" + + URL_VALID_UNICODE_DOMAIN_NAME + // protocol + unicode domain + TLD + "(?:" + URL_VALID_GTLD + "|" + URL_VALID_CCTLD + ")" + + ")" + + ")" + + ")" + + "|(?:" + // domain + ccTLD + '/' + URL_VALID_DOMAIN_NAME + URL_VALID_CCTLD + "(?=/)" + // e.g. t.co/ + ")" + private const val URL_VALID_PORT_NUMBER = "[0-9]++" + private const val URL_VALID_GENERAL_PATH_CHARS = + "[a-z0-9!\\*';:=\\+,.\\$/%#\\[\\]\\-\\u2013_~\\|&@" + + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]" + + /** + * Allow URL paths to contain up to two nested levels of balanced parens + * 1. Used in Wikipedia URLs like /Primer_(film) + * 2. Used in IIS sessions like /S(dfd346)/ + * 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/ + */ + private const val URL_BALANCED_PARENS = "\\(" + + "(?:" + + URL_VALID_GENERAL_PATH_CHARS + "+" + + "|" + // allow one nested level of balanced parentheses + "(?:" + + URL_VALID_GENERAL_PATH_CHARS + "*" + + "\\(" + + URL_VALID_GENERAL_PATH_CHARS + "+" + + "\\)" + + URL_VALID_GENERAL_PATH_CHARS + "*" + + ")" + + ")" + + "\\)" + + /** + * Valid end-of-path characters (so /foo. does not gobble the period). + * 2. Allow =&# for empty URL parameters and other URL-join artifacts + */ + private const val URL_VALID_PATH_ENDING_CHARS = + "[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]|(?:" + + URL_BALANCED_PARENS + ")" + private const val URL_VALID_PATH = "(?:" + + "(?:" + + URL_VALID_GENERAL_PATH_CHARS + "*" + + "(?:" + URL_BALANCED_PARENS + URL_VALID_GENERAL_PATH_CHARS + "*)*" + + URL_VALID_PATH_ENDING_CHARS + + ")|(?:@" + URL_VALID_GENERAL_PATH_CHARS + "+/)" + + ")" + private const val URL_VALID_URL_QUERY_CHARS = + "[a-z0-9!?\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~\\|@]" + private const val URL_VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9\\-_&=#/]" + private val VALID_URL_PATTERN_STRING = "(" + // $1 total match + "(" + URL_VALID_PRECEDING_CHARS + ")" + // $2 Preceding character + "(" + // $3 URL + "(https?://)?" + // $4 Protocol (optional) + "(" + URL_VALID_DOMAIN + ")" + // $5 Domain(s) + "(?::(" + URL_VALID_PORT_NUMBER + "))?" + // $6 Port number (optional) + "(/" + + URL_VALID_PATH + "*+" + + ")?" + // $7 URL Path and anchor + "(\\?" + URL_VALID_URL_QUERY_CHARS + "*" + // $8 Query String + URL_VALID_URL_QUERY_ENDING_CHARS + ")?" + + ")" + + ")" + private const val AT_SIGNS_CHARS = "@\uFF20" + + /* Begin public constants */ + private val INVALID_CHARACTERS_PATTERN: Pattern + val VALID_HASHTAG: Pattern + const val VALID_HASHTAG_GROUP_TAG = 1 + val INVALID_HASHTAG_MATCH_END: Pattern + private val RTL_CHARACTERS: Pattern + private val AT_SIGNS: Pattern + val VALID_MENTION_OR_LIST: Pattern + const val VALID_MENTION_OR_LIST_GROUP_USERNAME = 3 + const val VALID_MENTION_OR_LIST_GROUP_LIST = 4 + private val VALID_REPLY: Pattern + val INVALID_MENTION_MATCH_END: Pattern + + /** + * Regex to extract URL (it also includes the text preceding the url). + * + * This regex does not reflect its name and [Regex.VALID_URL_GROUP_URL] match + * should be checked in order to match a valid url. This is not ideal, but the behavior is + * being kept to ensure backwards compatibility. Ideally this regex should be + * implemented with a negative lookbehind as opposed to a negated character class + * but lack of JS support increases maint overhead if the logic is different by + * platform. + */ + val VALID_URL: Pattern + const val VALID_URL_GROUP_BEFORE = 2 + const val VALID_URL_GROUP_URL = 3 + const val VALID_URL_GROUP_PROTOCOL = 4 + const val VALID_URL_GROUP_DOMAIN = 5 + val INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN: Pattern + private val VALID_DOMAIN: Pattern + + // Mastodon hashtag regular expressions. Different from the Twitter ones, in particular, + // they can be preceded by e.g., Hirigana characters without an intervening space. + // See HASHTAG_RE in https://github.com/mastodon/mastodon/blob/main/app/models/tag.rb + // (which is VALID_HASHTAG in this file). + private const val HASHTAG_SEPARATORS = "_\u00B7\u30FB\u200c" + private const val HASHTAG_FIRST_SEQUENCE_CHUNK_ONE = """[\w_][\w$HASHTAG_SEPARATORS]*[\p{Alpha}$HASHTAG_SEPARATORS]""" + private const val HASHTAG_FIRST_SEQUENCE_CHUNK_TWO = """[\w$HASHTAG_SEPARATORS]*[\w_]""" + private const val HASHTAG_FIRST_SEQUENCE = "($HASHTAG_FIRST_SEQUENCE_CHUNK_ONE$HASHTAG_FIRST_SEQUENCE_CHUNK_TWO)" + private const val HASHTAG_LAST_SEQUENCE = """([\w_]*[\p{L}][\w_]*)""" + private const val HASHTAG_NAME_PAT: String = "$HASHTAG_FIRST_SEQUENCE|$HASHTAG_LAST_SEQUENCE" + + // initializing in a static synchronized block, + // there appears to be thread safety issues with Pattern.compile in android + init { + synchronized(Regex::class.java) { + INVALID_CHARACTERS_PATTERN = Pattern.compile(".*[$INVALID_CHARACTERS].*") + VALID_HASHTAG = Pattern.compile( + "(?): String { + val sb = StringBuilder() + val iter = col.iterator() + if (iter.hasNext()) { + sb.append(iter.next().toString()) + } + while (iter.hasNext()) { + sb.append("|") + sb.append(iter.next().toString()) + } + return sb.toString() + } +} diff --git a/core/ui/src/main/kotlin/com/twitter/twittertext/TldLists.kt b/core/ui/src/main/kotlin/com/twitter/twittertext/TldLists.kt new file mode 100644 index 000000000..dbb01b783 --- /dev/null +++ b/core/ui/src/main/kotlin/com/twitter/twittertext/TldLists.kt @@ -0,0 +1,1586 @@ +// Copyright 2018 Twitter, Inc. +// Licensed under the Apache License, Version 2.0 +// http://www.apache.org/licenses/LICENSE-2.0 +// Auto-generated by conformance/Rakefile +package com.twitter.twittertext + +object TldLists { + val GTLDS: List = mutableListOf( + "삼성", + "닷컴", + "닷넷", + "香格里拉", + "餐厅", + "食品", + "飞利浦", + "電訊盈科", + "集团", + "通販", + "购物", + "谷歌", + "诺基亚", + "联通", + "网络", + "网站", + "网店", + "网址", + "组织机构", + "移动", + "珠宝", + "点看", + "游戏", + "淡马锡", + "机构", + "書籍", + "时尚", + "新闻", + "政府", + "政务", + "招聘", + "手表", + "手机", + "我爱你", + "慈善", + "微博", + "广东", + "工行", + "家電", + "娱乐", + "天主教", + "大拿", + "大众汽车", + "在线", + "嘉里大酒店", + "嘉里", + "商标", + "商店", + "商城", + "公益", + "公司", + "八卦", + "健康", + "信息", + "佛山", + "企业", + "中文网", + "中信", + "世界", + "ポイント", + "ファッション", + "セール", + "ストア", + "コム", + "グーグル", + "クラウド", + "みんな", + "คอม", + "संगठन", + "नेट", + "कॉम", + "همراه", + "موقع", + "موبايلي", + "كوم", + "كاثوليك", + "عرب", + "شبكة", + "بيتك", + "بازار", + "العليان", + "ارامكو", + "اتصالات", + "ابوظبي", + "קום", + "сайт", + "рус", + "орг", + "онлайн", + "москва", + "ком", + "католик", + "дети", + "zuerich", + "zone", + "zippo", + "zip", + "zero", + "zara", + "zappos", + "yun", + "youtube", + "you", + "yokohama", + "yoga", + "yodobashi", + "yandex", + "yamaxun", + "yahoo", + "yachts", + "xyz", + "xxx", + "xperia", + "xin", + "xihuan", + "xfinity", + "xerox", + "xbox", + "wtf", + "wtc", + "wow", + "world", + "works", + "work", + "woodside", + "wolterskluwer", + "wme", + "winners", + "wine", + "windows", + "win", + "williamhill", + "wiki", + "wien", + "whoswho", + "weir", + "weibo", + "wedding", + "wed", + "website", + "weber", + "webcam", + "weatherchannel", + "weather", + "watches", + "watch", + "warman", + "wanggou", + "wang", + "walter", + "walmart", + "wales", + "vuelos", + "voyage", + "voto", + "voting", + "vote", + "volvo", + "volkswagen", + "vodka", + "vlaanderen", + "vivo", + "viva", + "vistaprint", + "vista", + "vision", + "visa", + "virgin", + "vip", + "vin", + "villas", + "viking", + "vig", + "video", + "viajes", + "vet", + "versicherung", + "vermögensberatung", + "vermögensberater", + "verisign", + "ventures", + "vegas", + "vanguard", + "vana", + "vacations", + "ups", + "uol", + "uno", + "university", + "unicom", + "uconnect", + "ubs", + "ubank", + "tvs", + "tushu", + "tunes", + "tui", + "tube", + "trv", + "trust", + "travelersinsurance", + "travelers", + "travelchannel", + "travel", + "training", + "trading", + "trade", + "toys", + "toyota", + "town", + "tours", + "total", + "toshiba", + "toray", + "top", + "tools", + "tokyo", + "today", + "tmall", + "tkmaxx", + "tjx", + "tjmaxx", + "tirol", + "tires", + "tips", + "tiffany", + "tienda", + "tickets", + "tiaa", + "theatre", + "theater", + "thd", + "teva", + "tennis", + "temasek", + "telefonica", + "telecity", + "tel", + "technology", + "tech", + "team", + "tdk", + "tci", + "taxi", + "tax", + "tattoo", + "tatar", + "tatamotors", + "target", + "taobao", + "talk", + "taipei", + "tab", + "systems", + "symantec", + "sydney", + "swiss", + "swiftcover", + "swatch", + "suzuki", + "surgery", + "surf", + "support", + "supply", + "supplies", + "sucks", + "style", + "study", + "studio", + "stream", + "store", + "storage", + "stockholm", + "stcgroup", + "stc", + "statoil", + "statefarm", + "statebank", + "starhub", + "star", + "staples", + "stada", + "srt", + "srl", + "spreadbetting", + "spot", + "sport", + "spiegel", + "space", + "soy", + "sony", + "song", + "solutions", + "solar", + "sohu", + "software", + "softbank", + "social", + "soccer", + "sncf", + "smile", + "smart", + "sling", + "skype", + "sky", + "skin", + "ski", + "site", + "singles", + "sina", + "silk", + "shriram", + "showtime", + "show", + "shouji", + "shopping", + "shop", + "shoes", + "shiksha", + "shia", + "shell", + "shaw", + "sharp", + "shangrila", + "sfr", + "sexy", + "sex", + "sew", + "seven", + "ses", + "services", + "sener", + "select", + "seek", + "security", + "secure", + "seat", + "search", + "scot", + "scor", + "scjohnson", + "science", + "schwarz", + "schule", + "school", + "scholarships", + "schmidt", + "schaeffler", + "scb", + "sca", + "sbs", + "sbi", + "saxo", + "save", + "sas", + "sarl", + "sapo", + "sap", + "sanofi", + "sandvikcoromant", + "sandvik", + "samsung", + "samsclub", + "salon", + "sale", + "sakura", + "safety", + "safe", + "saarland", + "ryukyu", + "rwe", + "run", + "ruhr", + "rugby", + "rsvp", + "room", + "rogers", + "rodeo", + "rocks", + "rocher", + "rmit", + "rip", + "rio", + "ril", + "rightathome", + "ricoh", + "richardli", + "rich", + "rexroth", + "reviews", + "review", + "restaurant", + "rest", + "republican", + "report", + "repair", + "rentals", + "rent", + "ren", + "reliance", + "reit", + "reisen", + "reise", + "rehab", + "redumbrella", + "redstone", + "red", + "recipes", + "realty", + "realtor", + "realestate", + "read", + "raid", + "radio", + "racing", + "qvc", + "quest", + "quebec", + "qpon", + "pwc", + "pub", + "prudential", + "pru", + "protection", + "property", + "properties", + "promo", + "progressive", + "prof", + "productions", + "prod", + "pro", + "prime", + "press", + "praxi", + "pramerica", + "post", + "porn", + "politie", + "poker", + "pohl", + "pnc", + "plus", + "plumbing", + "playstation", + "play", + "place", + "pizza", + "pioneer", + "pink", + "ping", + "pin", + "pid", + "pictures", + "pictet", + "pics", + "piaget", + "physio", + "photos", + "photography", + "photo", + "phone", + "philips", + "phd", + "pharmacy", + "pfizer", + "pet", + "pccw", + "pay", + "passagens", + "party", + "parts", + "partners", + "pars", + "paris", + "panerai", + "panasonic", + "pamperedchef", + "page", + "ovh", + "ott", + "otsuka", + "osaka", + "origins", + "orientexpress", + "organic", + "org", + "orange", + "oracle", + "open", + "ooo", + "onyourside", + "online", + "onl", + "ong", + "one", + "omega", + "ollo", + "oldnavy", + "olayangroup", + "olayan", + "okinawa", + "office", + "off", + "observer", + "obi", + "nyc", + "ntt", + "nrw", + "nra", + "nowtv", + "nowruz", + "now", + "norton", + "northwesternmutual", + "nokia", + "nissay", + "nissan", + "ninja", + "nikon", + "nike", + "nico", + "nhk", + "ngo", + "nfl", + "nexus", + "nextdirect", + "next", + "news", + "newholland", + "new", + "neustar", + "network", + "netflix", + "netbank", + "net", + "nec", + "nba", + "navy", + "natura", + "nationwide", + "name", + "nagoya", + "nadex", + "nab", + "mutuelle", + "mutual", + "museum", + "mtr", + "mtpc", + "mtn", + "msd", + "movistar", + "movie", + "mov", + "motorcycles", + "moto", + "moscow", + "mortgage", + "mormon", + "mopar", + "montblanc", + "monster", + "money", + "monash", + "mom", + "moi", + "moe", + "moda", + "mobily", + "mobile", + "mobi", + "mma", + "mls", + "mlb", + "mitsubishi", + "mit", + "mint", + "mini", + "mil", + "microsoft", + "miami", + "metlife", + "merckmsd", + "meo", + "menu", + "men", + "memorial", + "meme", + "melbourne", + "meet", + "media", + "med", + "mckinsey", + "mcdonalds", + "mcd", + "mba", + "mattel", + "maserati", + "marshalls", + "marriott", + "markets", + "marketing", + "market", + "map", + "mango", + "management", + "man", + "makeup", + "maison", + "maif", + "madrid", + "macys", + "luxury", + "luxe", + "lupin", + "lundbeck", + "ltda", + "ltd", + "lplfinancial", + "lpl", + "love", + "lotto", + "lotte", + "london", + "lol", + "loft", + "locus", + "locker", + "loans", + "loan", + "llp", + "llc", + "lixil", + "living", + "live", + "lipsy", + "link", + "linde", + "lincoln", + "limo", + "limited", + "lilly", + "like", + "lighting", + "lifestyle", + "lifeinsurance", + "life", + "lidl", + "liaison", + "lgbt", + "lexus", + "lego", + "legal", + "lefrak", + "leclerc", + "lease", + "lds", + "lawyer", + "law", + "latrobe", + "latino", + "lat", + "lasalle", + "lanxess", + "landrover", + "land", + "lancome", + "lancia", + "lancaster", + "lamer", + "lamborghini", + "ladbrokes", + "lacaixa", + "kyoto", + "kuokgroup", + "kred", + "krd", + "kpn", + "kpmg", + "kosher", + "komatsu", + "koeln", + "kiwi", + "kitchen", + "kindle", + "kinder", + "kim", + "kia", + "kfh", + "kerryproperties", + "kerrylogistics", + "kerryhotels", + "kddi", + "kaufen", + "juniper", + "juegos", + "jprs", + "jpmorgan", + "joy", + "jot", + "joburg", + "jobs", + "jnj", + "jmp", + "jll", + "jlc", + "jio", + "jewelry", + "jetzt", + "jeep", + "jcp", + "jcb", + "java", + "jaguar", + "iwc", + "iveco", + "itv", + "itau", + "istanbul", + "ist", + "ismaili", + "iselect", + "irish", + "ipiranga", + "investments", + "intuit", + "international", + "intel", + "int", + "insure", + "insurance", + "institute", + "ink", + "ing", + "info", + "infiniti", + "industries", + "inc", + "immobilien", + "immo", + "imdb", + "imamat", + "ikano", + "iinet", + "ifm", + "ieee", + "icu", + "ice", + "icbc", + "ibm", + "hyundai", + "hyatt", + "hughes", + "htc", + "hsbc", + "how", + "house", + "hotmail", + "hotels", + "hoteles", + "hot", + "hosting", + "host", + "hospital", + "horse", + "honeywell", + "honda", + "homesense", + "homes", + "homegoods", + "homedepot", + "holiday", + "holdings", + "hockey", + "hkt", + "hiv", + "hitachi", + "hisamitsu", + "hiphop", + "hgtv", + "hermes", + "here", + "helsinki", + "help", + "healthcare", + "health", + "hdfcbank", + "hdfc", + "hbo", + "haus", + "hangout", + "hamburg", + "hair", + "guru", + "guitars", + "guide", + "guge", + "gucci", + "guardian", + "group", + "grocery", + "gripe", + "green", + "gratis", + "graphics", + "grainger", + "gov", + "got", + "gop", + "google", + "goog", + "goodyear", + "goodhands", + "goo", + "golf", + "goldpoint", + "gold", + "godaddy", + "gmx", + "gmo", + "gmbh", + "gmail", + "globo", + "global", + "gle", + "glass", + "glade", + "giving", + "gives", + "gifts", + "gift", + "ggee", + "george", + "genting", + "gent", + "gea", + "gdn", + "gbiz", + "gay", + "garden", + "gap", + "games", + "game", + "gallup", + "gallo", + "gallery", + "gal", + "fyi", + "futbol", + "furniture", + "fund", + "fun", + "fujixerox", + "fujitsu", + "ftr", + "frontier", + "frontdoor", + "frogans", + "frl", + "fresenius", + "free", + "fox", + "foundation", + "forum", + "forsale", + "forex", + "ford", + "football", + "foodnetwork", + "food", + "foo", + "fly", + "flsmidth", + "flowers", + "florist", + "flir", + "flights", + "flickr", + "fitness", + "fit", + "fishing", + "fish", + "firmdale", + "firestone", + "fire", + "financial", + "finance", + "final", + "film", + "fido", + "fidelity", + "fiat", + "ferrero", + "ferrari", + "feedback", + "fedex", + "fast", + "fashion", + "farmers", + "farm", + "fans", + "fan", + "family", + "faith", + "fairwinds", + "fail", + "fage", + "extraspace", + "express", + "exposed", + "expert", + "exchange", + "everbank", + "events", + "eus", + "eurovision", + "etisalat", + "esurance", + "estate", + "esq", + "erni", + "ericsson", + "equipment", + "epson", + "epost", + "enterprises", + "engineering", + "engineer", + "energy", + "emerck", + "email", + "education", + "edu", + "edeka", + "eco", + "eat", + "earth", + "dvr", + "dvag", + "durban", + "dupont", + "duns", + "dunlop", + "duck", + "dubai", + "dtv", + "drive", + "download", + "dot", + "doosan", + "domains", + "doha", + "dog", + "dodge", + "doctor", + "docs", + "dnp", + "diy", + "dish", + "discover", + "discount", + "directory", + "direct", + "digital", + "diet", + "diamonds", + "dhl", + "dev", + "design", + "desi", + "dentist", + "dental", + "democrat", + "delta", + "deloitte", + "dell", + "delivery", + "degree", + "deals", + "dealer", + "deal", + "dds", + "dclk", + "day", + "datsun", + "dating", + "date", + "data", + "dance", + "dad", + "dabur", + "cyou", + "cymru", + "cuisinella", + "csc", + "cruises", + "cruise", + "crs", + "crown", + "cricket", + "creditunion", + "creditcard", + "credit", + "cpa", + "courses", + "coupons", + "coupon", + "country", + "corsica", + "coop", + "cool", + "cookingchannel", + "cooking", + "contractors", + "contact", + "consulting", + "construction", + "condos", + "comsec", + "computer", + "compare", + "company", + "community", + "commbank", + "comcast", + "com", + "cologne", + "college", + "coffee", + "codes", + "coach", + "clubmed", + "club", + "cloud", + "clothing", + "clinique", + "clinic", + "click", + "cleaning", + "claims", + "cityeats", + "city", + "citic", + "citi", + "citadel", + "cisco", + "circle", + "cipriani", + "church", + "chrysler", + "chrome", + "christmas", + "chloe", + "chintai", + "cheap", + "chat", + "chase", + "charity", + "channel", + "chanel", + "cfd", + "cfa", + "cern", + "ceo", + "center", + "ceb", + "cbs", + "cbre", + "cbn", + "cba", + "catholic", + "catering", + "cat", + "casino", + "cash", + "caseih", + "case", + "casa", + "cartier", + "cars", + "careers", + "career", + "care", + "cards", + "caravan", + "car", + "capitalone", + "capital", + "capetown", + "canon", + "cancerresearch", + "camp", + "camera", + "cam", + "calvinklein", + "call", + "cal", + "cafe", + "cab", + "bzh", + "buzz", + "buy", + "business", + "builders", + "build", + "bugatti", + "budapest", + "brussels", + "brother", + "broker", + "broadway", + "bridgestone", + "bradesco", + "box", + "boutique", + "bot", + "boston", + "bostik", + "bosch", + "boots", + "booking", + "book", + "boo", + "bond", + "bom", + "bofa", + "boehringer", + "boats", + "bnpparibas", + "bnl", + "bmw", + "bms", + "blue", + "bloomberg", + "blog", + "blockbuster", + "blanco", + "blackfriday", + "black", + "biz", + "bio", + "bingo", + "bing", + "bike", + "bid", + "bible", + "bharti", + "bet", + "bestbuy", + "best", + "berlin", + "bentley", + "beer", + "beauty", + "beats", + "bcn", + "bcg", + "bbva", + "bbt", + "bbc", + "bayern", + "bauhaus", + "basketball", + "baseball", + "bargains", + "barefoot", + "barclays", + "barclaycard", + "barcelona", + "bar", + "bank", + "band", + "bananarepublic", + "banamex", + "baidu", + "baby", + "azure", + "axa", + "aws", + "avianca", + "autos", + "auto", + "author", + "auspost", + "audio", + "audible", + "audi", + "auction", + "attorney", + "athleta", + "associates", + "asia", + "asda", + "arte", + "art", + "arpa", + "army", + "archi", + "aramco", + "arab", + "aquarelle", + "apple", + "app", + "apartments", + "aol", + "anz", + "anquan", + "android", + "analytics", + "amsterdam", + "amica", + "amfam", + "amex", + "americanfamily", + "americanexpress", + "alstom", + "alsace", + "ally", + "allstate", + "allfinanz", + "alipay", + "alibaba", + "alfaromeo", + "akdn", + "airtel", + "airforce", + "airbus", + "aigo", + "aig", + "agency", + "agakhan", + "africa", + "afl", + "afamilycompany", + "aetna", + "aero", + "aeg", + "adult", + "ads", + "adac", + "actor", + "active", + "aco", + "accountants", + "accountant", + "accenture", + "academy", + "abudhabi", + "abogado", + "able", + "abc", + "abbvie", + "abbott", + "abb", + "abarth", + "aarp", + "aaa", + "onion", + ) + val CTLDS: List = mutableListOf( + "한국", + "香港", + "澳門", + "新加坡", + "台灣", + "台湾", + "中國", + "中国", + "გე", + "ລາວ", + "ไทย", + "ලංකා", + "ഭാരതം", + "ಭಾರತ", + "భారత్", + "சிங்கப்பூர்", + "இலங்கை", + "இந்தியா", + "ଭାରତ", + "ભારત", + "ਭਾਰਤ", + "ভাৰত", + "ভারত", + "বাংলা", + "भारोत", + "भारतम्", + "भारत", + "ڀارت", + "پاکستان", + "موريتانيا", + "مليسيا", + "مصر", + "قطر", + "فلسطين", + "عمان", + "عراق", + "سورية", + "سودان", + "تونس", + "بھارت", + "بارت", + "ایران", + "امارات", + "المغرب", + "السعودية", + "الجزائر", + "البحرين", + "الاردن", + "հայ", + "қаз", + "укр", + "срб", + "рф", + "мон", + "мкд", + "ею", + "бел", + "бг", + "ευ", + "ελ", + "zw", + "zm", + "za", + "yt", + "ye", + "ws", + "wf", + "vu", + "vn", + "vi", + "vg", + "ve", + "vc", + "va", + "uz", + "uy", + "us", + "um", + "uk", + "ug", + "ua", + "tz", + "tw", + "tv", + "tt", + "tr", + "tp", + "to", + "tn", + "tm", + "tl", + "tk", + "tj", + "th", + "tg", + "tf", + "td", + "tc", + "sz", + "sy", + "sx", + "sv", + "su", + "st", + "ss", + "sr", + "so", + "sn", + "sm", + "sl", + "sk", + "sj", + "si", + "sh", + "sg", + "se", + "sd", + "sc", + "sb", + "sa", + "rw", + "ru", + "rs", + "ro", + "re", + "qa", + "py", + "pw", + "pt", + "ps", + "pr", + "pn", + "pm", + "pl", + "pk", + "ph", + "pg", + "pf", + "pe", + "pa", + "om", + "nz", + "nu", + "nr", + "np", + "no", + "nl", + "ni", + "ng", + "nf", + "ne", + "nc", + "na", + "mz", + "my", + "mx", + "mw", + "mv", + "mu", + "mt", + "ms", + "mr", + "mq", + "mp", + "mo", + "mn", + "mm", + "ml", + "mk", + "mh", + "mg", + "mf", + "me", + "md", + "mc", + "ma", + "ly", + "lv", + "lu", + "lt", + "ls", + "lr", + "lk", + "li", + "lc", + "lb", + "la", + "kz", + "ky", + "kw", + "kr", + "kp", + "kn", + "km", + "ki", + "kh", + "kg", + "ke", + "jp", + "jo", + "jm", + "je", + "it", + "is", + "ir", + "iq", + "io", + "in", + "im", + "il", + "ie", + "id", + "hu", + "ht", + "hr", + "hn", + "hm", + "hk", + "gy", + "gw", + "gu", + "gt", + "gs", + "gr", + "gq", + "gp", + "gn", + "gm", + "gl", + "gi", + "gh", + "gg", + "gf", + "ge", + "gd", + "gb", + "ga", + "fr", + "fo", + "fm", + "fk", + "fj", + "fi", + "eu", + "et", + "es", + "er", + "eh", + "eg", + "ee", + "ec", + "dz", + "do", + "dm", + "dk", + "dj", + "de", + "cz", + "cy", + "cx", + "cw", + "cv", + "cu", + "cr", + "co", + "cn", + "cm", + "cl", + "ck", + "ci", + "ch", + "cg", + "cf", + "cd", + "cc", + "ca", + "bz", + "by", + "bw", + "bv", + "bt", + "bs", + "br", + "bq", + "bo", + "bn", + "bm", + "bl", + "bj", + "bi", + "bh", + "bg", + "bf", + "be", + "bd", + "bb", + "ba", + "az", + "ax", + "aw", + "au", + "at", + "as", + "ar", + "aq", + "ao", + "an", + "am", + "al", + "ai", + "ag", + "af", + "ae", + "ad", + "ac", + ) +} diff --git a/licenses/libraries/twittertext.json b/licenses/libraries/twittertext.json new file mode 100644 index 000000000..59a71dcaa --- /dev/null +++ b/licenses/libraries/twittertext.json @@ -0,0 +1,17 @@ +{ + "uniqueId": "com.twitter:twittertext", + "developers": [ + { + "name": "Twitter", + "organisationUrl": "https://www.twitter.com" + } + ], + "artifactVersion": "1.14.3", + "description": "Libraries and conformance tests to standardize parsing of Tweet text.", + "name": "Twitter text parsing", + "tag": "text", + "licenses": [ + "Apache_2_0" + ], + "website": "https://github.com/twitter/twitter-text" +}