fix: Improve URL / tag / mention extraction when composing (#564)

Previous code used custom regular expressions to extract URLs, hashtags,
and mentions from text while the user was writing a post. These were
inconsistent with the ones that Mastodon uses so the derived character
count could be wrong.

As well as being visually incorrect this could prevent the user from
posting a status that was within the length limit, or allow them to
attempt to post a status that was over the length limit (which would
then fail).

Fix this by dropping the homegrown regular expressions and using the
same text parsing library that Mastodon users; twitter-text. This has
been converted to Kotlin and the functionality related to Twitter
specific features has been removed.

The hashtag handling has been adjusted, as Mastodon is more permissive
about the positions where hashtags can appear than Twitter is, in
particular, a hashtag does not need to be preceded with whitespace if
the tag appears after some scripts, such as Hirigana.
This commit is contained in:
Nik Clayton 2024-03-23 14:14:07 +01:00 committed by GitHub
parent 62fd47e862
commit 2a4126a542
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 2266 additions and 166 deletions

View File

@ -1,68 +1,33 @@
/*
* Copyright 2024 Pachli Association
*
* This file is a part of Pachli.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation; either version 3 of the
* License, or (at your option) any later version.
*
* Pachli is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
* Public License for more details.
*
* You should have received a copy of the GNU General Public License along with Pachli; if not,
* see <http://www.gnu.org/licenses>.
*/
package app.pachli.util
import android.text.Spannable
import android.text.Spanned
import android.text.style.CharacterStyle
import android.text.style.ForegroundColorSpan
import android.text.style.URLSpan
import app.pachli.core.ui.MentionSpan
import app.pachli.core.ui.NoUnderlineURLSpan
import java.util.regex.Pattern
import kotlin.math.max
/**
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/tag.rb">
* Tag#HASHTAG_RE</a>.
*/
private const val HASHTAG_SEPARATORS = "_\\u00B7\\u200c"
private const val UNICODE_WORD = "\\p{L}\\p{Mn}\\p{Nd}\\p{Nl}\\p{Pc}" // Ugh, java ( https://stackoverflow.com/questions/4304928/unicode-equivalents-for-w-and-b-in-java-regular-expressions )
private const val TAG_REGEX = "(?:^|[^/)\\w])#(([${UNICODE_WORD}_][$UNICODE_WORD$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][$UNICODE_WORD$HASHTAG_SEPARATORS]*[${UNICODE_WORD}_])|([${UNICODE_WORD}_]*[\\p{Alpha}][${UNICODE_WORD}_]*))"
/**
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/account.rb">
* Account#MENTION_RE</a>
*/
private const val USERNAME_REGEX = "[\\w]+([\\w\\.-]+[\\w]+)?"
private const val MENTION_REGEX = "(?<=^|[^\\/$UNICODE_WORD])@(($USERNAME_REGEX)(?:@[$UNICODE_WORD\\.\\-]+[$UNICODE_WORD]+)?)"
private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)"
private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)"
/**
* Dump of android.util.Patterns.WEB_URL
*/
private val STRICT_WEB_URL_PATTERN = Pattern.compile("(((?:(?i:http|https|rtsp)://(?:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?(?:(([a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ -]\u2028\u2029 ]]](?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ -]\u2028\u2029 ]]_\\-]{0,61}[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ -]\u2028\u2029 ]]]){0,1}\\.)+(xn\\-\\-[\\w\\-]{0,58}\\w|[a-zA-Z[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ -]\u2028\u2029 ]]]{2,63})|((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9]))))(?:\\:\\d{1,5})?)([/\\?](?:(?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ -]\u2028\u2029 ]];/\\?:@&=#~\\-\\.\\+!\\*'\\(\\),_\\\$])|(?:%[a-fA-F0-9]{2}))*)?(?:\\b|\$|^))")
import com.twitter.twittertext.Extractor
private val spanClasses = listOf(ForegroundColorSpan::class.java, URLSpan::class.java)
private val finders = mapOf(
FoundMatchType.HTTP_URL to PatternFinder(':', HTTP_URL_REGEX, 5, Character::isWhitespace),
FoundMatchType.HTTPS_URL to PatternFinder(':', HTTPS_URL_REGEX, 6, Character::isWhitespace),
FoundMatchType.TAG to PatternFinder('#', TAG_REGEX, 1, ::isValidForTagPrefix),
// TODO: We also need a proper validator for mentions
FoundMatchType.MENTION to PatternFinder('@', MENTION_REGEX, 1, Character::isWhitespace),
)
private enum class FoundMatchType {
HTTP_URL,
HTTPS_URL,
TAG,
MENTION,
}
private class FindCharsResult {
lateinit var matchType: FoundMatchType
var start: Int = -1
var end: Int = -1
}
private class PatternFinder(
val searchCharacter: Char,
regex: String,
val searchPrefixWidth: Int,
val prefixValidator: (Int) -> Boolean,
) {
val pattern: Pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
}
private val extractor = Extractor().apply { isExtractURLWithoutProtocol = false }
/**
* Takes text containing mentions and hashtags and urls and makes them the given colour.
@ -75,18 +40,16 @@ fun highlightSpans(text: Spannable, colour: Int) {
// Colour the mentions and hashtags.
val string = text.toString()
val length = text.length
var start = 0
var end = 0
while (end in 0 until length && start >= 0) {
// Search for url first because it can contain the other characters
val found = findPattern(string, end)
start = found.start
end = found.end
if (start in 0 until end) {
text.setSpan(getSpan(found.matchType, string, colour, start, end), start, end, Spanned.SPAN_INCLUSIVE_EXCLUSIVE)
start += finders[found.matchType]!!.searchPrefixWidth
val entities = extractor.extractEntitiesWithIndices(string)
for (entity in entities) {
val span = when (entity.type) {
Extractor.Entity.Type.URL -> NoUnderlineURLSpan(string.substring(entity.start, entity.end))
Extractor.Entity.Type.HASHTAG -> ForegroundColorSpan(colour)
Extractor.Entity.Type.MENTION -> MentionSpan(string.substring(entity.start, entity.end))
}
text.setSpan(span, entity.start, entity.end, Spanned.SPAN_INCLUSIVE_EXCLUSIVE)
}
}
@ -95,87 +58,3 @@ private fun <T> clearSpans(text: Spannable, spanClass: Class<T>) {
text.removeSpan(span)
}
}
private fun findPattern(string: String, fromIndex: Int): FindCharsResult {
val result = FindCharsResult()
for (i in fromIndex..string.lastIndex) {
val c = string[i]
for (matchType in FoundMatchType.entries) {
val finder = finders[matchType]
if (finder!!.searchCharacter == c &&
(
(i - fromIndex) < finder.searchPrefixWidth ||
finder.prefixValidator(string.codePointAt(i - finder.searchPrefixWidth))
)
) {
result.matchType = matchType
result.start = max(0, i - finder.searchPrefixWidth)
findEndOfPattern(string, result, finder.pattern)
if (result.start + finder.searchPrefixWidth <= i + 1 && // The found result is actually triggered by the correct search character
result.end >= result.start
) { // ...and we actually found a valid result
return result
}
}
}
}
return result
}
private fun findEndOfPattern(string: String, result: FindCharsResult, pattern: Pattern) {
val matcher = pattern.matcher(string)
if (matcher.find(result.start)) {
// Once we have API level 26+, we can use named captures...
val end = matcher.end()
result.start = matcher.start()
when (result.matchType) {
FoundMatchType.TAG -> {
if (isValidForTagPrefix(string.codePointAt(result.start))) {
if (string[result.start] != '#' ||
(string[result.start] == '#' && string[result.start + 1] == '#')
) {
++result.start
}
}
}
else -> {
if (Character.isWhitespace(string.codePointAt(result.start))) {
++result.start
}
}
}
when (result.matchType) {
FoundMatchType.HTTP_URL, FoundMatchType.HTTPS_URL -> {
// Preliminary url patterns are fast/permissive, now we'll do full validation
if (STRICT_WEB_URL_PATTERN.matcher(string.substring(result.start, end)).matches()) {
result.end = end
}
}
else -> result.end = end
}
}
}
private fun getSpan(matchType: FoundMatchType, string: String, colour: Int, start: Int, end: Int): CharacterStyle {
return when (matchType) {
FoundMatchType.HTTP_URL -> NoUnderlineURLSpan(string.substring(start, end))
FoundMatchType.HTTPS_URL -> NoUnderlineURLSpan(string.substring(start, end))
FoundMatchType.MENTION -> MentionSpan(string.substring(start, end))
else -> ForegroundColorSpan(colour)
}
}
private fun isWordCharacters(codePoint: Int): Boolean {
return (codePoint in 0x30..0x39) || // [0-9]
(codePoint in 0x41..0x5a) || // [A-Z]
(codePoint == 0x5f) || // _
(codePoint in 0x61..0x7a) // [a-z]
}
private fun isValidForTagPrefix(codePoint: Int): Boolean {
return !(
isWordCharacters(codePoint) || // \w
(codePoint == 0x2f) || // /
(codePoint == 0x29)
) // )
}

View File

@ -4,9 +4,11 @@ import app.pachli.core.testing.fakes.FakeSpannable
import app.pachli.util.highlightSpans
import org.junit.Assert
import org.junit.Test
import org.junit.experimental.runners.Enclosed
import org.junit.runner.RunWith
import org.junit.runners.Parameterized
@RunWith(Enclosed::class)
class SpanUtilsTest {
@Test
fun matchesMixedSpans() {
@ -19,8 +21,8 @@ class SpanUtilsTest {
@Test
fun doesntMergeAdjacentURLs() {
val firstURL = "http://first.thing"
val secondURL = "https://second.thing"
val firstURL = "http://first.bar"
val secondURL = "https://second.bar"
val inputSpannable = FakeSpannable("$firstURL $secondURL")
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
@ -71,14 +73,6 @@ class SpanUtilsTest {
Assert.assertTrue(spans.isEmpty())
}
@Test
fun doesNotMatchSpanEmbeddedInAnotherSpan() {
val inputSpannable = FakeSpannable("@aa${thingToHighlight}aa")
highlightSpans(inputSpannable, 0xffffff)
val spans = inputSpannable.spans
Assert.assertEquals(1, spans.size)
}
@Test
fun spansDoNotOverlap() {
val begin = "@begin"

View File

@ -288,7 +288,7 @@ class ComposeActivityTest {
@Test
fun whenTextContainsUrl_onlyEllipsizedURLIsCounted() {
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
val additionalContent = "Check out this @image #search result: "
rule.launch()
rule.getScenario().onActivity {
@ -303,7 +303,7 @@ class ComposeActivityTest {
@Test
fun whenTextContainsShortUrls_allUrlsGetEllipsized() {
val shortUrl = "https://pachli.app"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
val additionalContent = " Check out this @image #search result: "
rule.launch()
rule.getScenario().onActivity {
@ -317,7 +317,7 @@ class ComposeActivityTest {
@Test
fun whenTextContainsMultipleURLs_allURLsGetEllipsized() {
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
val additionalContent = " Check out this @image #search result: "
rule.launch()
rule.getScenario().onActivity {
@ -331,7 +331,7 @@ class ComposeActivityTest {
@Test
fun whenTextContainsUrl_onlyEllipsizedURLIsCounted_withCustomConfiguration() {
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
val additionalContent = "Check out this @image #search result: "
val customUrlLength = 16
getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) }
@ -348,7 +348,7 @@ class ComposeActivityTest {
@Test
fun whenTextContainsShortUrls_allUrlsGetEllipsized_withCustomConfiguration() {
val shortUrl = "https://pachli.app"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
val additionalContent = " Check out this @image #search result: "
val customUrlLength = 18 // The intention is that this is longer than shortUrl.length
getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) }
@ -364,7 +364,7 @@ class ComposeActivityTest {
@Test
fun whenTextContainsMultipleURLs_allURLsGetEllipsized_withCustomConfiguration() {
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
val additionalContent = " Check out this @image #search result: "
val customUrlLength = 16
getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) }

View File

@ -40,9 +40,12 @@ class StatusLengthTest(
// "@user@server" should be treated as "@user"
arrayOf("123 @example@example.org", 12),
// URLs under 23 chars are treated as 23 chars
arrayOf("123 http://example.url", 27),
arrayOf("123 http://example.org", 27),
// URLs over 23 chars are treated as 23 chars
arrayOf("123 http://urlthatislongerthan23characters.example.org", 27),
// URLs end when they should (the ")." should be part of the status
// length, not considered to be part of the URL)
arrayOf("test (https://example.com). test", 36),
// Short hashtags are treated as is
arrayOf("123 #basictag", 13),
// Long hashtags are *also* treated as is (not treated as 23, like URLs)

View File

@ -0,0 +1,325 @@
// Copyright 2018 Twitter, Inc.
// Licensed under the Apache License, Version 2.0
// http://www.apache.org/licenses/LICENSE-2.0
package com.twitter.twittertext
import java.net.IDN
import java.util.regex.Matcher
/**
* A class to extract usernames, hashtags and URLs from Mastodon text.
*/
open class Extractor {
data class Entity(
var start: Int,
var end: Int,
val value: String,
val type: Type,
) {
enum class Type {
URL,
HASHTAG,
MENTION,
}
@JvmOverloads
constructor(matcher: Matcher, type: Type, groupNumber: Int, startOffset: Int = -1) : this(
matcher.start(groupNumber) + startOffset,
matcher.end(groupNumber),
matcher.group(groupNumber)!!,
type,
)
}
var isExtractURLWithoutProtocol = true
private fun removeOverlappingEntities(entities: MutableList<Entity>) {
// sort by index
entities.sortWith(Comparator { e1, e2 -> e1.start - e2.start })
// Remove overlapping entities.
// Two entities overlap only when one is URL and the other is hashtag/mention
// which is a part of the URL. When it happens, we choose URL over hashtag/mention
// by selecting the one with smaller start index.
if (!entities.isEmpty()) {
val it = entities.iterator()
var prev = it.next()
while (it.hasNext()) {
val cur = it.next()
if (prev.end > cur.start) {
it.remove()
} else {
prev = cur
}
}
}
}
/**
* Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
*
* @param text text of tweet
* @return list of extracted entities
*/
fun extractEntitiesWithIndices(text: String): List<Entity> = buildList {
addAll(extractURLsWithIndices(text))
addAll(extractHashtagsWithIndices(text, false))
addAll(extractMentionsOrListsWithIndices(text))
removeOverlappingEntities(this)
}
/**
* Extract @username and an optional list reference from Tweet text. A mention is an occurrence
* of @username anywhere in a Tweet. A mention with a list is a @username/list.
*
* @param text of the tweet from which to extract usernames
* @return List of usernames (without the leading @ sign) and an optional lists referenced
*/
private fun extractMentionsOrListsWithIndices(text: String): List<Entity> {
if (text.isEmpty()) return emptyList()
// Performance optimization.
// If text doesn't contain @/ at all, the text doesn't
// contain @mention. So we can simply return an empty list.
var found = false
for (c in text.toCharArray()) {
if (c == '@' || c == '') {
found = true
break
}
}
if (!found) {
return emptyList()
}
val extracted: MutableList<Entity> = ArrayList()
val matcher: Matcher = Regex.VALID_MENTION_OR_LIST.matcher(text)
while (matcher.find()) {
val after = text.substring(matcher.end())
if (!Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) {
if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null) {
extracted.add(
Entity(
matcher,
Entity.Type.MENTION,
Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME,
),
)
} else {
extracted.add(
Entity(
matcher.start(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME) - 1,
matcher.end(Regex.VALID_MENTION_OR_LIST_GROUP_LIST),
matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME),
Entity.Type.MENTION,
),
)
}
}
}
return extracted
}
/**
* Extract URL references from Tweet text.
*
* @param text of the tweet from which to extract URLs
* @return List of URLs referenced.
*/
private fun extractURLsWithIndices(text: String?): List<Entity> {
if (text.isNullOrEmpty() ||
(if (isExtractURLWithoutProtocol) text.indexOf('.') else text.indexOf(':')) == -1
) {
// Performance optimization.
// If text doesn't contain '.' or ':' at all, text doesn't contain URL,
// so we can simply return an empty list.
return emptyList()
}
val urls: MutableList<Entity> = ArrayList()
val matcher: Matcher = Regex.VALID_URL.matcher(text)
while (matcher.find()) {
val protocol = matcher.group(Regex.VALID_URL_GROUP_PROTOCOL)
if (protocol.isNullOrEmpty()) {
// skip if protocol is not present and 'extractURLWithoutProtocol' is false
// or URL is preceded by invalid character.
if (!isExtractURLWithoutProtocol ||
Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN
.matcher(matcher.group(Regex.VALID_URL_GROUP_BEFORE))
.matches()
) {
continue
}
}
val url = matcher.group(Regex.VALID_URL_GROUP_URL)
val start = matcher.start(Regex.VALID_URL_GROUP_URL)
val end = matcher.end(Regex.VALID_URL_GROUP_URL)
val host = matcher.group(Regex.VALID_URL_GROUP_DOMAIN)
if (isValidHostAndLength(url.length, protocol, host)) {
urls.add(Entity(start, end, url, Entity.Type.URL))
}
}
return urls
}
/**
* Extract #hashtag references from Tweet text.
*
* @param text of the tweet from which to extract hashtags
* @param checkUrlOverlap if true, check if extracted hashtags overlap URLs and
* remove overlapping ones
* @return List of hashtags referenced (without the leading # sign)
*/
private fun extractHashtagsWithIndices(text: String, checkUrlOverlap: Boolean): List<Entity> {
if (text.isEmpty()) return emptyList()
// Performance optimization.
// If text doesn't contain #/ at all, text doesn't contain
// hashtag, so we can simply return an empty list.
var found = false
for (c in text.toCharArray()) {
if (c == '#' || c == '') {
found = true
break
}
}
if (!found) {
return emptyList()
}
val extracted: MutableList<Entity> = ArrayList()
val matcher: Matcher = Regex.VALID_HASHTAG.matcher(text)
while (matcher.find()) {
val after = text.substring(matcher.end())
if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) {
extracted.add(
Entity(
matcher,
Entity.Type.HASHTAG,
Regex.VALID_HASHTAG_GROUP_TAG,
),
)
}
}
if (checkUrlOverlap) {
// extract URLs
val urls = extractURLsWithIndices(text)
if (urls.isNotEmpty()) {
extracted.addAll(urls)
// remove overlap
removeOverlappingEntities(extracted)
// remove URL entities
val it = extracted.iterator()
while (it.hasNext()) {
val entity = it.next()
if (entity.type != Entity.Type.HASHTAG) {
it.remove()
}
}
}
}
return extracted
}
/**
* An efficient converter of indices between code points and code units.
*/
private class IndexConverter(val text: String) {
// Keep track of a single corresponding pair of code unit and code point
// offsets so that we can re-use counting work if the next requested
// entity is near the most recent entity.
private var codePointIndex = 0
private var charIndex = 0
/**
* Converts code units to code points
*
* @param charIndex Index into the string measured in code units.
* @return The code point index that corresponds to the specified character index.
*/
fun codeUnitsToCodePoints(charIndex: Int): Int {
if (charIndex < this.charIndex) {
codePointIndex -= text.codePointCount(charIndex, this.charIndex)
} else {
codePointIndex += text.codePointCount(this.charIndex, charIndex)
}
this.charIndex = charIndex
// Make sure that charIndex never points to the second code unit of a
// surrogate pair.
if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
this.charIndex -= 1
}
return codePointIndex
}
/**
* Converts code points to code units
*
* @param codePointIndex Index into the string measured in code points.
* @return the code unit index that corresponds to the specified code point index.
*/
fun codePointsToCodeUnits(codePointIndex: Int): Int {
// Note that offsetByCodePoints accepts negative indices.
charIndex = text.offsetByCodePoints(charIndex, codePointIndex - this.codePointIndex)
this.codePointIndex = codePointIndex
return charIndex
}
}
companion object {
/**
* The maximum url length that the Twitter backend supports.
*/
const val MAX_URL_LENGTH = 4096
/**
* The backend adds http:// for normal links and https to *.twitter.com URLs
* (it also rewrites http to https for URLs matching *.twitter.com).
* We're better off adding https:// all the time. By making the assumption that
* URL_GROUP_PROTOCOL_LENGTH is https, the trade off is we'll disallow a http URL
* that is 4096 characters.
*/
private const val URL_GROUP_PROTOCOL_LENGTH = "https://".length
/**
* Verifies that the host name adheres to RFC 3490 and 1035
* Also, verifies that the entire url (including protocol) doesn't exceed MAX_URL_LENGTH
*
* @param originalUrlLength The length of the entire URL, including protocol if any
* @param protocol The protocol used
* @param originalHost The hostname to check validity of
* @return true if the host is valid
*/
fun isValidHostAndLength(
originalUrlLength: Int,
protocol: String?,
originalHost: String?,
): Boolean {
if (originalHost.isNullOrEmpty()) {
return false
}
val originalHostLength = originalHost.length
val host: String = try {
// Use IDN for all host names, if the host is all ASCII, it returns unchanged.
// It comes with an added benefit of checking host length to be between 1 and 63 characters.
IDN.toASCII(originalHost, IDN.ALLOW_UNASSIGNED)
// toASCII can throw IndexOutOfBoundsException when the domain name is longer than
// 256 characters, instead of the documented IllegalArgumentException.
} catch (e: IllegalArgumentException) {
return false
} catch (e: IndexOutOfBoundsException) {
return false
}
val punycodeEncodedHostLength = host.length
if (punycodeEncodedHostLength == 0) {
return false
}
// The punycodeEncoded host length might be different now, offset that length from the URL.
val urlLength = originalUrlLength + punycodeEncodedHostLength - originalHostLength
// Add the protocol to our length check, if there isn't one,
// to ensure it doesn't go over the limit.
val urlLengthWithProtocol =
urlLength + if (protocol == null) URL_GROUP_PROTOCOL_LENGTH else 0
return urlLengthWithProtocol <= MAX_URL_LENGTH
}
}
}

View File

@ -0,0 +1,296 @@
// Copyright 2018 Twitter, Inc.
// Licensed under the Apache License, Version 2.0
// http://www.apache.org/licenses/LICENSE-2.0
package com.twitter.twittertext
import java.util.regex.Pattern
object Regex {
private val URL_VALID_GTLD = "(?:(?:" +
join(TldLists.GTLDS) +
")(?=[^a-z0-9@+-]|$))"
private val URL_VALID_CCTLD = "(?:(?:" +
join(TldLists.CTLDS) +
")(?=[^a-z0-9@+-]|$))"
private const val INVALID_CHARACTERS = "\\uFFFE" + // BOM
"\\uFEFF" + // BOM
"\\uFFFF" // Special
private const val DIRECTIONAL_CHARACTERS = "\\u061C" + // ARABIC LETTER MARK (ALM)
"\\u200E" + // LEFT-TO-RIGHT MARK (LRM)
"\\u200F" + // RIGHT-TO-LEFT MARK (RLM)
"\\u202A" + // LEFT-TO-RIGHT EMBEDDING (LRE)
"\\u202B" + // RIGHT-TO-LEFT EMBEDDING (RLE)
"\\u202C" + // POP DIRECTIONAL FORMATTING (PDF)
"\\u202D" + // LEFT-TO-RIGHT OVERRIDE (LRO)
"\\u202E" + // RIGHT-TO-LEFT OVERRIDE (RLO)
"\\u2066" + // LEFT-TO-RIGHT ISOLATE (LRI)
"\\u2067" + // RIGHT-TO-LEFT ISOLATE (RLI)
"\\u2068" + // FIRST STRONG ISOLATE (FSI)
"\\u2069" // POP DIRECTIONAL ISOLATE (PDI)
private const val UNICODE_SPACES = "[" +
"\\u0009-\\u000d" + // # White_Space # Cc [5] <control-0009>..<control-000D>
"\\u0020" + // White_Space # Zs SPACE
"\\u0085" + // White_Space # Cc <control-0085>
"\\u00a0" + // White_Space # Zs NO-BREAK SPACE
"\\u1680" + // White_Space # Zs OGHAM SPACE MARK
"\\u180E" + // White_Space # Zs MONGOLIAN VOWEL SEPARATOR
"\\u2000-\\u200a" + // # White_Space # Zs [11] EN QUAD..HAIR SPACE
"\\u2028" + // White_Space # Zl LINE SEPARATOR
"\\u2029" + // White_Space # Zp PARAGRAPH SEPARATOR
"\\u202F" + // White_Space # Zs NARROW NO-BREAK SPACE
"\\u205F" + // White_Space # Zs MEDIUM MATHEMATICAL SPACE
"\\u3000" + // White_Space # Zs IDEOGRAPHIC SPACE
"]"
private const val LATIN_ACCENTS_CHARS = // Latin-1
"\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff" + // Latin Extended A and B
"\\u0100-\\u024f" + // IPA Extensions
"\\u0253\\u0254\\u0256\\u0257\\u0259\\u025b\\u0263\\u0268\\u026f\\u0272\\u0289\\u028b" + // Hawaiian
"\\u02bb" + // Combining diacritics
"\\u0300-\\u036f" + // Latin Extended Additional (mostly for Vietnamese)
"\\u1e00-\\u1eff"
private const val CYRILLIC_CHARS = "\\u0400-\\u04ff"
// Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Java's \p{L}\p{M}
private const val HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
"\\u037f\\u0528-\\u052f\\u08a0-\\u08b2\\u08e4-\\u08ff\\u0978\\u0980\\u0c00\\u0c34\\u0c81" +
"\\u0d01\\u0ede\\u0edf\\u10c7\\u10cd\\u10fd-\\u10ff\\u16f1-\\u16f8\\u17b4\\u17b5\\u191d" +
"\\u191e\\u1ab0-\\u1abe\\u1bab-\\u1bad\\u1bba-\\u1bbf\\u1cf3-\\u1cf6\\u1cf8\\u1cf9" +
"\\u1de7-\\u1df5\\u2cf2\\u2cf3\\u2d27\\u2d2d\\u2d66\\u2d67\\u9fcc\\ua674-\\ua67b\\ua698" +
"-\\ua69d\\ua69f\\ua792-\\ua79f\\ua7aa-\\ua7ad\\ua7b0\\ua7b1\\ua7f7-\\ua7f9\\ua9e0-" +
"\\ua9ef\\ua9fa-\\ua9fe\\uaa7c-\\uaa7f\\uaae0-\\uaaef\\uaaf2-\\uaaf6\\uab30-\\uab5a" +
"\\uab5c-\\uab5f\\uab64\\uab65\\uf870-\\uf87f\\uf882\\uf884-\\uf89f\\uf8b8\\uf8c1-" +
"\\uf8d6\\ufa2e\\ufa2f\\ufe27-\\ufe2d\\ud800\\udee0\\ud800\\udf1f\\ud800\\udf50-\\ud800" +
"\\udf7a\\ud801\\udd00-\\ud801\\udd27\\ud801\\udd30-\\ud801\\udd63\\ud801\\ude00-\\ud801" +
"\\udf36\\ud801\\udf40-\\ud801\\udf55\\ud801\\udf60-\\ud801\\udf67\\ud802\\udc60-\\ud802" +
"\\udc76\\ud802\\udc80-\\ud802\\udc9e\\ud802\\udd80-\\ud802\\uddb7\\ud802\\uddbe\\ud802" +
"\\uddbf\\ud802\\ude80-\\ud802\\ude9c\\ud802\\udec0-\\ud802\\udec7\\ud802\\udec9-\\ud802" +
"\\udee6\\ud802\\udf80-\\ud802\\udf91\\ud804\\udc7f\\ud804\\udcd0-\\ud804\\udce8\\ud804" +
"\\udd00-\\ud804\\udd34\\ud804\\udd50-\\ud804\\udd73\\ud804\\udd76\\ud804\\udd80-\\ud804" +
"\\uddc4\\ud804\\uddda\\ud804\\ude00-\\ud804\\ude11\\ud804\\ude13-\\ud804\\ude37\\ud804" +
"\\udeb0-\\ud804\\udeea\\ud804\\udf01-\\ud804\\udf03\\ud804\\udf05-\\ud804\\udf0c\\ud804" +
"\\udf0f\\ud804\\udf10\\ud804\\udf13-\\ud804\\udf28\\ud804\\udf2a-\\ud804\\udf30\\ud804" +
"\\udf32\\ud804\\udf33\\ud804\\udf35-\\ud804\\udf39\\ud804\\udf3c-\\ud804\\udf44\\ud804" +
"\\udf47\\ud804\\udf48\\ud804\\udf4b-\\ud804\\udf4d\\ud804\\udf57\\ud804\\udf5d-\\ud804" +
"\\udf63\\ud804\\udf66-\\ud804\\udf6c\\ud804\\udf70-\\ud804\\udf74\\ud805\\udc80-\\ud805" +
"\\udcc5\\ud805\\udcc7\\ud805\\udd80-\\ud805\\uddb5\\ud805\\uddb8-\\ud805\\uddc0\\ud805" +
"\\ude00-\\ud805\\ude40\\ud805\\ude44\\ud805\\ude80-\\ud805\\udeb7\\ud806\\udca0-\\ud806" +
"\\udcdf\\ud806\\udcff\\ud806\\udec0-\\ud806\\udef8\\ud808\\udf6f-\\ud808\\udf98\\ud81a" +
"\\ude40-\\ud81a\\ude5e\\ud81a\\uded0-\\ud81a\\udeed\\ud81a\\udef0-\\ud81a\\udef4\\ud81a" +
"\\udf00-\\ud81a\\udf36\\ud81a\\udf40-\\ud81a\\udf43\\ud81a\\udf63-\\ud81a\\udf77\\ud81a" +
"\\udf7d-\\ud81a\\udf8f\\ud81b\\udf00-\\ud81b\\udf44\\ud81b\\udf50-\\ud81b\\udf7e\\ud81b" +
"\\udf8f-\\ud81b\\udf9f\\ud82f\\udc00-\\ud82f\\udc6a\\ud82f\\udc70-\\ud82f\\udc7c\\ud82f" +
"\\udc80-\\ud82f\\udc88\\ud82f\\udc90-\\ud82f\\udc99\\ud82f\\udc9d\\ud82f\\udc9e\\ud83a" +
"\\udc00-\\ud83a\\udcc4\\ud83a\\udcd0-\\ud83a\\udcd6\\ud83b\\ude00-\\ud83b\\ude03\\ud83b" +
"\\ude05-\\ud83b\\ude1f\\ud83b\\ude21\\ud83b\\ude22\\ud83b\\ude24\\ud83b\\ude27\\ud83b" +
"\\ude29-\\ud83b\\ude32\\ud83b\\ude34-\\ud83b\\ude37\\ud83b\\ude39\\ud83b\\ude3b\\ud83b" +
"\\ude42\\ud83b\\ude47\\ud83b\\ude49\\ud83b\\ude4b\\ud83b\\ude4d-\\ud83b\\ude4f\\ud83b" +
"\\ude51\\ud83b\\ude52\\ud83b\\ude54\\ud83b\\ude57\\ud83b\\ude59\\ud83b\\ude5b\\ud83b" +
"\\ude5d\\ud83b\\ude5f\\ud83b\\ude61\\ud83b\\ude62\\ud83b\\ude64\\ud83b\\ude67-\\ud83b" +
"\\ude6a\\ud83b\\ude6c-\\ud83b\\ude72\\ud83b\\ude74-\\ud83b\\ude77\\ud83b\\ude79-\\ud83b" +
"\\ude7c\\ud83b\\ude7e\\ud83b\\ude80-\\ud83b\\ude89\\ud83b\\ude8b-\\ud83b\\ude9b\\ud83b" +
"\\udea1-\\ud83b\\udea3\\ud83b\\udea5-\\ud83b\\udea9\\ud83b\\udeab-\\ud83b\\udebb"
// Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Java's \p{Nd}
private const val HASHTAG_NUMERALS = "\\p{Nd}" +
"\\u0de6-\\u0def\\ua9f0-\\ua9f9\\ud804\\udcf0-\\ud804\\udcf9\\ud804\\udd36-\\ud804" +
"\\udd3f\\ud804\\uddd0-\\ud804\\uddd9\\ud804\\udef0-\\ud804\\udef9\\ud805\\udcd0-\\ud805" +
"\\udcd9\\ud805\\ude50-\\ud805\\ude59\\ud805\\udec0-\\ud805\\udec9\\ud806\\udce0-\\ud806" +
"\\udce9\\ud81a\\ude60-\\ud81a\\ude69\\ud81a\\udf50-\\ud81a\\udf59"
private const val HASHTAG_SPECIAL_CHARS = "_" + // underscore
"\\u200c" + // ZERO WIDTH NON-JOINER (ZWNJ)
"\\u200d" + // ZERO WIDTH JOINER (ZWJ)
"\\ua67e" + // CYRILLIC KAVYKA
"\\u05be" + // HEBREW PUNCTUATION MAQAF
"\\u05f3" + // HEBREW PUNCTUATION GERESH
"\\u05f4" + // HEBREW PUNCTUATION GERSHAYIM
"\\uff5e" + // FULLWIDTH TILDE
"\\u301c" + // WAVE DASH
"\\u309b" + // KATAKANA-HIRAGANA VOICED SOUND MARK
"\\u309c" + // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
"\\u30a0" + // KATAKANA-HIRAGANA DOUBLE HYPHEN
"\\u30fb" + // KATAKANA MIDDLE DOT
"\\u3003" + // DITTO MARK
"\\u0f0b" + // TIBETAN MARK INTERSYLLABIC TSHEG
"\\u0f0c" + // TIBETAN MARK DELIMITER TSHEG BSTAR
"\\u00b7" // MIDDLE DOT
private const val HASHTAG_LETTERS_NUMERALS =
HASHTAG_LETTERS_AND_MARKS + HASHTAG_NUMERALS + HASHTAG_SPECIAL_CHARS
private const val HASHTAG_LETTERS_SET = "[$HASHTAG_LETTERS_AND_MARKS]"
private const val HASHTAG_LETTERS_NUMERALS_SET = "[$HASHTAG_LETTERS_NUMERALS]"
/* URL related hash regex collection */
private const val URL_VALID_PRECEDING_CHARS =
"(?:[^a-z0-9@$#$INVALID_CHARACTERS]|[$DIRECTIONAL_CHARACTERS]|^)"
private const val URL_VALID_CHARS = "[a-z0-9$LATIN_ACCENTS_CHARS]"
private const val URL_VALID_SUBDOMAIN =
"(?>(?:$URL_VALID_CHARS[$URL_VALID_CHARS\\-_]*)?$URL_VALID_CHARS\\.)"
private const val URL_VALID_DOMAIN_NAME =
"(?:(?:$URL_VALID_CHARS[$URL_VALID_CHARS\\-]*)?$URL_VALID_CHARS\\.)"
private const val PUNCTUATION_CHARS = "-_!\"#$%&'\\(\\)*+,./:;<=>?@\\[\\]^`\\{|}~"
// Any non-space, non-punctuation characters.
// \p{Z} = any kind of whitespace or invisible separator.
private const val URL_VALID_UNICODE_CHARS =
"[^$PUNCTUATION_CHARS\\s\\p{Z}\\p{InGeneralPunctuation}]"
private const val URL_VALID_UNICODE_DOMAIN_NAME =
"(?:(?:" + URL_VALID_UNICODE_CHARS + "[" + URL_VALID_UNICODE_CHARS + "\\-]*)?" +
URL_VALID_UNICODE_CHARS + "\\.)"
private const val URL_PUNYCODE = "(?:xn--[-0-9a-z]+)"
private val URL_VALID_DOMAIN = "(?:" + // optional sub-domain + domain + TLD
URL_VALID_SUBDOMAIN + "*" + URL_VALID_DOMAIN_NAME + // e.g. twitter.com, foo.co.jp ...
"(?:" + URL_VALID_GTLD + "|" + URL_VALID_CCTLD + "|" + URL_PUNYCODE + ")" +
")" +
"|(?:" + "(?<=https?://)" +
"(?:" +
"(?:" + URL_VALID_DOMAIN_NAME + URL_VALID_CCTLD + ")" + // protocol + domain + ccTLD
"|(?:" +
URL_VALID_UNICODE_DOMAIN_NAME + // protocol + unicode domain + TLD
"(?:" + URL_VALID_GTLD + "|" + URL_VALID_CCTLD + ")" +
")" +
")" +
")" +
"|(?:" + // domain + ccTLD + '/'
URL_VALID_DOMAIN_NAME + URL_VALID_CCTLD + "(?=/)" + // e.g. t.co/
")"
private const val URL_VALID_PORT_NUMBER = "[0-9]++"
private const val URL_VALID_GENERAL_PATH_CHARS =
"[a-z0-9!\\*';:=\\+,.\\$/%#\\[\\]\\-\\u2013_~\\|&@" +
LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]"
/**
* Allow URL paths to contain up to two nested levels of balanced parens
* 1. Used in Wikipedia URLs like /Primer_(film)
* 2. Used in IIS sessions like /S(dfd346)/
* 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
*/
private const val URL_BALANCED_PARENS = "\\(" +
"(?:" +
URL_VALID_GENERAL_PATH_CHARS + "+" +
"|" + // allow one nested level of balanced parentheses
"(?:" +
URL_VALID_GENERAL_PATH_CHARS + "*" +
"\\(" +
URL_VALID_GENERAL_PATH_CHARS + "+" +
"\\)" +
URL_VALID_GENERAL_PATH_CHARS + "*" +
")" +
")" +
"\\)"
/**
* Valid end-of-path characters (so /foo. does not gobble the period).
* 2. Allow =&# for empty URL parameters and other URL-join artifacts
*/
private const val URL_VALID_PATH_ENDING_CHARS =
"[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]|(?:" +
URL_BALANCED_PARENS + ")"
private const val URL_VALID_PATH = "(?:" +
"(?:" +
URL_VALID_GENERAL_PATH_CHARS + "*" +
"(?:" + URL_BALANCED_PARENS + URL_VALID_GENERAL_PATH_CHARS + "*)*" +
URL_VALID_PATH_ENDING_CHARS +
")|(?:@" + URL_VALID_GENERAL_PATH_CHARS + "+/)" +
")"
private const val URL_VALID_URL_QUERY_CHARS =
"[a-z0-9!?\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~\\|@]"
private const val URL_VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9\\-_&=#/]"
private val VALID_URL_PATTERN_STRING = "(" + // $1 total match
"(" + URL_VALID_PRECEDING_CHARS + ")" + // $2 Preceding character
"(" + // $3 URL
"(https?://)?" + // $4 Protocol (optional)
"(" + URL_VALID_DOMAIN + ")" + // $5 Domain(s)
"(?::(" + URL_VALID_PORT_NUMBER + "))?" + // $6 Port number (optional)
"(/" +
URL_VALID_PATH + "*+" +
")?" + // $7 URL Path and anchor
"(\\?" + URL_VALID_URL_QUERY_CHARS + "*" + // $8 Query String
URL_VALID_URL_QUERY_ENDING_CHARS + ")?" +
")" +
")"
private const val AT_SIGNS_CHARS = "@\uFF20"
/* Begin public constants */
private val INVALID_CHARACTERS_PATTERN: Pattern
val VALID_HASHTAG: Pattern
const val VALID_HASHTAG_GROUP_TAG = 1
val INVALID_HASHTAG_MATCH_END: Pattern
private val RTL_CHARACTERS: Pattern
private val AT_SIGNS: Pattern
val VALID_MENTION_OR_LIST: Pattern
const val VALID_MENTION_OR_LIST_GROUP_USERNAME = 3
const val VALID_MENTION_OR_LIST_GROUP_LIST = 4
private val VALID_REPLY: Pattern
val INVALID_MENTION_MATCH_END: Pattern
/**
* Regex to extract URL (it also includes the text preceding the url).
*
* This regex does not reflect its name and [Regex.VALID_URL_GROUP_URL] match
* should be checked in order to match a valid url. This is not ideal, but the behavior is
* being kept to ensure backwards compatibility. Ideally this regex should be
* implemented with a negative lookbehind as opposed to a negated character class
* but lack of JS support increases maint overhead if the logic is different by
* platform.
*/
val VALID_URL: Pattern
const val VALID_URL_GROUP_BEFORE = 2
const val VALID_URL_GROUP_URL = 3
const val VALID_URL_GROUP_PROTOCOL = 4
const val VALID_URL_GROUP_DOMAIN = 5
val INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN: Pattern
private val VALID_DOMAIN: Pattern
// Mastodon hashtag regular expressions. Different from the Twitter ones, in particular,
// they can be preceded by e.g., Hirigana characters without an intervening space.
// See HASHTAG_RE in https://github.com/mastodon/mastodon/blob/main/app/models/tag.rb
// (which is VALID_HASHTAG in this file).
private const val HASHTAG_SEPARATORS = "_\u00B7\u30FB\u200c"
private const val HASHTAG_FIRST_SEQUENCE_CHUNK_ONE = """[\w_][\w$HASHTAG_SEPARATORS]*[\p{Alpha}$HASHTAG_SEPARATORS]"""
private const val HASHTAG_FIRST_SEQUENCE_CHUNK_TWO = """[\w$HASHTAG_SEPARATORS]*[\w_]"""
private const val HASHTAG_FIRST_SEQUENCE = "($HASHTAG_FIRST_SEQUENCE_CHUNK_ONE$HASHTAG_FIRST_SEQUENCE_CHUNK_TWO)"
private const val HASHTAG_LAST_SEQUENCE = """([\w_]*[\p{L}][\w_]*)"""
private const val HASHTAG_NAME_PAT: String = "$HASHTAG_FIRST_SEQUENCE|$HASHTAG_LAST_SEQUENCE"
// initializing in a static synchronized block,
// there appears to be thread safety issues with Pattern.compile in android
init {
synchronized(Regex::class.java) {
INVALID_CHARACTERS_PATTERN = Pattern.compile(".*[$INVALID_CHARACTERS].*")
VALID_HASHTAG = Pattern.compile(
"(?<![=/)a-zA-Z0-9_])[#]($HASHTAG_NAME_PAT)",
Pattern.CASE_INSENSITIVE,
)
INVALID_HASHTAG_MATCH_END = Pattern.compile("^(?:[#]|://)")
RTL_CHARACTERS =
Pattern.compile("[\u0600-\u06FF\u0750-\u077F\u0590-\u05FF\uFE70-\uFEFF]")
AT_SIGNS = Pattern.compile("[$AT_SIGNS_CHARS]")
VALID_MENTION_OR_LIST = Pattern.compile(
"([^a-z0-9_!#$%&*=$AT_SIGNS_CHARS]|^|(?:^|[^a-z0-9_+~.-])RT:?)($AT_SIGNS+)([a-z0-9_]+)($AT_SIGNS[a-z][a-z0-9_\\-.]+)?",
Pattern.CASE_INSENSITIVE,
)
VALID_REPLY = Pattern.compile(
"^(?:" + UNICODE_SPACES + "|" + DIRECTIONAL_CHARACTERS + ")*" +
AT_SIGNS + "([a-z0-9_]{1,20})",
Pattern.CASE_INSENSITIVE,
)
INVALID_MENTION_MATCH_END = Pattern.compile("^(?:[$AT_SIGNS_CHARS$LATIN_ACCENTS_CHARS]|://)")
INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN = Pattern.compile("[-_./]$")
VALID_URL = Pattern.compile(VALID_URL_PATTERN_STRING, Pattern.CASE_INSENSITIVE)
VALID_DOMAIN = Pattern.compile(URL_VALID_DOMAIN, Pattern.CASE_INSENSITIVE)
}
}
private fun join(col: Collection<*>): String {
val sb = StringBuilder()
val iter = col.iterator()
if (iter.hasNext()) {
sb.append(iter.next().toString())
}
while (iter.hasNext()) {
sb.append("|")
sb.append(iter.next().toString())
}
return sb.toString()
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,17 @@
{
"uniqueId": "com.twitter:twittertext",
"developers": [
{
"name": "Twitter",
"organisationUrl": "https://www.twitter.com"
}
],
"artifactVersion": "1.14.3",
"description": "Libraries and conformance tests to standardize parsing of Tweet text.",
"name": "Twitter text parsing",
"tag": "text",
"licenses": [
"Apache_2_0"
],
"website": "https://github.com/twitter/twitter-text"
}