fix: Improve URL / tag / mention extraction when composing (#564)
Previous code used custom regular expressions to extract URLs, hashtags, and mentions from text while the user was writing a post. These were inconsistent with the ones that Mastodon uses so the derived character count could be wrong. As well as being visually incorrect this could prevent the user from posting a status that was within the length limit, or allow them to attempt to post a status that was over the length limit (which would then fail). Fix this by dropping the homegrown regular expressions and using the same text parsing library that Mastodon users; twitter-text. This has been converted to Kotlin and the functionality related to Twitter specific features has been removed. The hashtag handling has been adjusted, as Mastodon is more permissive about the positions where hashtags can appear than Twitter is, in particular, a hashtag does not need to be preceded with whitespace if the tag appears after some scripts, such as Hirigana.
This commit is contained in:
parent
62fd47e862
commit
2a4126a542
|
@ -1,68 +1,33 @@
|
|||
/*
|
||||
* Copyright 2024 Pachli Association
|
||||
*
|
||||
* This file is a part of Pachli.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||
* GNU General Public License as published by the Free Software Foundation; either version 3 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* Pachli is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
|
||||
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
||||
* Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with Pachli; if not,
|
||||
* see <http://www.gnu.org/licenses>.
|
||||
*/
|
||||
|
||||
package app.pachli.util
|
||||
|
||||
import android.text.Spannable
|
||||
import android.text.Spanned
|
||||
import android.text.style.CharacterStyle
|
||||
import android.text.style.ForegroundColorSpan
|
||||
import android.text.style.URLSpan
|
||||
import app.pachli.core.ui.MentionSpan
|
||||
import app.pachli.core.ui.NoUnderlineURLSpan
|
||||
import java.util.regex.Pattern
|
||||
import kotlin.math.max
|
||||
|
||||
/**
|
||||
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/tag.rb">
|
||||
* Tag#HASHTAG_RE</a>.
|
||||
*/
|
||||
private const val HASHTAG_SEPARATORS = "_\\u00B7\\u200c"
|
||||
private const val UNICODE_WORD = "\\p{L}\\p{Mn}\\p{Nd}\\p{Nl}\\p{Pc}" // Ugh, java ( https://stackoverflow.com/questions/4304928/unicode-equivalents-for-w-and-b-in-java-regular-expressions )
|
||||
private const val TAG_REGEX = "(?:^|[^/)\\w])#(([${UNICODE_WORD}_][$UNICODE_WORD$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][$UNICODE_WORD$HASHTAG_SEPARATORS]*[${UNICODE_WORD}_])|([${UNICODE_WORD}_]*[\\p{Alpha}][${UNICODE_WORD}_]*))"
|
||||
|
||||
/**
|
||||
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/account.rb">
|
||||
* Account#MENTION_RE</a>
|
||||
*/
|
||||
private const val USERNAME_REGEX = "[\\w]+([\\w\\.-]+[\\w]+)?"
|
||||
private const val MENTION_REGEX = "(?<=^|[^\\/$UNICODE_WORD])@(($USERNAME_REGEX)(?:@[$UNICODE_WORD\\.\\-]+[$UNICODE_WORD]+)?)"
|
||||
|
||||
private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)"
|
||||
private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)"
|
||||
|
||||
/**
|
||||
* Dump of android.util.Patterns.WEB_URL
|
||||
*/
|
||||
private val STRICT_WEB_URL_PATTERN = Pattern.compile("(((?:(?i:http|https|rtsp)://(?:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?(?:(([a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]](?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]_\\-]{0,61}[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]]){0,1}\\.)+(xn\\-\\-[\\w\\-]{0,58}\\w|[a-zA-Z[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]]]{2,63})|((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9]))))(?:\\:\\d{1,5})?)([/\\?](?:(?:[a-zA-Z0-9[ -\uD7FF豈-\uFDCFﷰ-\uFFEF\uD800\uDC00-\uD83F\uDFFD\uD840\uDC00-\uD87F\uDFFD\uD880\uDC00-\uD8BF\uDFFD\uD8C0\uDC00-\uD8FF\uDFFD\uD900\uDC00-\uD93F\uDFFD\uD940\uDC00-\uD97F\uDFFD\uD980\uDC00-\uD9BF\uDFFD\uD9C0\uDC00-\uD9FF\uDFFD\uDA00\uDC00-\uDA3F\uDFFD\uDA40\uDC00-\uDA7F\uDFFD\uDA80\uDC00-\uDABF\uDFFD\uDAC0\uDC00-\uDAFF\uDFFD\uDB00\uDC00-\uDB3F\uDFFD\uDB44\uDC00-\uDB7F\uDFFD&&[^ [ - ]\u2028\u2029 ]];/\\?:@&=#~\\-\\.\\+!\\*'\\(\\),_\\\$])|(?:%[a-fA-F0-9]{2}))*)?(?:\\b|\$|^))")
|
||||
import com.twitter.twittertext.Extractor
|
||||
|
||||
private val spanClasses = listOf(ForegroundColorSpan::class.java, URLSpan::class.java)
|
||||
private val finders = mapOf(
|
||||
FoundMatchType.HTTP_URL to PatternFinder(':', HTTP_URL_REGEX, 5, Character::isWhitespace),
|
||||
FoundMatchType.HTTPS_URL to PatternFinder(':', HTTPS_URL_REGEX, 6, Character::isWhitespace),
|
||||
FoundMatchType.TAG to PatternFinder('#', TAG_REGEX, 1, ::isValidForTagPrefix),
|
||||
// TODO: We also need a proper validator for mentions
|
||||
FoundMatchType.MENTION to PatternFinder('@', MENTION_REGEX, 1, Character::isWhitespace),
|
||||
)
|
||||
|
||||
private enum class FoundMatchType {
|
||||
HTTP_URL,
|
||||
HTTPS_URL,
|
||||
TAG,
|
||||
MENTION,
|
||||
}
|
||||
|
||||
private class FindCharsResult {
|
||||
lateinit var matchType: FoundMatchType
|
||||
var start: Int = -1
|
||||
var end: Int = -1
|
||||
}
|
||||
|
||||
private class PatternFinder(
|
||||
val searchCharacter: Char,
|
||||
regex: String,
|
||||
val searchPrefixWidth: Int,
|
||||
val prefixValidator: (Int) -> Boolean,
|
||||
) {
|
||||
val pattern: Pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
||||
}
|
||||
private val extractor = Extractor().apply { isExtractURLWithoutProtocol = false }
|
||||
|
||||
/**
|
||||
* Takes text containing mentions and hashtags and urls and makes them the given colour.
|
||||
|
@ -75,18 +40,16 @@ fun highlightSpans(text: Spannable, colour: Int) {
|
|||
|
||||
// Colour the mentions and hashtags.
|
||||
val string = text.toString()
|
||||
val length = text.length
|
||||
var start = 0
|
||||
var end = 0
|
||||
while (end in 0 until length && start >= 0) {
|
||||
// Search for url first because it can contain the other characters
|
||||
val found = findPattern(string, end)
|
||||
start = found.start
|
||||
end = found.end
|
||||
if (start in 0 until end) {
|
||||
text.setSpan(getSpan(found.matchType, string, colour, start, end), start, end, Spanned.SPAN_INCLUSIVE_EXCLUSIVE)
|
||||
start += finders[found.matchType]!!.searchPrefixWidth
|
||||
|
||||
val entities = extractor.extractEntitiesWithIndices(string)
|
||||
|
||||
for (entity in entities) {
|
||||
val span = when (entity.type) {
|
||||
Extractor.Entity.Type.URL -> NoUnderlineURLSpan(string.substring(entity.start, entity.end))
|
||||
Extractor.Entity.Type.HASHTAG -> ForegroundColorSpan(colour)
|
||||
Extractor.Entity.Type.MENTION -> MentionSpan(string.substring(entity.start, entity.end))
|
||||
}
|
||||
text.setSpan(span, entity.start, entity.end, Spanned.SPAN_INCLUSIVE_EXCLUSIVE)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -95,87 +58,3 @@ private fun <T> clearSpans(text: Spannable, spanClass: Class<T>) {
|
|||
text.removeSpan(span)
|
||||
}
|
||||
}
|
||||
|
||||
private fun findPattern(string: String, fromIndex: Int): FindCharsResult {
|
||||
val result = FindCharsResult()
|
||||
for (i in fromIndex..string.lastIndex) {
|
||||
val c = string[i]
|
||||
for (matchType in FoundMatchType.entries) {
|
||||
val finder = finders[matchType]
|
||||
if (finder!!.searchCharacter == c &&
|
||||
(
|
||||
(i - fromIndex) < finder.searchPrefixWidth ||
|
||||
finder.prefixValidator(string.codePointAt(i - finder.searchPrefixWidth))
|
||||
)
|
||||
) {
|
||||
result.matchType = matchType
|
||||
result.start = max(0, i - finder.searchPrefixWidth)
|
||||
findEndOfPattern(string, result, finder.pattern)
|
||||
if (result.start + finder.searchPrefixWidth <= i + 1 && // The found result is actually triggered by the correct search character
|
||||
result.end >= result.start
|
||||
) { // ...and we actually found a valid result
|
||||
return result
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
private fun findEndOfPattern(string: String, result: FindCharsResult, pattern: Pattern) {
|
||||
val matcher = pattern.matcher(string)
|
||||
if (matcher.find(result.start)) {
|
||||
// Once we have API level 26+, we can use named captures...
|
||||
val end = matcher.end()
|
||||
result.start = matcher.start()
|
||||
when (result.matchType) {
|
||||
FoundMatchType.TAG -> {
|
||||
if (isValidForTagPrefix(string.codePointAt(result.start))) {
|
||||
if (string[result.start] != '#' ||
|
||||
(string[result.start] == '#' && string[result.start + 1] == '#')
|
||||
) {
|
||||
++result.start
|
||||
}
|
||||
}
|
||||
}
|
||||
else -> {
|
||||
if (Character.isWhitespace(string.codePointAt(result.start))) {
|
||||
++result.start
|
||||
}
|
||||
}
|
||||
}
|
||||
when (result.matchType) {
|
||||
FoundMatchType.HTTP_URL, FoundMatchType.HTTPS_URL -> {
|
||||
// Preliminary url patterns are fast/permissive, now we'll do full validation
|
||||
if (STRICT_WEB_URL_PATTERN.matcher(string.substring(result.start, end)).matches()) {
|
||||
result.end = end
|
||||
}
|
||||
}
|
||||
else -> result.end = end
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun getSpan(matchType: FoundMatchType, string: String, colour: Int, start: Int, end: Int): CharacterStyle {
|
||||
return when (matchType) {
|
||||
FoundMatchType.HTTP_URL -> NoUnderlineURLSpan(string.substring(start, end))
|
||||
FoundMatchType.HTTPS_URL -> NoUnderlineURLSpan(string.substring(start, end))
|
||||
FoundMatchType.MENTION -> MentionSpan(string.substring(start, end))
|
||||
else -> ForegroundColorSpan(colour)
|
||||
}
|
||||
}
|
||||
|
||||
private fun isWordCharacters(codePoint: Int): Boolean {
|
||||
return (codePoint in 0x30..0x39) || // [0-9]
|
||||
(codePoint in 0x41..0x5a) || // [A-Z]
|
||||
(codePoint == 0x5f) || // _
|
||||
(codePoint in 0x61..0x7a) // [a-z]
|
||||
}
|
||||
|
||||
private fun isValidForTagPrefix(codePoint: Int): Boolean {
|
||||
return !(
|
||||
isWordCharacters(codePoint) || // \w
|
||||
(codePoint == 0x2f) || // /
|
||||
(codePoint == 0x29)
|
||||
) // )
|
||||
}
|
||||
|
|
|
@ -4,9 +4,11 @@ import app.pachli.core.testing.fakes.FakeSpannable
|
|||
import app.pachli.util.highlightSpans
|
||||
import org.junit.Assert
|
||||
import org.junit.Test
|
||||
import org.junit.experimental.runners.Enclosed
|
||||
import org.junit.runner.RunWith
|
||||
import org.junit.runners.Parameterized
|
||||
|
||||
@RunWith(Enclosed::class)
|
||||
class SpanUtilsTest {
|
||||
@Test
|
||||
fun matchesMixedSpans() {
|
||||
|
@ -19,8 +21,8 @@ class SpanUtilsTest {
|
|||
|
||||
@Test
|
||||
fun doesntMergeAdjacentURLs() {
|
||||
val firstURL = "http://first.thing"
|
||||
val secondURL = "https://second.thing"
|
||||
val firstURL = "http://first.bar"
|
||||
val secondURL = "https://second.bar"
|
||||
val inputSpannable = FakeSpannable("$firstURL $secondURL")
|
||||
highlightSpans(inputSpannable, 0xffffff)
|
||||
val spans = inputSpannable.spans
|
||||
|
@ -71,14 +73,6 @@ class SpanUtilsTest {
|
|||
Assert.assertTrue(spans.isEmpty())
|
||||
}
|
||||
|
||||
@Test
|
||||
fun doesNotMatchSpanEmbeddedInAnotherSpan() {
|
||||
val inputSpannable = FakeSpannable("@aa${thingToHighlight}aa")
|
||||
highlightSpans(inputSpannable, 0xffffff)
|
||||
val spans = inputSpannable.spans
|
||||
Assert.assertEquals(1, spans.size)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun spansDoNotOverlap() {
|
||||
val begin = "@begin"
|
||||
|
|
|
@ -288,7 +288,7 @@ class ComposeActivityTest {
|
|||
|
||||
@Test
|
||||
fun whenTextContainsUrl_onlyEllipsizedURLIsCounted() {
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
|
||||
val additionalContent = "Check out this @image #search result: "
|
||||
rule.launch()
|
||||
rule.getScenario().onActivity {
|
||||
|
@ -303,7 +303,7 @@ class ComposeActivityTest {
|
|||
@Test
|
||||
fun whenTextContainsShortUrls_allUrlsGetEllipsized() {
|
||||
val shortUrl = "https://pachli.app"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
|
||||
val additionalContent = " Check out this @image #search result: "
|
||||
rule.launch()
|
||||
rule.getScenario().onActivity {
|
||||
|
@ -317,7 +317,7 @@ class ComposeActivityTest {
|
|||
|
||||
@Test
|
||||
fun whenTextContainsMultipleURLs_allURLsGetEllipsized() {
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
|
||||
val additionalContent = " Check out this @image #search result: "
|
||||
rule.launch()
|
||||
rule.getScenario().onActivity {
|
||||
|
@ -331,7 +331,7 @@ class ComposeActivityTest {
|
|||
|
||||
@Test
|
||||
fun whenTextContainsUrl_onlyEllipsizedURLIsCounted_withCustomConfiguration() {
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
|
||||
val additionalContent = "Check out this @image #search result: "
|
||||
val customUrlLength = 16
|
||||
getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) }
|
||||
|
@ -348,7 +348,7 @@ class ComposeActivityTest {
|
|||
@Test
|
||||
fun whenTextContainsShortUrls_allUrlsGetEllipsized_withCustomConfiguration() {
|
||||
val shortUrl = "https://pachli.app"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
|
||||
val additionalContent = " Check out this @image #search result: "
|
||||
val customUrlLength = 18 // The intention is that this is longer than shortUrl.length
|
||||
getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) }
|
||||
|
@ -364,7 +364,7 @@ class ComposeActivityTest {
|
|||
|
||||
@Test
|
||||
fun whenTextContainsMultipleURLs_allURLsGetEllipsized_withCustomConfiguration() {
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM:"
|
||||
val url = "https://www.google.dk/search?biw=1920&bih=990&tbm=isch&sa=1&ei=bmDrWuOoKMv6kwWOkIaoDQ&q=indiana+jones+i+hate+snakes+animated&oq=indiana+jones+i+hate+snakes+animated&gs_l=psy-ab.3...54174.55443.0.55553.9.7.0.0.0.0.255.333.1j0j1.2.0....0...1c.1.64.psy-ab..7.0.0....0.40G-kcDkC6A#imgdii=PSp15hQjN1JqvM:&imgrc=H0hyE2JW5wrpBM%3A"
|
||||
val additionalContent = " Check out this @image #search result: "
|
||||
val customUrlLength = 16
|
||||
getInstanceCallback = { getInstanceWithCustomConfiguration(configuration = getCustomInstanceConfiguration(charactersReservedPerUrl = customUrlLength)) }
|
||||
|
|
|
@ -40,9 +40,12 @@ class StatusLengthTest(
|
|||
// "@user@server" should be treated as "@user"
|
||||
arrayOf("123 @example@example.org", 12),
|
||||
// URLs under 23 chars are treated as 23 chars
|
||||
arrayOf("123 http://example.url", 27),
|
||||
arrayOf("123 http://example.org", 27),
|
||||
// URLs over 23 chars are treated as 23 chars
|
||||
arrayOf("123 http://urlthatislongerthan23characters.example.org", 27),
|
||||
// URLs end when they should (the ")." should be part of the status
|
||||
// length, not considered to be part of the URL)
|
||||
arrayOf("test (https://example.com). test", 36),
|
||||
// Short hashtags are treated as is
|
||||
arrayOf("123 #basictag", 13),
|
||||
// Long hashtags are *also* treated as is (not treated as 23, like URLs)
|
||||
|
|
|
@ -0,0 +1,325 @@
|
|||
// Copyright 2018 Twitter, Inc.
|
||||
// Licensed under the Apache License, Version 2.0
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package com.twitter.twittertext
|
||||
|
||||
import java.net.IDN
|
||||
import java.util.regex.Matcher
|
||||
|
||||
/**
|
||||
* A class to extract usernames, hashtags and URLs from Mastodon text.
|
||||
*/
|
||||
open class Extractor {
|
||||
data class Entity(
|
||||
var start: Int,
|
||||
var end: Int,
|
||||
val value: String,
|
||||
val type: Type,
|
||||
) {
|
||||
enum class Type {
|
||||
URL,
|
||||
HASHTAG,
|
||||
MENTION,
|
||||
}
|
||||
|
||||
@JvmOverloads
|
||||
constructor(matcher: Matcher, type: Type, groupNumber: Int, startOffset: Int = -1) : this(
|
||||
matcher.start(groupNumber) + startOffset,
|
||||
matcher.end(groupNumber),
|
||||
matcher.group(groupNumber)!!,
|
||||
type,
|
||||
)
|
||||
}
|
||||
|
||||
var isExtractURLWithoutProtocol = true
|
||||
|
||||
private fun removeOverlappingEntities(entities: MutableList<Entity>) {
|
||||
// sort by index
|
||||
entities.sortWith(Comparator { e1, e2 -> e1.start - e2.start })
|
||||
|
||||
// Remove overlapping entities.
|
||||
// Two entities overlap only when one is URL and the other is hashtag/mention
|
||||
// which is a part of the URL. When it happens, we choose URL over hashtag/mention
|
||||
// by selecting the one with smaller start index.
|
||||
if (!entities.isEmpty()) {
|
||||
val it = entities.iterator()
|
||||
var prev = it.next()
|
||||
while (it.hasNext()) {
|
||||
val cur = it.next()
|
||||
if (prev.end > cur.start) {
|
||||
it.remove()
|
||||
} else {
|
||||
prev = cur
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract URLs, @mentions, lists and #hashtag from a given text/tweet.
|
||||
*
|
||||
* @param text text of tweet
|
||||
* @return list of extracted entities
|
||||
*/
|
||||
fun extractEntitiesWithIndices(text: String): List<Entity> = buildList {
|
||||
addAll(extractURLsWithIndices(text))
|
||||
addAll(extractHashtagsWithIndices(text, false))
|
||||
addAll(extractMentionsOrListsWithIndices(text))
|
||||
removeOverlappingEntities(this)
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract @username and an optional list reference from Tweet text. A mention is an occurrence
|
||||
* of @username anywhere in a Tweet. A mention with a list is a @username/list.
|
||||
*
|
||||
* @param text of the tweet from which to extract usernames
|
||||
* @return List of usernames (without the leading @ sign) and an optional lists referenced
|
||||
*/
|
||||
private fun extractMentionsOrListsWithIndices(text: String): List<Entity> {
|
||||
if (text.isEmpty()) return emptyList()
|
||||
|
||||
// Performance optimization.
|
||||
// If text doesn't contain @/@ at all, the text doesn't
|
||||
// contain @mention. So we can simply return an empty list.
|
||||
var found = false
|
||||
for (c in text.toCharArray()) {
|
||||
if (c == '@' || c == '@') {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
return emptyList()
|
||||
}
|
||||
val extracted: MutableList<Entity> = ArrayList()
|
||||
val matcher: Matcher = Regex.VALID_MENTION_OR_LIST.matcher(text)
|
||||
while (matcher.find()) {
|
||||
val after = text.substring(matcher.end())
|
||||
if (!Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) {
|
||||
if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null) {
|
||||
extracted.add(
|
||||
Entity(
|
||||
matcher,
|
||||
Entity.Type.MENTION,
|
||||
Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME,
|
||||
),
|
||||
)
|
||||
} else {
|
||||
extracted.add(
|
||||
Entity(
|
||||
matcher.start(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME) - 1,
|
||||
matcher.end(Regex.VALID_MENTION_OR_LIST_GROUP_LIST),
|
||||
matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME),
|
||||
Entity.Type.MENTION,
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
return extracted
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract URL references from Tweet text.
|
||||
*
|
||||
* @param text of the tweet from which to extract URLs
|
||||
* @return List of URLs referenced.
|
||||
*/
|
||||
private fun extractURLsWithIndices(text: String?): List<Entity> {
|
||||
if (text.isNullOrEmpty() ||
|
||||
(if (isExtractURLWithoutProtocol) text.indexOf('.') else text.indexOf(':')) == -1
|
||||
) {
|
||||
// Performance optimization.
|
||||
// If text doesn't contain '.' or ':' at all, text doesn't contain URL,
|
||||
// so we can simply return an empty list.
|
||||
return emptyList()
|
||||
}
|
||||
val urls: MutableList<Entity> = ArrayList()
|
||||
val matcher: Matcher = Regex.VALID_URL.matcher(text)
|
||||
while (matcher.find()) {
|
||||
val protocol = matcher.group(Regex.VALID_URL_GROUP_PROTOCOL)
|
||||
if (protocol.isNullOrEmpty()) {
|
||||
// skip if protocol is not present and 'extractURLWithoutProtocol' is false
|
||||
// or URL is preceded by invalid character.
|
||||
if (!isExtractURLWithoutProtocol ||
|
||||
Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN
|
||||
.matcher(matcher.group(Regex.VALID_URL_GROUP_BEFORE))
|
||||
.matches()
|
||||
) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
val url = matcher.group(Regex.VALID_URL_GROUP_URL)
|
||||
val start = matcher.start(Regex.VALID_URL_GROUP_URL)
|
||||
val end = matcher.end(Regex.VALID_URL_GROUP_URL)
|
||||
val host = matcher.group(Regex.VALID_URL_GROUP_DOMAIN)
|
||||
if (isValidHostAndLength(url.length, protocol, host)) {
|
||||
urls.add(Entity(start, end, url, Entity.Type.URL))
|
||||
}
|
||||
}
|
||||
return urls
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract #hashtag references from Tweet text.
|
||||
*
|
||||
* @param text of the tweet from which to extract hashtags
|
||||
* @param checkUrlOverlap if true, check if extracted hashtags overlap URLs and
|
||||
* remove overlapping ones
|
||||
* @return List of hashtags referenced (without the leading # sign)
|
||||
*/
|
||||
private fun extractHashtagsWithIndices(text: String, checkUrlOverlap: Boolean): List<Entity> {
|
||||
if (text.isEmpty()) return emptyList()
|
||||
|
||||
// Performance optimization.
|
||||
// If text doesn't contain #/# at all, text doesn't contain
|
||||
// hashtag, so we can simply return an empty list.
|
||||
var found = false
|
||||
for (c in text.toCharArray()) {
|
||||
if (c == '#' || c == '#') {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
return emptyList()
|
||||
}
|
||||
val extracted: MutableList<Entity> = ArrayList()
|
||||
val matcher: Matcher = Regex.VALID_HASHTAG.matcher(text)
|
||||
while (matcher.find()) {
|
||||
val after = text.substring(matcher.end())
|
||||
if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) {
|
||||
extracted.add(
|
||||
Entity(
|
||||
matcher,
|
||||
Entity.Type.HASHTAG,
|
||||
Regex.VALID_HASHTAG_GROUP_TAG,
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
if (checkUrlOverlap) {
|
||||
// extract URLs
|
||||
val urls = extractURLsWithIndices(text)
|
||||
if (urls.isNotEmpty()) {
|
||||
extracted.addAll(urls)
|
||||
// remove overlap
|
||||
removeOverlappingEntities(extracted)
|
||||
// remove URL entities
|
||||
val it = extracted.iterator()
|
||||
while (it.hasNext()) {
|
||||
val entity = it.next()
|
||||
if (entity.type != Entity.Type.HASHTAG) {
|
||||
it.remove()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return extracted
|
||||
}
|
||||
|
||||
/**
|
||||
* An efficient converter of indices between code points and code units.
|
||||
*/
|
||||
private class IndexConverter(val text: String) {
|
||||
// Keep track of a single corresponding pair of code unit and code point
|
||||
// offsets so that we can re-use counting work if the next requested
|
||||
// entity is near the most recent entity.
|
||||
private var codePointIndex = 0
|
||||
private var charIndex = 0
|
||||
|
||||
/**
|
||||
* Converts code units to code points
|
||||
*
|
||||
* @param charIndex Index into the string measured in code units.
|
||||
* @return The code point index that corresponds to the specified character index.
|
||||
*/
|
||||
fun codeUnitsToCodePoints(charIndex: Int): Int {
|
||||
if (charIndex < this.charIndex) {
|
||||
codePointIndex -= text.codePointCount(charIndex, this.charIndex)
|
||||
} else {
|
||||
codePointIndex += text.codePointCount(this.charIndex, charIndex)
|
||||
}
|
||||
this.charIndex = charIndex
|
||||
|
||||
// Make sure that charIndex never points to the second code unit of a
|
||||
// surrogate pair.
|
||||
if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
|
||||
this.charIndex -= 1
|
||||
}
|
||||
return codePointIndex
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts code points to code units
|
||||
*
|
||||
* @param codePointIndex Index into the string measured in code points.
|
||||
* @return the code unit index that corresponds to the specified code point index.
|
||||
*/
|
||||
fun codePointsToCodeUnits(codePointIndex: Int): Int {
|
||||
// Note that offsetByCodePoints accepts negative indices.
|
||||
charIndex = text.offsetByCodePoints(charIndex, codePointIndex - this.codePointIndex)
|
||||
this.codePointIndex = codePointIndex
|
||||
return charIndex
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
/**
|
||||
* The maximum url length that the Twitter backend supports.
|
||||
*/
|
||||
const val MAX_URL_LENGTH = 4096
|
||||
|
||||
/**
|
||||
* The backend adds http:// for normal links and https to *.twitter.com URLs
|
||||
* (it also rewrites http to https for URLs matching *.twitter.com).
|
||||
* We're better off adding https:// all the time. By making the assumption that
|
||||
* URL_GROUP_PROTOCOL_LENGTH is https, the trade off is we'll disallow a http URL
|
||||
* that is 4096 characters.
|
||||
*/
|
||||
private const val URL_GROUP_PROTOCOL_LENGTH = "https://".length
|
||||
|
||||
/**
|
||||
* Verifies that the host name adheres to RFC 3490 and 1035
|
||||
* Also, verifies that the entire url (including protocol) doesn't exceed MAX_URL_LENGTH
|
||||
*
|
||||
* @param originalUrlLength The length of the entire URL, including protocol if any
|
||||
* @param protocol The protocol used
|
||||
* @param originalHost The hostname to check validity of
|
||||
* @return true if the host is valid
|
||||
*/
|
||||
fun isValidHostAndLength(
|
||||
originalUrlLength: Int,
|
||||
protocol: String?,
|
||||
originalHost: String?,
|
||||
): Boolean {
|
||||
if (originalHost.isNullOrEmpty()) {
|
||||
return false
|
||||
}
|
||||
val originalHostLength = originalHost.length
|
||||
val host: String = try {
|
||||
// Use IDN for all host names, if the host is all ASCII, it returns unchanged.
|
||||
// It comes with an added benefit of checking host length to be between 1 and 63 characters.
|
||||
IDN.toASCII(originalHost, IDN.ALLOW_UNASSIGNED)
|
||||
// toASCII can throw IndexOutOfBoundsException when the domain name is longer than
|
||||
// 256 characters, instead of the documented IllegalArgumentException.
|
||||
} catch (e: IllegalArgumentException) {
|
||||
return false
|
||||
} catch (e: IndexOutOfBoundsException) {
|
||||
return false
|
||||
}
|
||||
val punycodeEncodedHostLength = host.length
|
||||
if (punycodeEncodedHostLength == 0) {
|
||||
return false
|
||||
}
|
||||
// The punycodeEncoded host length might be different now, offset that length from the URL.
|
||||
val urlLength = originalUrlLength + punycodeEncodedHostLength - originalHostLength
|
||||
// Add the protocol to our length check, if there isn't one,
|
||||
// to ensure it doesn't go over the limit.
|
||||
val urlLengthWithProtocol =
|
||||
urlLength + if (protocol == null) URL_GROUP_PROTOCOL_LENGTH else 0
|
||||
return urlLengthWithProtocol <= MAX_URL_LENGTH
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,296 @@
|
|||
// Copyright 2018 Twitter, Inc.
|
||||
// Licensed under the Apache License, Version 2.0
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package com.twitter.twittertext
|
||||
|
||||
import java.util.regex.Pattern
|
||||
|
||||
object Regex {
|
||||
private val URL_VALID_GTLD = "(?:(?:" +
|
||||
join(TldLists.GTLDS) +
|
||||
")(?=[^a-z0-9@+-]|$))"
|
||||
private val URL_VALID_CCTLD = "(?:(?:" +
|
||||
join(TldLists.CTLDS) +
|
||||
")(?=[^a-z0-9@+-]|$))"
|
||||
private const val INVALID_CHARACTERS = "\\uFFFE" + // BOM
|
||||
"\\uFEFF" + // BOM
|
||||
"\\uFFFF" // Special
|
||||
private const val DIRECTIONAL_CHARACTERS = "\\u061C" + // ARABIC LETTER MARK (ALM)
|
||||
"\\u200E" + // LEFT-TO-RIGHT MARK (LRM)
|
||||
"\\u200F" + // RIGHT-TO-LEFT MARK (RLM)
|
||||
"\\u202A" + // LEFT-TO-RIGHT EMBEDDING (LRE)
|
||||
"\\u202B" + // RIGHT-TO-LEFT EMBEDDING (RLE)
|
||||
"\\u202C" + // POP DIRECTIONAL FORMATTING (PDF)
|
||||
"\\u202D" + // LEFT-TO-RIGHT OVERRIDE (LRO)
|
||||
"\\u202E" + // RIGHT-TO-LEFT OVERRIDE (RLO)
|
||||
"\\u2066" + // LEFT-TO-RIGHT ISOLATE (LRI)
|
||||
"\\u2067" + // RIGHT-TO-LEFT ISOLATE (RLI)
|
||||
"\\u2068" + // FIRST STRONG ISOLATE (FSI)
|
||||
"\\u2069" // POP DIRECTIONAL ISOLATE (PDI)
|
||||
private const val UNICODE_SPACES = "[" +
|
||||
"\\u0009-\\u000d" + // # White_Space # Cc [5] <control-0009>..<control-000D>
|
||||
"\\u0020" + // White_Space # Zs SPACE
|
||||
"\\u0085" + // White_Space # Cc <control-0085>
|
||||
"\\u00a0" + // White_Space # Zs NO-BREAK SPACE
|
||||
"\\u1680" + // White_Space # Zs OGHAM SPACE MARK
|
||||
"\\u180E" + // White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
||||
"\\u2000-\\u200a" + // # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
||||
"\\u2028" + // White_Space # Zl LINE SEPARATOR
|
||||
"\\u2029" + // White_Space # Zp PARAGRAPH SEPARATOR
|
||||
"\\u202F" + // White_Space # Zs NARROW NO-BREAK SPACE
|
||||
"\\u205F" + // White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
||||
"\\u3000" + // White_Space # Zs IDEOGRAPHIC SPACE
|
||||
"]"
|
||||
private const val LATIN_ACCENTS_CHARS = // Latin-1
|
||||
"\\u00c0-\\u00d6\\u00d8-\\u00f6\\u00f8-\\u00ff" + // Latin Extended A and B
|
||||
"\\u0100-\\u024f" + // IPA Extensions
|
||||
"\\u0253\\u0254\\u0256\\u0257\\u0259\\u025b\\u0263\\u0268\\u026f\\u0272\\u0289\\u028b" + // Hawaiian
|
||||
"\\u02bb" + // Combining diacritics
|
||||
"\\u0300-\\u036f" + // Latin Extended Additional (mostly for Vietnamese)
|
||||
"\\u1e00-\\u1eff"
|
||||
private const val CYRILLIC_CHARS = "\\u0400-\\u04ff"
|
||||
|
||||
// Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Java's \p{L}\p{M}
|
||||
private const val HASHTAG_LETTERS_AND_MARKS = "\\p{L}\\p{M}" +
|
||||
"\\u037f\\u0528-\\u052f\\u08a0-\\u08b2\\u08e4-\\u08ff\\u0978\\u0980\\u0c00\\u0c34\\u0c81" +
|
||||
"\\u0d01\\u0ede\\u0edf\\u10c7\\u10cd\\u10fd-\\u10ff\\u16f1-\\u16f8\\u17b4\\u17b5\\u191d" +
|
||||
"\\u191e\\u1ab0-\\u1abe\\u1bab-\\u1bad\\u1bba-\\u1bbf\\u1cf3-\\u1cf6\\u1cf8\\u1cf9" +
|
||||
"\\u1de7-\\u1df5\\u2cf2\\u2cf3\\u2d27\\u2d2d\\u2d66\\u2d67\\u9fcc\\ua674-\\ua67b\\ua698" +
|
||||
"-\\ua69d\\ua69f\\ua792-\\ua79f\\ua7aa-\\ua7ad\\ua7b0\\ua7b1\\ua7f7-\\ua7f9\\ua9e0-" +
|
||||
"\\ua9ef\\ua9fa-\\ua9fe\\uaa7c-\\uaa7f\\uaae0-\\uaaef\\uaaf2-\\uaaf6\\uab30-\\uab5a" +
|
||||
"\\uab5c-\\uab5f\\uab64\\uab65\\uf870-\\uf87f\\uf882\\uf884-\\uf89f\\uf8b8\\uf8c1-" +
|
||||
"\\uf8d6\\ufa2e\\ufa2f\\ufe27-\\ufe2d\\ud800\\udee0\\ud800\\udf1f\\ud800\\udf50-\\ud800" +
|
||||
"\\udf7a\\ud801\\udd00-\\ud801\\udd27\\ud801\\udd30-\\ud801\\udd63\\ud801\\ude00-\\ud801" +
|
||||
"\\udf36\\ud801\\udf40-\\ud801\\udf55\\ud801\\udf60-\\ud801\\udf67\\ud802\\udc60-\\ud802" +
|
||||
"\\udc76\\ud802\\udc80-\\ud802\\udc9e\\ud802\\udd80-\\ud802\\uddb7\\ud802\\uddbe\\ud802" +
|
||||
"\\uddbf\\ud802\\ude80-\\ud802\\ude9c\\ud802\\udec0-\\ud802\\udec7\\ud802\\udec9-\\ud802" +
|
||||
"\\udee6\\ud802\\udf80-\\ud802\\udf91\\ud804\\udc7f\\ud804\\udcd0-\\ud804\\udce8\\ud804" +
|
||||
"\\udd00-\\ud804\\udd34\\ud804\\udd50-\\ud804\\udd73\\ud804\\udd76\\ud804\\udd80-\\ud804" +
|
||||
"\\uddc4\\ud804\\uddda\\ud804\\ude00-\\ud804\\ude11\\ud804\\ude13-\\ud804\\ude37\\ud804" +
|
||||
"\\udeb0-\\ud804\\udeea\\ud804\\udf01-\\ud804\\udf03\\ud804\\udf05-\\ud804\\udf0c\\ud804" +
|
||||
"\\udf0f\\ud804\\udf10\\ud804\\udf13-\\ud804\\udf28\\ud804\\udf2a-\\ud804\\udf30\\ud804" +
|
||||
"\\udf32\\ud804\\udf33\\ud804\\udf35-\\ud804\\udf39\\ud804\\udf3c-\\ud804\\udf44\\ud804" +
|
||||
"\\udf47\\ud804\\udf48\\ud804\\udf4b-\\ud804\\udf4d\\ud804\\udf57\\ud804\\udf5d-\\ud804" +
|
||||
"\\udf63\\ud804\\udf66-\\ud804\\udf6c\\ud804\\udf70-\\ud804\\udf74\\ud805\\udc80-\\ud805" +
|
||||
"\\udcc5\\ud805\\udcc7\\ud805\\udd80-\\ud805\\uddb5\\ud805\\uddb8-\\ud805\\uddc0\\ud805" +
|
||||
"\\ude00-\\ud805\\ude40\\ud805\\ude44\\ud805\\ude80-\\ud805\\udeb7\\ud806\\udca0-\\ud806" +
|
||||
"\\udcdf\\ud806\\udcff\\ud806\\udec0-\\ud806\\udef8\\ud808\\udf6f-\\ud808\\udf98\\ud81a" +
|
||||
"\\ude40-\\ud81a\\ude5e\\ud81a\\uded0-\\ud81a\\udeed\\ud81a\\udef0-\\ud81a\\udef4\\ud81a" +
|
||||
"\\udf00-\\ud81a\\udf36\\ud81a\\udf40-\\ud81a\\udf43\\ud81a\\udf63-\\ud81a\\udf77\\ud81a" +
|
||||
"\\udf7d-\\ud81a\\udf8f\\ud81b\\udf00-\\ud81b\\udf44\\ud81b\\udf50-\\ud81b\\udf7e\\ud81b" +
|
||||
"\\udf8f-\\ud81b\\udf9f\\ud82f\\udc00-\\ud82f\\udc6a\\ud82f\\udc70-\\ud82f\\udc7c\\ud82f" +
|
||||
"\\udc80-\\ud82f\\udc88\\ud82f\\udc90-\\ud82f\\udc99\\ud82f\\udc9d\\ud82f\\udc9e\\ud83a" +
|
||||
"\\udc00-\\ud83a\\udcc4\\ud83a\\udcd0-\\ud83a\\udcd6\\ud83b\\ude00-\\ud83b\\ude03\\ud83b" +
|
||||
"\\ude05-\\ud83b\\ude1f\\ud83b\\ude21\\ud83b\\ude22\\ud83b\\ude24\\ud83b\\ude27\\ud83b" +
|
||||
"\\ude29-\\ud83b\\ude32\\ud83b\\ude34-\\ud83b\\ude37\\ud83b\\ude39\\ud83b\\ude3b\\ud83b" +
|
||||
"\\ude42\\ud83b\\ude47\\ud83b\\ude49\\ud83b\\ude4b\\ud83b\\ude4d-\\ud83b\\ude4f\\ud83b" +
|
||||
"\\ude51\\ud83b\\ude52\\ud83b\\ude54\\ud83b\\ude57\\ud83b\\ude59\\ud83b\\ude5b\\ud83b" +
|
||||
"\\ude5d\\ud83b\\ude5f\\ud83b\\ude61\\ud83b\\ude62\\ud83b\\ude64\\ud83b\\ude67-\\ud83b" +
|
||||
"\\ude6a\\ud83b\\ude6c-\\ud83b\\ude72\\ud83b\\ude74-\\ud83b\\ude77\\ud83b\\ude79-\\ud83b" +
|
||||
"\\ude7c\\ud83b\\ude7e\\ud83b\\ude80-\\ud83b\\ude89\\ud83b\\ude8b-\\ud83b\\ude9b\\ud83b" +
|
||||
"\\udea1-\\ud83b\\udea3\\ud83b\\udea5-\\ud83b\\udea9\\ud83b\\udeab-\\ud83b\\udebb"
|
||||
|
||||
// Generated from unicode_regex/unicode_regex_groups.scala, more inclusive than Java's \p{Nd}
|
||||
private const val HASHTAG_NUMERALS = "\\p{Nd}" +
|
||||
"\\u0de6-\\u0def\\ua9f0-\\ua9f9\\ud804\\udcf0-\\ud804\\udcf9\\ud804\\udd36-\\ud804" +
|
||||
"\\udd3f\\ud804\\uddd0-\\ud804\\uddd9\\ud804\\udef0-\\ud804\\udef9\\ud805\\udcd0-\\ud805" +
|
||||
"\\udcd9\\ud805\\ude50-\\ud805\\ude59\\ud805\\udec0-\\ud805\\udec9\\ud806\\udce0-\\ud806" +
|
||||
"\\udce9\\ud81a\\ude60-\\ud81a\\ude69\\ud81a\\udf50-\\ud81a\\udf59"
|
||||
private const val HASHTAG_SPECIAL_CHARS = "_" + // underscore
|
||||
"\\u200c" + // ZERO WIDTH NON-JOINER (ZWNJ)
|
||||
"\\u200d" + // ZERO WIDTH JOINER (ZWJ)
|
||||
"\\ua67e" + // CYRILLIC KAVYKA
|
||||
"\\u05be" + // HEBREW PUNCTUATION MAQAF
|
||||
"\\u05f3" + // HEBREW PUNCTUATION GERESH
|
||||
"\\u05f4" + // HEBREW PUNCTUATION GERSHAYIM
|
||||
"\\uff5e" + // FULLWIDTH TILDE
|
||||
"\\u301c" + // WAVE DASH
|
||||
"\\u309b" + // KATAKANA-HIRAGANA VOICED SOUND MARK
|
||||
"\\u309c" + // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
"\\u30a0" + // KATAKANA-HIRAGANA DOUBLE HYPHEN
|
||||
"\\u30fb" + // KATAKANA MIDDLE DOT
|
||||
"\\u3003" + // DITTO MARK
|
||||
"\\u0f0b" + // TIBETAN MARK INTERSYLLABIC TSHEG
|
||||
"\\u0f0c" + // TIBETAN MARK DELIMITER TSHEG BSTAR
|
||||
"\\u00b7" // MIDDLE DOT
|
||||
private const val HASHTAG_LETTERS_NUMERALS =
|
||||
HASHTAG_LETTERS_AND_MARKS + HASHTAG_NUMERALS + HASHTAG_SPECIAL_CHARS
|
||||
private const val HASHTAG_LETTERS_SET = "[$HASHTAG_LETTERS_AND_MARKS]"
|
||||
private const val HASHTAG_LETTERS_NUMERALS_SET = "[$HASHTAG_LETTERS_NUMERALS]"
|
||||
|
||||
/* URL related hash regex collection */
|
||||
private const val URL_VALID_PRECEDING_CHARS =
|
||||
"(?:[^a-z0-9@@$##$INVALID_CHARACTERS]|[$DIRECTIONAL_CHARACTERS]|^)"
|
||||
private const val URL_VALID_CHARS = "[a-z0-9$LATIN_ACCENTS_CHARS]"
|
||||
private const val URL_VALID_SUBDOMAIN =
|
||||
"(?>(?:$URL_VALID_CHARS[$URL_VALID_CHARS\\-_]*)?$URL_VALID_CHARS\\.)"
|
||||
private const val URL_VALID_DOMAIN_NAME =
|
||||
"(?:(?:$URL_VALID_CHARS[$URL_VALID_CHARS\\-]*)?$URL_VALID_CHARS\\.)"
|
||||
private const val PUNCTUATION_CHARS = "-_!\"#$%&'\\(\\)*+,./:;<=>?@\\[\\]^`\\{|}~"
|
||||
|
||||
// Any non-space, non-punctuation characters.
|
||||
// \p{Z} = any kind of whitespace or invisible separator.
|
||||
private const val URL_VALID_UNICODE_CHARS =
|
||||
"[^$PUNCTUATION_CHARS\\s\\p{Z}\\p{InGeneralPunctuation}]"
|
||||
private const val URL_VALID_UNICODE_DOMAIN_NAME =
|
||||
"(?:(?:" + URL_VALID_UNICODE_CHARS + "[" + URL_VALID_UNICODE_CHARS + "\\-]*)?" +
|
||||
URL_VALID_UNICODE_CHARS + "\\.)"
|
||||
private const val URL_PUNYCODE = "(?:xn--[-0-9a-z]+)"
|
||||
private val URL_VALID_DOMAIN = "(?:" + // optional sub-domain + domain + TLD
|
||||
URL_VALID_SUBDOMAIN + "*" + URL_VALID_DOMAIN_NAME + // e.g. twitter.com, foo.co.jp ...
|
||||
"(?:" + URL_VALID_GTLD + "|" + URL_VALID_CCTLD + "|" + URL_PUNYCODE + ")" +
|
||||
")" +
|
||||
"|(?:" + "(?<=https?://)" +
|
||||
"(?:" +
|
||||
"(?:" + URL_VALID_DOMAIN_NAME + URL_VALID_CCTLD + ")" + // protocol + domain + ccTLD
|
||||
"|(?:" +
|
||||
URL_VALID_UNICODE_DOMAIN_NAME + // protocol + unicode domain + TLD
|
||||
"(?:" + URL_VALID_GTLD + "|" + URL_VALID_CCTLD + ")" +
|
||||
")" +
|
||||
")" +
|
||||
")" +
|
||||
"|(?:" + // domain + ccTLD + '/'
|
||||
URL_VALID_DOMAIN_NAME + URL_VALID_CCTLD + "(?=/)" + // e.g. t.co/
|
||||
")"
|
||||
private const val URL_VALID_PORT_NUMBER = "[0-9]++"
|
||||
private const val URL_VALID_GENERAL_PATH_CHARS =
|
||||
"[a-z0-9!\\*';:=\\+,.\\$/%#\\[\\]\\-\\u2013_~\\|&@" +
|
||||
LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]"
|
||||
|
||||
/**
|
||||
* Allow URL paths to contain up to two nested levels of balanced parens
|
||||
* 1. Used in Wikipedia URLs like /Primer_(film)
|
||||
* 2. Used in IIS sessions like /S(dfd346)/
|
||||
* 3. Used in Rdio URLs like /track/We_Up_(Album_Version_(Edited))/
|
||||
*/
|
||||
private const val URL_BALANCED_PARENS = "\\(" +
|
||||
"(?:" +
|
||||
URL_VALID_GENERAL_PATH_CHARS + "+" +
|
||||
"|" + // allow one nested level of balanced parentheses
|
||||
"(?:" +
|
||||
URL_VALID_GENERAL_PATH_CHARS + "*" +
|
||||
"\\(" +
|
||||
URL_VALID_GENERAL_PATH_CHARS + "+" +
|
||||
"\\)" +
|
||||
URL_VALID_GENERAL_PATH_CHARS + "*" +
|
||||
")" +
|
||||
")" +
|
||||
"\\)"
|
||||
|
||||
/**
|
||||
* Valid end-of-path characters (so /foo. does not gobble the period).
|
||||
* 2. Allow =&# for empty URL parameters and other URL-join artifacts
|
||||
*/
|
||||
private const val URL_VALID_PATH_ENDING_CHARS =
|
||||
"[a-z0-9=_#/\\-\\+" + LATIN_ACCENTS_CHARS + CYRILLIC_CHARS + "]|(?:" +
|
||||
URL_BALANCED_PARENS + ")"
|
||||
private const val URL_VALID_PATH = "(?:" +
|
||||
"(?:" +
|
||||
URL_VALID_GENERAL_PATH_CHARS + "*" +
|
||||
"(?:" + URL_BALANCED_PARENS + URL_VALID_GENERAL_PATH_CHARS + "*)*" +
|
||||
URL_VALID_PATH_ENDING_CHARS +
|
||||
")|(?:@" + URL_VALID_GENERAL_PATH_CHARS + "+/)" +
|
||||
")"
|
||||
private const val URL_VALID_URL_QUERY_CHARS =
|
||||
"[a-z0-9!?\\*'\\(\\);:&=\\+\\$/%#\\[\\]\\-_\\.,~\\|@]"
|
||||
private const val URL_VALID_URL_QUERY_ENDING_CHARS = "[a-z0-9\\-_&=#/]"
|
||||
private val VALID_URL_PATTERN_STRING = "(" + // $1 total match
|
||||
"(" + URL_VALID_PRECEDING_CHARS + ")" + // $2 Preceding character
|
||||
"(" + // $3 URL
|
||||
"(https?://)?" + // $4 Protocol (optional)
|
||||
"(" + URL_VALID_DOMAIN + ")" + // $5 Domain(s)
|
||||
"(?::(" + URL_VALID_PORT_NUMBER + "))?" + // $6 Port number (optional)
|
||||
"(/" +
|
||||
URL_VALID_PATH + "*+" +
|
||||
")?" + // $7 URL Path and anchor
|
||||
"(\\?" + URL_VALID_URL_QUERY_CHARS + "*" + // $8 Query String
|
||||
URL_VALID_URL_QUERY_ENDING_CHARS + ")?" +
|
||||
")" +
|
||||
")"
|
||||
private const val AT_SIGNS_CHARS = "@\uFF20"
|
||||
|
||||
/* Begin public constants */
|
||||
private val INVALID_CHARACTERS_PATTERN: Pattern
|
||||
val VALID_HASHTAG: Pattern
|
||||
const val VALID_HASHTAG_GROUP_TAG = 1
|
||||
val INVALID_HASHTAG_MATCH_END: Pattern
|
||||
private val RTL_CHARACTERS: Pattern
|
||||
private val AT_SIGNS: Pattern
|
||||
val VALID_MENTION_OR_LIST: Pattern
|
||||
const val VALID_MENTION_OR_LIST_GROUP_USERNAME = 3
|
||||
const val VALID_MENTION_OR_LIST_GROUP_LIST = 4
|
||||
private val VALID_REPLY: Pattern
|
||||
val INVALID_MENTION_MATCH_END: Pattern
|
||||
|
||||
/**
|
||||
* Regex to extract URL (it also includes the text preceding the url).
|
||||
*
|
||||
* This regex does not reflect its name and [Regex.VALID_URL_GROUP_URL] match
|
||||
* should be checked in order to match a valid url. This is not ideal, but the behavior is
|
||||
* being kept to ensure backwards compatibility. Ideally this regex should be
|
||||
* implemented with a negative lookbehind as opposed to a negated character class
|
||||
* but lack of JS support increases maint overhead if the logic is different by
|
||||
* platform.
|
||||
*/
|
||||
val VALID_URL: Pattern
|
||||
const val VALID_URL_GROUP_BEFORE = 2
|
||||
const val VALID_URL_GROUP_URL = 3
|
||||
const val VALID_URL_GROUP_PROTOCOL = 4
|
||||
const val VALID_URL_GROUP_DOMAIN = 5
|
||||
val INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN: Pattern
|
||||
private val VALID_DOMAIN: Pattern
|
||||
|
||||
// Mastodon hashtag regular expressions. Different from the Twitter ones, in particular,
|
||||
// they can be preceded by e.g., Hirigana characters without an intervening space.
|
||||
// See HASHTAG_RE in https://github.com/mastodon/mastodon/blob/main/app/models/tag.rb
|
||||
// (which is VALID_HASHTAG in this file).
|
||||
private const val HASHTAG_SEPARATORS = "_\u00B7\u30FB\u200c"
|
||||
private const val HASHTAG_FIRST_SEQUENCE_CHUNK_ONE = """[\w_][\w$HASHTAG_SEPARATORS]*[\p{Alpha}$HASHTAG_SEPARATORS]"""
|
||||
private const val HASHTAG_FIRST_SEQUENCE_CHUNK_TWO = """[\w$HASHTAG_SEPARATORS]*[\w_]"""
|
||||
private const val HASHTAG_FIRST_SEQUENCE = "($HASHTAG_FIRST_SEQUENCE_CHUNK_ONE$HASHTAG_FIRST_SEQUENCE_CHUNK_TWO)"
|
||||
private const val HASHTAG_LAST_SEQUENCE = """([\w_]*[\p{L}][\w_]*)"""
|
||||
private const val HASHTAG_NAME_PAT: String = "$HASHTAG_FIRST_SEQUENCE|$HASHTAG_LAST_SEQUENCE"
|
||||
|
||||
// initializing in a static synchronized block,
|
||||
// there appears to be thread safety issues with Pattern.compile in android
|
||||
init {
|
||||
synchronized(Regex::class.java) {
|
||||
INVALID_CHARACTERS_PATTERN = Pattern.compile(".*[$INVALID_CHARACTERS].*")
|
||||
VALID_HASHTAG = Pattern.compile(
|
||||
"(?<![=/)a-zA-Z0-9_])[##]($HASHTAG_NAME_PAT)",
|
||||
Pattern.CASE_INSENSITIVE,
|
||||
)
|
||||
INVALID_HASHTAG_MATCH_END = Pattern.compile("^(?:[##]|://)")
|
||||
RTL_CHARACTERS =
|
||||
Pattern.compile("[\u0600-\u06FF\u0750-\u077F\u0590-\u05FF\uFE70-\uFEFF]")
|
||||
AT_SIGNS = Pattern.compile("[$AT_SIGNS_CHARS]")
|
||||
VALID_MENTION_OR_LIST = Pattern.compile(
|
||||
"([^a-z0-9_!#$%&*=$AT_SIGNS_CHARS]|^|(?:^|[^a-z0-9_+~.-])RT:?)($AT_SIGNS+)([a-z0-9_]+)($AT_SIGNS[a-z][a-z0-9_\\-.]+)?",
|
||||
Pattern.CASE_INSENSITIVE,
|
||||
)
|
||||
VALID_REPLY = Pattern.compile(
|
||||
"^(?:" + UNICODE_SPACES + "|" + DIRECTIONAL_CHARACTERS + ")*" +
|
||||
AT_SIGNS + "([a-z0-9_]{1,20})",
|
||||
Pattern.CASE_INSENSITIVE,
|
||||
)
|
||||
INVALID_MENTION_MATCH_END = Pattern.compile("^(?:[$AT_SIGNS_CHARS$LATIN_ACCENTS_CHARS]|://)")
|
||||
INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN = Pattern.compile("[-_./]$")
|
||||
VALID_URL = Pattern.compile(VALID_URL_PATTERN_STRING, Pattern.CASE_INSENSITIVE)
|
||||
VALID_DOMAIN = Pattern.compile(URL_VALID_DOMAIN, Pattern.CASE_INSENSITIVE)
|
||||
}
|
||||
}
|
||||
|
||||
private fun join(col: Collection<*>): String {
|
||||
val sb = StringBuilder()
|
||||
val iter = col.iterator()
|
||||
if (iter.hasNext()) {
|
||||
sb.append(iter.next().toString())
|
||||
}
|
||||
while (iter.hasNext()) {
|
||||
sb.append("|")
|
||||
sb.append(iter.next().toString())
|
||||
}
|
||||
return sb.toString()
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"uniqueId": "com.twitter:twittertext",
|
||||
"developers": [
|
||||
{
|
||||
"name": "Twitter",
|
||||
"organisationUrl": "https://www.twitter.com"
|
||||
}
|
||||
],
|
||||
"artifactVersion": "1.14.3",
|
||||
"description": "Libraries and conformance tests to standardize parsing of Tweet text.",
|
||||
"name": "Twitter text parsing",
|
||||
"tag": "text",
|
||||
"licenses": [
|
||||
"Apache_2_0"
|
||||
],
|
||||
"website": "https://github.com/twitter/twitter-text"
|
||||
}
|
Loading…
Reference in New Issue