[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049)

* [bugfix] Fix unicode-unaware word boundary check in hashtag regex

Go `\b` does not care for Unicode, and without lookahead, the workarounds got
very ugly. So I replaced the regex with a parser.

The parser runs in O(n) time and performance should not be affected.

* [bugfix] Add back hashtag max length and add tests for it
This commit is contained in:
ugla
2022-11-15 16:05:34 +01:00
committed by GitHub
parent fece7fa706
commit 52109776f6
4 changed files with 146 additions and 45 deletions

View File

@@ -19,11 +19,16 @@
package util
import (
"strings"
"unicode"
"unicode/utf8"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
)
const (
maximumHashtagLength = 30
)
// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
// and applies a regex to it to return a deduplicated list of account names
// mentioned in that text, in the format "@user@example.org" or "@username" for
@@ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string {
return UniqueStrings(mentionedAccounts)
}
// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text,
// and applies a regex to it to return a deduplicated list of hashtags
// used in that text, without the leading #. The case of the returned
// tags will be lowered, for consistency.
type Pair[A, B any] struct {
First A
Second B
}
// Byte index in original string
// `First` includes `#`.
type Span = Pair[int, int]
// Takes a plaintext (ie., not HTML-formatted) text,
// and returns a slice of unique hashtags.
func DeriveHashtagsFromText(text string) []string {
tagsMap := make(map[string]bool)
tags := []string{}
for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) {
tags = append(tags, strings.TrimPrefix(m[1], "#"))
for _, v := range FindHashtagSpansInText(text) {
t := text[v.First+1 : v.Second]
if _, value := tagsMap[t]; !value {
tagsMap[t] = true
tags = append(tags, t)
}
}
return tags
}
// Takes a plaintext (ie., not HTML-formatted) text,
// and returns a list of pairs of indices into the original string, where
// hashtags are located.
func FindHashtagSpansInText(text string) []Span {
tags := []Span{}
start := 0
// Keep one rune of lookbehind.
prev := ' '
inTag := false
for i, r := range text {
if r == '#' && isHashtagBoundary(prev) {
// Start of hashtag.
inTag = true
start = i
} else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) {
// Inside the hashtag, but it was a phoney, gottem.
inTag = false
} else if inTag && isHashtagBoundary(r) {
// End of hashtag.
inTag = false
appendTag(&tags, text, start, i)
} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
// End of text.
appendTag(&tags, text, start, irl)
}
prev = r
}
return tags
}
func appendTag(tags *[]Span, text string, start int, end int) {
l := end - start - 1
// This check could be moved out into the parsing loop if necessary!
if 0 < l && l <= maximumHashtagLength {
*tags = append(*tags, Span{First: start, Second: end})
}
return UniqueStrings(tags)
}
// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
@@ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string {
}
return UniqueStrings(emojis)
}
func isPermittedInHashtag(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r)
}
// Decides where to break before or after a hashtag.
func isHashtagBoundary(r rune) bool {
return r == '#' || // `###lol` should work
unicode.IsSpace(r) || // All kinds of Unicode whitespace.
unicode.IsControl(r) || // All kinds of control characters, like tab.
// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
// But `someurl/#fragment` should not match, neither should HTML entities like `&#35;`.
('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
}