[bugfix] Extend parser to handle more non-Latin hashtags (#3700)

* Allow marks after NFC normalization

Includes regression test for the Tamil example from #3618

* Disallow just numbers + marks + underscore as hashtag
This commit is contained in:
Vyr Cossont
2025-01-31 02:42:55 -08:00
committed by GitHub
parent ab758cc233
commit b9e0689359
5 changed files with 48 additions and 37 deletions

View File

@@ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) {
// Validate normalized result.
var (
notJustUnderscores = false
onlyPermittedChars = true
lengthOK = true
atLeastOneRequiredChar = false
onlyPermittedChars = true
lengthOK = true
)
for i, r := range normalized {
if r != '_' {
// This isn't an underscore,
// so the whole hashtag isn't
// just underscores.
notJustUnderscores = true
if !isPermittedIfNotEntireHashtag(r) {
// This isn't an underscore, mark, etc,
// so the hashtag contains at least one
atLeastOneRequiredChar = true
}
if i >= maximumHashtagLength {
@@ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) {
}
}
return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores)
return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar
}