[bugfix] Extend parser to handle more non-Latin hashtags (#3700)

* Allow marks after NFC normalization

Includes regression test for the Tamil example from #3618

* Disallow just numbers + marks + underscore as hashtag
This commit is contained in:
Vyr Cossont
2025-01-31 02:42:55 -08:00
committed by GitHub
parent ab758cc233
commit b9e0689359
5 changed files with 48 additions and 37 deletions

View File

@@ -19,19 +19,14 @@ package text
import "unicode"
func isPlausiblyInHashtag(r rune) bool {
// Marks are allowed during parsing
// prior to normalization, but not after,
// since they may be combined into letters
// during normalization.
return unicode.IsMark(r) ||
isPermittedInHashtag(r)
func isPermittedInHashtag(r rune) bool {
return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r)
}
func isPermittedInHashtag(r rune) bool {
return unicode.IsLetter(r) ||
unicode.IsNumber(r) ||
r == '_'
// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag
// but are not allowed to be the only characters making up the hashtag.
func isPermittedIfNotEntireHashtag(r rune) bool {
return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_'
}
// isHashtagBoundary returns true if rune r