GoToSocial/internal/util/statustools.go

/*
   GoToSocial
   Copyright (C) 2021-2022 GoToSocial Authors admin@gotosocial.org

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU Affero General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Affero General Public License for more details.

   You should have received a copy of the GNU Affero General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

package util

import (
	"unicode"
	"unicode/utf8"

	"github.com/superseriousbusiness/gotosocial/internal/regexes"
)

const (
	maximumHashtagLength = 30
)

// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
// and applies a regex to it to return a deduplicated list of account names
// mentioned in that text, in the format "@user@example.org" or "@username" for
// local users.
func DeriveMentionNamesFromText(text string) []string {
	mentionedAccounts := []string{}
	for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) {
		mentionedAccounts = append(mentionedAccounts, m[1])
	}
	return UniqueStrings(mentionedAccounts)
}

type Pair[A, B any] struct {
	First  A
	Second B
}

// Byte index in original string
// `First` includes `#`.
type Span = Pair[int, int]

// Takes a plaintext (ie., not HTML-formatted) text,
// and returns a slice of unique hashtags.
func DeriveHashtagsFromText(text string) []string {
	tagsMap := make(map[string]bool)
	tags := []string{}

	for _, v := range FindHashtagSpansInText(text) {
		t := text[v.First+1 : v.Second]
		if _, value := tagsMap[t]; !value {
			tagsMap[t] = true
			tags = append(tags, t)
		}
	}

	return tags
}

// Takes a plaintext (ie., not HTML-formatted) text,
// and returns a list of pairs of indices into the original string, where
// hashtags are located.
func FindHashtagSpansInText(text string) []Span {
	tags := []Span{}
	start := 0
	// Keep one rune of lookbehind.
	prev := ' '
	inTag := false

	for i, r := range text {
		if r == '#' && IsHashtagBoundary(prev) {
			// Start of hashtag.
			inTag = true
			start = i
		} else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) {
			// Inside the hashtag, but it was a phoney, gottem.
			inTag = false
		} else if inTag && IsHashtagBoundary(r) {
			// End of hashtag.
			inTag = false
			appendTag(&tags, text, start, i)
		} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
			// End of text.
			appendTag(&tags, text, start, irl)
		}

		prev = r
	}

	return tags
}

func appendTag(tags *[]Span, text string, start int, end int) {
	l := end - start - 1
	// This check could be moved out into the parsing loop if necessary!
	if 0 < l && l <= maximumHashtagLength {
		*tags = append(*tags, Span{First: start, Second: end})
	}
}

// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
// and applies a regex to it to return a deduplicated list of emojis
// used in that text, without the surrounding `::`
func DeriveEmojisFromText(text string) []string {
	emojis := []string{}
	for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) {
		emojis = append(emojis, m[1])
	}
	return UniqueStrings(emojis)
}

func IsPermittedInHashtag(r rune) bool {
	return unicode.IsLetter(r) || unicode.IsNumber(r)
}

// Decides where to break before or after a hashtag.
func IsHashtagBoundary(r rune) bool {
	return r == '#' || // `###lol` should work
		unicode.IsSpace(r) || // All kinds of Unicode whitespace.
		unicode.IsControl(r) || // All kinds of control characters, like tab.
		// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
		// But `someurl/#fragment` should not match, neither should HTML entities like `&#35;`.
		('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
}
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`/*`
			`GoToSocial`
Extend license notices to 2022 (#354) 2021-12-20 18:42:19 +01:00			`Copyright (C) 2021-2022 GoToSocial Authors admin@gotosocial.org`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00
			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU Affero General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU Affero General Public License for more details.`

			`You should have received a copy of the GNU Affero General Public License`
			`along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`package util`

			`import (`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`"unicode"`
			`"unicode/utf8"`
moving stuff around 2021-09-01 18:29:25 +02:00
			`"github.com/superseriousbusiness/gotosocial/internal/regexes"`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`)`

[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`const (`
			`maximumHashtagLength = 30`
			`)`

[feature] Dereference remote mentions when the account is not already known (#442) * remove mention util function from db * add ParseMentionFunc to gtsmodel * add parseMentionFunc to processor * refactor search to simplify it a bit * add parseMentionFunc to account * add parseMentionFunc to status * some renaming for clarity * test dereference of unknown mentioned account 2022-03-29 11:54:56 +02:00			`// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,`
			`// and applies a regex to it to return a deduplicated list of account names`
			`// mentioned in that text, in the format "@user@example.org" or "@username" for`
			`// local users.`
			`func DeriveMentionNamesFromText(text string) []string {`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`mentionedAccounts := []string{}`
kim is a reply guy (#208) * bun debug * bun trace logging hooks * more tests * fix up some stuffffff * drop the frontend cache until a proper fix is made * go fmt 2021-09-11 13:19:06 +02:00			`for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) {`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`mentionedAccounts = append(mentionedAccounts, m[1])`
			`}`
Database updates (#144) * start moving some database stuff around * continue moving db stuff around * more fiddling * more updates * and some more * and yet more * i broke SOMETHING but what, it's a mystery * tidy up * vendor ttlcache * use ttlcache * fix up some tests * rename some stuff * little reminder * some more updates 2021-08-20 12:26:56 +02:00			`return UniqueStrings(mentionedAccounts)`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`}`

[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`type Pair[A, B any] struct {`
			`First A`
			`Second B`
			`}`

			`// Byte index in original string`
			// `First` includes `#`.
			`type Span = Pair[int, int]`

			`// Takes a plaintext (ie., not HTML-formatted) text,`
			`// and returns a slice of unique hashtags.`
kim is a reply guy (#208) * bun debug * bun trace logging hooks * more tests * fix up some stuffffff * drop the frontend cache until a proper fix is made * go fmt 2021-09-11 13:19:06 +02:00			`func DeriveHashtagsFromText(text string) []string {`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`tagsMap := make(map[string]bool)`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`tags := []string{}`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00
			`for _, v := range FindHashtagSpansInText(text) {`
			`t := text[v.First+1 : v.Second]`
			`if _, value := tagsMap[t]; !value {`
			`tagsMap[t] = true`
			`tags = append(tags, t)`
			`}`
			`}`

			`return tags`
			`}`

			`// Takes a plaintext (ie., not HTML-formatted) text,`
			`// and returns a list of pairs of indices into the original string, where`
			`// hashtags are located.`
			`func FindHashtagSpansInText(text string) []Span {`
			`tags := []Span{}`
			`start := 0`
			`// Keep one rune of lookbehind.`
			`prev := ' '`
			`inTag := false`

			`for i, r := range text {`
[chore/bugfix] Switch markdown from blackfriday to goldmark (#1267) Co-authored-by: Autumn! <autumnull@posteo.net> 2022-12-16 12:20:22 +01:00			`if r == '#' && IsHashtagBoundary(prev) {`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`// Start of hashtag.`
			`inTag = true`
			`start = i`
[chore/bugfix] Switch markdown from blackfriday to goldmark (#1267) Co-authored-by: Autumn! <autumnull@posteo.net> 2022-12-16 12:20:22 +01:00			`} else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) {`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`// Inside the hashtag, but it was a phoney, gottem.`
			`inTag = false`
[chore/bugfix] Switch markdown from blackfriday to goldmark (#1267) Co-authored-by: Autumn! <autumnull@posteo.net> 2022-12-16 12:20:22 +01:00			`} else if inTag && IsHashtagBoundary(r) {`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`// End of hashtag.`
			`inTag = false`
			`appendTag(&tags, text, start, i)`
			`} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {`
			`// End of text.`
			`appendTag(&tags, text, start, irl)`
			`}`

			`prev = r`
			`}`

			`return tags`
			`}`

			`func appendTag(tags *[]Span, text string, start int, end int) {`
			`l := end - start - 1`
			`// This check could be moved out into the parsing loop if necessary!`
			`if 0 < l && l <= maximumHashtagLength {`
			`tags = append(tags, Span{First: start, Second: end})`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`}`
			`}`

kim is a reply guy (#208) * bun debug * bun trace logging hooks * more tests * fix up some stuffffff * drop the frontend cache until a proper fix is made * go fmt 2021-09-11 13:19:06 +02:00			`// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`// and applies a regex to it to return a deduplicated list of emojis`
kim is a reply guy (#208) * bun debug * bun trace logging hooks * more tests * fix up some stuffffff * drop the frontend cache until a proper fix is made * go fmt 2021-09-11 13:19:06 +02:00			// used in that text, without the surrounding `::`
			`func DeriveEmojisFromText(text string) []string {`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`emojis := []string{}`
kim is a reply guy (#208) * bun debug * bun trace logging hooks * more tests * fix up some stuffffff * drop the frontend cache until a proper fix is made * go fmt 2021-09-11 13:19:06 +02:00			`for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) {`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`emojis = append(emojis, m[1])`
			`}`
Database updates (#144) * start moving some database stuff around * continue moving db stuff around * more fiddling * more updates * and some more * and yet more * i broke SOMETHING but what, it's a mystery * tidy up * vendor ttlcache * use ttlcache * fix up some tests * rename some stuff * little reminder * some more updates 2021-08-20 12:26:56 +02:00			`return UniqueStrings(emojis)`
Api/v1/statuses (#11) This PR adds: Statuses New status creation. View existing status Delete a status Fave a status Unfave a status See who's faved a status Media Upload media attachment and store/retrieve it Upload custom emoji and store/retrieve it Fileserver Serve files from storage Testing Test models, testrig -- run a GTS test instance and play around with it. 2021-04-19 19:42:19 +02:00			`}`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00
[chore/bugfix] Switch markdown from blackfriday to goldmark (#1267) Co-authored-by: Autumn! <autumnull@posteo.net> 2022-12-16 12:20:22 +01:00			`func IsPermittedInHashtag(r rune) bool {`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			`return unicode.IsLetter(r) \|\| unicode.IsNumber(r)`
			`}`

			`// Decides where to break before or after a hashtag.`
[chore/bugfix] Switch markdown from blackfriday to goldmark (#1267) Co-authored-by: Autumn! <autumnull@posteo.net> 2022-12-16 12:20:22 +01:00			`func IsHashtagBoundary(r rune) bool {`
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it 2022-11-15 16:05:34 +01:00			return r == '#' \|\| // `###lol` should work
			`unicode.IsSpace(r) \|\| // All kinds of Unicode whitespace.`
			`unicode.IsControl(r) \|\| // All kinds of control characters, like tab.`
			// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
			// But `someurl/#fragment` should not match, neither should HTML entities like `#`.
			`('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))`
			`}`