[bugfix] Use better plaintext representation of status for filtering (#3301)

* [bugfix] Use better plaintext representation of status for filtering

* add new deps to readme

* lint

* update tests

* update regexes

* address review comments

* remove now unused xxhash

* whoops, wrong logger

* Merge branch 'main' into status_filtering_bugfix

* put cache in caches struct

* pain
This commit is contained in:
tobi
2024-09-16 14:00:23 +02:00
committed by GitHub
parent 6dd936fbe1
commit efd1a4f717
15 changed files with 2685 additions and 64 deletions

View File

@ -27,6 +27,7 @@ import (
"strconv"
"strings"
"github.com/k3a/html2text"
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@ -284,3 +285,64 @@ func ContentToContentLanguage(
return contentStr, langTagStr
}
// filterableFields returns text fields from
// a status that we might want to filter on:
//
// - content warning
// - content (converted to plaintext from HTML)
// - media descriptions
// - poll options
//
// Each field should be filtered separately.
// This avoids scenarios where false-positive
// multiple-word matches can be made by matching
// the last word of one field + the first word
// of the next field together.
func filterableFields(s *gtsmodel.Status) []string {
// Estimate length of fields.
fieldCount := 2 + len(s.Attachments)
if s.Poll != nil {
fieldCount += len(s.Poll.Options)
}
fields := make([]string, 0, fieldCount)
// Content warning / title.
if s.ContentWarning != "" {
fields = append(fields, s.ContentWarning)
}
// Status content. Though we have raw text
// available for statuses created on our
// instance, use the html2text version to
// remove markdown-formatting characters
// and ensure more consistent filtering.
if s.Content != "" {
text := html2text.HTML2TextWithOptions(
s.Content,
html2text.WithLinksInnerText(),
html2text.WithUnixLineBreaks(),
)
if text != "" {
fields = append(fields, text)
}
}
// Media descriptions.
for _, attachment := range s.Attachments {
if attachment.Description != "" {
fields = append(fields, attachment.Description)
}
}
// Poll options.
if s.Poll != nil {
for _, opt := range s.Poll.Options {
if opt != "" {
fields = append(fields, opt)
}
}
}
return fields
}