[bugfix] Use better plaintext representation of status for filtering (#3301)

* [bugfix] Use better plaintext representation of status for filtering

* add new deps to readme

* lint

* update tests

* update regexes

* address review comments

* remove now unused xxhash

* whoops, wrong logger

* Merge branch 'main' into status_filtering_bugfix

* put cache in caches struct

* pain
This commit is contained in:
tobi
2024-09-16 14:00:23 +02:00
committed by GitHub
parent 6dd936fbe1
commit efd1a4f717
15 changed files with 2685 additions and 64 deletions

View File

@@ -21,6 +21,8 @@ import (
"context"
"errors"
"fmt"
"slices"
"strconv"
"strings"
"time"
@@ -35,7 +37,6 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/language"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/media"
"github.com/superseriousbusiness/gotosocial/internal/text"
"github.com/superseriousbusiness/gotosocial/internal/uris"
"github.com/superseriousbusiness/gotosocial/internal/util"
)
@@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(
return nil, nil
}
// Extract text fields from the status that we will match filters against.
fields := filterableTextFields(s)
// Key this status based on ID + last updated time,
// to ensure we always filter on latest version.
statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10)
// Check if we have filterable fields cached for this status.
cache := c.state.Caches.StatusesFilterableFields
fields, stored := cache.Get(statusKey)
if !stored {
// We don't have filterable fields
// cached, calculate + cache now.
fields = filterableFields(s)
cache.Set(statusKey, fields)
}
// Record all matching warn filters and the reasons they matched.
filterResults := make([]apimodel.FilterResult, 0, len(filters))
for _, filter := range filters {
if !filterAppliesInContext(filter, filterContext) {
// Filter doesn't apply to this context.
continue
}
if filter.Expired(now) {
// Filter doesn't apply
// to this context.
continue
}
// List all matching keywords.
if filter.Expired(now) {
// Filter doesn't
// apply anymore.
continue
}
// Assemble matching keywords (if any) from this filter.
keywordMatches := make([]string, 0, len(filter.Keywords))
for _, filterKeyword := range filter.Keywords {
var isMatch bool
for _, field := range fields {
if filterKeyword.Regexp.MatchString(field) {
isMatch = true
break
}
}
if isMatch {
keywordMatches = append(keywordMatches, filterKeyword.Keyword)
for _, keyword := range filter.Keywords {
// Check if at least one filterable field
// in the status matches on this filter.
if slices.ContainsFunc(
fields,
func(field string) bool {
return keyword.Regexp.MatchString(field)
},
) {
// At least one field matched on this filter.
keywordMatches = append(keywordMatches, keyword.Keyword)
}
}
@@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(
return filterResults, nil
}
// filterableTextFields returns all text from a status that we might want to filter on:
// - content
// - content warning
// - media descriptions
// - poll options
func filterableTextFields(s *gtsmodel.Status) []string {
fieldCount := 2 + len(s.Attachments)
if s.Poll != nil {
fieldCount += len(s.Poll.Options)
}
fields := make([]string, 0, fieldCount)
if s.Content != "" {
fields = append(fields, text.SanitizeToPlaintext(s.Content))
}
if s.ContentWarning != "" {
fields = append(fields, s.ContentWarning)
}
for _, attachment := range s.Attachments {
if attachment.Description != "" {
fields = append(fields, attachment.Description)
}
}
if s.Poll != nil {
for _, option := range s.Poll.Options {
if option != "" {
fields = append(fields, option)
}
}
}
return fields
}
// filterAppliesInContext returns whether a given filter applies in a given context.
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
switch filterContext {