mirror of
https://github.com/superseriousbusiness/gotosocial
synced 2025-06-05 21:59:39 +02:00
[bugfix] Use better plaintext representation of status for filtering (#3301)
* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
This commit is contained in:
@@ -21,6 +21,8 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@@ -35,7 +37,6 @@ import (
|
||||
"github.com/superseriousbusiness/gotosocial/internal/language"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/log"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/media"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/text"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/uris"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/util"
|
||||
)
|
||||
@@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Extract text fields from the status that we will match filters against.
|
||||
fields := filterableTextFields(s)
|
||||
// Key this status based on ID + last updated time,
|
||||
// to ensure we always filter on latest version.
|
||||
statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10)
|
||||
|
||||
// Check if we have filterable fields cached for this status.
|
||||
cache := c.state.Caches.StatusesFilterableFields
|
||||
fields, stored := cache.Get(statusKey)
|
||||
if !stored {
|
||||
// We don't have filterable fields
|
||||
// cached, calculate + cache now.
|
||||
fields = filterableFields(s)
|
||||
cache.Set(statusKey, fields)
|
||||
}
|
||||
|
||||
// Record all matching warn filters and the reasons they matched.
|
||||
filterResults := make([]apimodel.FilterResult, 0, len(filters))
|
||||
for _, filter := range filters {
|
||||
if !filterAppliesInContext(filter, filterContext) {
|
||||
// Filter doesn't apply to this context.
|
||||
continue
|
||||
}
|
||||
if filter.Expired(now) {
|
||||
// Filter doesn't apply
|
||||
// to this context.
|
||||
continue
|
||||
}
|
||||
|
||||
// List all matching keywords.
|
||||
if filter.Expired(now) {
|
||||
// Filter doesn't
|
||||
// apply anymore.
|
||||
continue
|
||||
}
|
||||
|
||||
// Assemble matching keywords (if any) from this filter.
|
||||
keywordMatches := make([]string, 0, len(filter.Keywords))
|
||||
for _, filterKeyword := range filter.Keywords {
|
||||
var isMatch bool
|
||||
for _, field := range fields {
|
||||
if filterKeyword.Regexp.MatchString(field) {
|
||||
isMatch = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if isMatch {
|
||||
keywordMatches = append(keywordMatches, filterKeyword.Keyword)
|
||||
for _, keyword := range filter.Keywords {
|
||||
// Check if at least one filterable field
|
||||
// in the status matches on this filter.
|
||||
if slices.ContainsFunc(
|
||||
fields,
|
||||
func(field string) bool {
|
||||
return keyword.Regexp.MatchString(field)
|
||||
},
|
||||
) {
|
||||
// At least one field matched on this filter.
|
||||
keywordMatches = append(keywordMatches, keyword.Keyword)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(
|
||||
return filterResults, nil
|
||||
}
|
||||
|
||||
// filterableTextFields returns all text from a status that we might want to filter on:
|
||||
// - content
|
||||
// - content warning
|
||||
// - media descriptions
|
||||
// - poll options
|
||||
func filterableTextFields(s *gtsmodel.Status) []string {
|
||||
fieldCount := 2 + len(s.Attachments)
|
||||
if s.Poll != nil {
|
||||
fieldCount += len(s.Poll.Options)
|
||||
}
|
||||
fields := make([]string, 0, fieldCount)
|
||||
|
||||
if s.Content != "" {
|
||||
fields = append(fields, text.SanitizeToPlaintext(s.Content))
|
||||
}
|
||||
if s.ContentWarning != "" {
|
||||
fields = append(fields, s.ContentWarning)
|
||||
}
|
||||
for _, attachment := range s.Attachments {
|
||||
if attachment.Description != "" {
|
||||
fields = append(fields, attachment.Description)
|
||||
}
|
||||
}
|
||||
if s.Poll != nil {
|
||||
for _, option := range s.Poll.Options {
|
||||
if option != "" {
|
||||
fields = append(fields, option)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return fields
|
||||
}
|
||||
|
||||
// filterAppliesInContext returns whether a given filter applies in a given context.
|
||||
func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {
|
||||
switch filterContext {
|
||||
|
@@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() {
|
||||
|
||||
// Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.
|
||||
func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) {
|
||||
testStatus := suite.testStatuses["admin_account_status_1"]
|
||||
testStatus := new(gtsmodel.Status)
|
||||
*testStatus = *suite.testStatuses["admin_account_status_1"]
|
||||
testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`
|
||||
|
||||
if boost {
|
||||
// Modify a fixture boost into a boost of the above status.
|
||||
boostStatus := suite.testStatuses["admin_account_status_4"]
|
||||
boostStatus.BoostOf = testStatus
|
||||
boostStatus.BoostOfID = testStatus.ID
|
||||
testStatus = boostStatus
|
||||
boost, err := suite.typeconverter.StatusToBoost(
|
||||
context.Background(),
|
||||
testStatus,
|
||||
suite.testAccounts["admin_account"],
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
suite.FailNow(err.Error())
|
||||
}
|
||||
testStatus = boost
|
||||
}
|
||||
|
||||
requestingAccount := suite.testAccounts["local_account_1"]
|
||||
@@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh
|
||||
[]*gtsmodel.Filter{filter},
|
||||
nil,
|
||||
)
|
||||
if suite.NoError(err) {
|
||||
suite.NotEmpty(apiStatus.Filtered)
|
||||
if err != nil {
|
||||
suite.FailNow(err.Error())
|
||||
}
|
||||
|
||||
suite.NotEmpty(apiStatus.Filtered)
|
||||
}
|
||||
|
||||
func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() {
|
||||
|
@@ -27,6 +27,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/k3a/html2text"
|
||||
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/config"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||
@@ -284,3 +285,64 @@ func ContentToContentLanguage(
|
||||
|
||||
return contentStr, langTagStr
|
||||
}
|
||||
|
||||
// filterableFields returns text fields from
|
||||
// a status that we might want to filter on:
|
||||
//
|
||||
// - content warning
|
||||
// - content (converted to plaintext from HTML)
|
||||
// - media descriptions
|
||||
// - poll options
|
||||
//
|
||||
// Each field should be filtered separately.
|
||||
// This avoids scenarios where false-positive
|
||||
// multiple-word matches can be made by matching
|
||||
// the last word of one field + the first word
|
||||
// of the next field together.
|
||||
func filterableFields(s *gtsmodel.Status) []string {
|
||||
// Estimate length of fields.
|
||||
fieldCount := 2 + len(s.Attachments)
|
||||
if s.Poll != nil {
|
||||
fieldCount += len(s.Poll.Options)
|
||||
}
|
||||
fields := make([]string, 0, fieldCount)
|
||||
|
||||
// Content warning / title.
|
||||
if s.ContentWarning != "" {
|
||||
fields = append(fields, s.ContentWarning)
|
||||
}
|
||||
|
||||
// Status content. Though we have raw text
|
||||
// available for statuses created on our
|
||||
// instance, use the html2text version to
|
||||
// remove markdown-formatting characters
|
||||
// and ensure more consistent filtering.
|
||||
if s.Content != "" {
|
||||
text := html2text.HTML2TextWithOptions(
|
||||
s.Content,
|
||||
html2text.WithLinksInnerText(),
|
||||
html2text.WithUnixLineBreaks(),
|
||||
)
|
||||
if text != "" {
|
||||
fields = append(fields, text)
|
||||
}
|
||||
}
|
||||
|
||||
// Media descriptions.
|
||||
for _, attachment := range s.Attachments {
|
||||
if attachment.Description != "" {
|
||||
fields = append(fields, attachment.Description)
|
||||
}
|
||||
}
|
||||
|
||||
// Poll options.
|
||||
if s.Poll != nil {
|
||||
for _, opt := range s.Poll.Options {
|
||||
if opt != "" {
|
||||
fields = append(fields, opt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return fields
|
||||
}
|
||||
|
@@ -21,6 +21,7 @@ import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/config"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/language"
|
||||
@@ -158,3 +159,62 @@ func TestContentToContentLanguage(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterableText(t *testing.T) {
|
||||
type testcase struct {
|
||||
status *gtsmodel.Status
|
||||
expectedFields []string
|
||||
}
|
||||
|
||||
for _, testcase := range []testcase{
|
||||
{
|
||||
status: >smodel.Status{
|
||||
ContentWarning: "This is a test status",
|
||||
Content: `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`,
|
||||
},
|
||||
expectedFields: []string{
|
||||
"This is a test status",
|
||||
"Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.",
|
||||
},
|
||||
},
|
||||
{
|
||||
status: >smodel.Status{
|
||||
Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`,
|
||||
},
|
||||
expectedFields: []string{
|
||||
"@zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)\n\nhttps://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>",
|
||||
},
|
||||
},
|
||||
{
|
||||
status: >smodel.Status{
|
||||
ContentWarning: "Nerd stuff",
|
||||
Content: `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`,
|
||||
Attachments: []*gtsmodel.MediaAttachment{
|
||||
{
|
||||
Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`,
|
||||
},
|
||||
{
|
||||
Description: `Another media attachment`,
|
||||
},
|
||||
},
|
||||
Poll: >smodel.Poll{
|
||||
Options: []string{
|
||||
"Poll option 1",
|
||||
"Poll option 2",
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedFields: []string{
|
||||
"Nerd stuff",
|
||||
"Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.",
|
||||
"Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.",
|
||||
"Another media attachment",
|
||||
"Poll option 1",
|
||||
"Poll option 2",
|
||||
},
|
||||
},
|
||||
} {
|
||||
fields := filterableFields(testcase.status)
|
||||
assert.Equal(t, testcase.expectedFields, fields)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user