[feature] Federate status language in and out (#2366)

* [feature] Federate status language in + out

* go fmt

* tests, little fix

* improve comments

* unnest a bit

* avoid unnecessary nil check

* use more descriptive variable for contentMap

* prefer instance languages when selecting from contentMap

* update docs to reflect lang selection

* rename rdfLangString -> rdfLangs

* update comments to mention Pollable

* iter through slice instead of map
This commit is contained in:
tobi
2023-11-21 15:13:30 +01:00
committed by GitHub
parent 1f962372af
commit cfefbc08d8
15 changed files with 758 additions and 168 deletions

View File

@@ -244,9 +244,15 @@ func (c *Converter) ASStatusToStatus(ctx context.Context, statusable ap.Statusab
}
// status.Content
// status.Language
//
// The (html-formatted) content of this status.
status.Content = ap.ExtractContent(statusable)
// Many implementations set both content
// and contentMap; we can use these to
// infer the language of the status.
status.Content, status.Language = ContentToContentLanguage(
ctx,
ap.ExtractContent(statusable),
)
// status.Attachments
//
@@ -396,9 +402,6 @@ func (c *Converter) ASStatusToStatus(ctx context.Context, statusable ap.Statusab
return &s
}()
// language
// TODO: we might be able to extract this from the contentMap field
// ActivityStreamsType
status.ActivityStreamsType = statusable.GetTypeName()
@@ -707,7 +710,7 @@ func (c *Converter) ASFlagToReport(ctx context.Context, flaggable ap.Flaggable)
// For Mastodon, this will just be a string, or nothing.
// In Misskey's case, it may also contain the URLs of
// one or more reported statuses, so extract these too.
content := ap.ExtractContent(flaggable)
content := ap.ExtractContent(flaggable).Content
statusURIs := []*url.URL{}
inlineURLs := misskeyReportInlineURLs(content)
statusURIs = append(statusURIs, inlineURLs...)

View File

@@ -45,6 +45,10 @@ func (suite *ASToInternalTestSuite) jsonToType(in string) vocab.Type {
suite.FailNow(err.Error())
}
if statusable, ok := t.(ap.Statusable); ok {
ap.NormalizeIncomingContent(statusable, m)
}
return t
}
@@ -103,7 +107,8 @@ func (suite *ASToInternalTestSuite) TestParsePublicStatus() {
suite.NoError(err)
suite.Equal("reading: Punishment and Reward in the Corporate University", status.ContentWarning)
suite.Equal(`<p>&gt; So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content)
suite.Equal(`<p>> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content)
suite.Equal("en", status.Language)
}
func (suite *ASToInternalTestSuite) TestParsePublicStatusNoURL() {
@@ -117,7 +122,7 @@ func (suite *ASToInternalTestSuite) TestParsePublicStatusNoURL() {
suite.NoError(err)
suite.Equal("reading: Punishment and Reward in the Corporate University", status.ContentWarning)
suite.Equal(`<p>&gt; So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content)
suite.Equal(`<p>> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.</p>`, status.Content)
// on statuses with no URL in them (like ones we get from pleroma sometimes) we should use the AP URI of the status as URL
suite.Equal("http://fossbros-anonymous.io/users/foss_satan/statuses/108138763199405167", status.URL)

View File

@@ -607,9 +607,17 @@ func (c *Converter) StatusToAS(ctx context.Context, s *gtsmodel.Status) (ap.Stat
// conversation
// TODO
// content -- the actual post itself
// content -- the actual post
// itself, plus the language
contentProp := streams.NewActivityStreamsContentProperty()
contentProp.AppendXMLSchemaString(s.Content)
if s.Language != "" {
contentProp.AppendRDFLangString(map[string]string{
s.Language: s.Content,
})
}
status.SetActivityStreamsContent(contentProp)
// attachments

View File

@@ -340,6 +340,9 @@ func (suite *InternalToASTestSuite) TestStatusToAS() {
"attributedTo": "http://localhost:8080/users/the_mighty_zork",
"cc": "http://localhost:8080/users/the_mighty_zork/followers",
"content": "hello everyone!",
"contentMap": {
"en": "hello everyone!"
},
"id": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY",
"published": "2021-10-20T12:40:37+02:00",
"replies": {
@@ -379,16 +382,21 @@ func (suite *InternalToASTestSuite) TestStatusWithTagsToASWithIDs() {
// http://joinmastodon.org/ns, https://www.w3.org/ns/activitystreams --
// will appear, so trim them out of the string for consistency
trimmed := strings.SplitAfter(string(bytes), `"attachment":`)[1]
suite.Equal(` {
"blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj",
"mediaType": "image/jpeg",
"name": "Black and white image of some 50's style text saying: Welcome On Board",
"type": "Document",
"url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg"
},
suite.Equal(` [
{
"blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj",
"mediaType": "image/jpeg",
"name": "Black and white image of some 50's style text saying: Welcome On Board",
"type": "Document",
"url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg"
}
],
"attributedTo": "http://localhost:8080/users/admin",
"cc": "http://localhost:8080/users/admin/followers",
"content": "hello world! #welcome ! first post on the instance :rainbow: !",
"contentMap": {
"en": "hello world! #welcome ! first post on the instance :rainbow: !"
},
"id": "http://localhost:8080/users/admin/statuses/01F8MH75CBF9JFX4ZAD54N0W0R",
"published": "2021-10-20T11:36:45Z",
"replies": {
@@ -446,16 +454,21 @@ func (suite *InternalToASTestSuite) TestStatusWithTagsToASFromDB() {
// http://joinmastodon.org/ns, https://www.w3.org/ns/activitystreams --
// will appear, so trim them out of the string for consistency
trimmed := strings.SplitAfter(string(bytes), `"attachment":`)[1]
suite.Equal(` {
"blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj",
"mediaType": "image/jpeg",
"name": "Black and white image of some 50's style text saying: Welcome On Board",
"type": "Document",
"url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg"
},
suite.Equal(` [
{
"blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj",
"mediaType": "image/jpeg",
"name": "Black and white image of some 50's style text saying: Welcome On Board",
"type": "Document",
"url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg"
}
],
"attributedTo": "http://localhost:8080/users/admin",
"cc": "http://localhost:8080/users/admin/followers",
"content": "hello world! #welcome ! first post on the instance :rainbow: !",
"contentMap": {
"en": "hello world! #welcome ! first post on the instance :rainbow: !"
},
"id": "http://localhost:8080/users/admin/statuses/01F8MH75CBF9JFX4ZAD54N0W0R",
"published": "2021-10-20T11:36:45Z",
"replies": {
@@ -519,6 +532,9 @@ func (suite *InternalToASTestSuite) TestStatusToASWithMentions() {
"http://localhost:8080/users/the_mighty_zork"
],
"content": "hi @the_mighty_zork welcome to the instance!",
"contentMap": {
"en": "hi @the_mighty_zork welcome to the instance!"
},
"id": "http://localhost:8080/users/admin/statuses/01FF25D5Q0DH7CHD57CTRS6WK0",
"inReplyTo": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY",
"published": "2021-11-20T13:32:16Z",

View File

@@ -31,6 +31,8 @@ import (
apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/language"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
"github.com/superseriousbusiness/gotosocial/internal/text"
)
@@ -184,3 +186,102 @@ func placeholdUnknownAttachments(arr []apimodel.Attachment) (string, []apimodel.
return text.SanitizeToHTML(aside.String()), arr
}
// ContentToContentLanguage tries to
// extract a content string and language
// tag string from the given intermediary
// content.
//
// Either/both of the returned strings may
// be empty, depending on how things go.
func ContentToContentLanguage(
ctx context.Context,
content gtsmodel.Content,
) (
string, // content
string, // language
) {
var (
contentStr string
langTagStr string
)
switch contentMap := content.ContentMap; {
// Simplest case: no `contentMap`.
// Return `content`, even if empty.
case contentMap == nil:
return content.Content, ""
// `content` and `contentMap` set.
// Try to infer "primary" language.
case content.Content != "":
// Assume `content` is intended
// primary content, and look for
// corresponding language tag.
contentStr = content.Content
for t, c := range contentMap {
if contentStr == c {
langTagStr = t
break
}
}
// `content` not set; `contentMap`
// is set with only one value.
// This must be the "primary" lang.
case len(contentMap) == 1:
// Use an empty loop to
// get the values we want.
// nolint:revive
for langTagStr, contentStr = range contentMap {
}
// Only `contentMap` is set, with more
// than one value. Map order is not
// guaranteed so we can't know the
// "primary" language.
//
// Try to select content using our
// instance's configured languages.
//
// In case of no hits, just take the
// first tag and content in the map.
default:
instanceLangs := config.GetInstanceLanguages()
for _, langTagStr = range instanceLangs.TagStrs() {
if contentStr = contentMap[langTagStr]; contentStr != "" {
// Hit!
break
}
}
// If nothing found, just take
// the first entry we can get by
// breaking after the first iter.
if contentStr == "" {
for langTagStr, contentStr = range contentMap {
break
}
}
}
if langTagStr != "" {
// Found a lang tag for this content,
// make sure it's valid / parseable.
lang, err := language.Parse(langTagStr)
if err != nil {
log.Warnf(
ctx,
"could not parse %s as BCP47 language tag in status contentMap: %v",
langTagStr, err,
)
} else {
// Inferred the language!
// Use normalized version.
langTagStr = lang.TagStr
}
}
return contentStr, langTagStr
}

View File

@@ -18,7 +18,12 @@
package typeutils
import (
"context"
"testing"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/language"
)
func TestMisskeyReportContentURLs1(t *testing.T) {
@@ -44,3 +49,112 @@ misskey-formatted`
t.Fatalf("wanted 0 urls, got %d", l)
}
}
func TestContentToContentLanguage(t *testing.T) {
type testcase struct {
content gtsmodel.Content
instanceLanguages language.Languages
expectedContent string
expectedLang string
}
ctx, cncl := context.WithCancel(context.Background())
defer cncl()
for i, testcase := range []testcase{
{
content: gtsmodel.Content{
Content: "hello world",
ContentMap: nil,
},
expectedContent: "hello world",
expectedLang: "",
},
{
content: gtsmodel.Content{
Content: "",
ContentMap: map[string]string{
"en": "hello world",
},
},
expectedContent: "hello world",
expectedLang: "en",
},
{
content: gtsmodel.Content{
Content: "bonjour le monde",
ContentMap: map[string]string{
"en": "hello world",
"fr": "bonjour le monde",
},
},
expectedContent: "bonjour le monde",
expectedLang: "fr",
},
{
content: gtsmodel.Content{
Content: "bonjour le monde",
ContentMap: map[string]string{
"en": "hello world",
},
},
expectedContent: "bonjour le monde",
expectedLang: "",
},
{
content: gtsmodel.Content{
Content: "",
ContentMap: map[string]string{
"en": "hello world",
"ru": "Привет, мир!",
"nl": "hallo wereld!",
"ca": "Hola món!",
},
},
instanceLanguages: language.Languages{
{TagStr: "en"},
{TagStr: "ca"},
},
expectedContent: "hello world",
expectedLang: "en",
},
{
content: gtsmodel.Content{
Content: "",
ContentMap: map[string]string{
"en": "hello world",
"ru": "Привет, мир!",
"nl": "hallo wereld!",
"ca": "Hola món!",
},
},
instanceLanguages: language.Languages{
{TagStr: "ca"},
{TagStr: "en"},
},
expectedContent: "Hola món!",
expectedLang: "ca",
},
} {
langs, err := language.InitLangs(testcase.instanceLanguages.TagStrs())
if err != nil {
t.Fatal(err)
}
config.SetInstanceLanguages(langs)
content, language := ContentToContentLanguage(ctx, testcase.content)
if content != testcase.expectedContent {
t.Errorf(
"test %d expected content '%s' got '%s'",
i, testcase.expectedContent, content,
)
}
if language != testcase.expectedLang {
t.Errorf(
"test %d expected language '%s' got '%s'",
i, testcase.expectedLang, language,
)
}
}
}

View File

@@ -85,6 +85,9 @@ func (suite *WrapTestSuite) TestWrapNoteInCreate() {
"attributedTo": "http://localhost:8080/users/the_mighty_zork",
"cc": "http://localhost:8080/users/the_mighty_zork/followers",
"content": "hello everyone!",
"contentMap": {
"en": "hello everyone!"
},
"id": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY",
"published": "2021-10-20T12:40:37+02:00",
"replies": {