From dc96562b4084e058846aea9102ef0257461717d6 Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Fri, 11 Aug 2023 14:40:11 +0200 Subject: [PATCH] [bugfix] Use custom bluemonday policy to disallow inline img tags (#2100) --- internal/ap/normalize.go | 30 ++- internal/ap/normalize_test.go | 16 +- .../api/client/statuses/statuscreate_test.go | 2 +- internal/processing/account/create.go | 2 +- internal/processing/account/update.go | 8 +- internal/processing/admin/domainblock.go | 4 +- internal/processing/instance.go | 8 +- internal/processing/media/update.go | 2 +- internal/processing/status/create.go | 2 +- internal/text/emojionly.go | 7 +- internal/text/markdown.go | 7 +- internal/text/markdown_test.go | 2 +- internal/text/minify.go | 21 +++ internal/text/plain.go | 7 +- internal/text/sanitize.go | 171 +++++++++++++++--- internal/text/sanitize_test.go | 28 +-- internal/web/opengraph.go | 4 +- 17 files changed, 243 insertions(+), 78 deletions(-) diff --git a/internal/ap/normalize.go b/internal/ap/normalize.go index 38861a1b9..8bc2a70e8 100644 --- a/internal/ap/normalize.go +++ b/internal/ap/normalize.go @@ -20,6 +20,7 @@ package ap import ( "github.com/superseriousbusiness/activity/pub" "github.com/superseriousbusiness/activity/streams" + "github.com/superseriousbusiness/gotosocial/internal/text" ) /* @@ -126,7 +127,8 @@ func NormalizeIncomingActivityObject(activity pub.Activity, rawJSON map[string]i } // NormalizeIncomingContent replaces the Content of the given item -// with the raw 'content' value from the raw json object map. +// with the sanitized version of the raw 'content' value from the +// raw json object map. // // noop if there was no content in the json object map or the // content was not a plain string. @@ -145,6 +147,14 @@ func NormalizeIncomingContent(item WithSetContent, rawJSON map[string]interface{ return } + // Content should be HTML encoded by default: + // https://www.w3.org/TR/activitystreams-vocabulary/#dfn-content + // + // TODO: sanitize differently based on mediaType. + // https://www.w3.org/TR/activitystreams-vocabulary/#dfn-mediatype + content = text.SanitizeToHTML(content) + content = text.MinifyHTML(content) + // Set normalized content property from the raw string; // this replaces any existing content property on the item. contentProp := streams.NewActivityStreamsContentProperty() @@ -154,7 +164,8 @@ func NormalizeIncomingContent(item WithSetContent, rawJSON map[string]interface{ // NormalizeIncomingAttachments normalizes all attachments (if any) of the given // item, replacing the 'name' (aka content warning) field of each attachment -// with the raw 'name' value from the raw json object map. +// with the raw 'name' value from the raw json object map, and doing sanitization +// on the result. // // noop if there are no attachments; noop if attachment is not a format // we can understand. @@ -212,7 +223,8 @@ func NormalizeIncomingAttachments(item WithAttachment, rawJSON map[string]interf } // NormalizeIncomingSummary replaces the Summary of the given item -// with the raw 'summary' value from the raw json object map. +// with the sanitized version of the raw 'summary' value from the +// raw json object map. // // noop if there was no summary in the json object map or the // summary was not a plain string. @@ -229,6 +241,11 @@ func NormalizeIncomingSummary(item WithSetSummary, rawJSON map[string]interface{ return } + // Summary should be HTML encoded: + // https://www.w3.org/TR/activitystreams-vocabulary/#dfn-summary + summary = text.SanitizeToHTML(summary) + summary = text.MinifyHTML(summary) + // Set normalized summary property from the raw string; this // will replace any existing summary property on the item. summaryProp := streams.NewActivityStreamsSummaryProperty() @@ -254,6 +271,13 @@ func NormalizeIncomingName(item WithSetName, rawJSON map[string]interface{}) { return } + // Name *must not* include any HTML markup: + // https://www.w3.org/TR/activitystreams-vocabulary/#dfn-name + // + // todo: We probably want to update this to allow + // *escaped* HTML markup, but for now just nuke it. + name = text.SanitizeToPlaintext(name) + // Set normalized name property from the raw string; this // will replace any existing name property on the item. nameProp := streams.NewActivityStreamsNameProperty() diff --git a/internal/ap/normalize_test.go b/internal/ap/normalize_test.go index cde807f21..cefaf4d38 100644 --- a/internal/ap/normalize_test.go +++ b/internal/ap/normalize_test.go @@ -146,7 +146,7 @@ func (suite *NormalizeTestSuite) getStatusableWithMultipleAttachments() (vocab.A "type": "Document", "mediaType": "image/jpeg", "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg", - "name": "danger: #cute but will claw you :(" + "name": "image of a cat & there's a note saying: <danger: #cute but will claw you :(>" } ] }`) @@ -192,7 +192,7 @@ func (suite *NormalizeTestSuite) TestNormalizeActivityObject() { ) ap.NormalizeIncomingActivityObject(create, map[string]interface{}{"object": rawNote}) - suite.Equal(`UPDATE: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration.

In fact, 100,000 new accounts have been created since last night.

Since last night's spike 8,000-12,000 new accounts are being created every hour.

Yesterday, I estimated that Mastodon would have 8 million users by the end of the week. That might happen a lot sooner if this trend continues.`, ap.ExtractContent(note)) + suite.Equal(`UPDATE: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration.

In fact, 100,000 new accounts have been created since last night.

Since last night's spike 8,000-12,000 new accounts are being created every hour.

Yesterday, I estimated that Mastodon would have 8 million users by the end of the week. That might happen a lot sooner if this trend continues.`, ap.ExtractContent(note)) } func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment() { @@ -224,7 +224,7 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment "@context": "https://www.w3.org/ns/activitystreams", "attachment": { "mediaType": "image/jpeg", - "name": "DESCRIPTION: here's \u003c\u003ca\u003e\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", + "name": "DESCRIPTION: here's \u003c\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", "type": "Document", "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" }, @@ -265,7 +265,7 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment "@context": "https://www.w3.org/ns/activitystreams", "attachment": { "mediaType": "image/jpeg", - "name": "DESCRIPTION: here's \u003c\u003ca\u003e\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", + "name": "DESCRIPTION: here's \u003c\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", "type": "Document", "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" }, @@ -304,7 +304,7 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsMultipleAttac }, { "mediaType": "image/jpeg", - "name": "danger: #cute%20but%20will%20claw%20you%20:(", + "name": "image of a cat \u0026amp; there's a note saying: \u0026lt;danger: #cute but will claw you :(\u0026gt;", "type": "Document", "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" } @@ -326,7 +326,7 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsMultipleAttac "attachment": [ { "mediaType": "image/jpeg", - "name": "DESCRIPTION: here's \u003c\u003ca\u003e\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", + "name": "DESCRIPTION: here's \u003c\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", "type": "Document", "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" }, @@ -343,7 +343,7 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsMultipleAttac }, { "mediaType": "image/jpeg", - "name": "danger: #cute but will claw you :(", + "name": "image of a cat \u0026 there's a note saying:", "type": "Document", "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" } @@ -380,7 +380,7 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableSummary() { suite.Equal(`warning: #WEIRD%20%23SUMMARY%20;;;;a;;a;asv%20%20%20%20khop8273987(*%5E&%5E)`, ap.ExtractSummary(statusable)) ap.NormalizeIncomingSummary(statusable, rawAccount) - suite.Equal(`warning: #WEIRD #SUMMARY ;;;;a;;a;asv khop8273987(*^&^)`, ap.ExtractSummary(statusable)) + suite.Equal(`warning: #WEIRD #SUMMARY ;;;;a;;a;asv khop8273987(*^&^)`, ap.ExtractSummary(statusable)) } func (suite *NormalizeTestSuite) TestNormalizeStatusableName() { diff --git a/internal/api/client/statuses/statuscreate_test.go b/internal/api/client/statuses/statuscreate_test.go index d47a74bbc..6238b111a 100644 --- a/internal/api/client/statuses/statuscreate_test.go +++ b/internal/api/client/statuses/statuscreate_test.go @@ -43,7 +43,7 @@ type StatusCreateTestSuite struct { const ( statusWithLinksAndTags = "#test alright, should be able to post #links with fragments in them now, let's see........\n\nhttps://docs.gotosocial.org/en/latest/user_guide/posts/#links\n\n#gotosocial\n\n(tobi remember to pull the docker image challenge)" statusMarkdown = "# Title\n\n## Smaller title\n\nThis is a post written in [markdown](https://www.markdownguide.org/)\n\n" - statusMarkdownExpected = "

Title

Smaller title

This is a post written in markdown

" + statusMarkdownExpected = "

Title

Smaller title

This is a post written in markdown

" ) // Post a new status with some custom visibility settings diff --git a/internal/processing/account/create.go b/internal/processing/account/create.go index 32a59d1ef..1925feb63 100644 --- a/internal/processing/account/create.go +++ b/internal/processing/account/create.go @@ -71,7 +71,7 @@ func (p *Processor) Create( Username: form.Username, Email: form.Email, Password: form.Password, - Reason: text.SanitizePlaintext(reason), + Reason: text.SanitizeToPlaintext(reason), PreApproved: !config.GetAccountsApprovalRequired(), // Mark as approved if no approval required. SignUpIP: form.IP, Locale: form.Locale, diff --git a/internal/processing/account/update.go b/internal/processing/account/update.go index f75b3c8d9..ec343f160 100644 --- a/internal/processing/account/update.go +++ b/internal/processing/account/update.go @@ -67,7 +67,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form } // Parse new display name (always from plaintext). - account.DisplayName = text.SanitizePlaintext(displayName) + account.DisplayName = text.SanitizeToPlaintext(displayName) // If display name has changed, account emojis may have also changed. emojisChanged = true @@ -110,8 +110,8 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form // Sanitize raw field values. fieldRaw := >smodel.Field{ - Name: text.SanitizePlaintext(name), - Value: text.SanitizePlaintext(value), + Name: text.SanitizeToPlaintext(name), + Value: text.SanitizeToPlaintext(value), } fieldsRaw = append(fieldsRaw, fieldRaw) } @@ -255,7 +255,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, form if err := validate.CustomCSS(customCSS); err != nil { return nil, gtserror.NewErrorBadRequest(err, err.Error()) } - account.CustomCSS = text.SanitizePlaintext(customCSS) + account.CustomCSS = text.SanitizeToPlaintext(customCSS) } if form.EnableRSS != nil { diff --git a/internal/processing/admin/domainblock.go b/internal/processing/admin/domainblock.go index c645f287a..a85d78a56 100644 --- a/internal/processing/admin/domainblock.go +++ b/internal/processing/admin/domainblock.go @@ -67,8 +67,8 @@ func (p *Processor) DomainBlockCreate( ID: id.NewULID(), Domain: domain, CreatedByAccountID: account.ID, - PrivateComment: text.SanitizePlaintext(privateComment), - PublicComment: text.SanitizePlaintext(publicComment), + PrivateComment: text.SanitizeToPlaintext(privateComment), + PublicComment: text.SanitizeToPlaintext(publicComment), Obfuscate: &obfuscate, SubscriptionID: subscriptionID, } diff --git a/internal/processing/instance.go b/internal/processing/instance.go index ac63814cd..edcfe5418 100644 --- a/internal/processing/instance.go +++ b/internal/processing/instance.go @@ -159,7 +159,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe return nil, gtserror.NewErrorBadRequest(err, fmt.Sprintf("site title invalid: %s", err)) } updatingColumns = append(updatingColumns, "title") - instance.Title = text.SanitizePlaintext(*form.Title) // don't allow html in site title + instance.Title = text.SanitizeToPlaintext(*form.Title) // don't allow html in site title } // validate & update site contact account if it's set on the form @@ -215,7 +215,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe return nil, gtserror.NewErrorBadRequest(err, err.Error()) } updatingColumns = append(updatingColumns, "short_description") - instance.ShortDescription = text.SanitizeHTML(*form.ShortDescription) // html is OK in site description, but we should sanitize it + instance.ShortDescription = text.SanitizeToHTML(*form.ShortDescription) // html is OK in site description, but we should sanitize it } // validate & update site description if it's set on the form @@ -224,7 +224,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe return nil, gtserror.NewErrorBadRequest(err, err.Error()) } updatingColumns = append(updatingColumns, "description") - instance.Description = text.SanitizeHTML(*form.Description) // html is OK in site description, but we should sanitize it + instance.Description = text.SanitizeToHTML(*form.Description) // html is OK in site description, but we should sanitize it } // validate & update site terms if it's set on the form @@ -233,7 +233,7 @@ func (p *Processor) InstancePatch(ctx context.Context, form *apimodel.InstanceSe return nil, gtserror.NewErrorBadRequest(err, err.Error()) } updatingColumns = append(updatingColumns, "terms") - instance.Terms = text.SanitizeHTML(*form.Terms) // html is OK in site terms, but we should sanitize it + instance.Terms = text.SanitizeToHTML(*form.Terms) // html is OK in site terms, but we should sanitize it } var updateInstanceAccount bool diff --git a/internal/processing/media/update.go b/internal/processing/media/update.go index 33649f201..59ade9ca5 100644 --- a/internal/processing/media/update.go +++ b/internal/processing/media/update.go @@ -47,7 +47,7 @@ func (p *Processor) Update(ctx context.Context, account *gtsmodel.Account, media var updatingColumns []string if form.Description != nil { - attachment.Description = text.SanitizePlaintext(*form.Description) + attachment.Description = text.SanitizeToPlaintext(*form.Description) updatingColumns = append(updatingColumns, "description") } diff --git a/internal/processing/status/create.go b/internal/processing/status/create.go index 36842ee07..d671ea8c4 100644 --- a/internal/processing/status/create.go +++ b/internal/processing/status/create.go @@ -54,7 +54,7 @@ func (p *Processor) Create(ctx context.Context, account *gtsmodel.Account, appli Local: &local, AccountID: account.ID, AccountURI: account.URI, - ContentWarning: text.SanitizePlaintext(form.SpoilerText), + ContentWarning: text.SanitizeToPlaintext(form.SpoilerText), ActivityStreamsType: ap.ObjectNote, Sensitive: &sensitive, CreatedWithApplicationID: application.ID, diff --git a/internal/text/emojionly.go b/internal/text/emojionly.go index ba7555716..f4f200b21 100644 --- a/internal/text/emojionly.go +++ b/internal/text/emojionly.go @@ -61,13 +61,10 @@ func (f *formatter) FromPlainEmojiOnly(ctx context.Context, pmf gtsmodel.ParseMe result.HTML = htmlContentBytes.String() // clean anything dangerous out of the HTML - result.HTML = SanitizeHTML(result.HTML) + result.HTML = SanitizeToHTML(result.HTML) // shrink ray - result.HTML, err = m.String("text/html", result.HTML) - if err != nil { - log.Errorf(ctx, "error minifying HTML: %s", err) - } + result.HTML = MinifyHTML(result.HTML) return result } diff --git a/internal/text/markdown.go b/internal/text/markdown.go index c7d4958f4..ecc49673b 100644 --- a/internal/text/markdown.go +++ b/internal/text/markdown.go @@ -57,13 +57,10 @@ func (f *formatter) FromMarkdown(ctx context.Context, pmf gtsmodel.ParseMentionF result.HTML = htmlContentBytes.String() // clean anything dangerous out of the HTML - result.HTML = SanitizeHTML(result.HTML) + result.HTML = SanitizeToHTML(result.HTML) // shrink ray - result.HTML, err = m.String("text/html", result.HTML) - if err != nil { - log.Errorf(ctx, "error minifying HTML: %s", err) - } + result.HTML = MinifyHTML(result.HTML) return result } diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go index 2602506ca..cc466df6c 100644 --- a/internal/text/markdown_test.go +++ b/internal/text/markdown_test.go @@ -51,7 +51,7 @@ const ( withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!" withHashtagExpected = "

Title

here's a simple status that uses hashtag #Hashtag!

" mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a link.\n\nHere's an image: \"The" - mdWithHTMLExpected = "

Title

Here's a simple text in markdown.

Here's a link.

Here's an image: \"The

" + mdWithHTMLExpected = "

Title

Here's a simple text in markdown.

Here's a link.

Here's an image:

" mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: " mdWithCheekyHTMLExpected = "

Title

Here's a simple text in markdown.

Here's a cheeky little script:

" mdWithHashtagInitial = "#welcome #Hashtag" diff --git a/internal/text/minify.go b/internal/text/minify.go index 83780d5c1..da61bdcf9 100644 --- a/internal/text/minify.go +++ b/internal/text/minify.go @@ -18,6 +18,7 @@ package text import ( + "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/tdewolff/minify/v2" "github.com/tdewolff/minify/v2/html" ) @@ -31,3 +32,23 @@ var m = func() *minify.M { }) return m }() + +// MinifyHTML minifies the given string +// under the assumption that it's HTML. +// +// If input is not HTML encoded, this +// function will try to do minimization +// anyway, but this may produce unexpected +// results. +// +// If an error occurs during minimization, +// it will be logged and the original string +// returned unmodified. +func MinifyHTML(in string) string { + out, err := m.String("text/html", in) + if err != nil { + log.Error(nil, err) + } + + return out +} diff --git a/internal/text/plain.go b/internal/text/plain.go index b1c2a2c33..330ebfb15 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -65,13 +65,10 @@ func (f *formatter) fromPlain( result.HTML = htmlContentBytes.String() // Clean anything dangerous out of resulting HTML. - result.HTML = SanitizeHTML(result.HTML) + result.HTML = SanitizeToHTML(result.HTML) // Shrink ray! - var err error - if result.HTML, err = m.String("text/html", result.HTML); err != nil { - log.Errorf(ctx, "error minifying HTML: %s", err) - } + result.HTML = MinifyHTML(result.HTML) return result } diff --git a/internal/text/sanitize.go b/internal/text/sanitize.go index 7e857b533..81c436264 100644 --- a/internal/text/sanitize.go +++ b/internal/text/sanitize.go @@ -25,44 +25,167 @@ import ( "github.com/microcosm-cc/bluemonday" ) -// '[A]llows a broad selection of HTML elements and attributes that are safe for user generated content. -// Note that this policy does not allow iframes, object, embed, styles, script, etc. -// An example usage scenario would be blog post bodies where a variety of formatting is expected along with the potential for TABLEs and IMGs.' -// -// Source: https://github.com/microcosm-cc/bluemonday#usage -var regular *bluemonday.Policy = bluemonday.UGCPolicy(). - RequireNoReferrerOnLinks(true). - RequireNoFollowOnLinks(false). // remove the global default which adds rel="nofollow" to all links including local relative - RequireNoFollowOnFullyQualifiedLinks(true). // add rel="nofollow" on all external links - RequireCrossOriginAnonymous(true). - AddTargetBlankToFullyQualifiedLinks(true). - AllowAttrs("class", "href", "rel").OnElements("a"). - AllowAttrs("class").OnElements("span"). - AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code"). - SkipElementsContent("code", "pre") +// Regular HTML policy is an adapted version of the default +// bluemonday UGC policy, with some tweaks of our own. +// See: https://github.com/microcosm-cc/bluemonday#usage +var regular *bluemonday.Policy = func() *bluemonday.Policy { + p := bluemonday.NewPolicy() -// '[C]an be thought of as equivalent to stripping all HTML elements and their attributes as it has nothing on its allowlist. -// An example usage scenario would be blog post titles where HTML tags are not expected at all -// and if they are then the elements and the content of the elements should be stripped. This is a very strict policy.' + // AllowStandardAttributes will enable "id", "title" and + // the language specific attributes "dir" and "lang" on + // all elements that are allowed + p.AllowStandardAttributes() + + /* + LAYOUT AND FORMATTING + */ + + // "aside" is permitted and takes no attributes. + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/aside + p.AllowElements("article", "aside") + + // "details" is permitted, including the "open" attribute + // which can either be blank or the value "open". + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/details + p.AllowAttrs("open").Matching(regexp.MustCompile(`(?i)^(|open)$`)).OnElements("details") + + // "section" is permitted and takes no attributes. + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/section + p.AllowElements("section") + + // "summary" is permitted and takes no attributes. + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/summary + p.AllowElements("summary") + + // "h1" through "h6" are permitted and take no attributes. + p.AllowElements("h1", "h2", "h3", "h4", "h5", "h6") + + // "hgroup" is permitted and takes no attributes. + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/hgroup + p.AllowElements("hgroup") + + // "blockquote" is permitted, including the "cite" + // attribute which must be a standard URL. + p.AllowAttrs("cite").OnElements("blockquote") + + // "br" "div" "hr" "p" "span" "wbr" are permitted and take no attributes + p.AllowElements("br", "div", "hr", "p", "span", "wbr") + + // The following are all inline phrasing elements: + p.AllowElements("abbr", "acronym", "cite", "code", "dfn", "em", + "figcaption", "mark", "s", "samp", "strong", "sub", "sup", "var") + + // "q" is permitted and "cite" is a URL and handled by URL policies + p.AllowAttrs("cite").OnElements("q") + + // "time" is permitted + p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("time") + + // Block and inline elements that impart no + // semantic meaning but style the document. + // Underlines, italics, bold, strikethrough etc. + p.AllowElements("b", "i", "pre", "small", "strike", "tt", "u") + + // "del" "ins" are permitted + p.AllowAttrs("cite").Matching(bluemonday.Paragraph).OnElements("del", "ins") + p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("del", "ins") + + // Enable ordered, unordered, and definition lists. + p.AllowLists() + + // Class needed on span for mentions, which look like this when assembled: + // `@someusername` + p.AllowAttrs("class").OnElements("span") + + /* + LANGUAGE FORMATTING + */ + + // "bdi" "bdo" are permitted on "dir". + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/dir + p.AllowAttrs("dir").Matching(bluemonday.Direction).OnElements("bdi", "bdo") + + // "rp" "rt" "ruby" are permitted. See: + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rp + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rt + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ruby + p.AllowElements("rp", "rt", "ruby") + + /* + CODE BLOCKS + */ + + // Permit language tags for code elements. + p.AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code") + + // Don't sanitize HTML inside code blocks. + p.SkipElementsContent("code", "pre") + + /* + LINKS AND LINK SAFETY. + */ + + // Permit hyperlinks. + p.AllowAttrs("class", "href", "rel").OnElements("a") + + // URLs must be parseable by net/url.Parse(). + p.RequireParseableURLs(true) + + // Most common URL schemes only. + p.AllowURLSchemes("mailto", "http", "https") + + // Force rel="noreferrer". + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel/noreferrer + p.RequireNoReferrerOnLinks(true) + + // Add rel="nofollow" on all fully qualified (not relative) links. + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel#nofollow + p.RequireNoFollowOnFullyQualifiedLinks(true) + + // Force crossorigin="anonymous" + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/crossorigin#anonymous + p.RequireCrossOriginAnonymous(true) + + // Force target="_blank". + // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#target + p.AddTargetBlankToFullyQualifiedLinks(true) + + return p +}() + +// '[C]an be thought of as equivalent to stripping all HTML +// elements and their attributes as it has nothing on its allowlist. +// An example usage scenario would be blog post titles where HTML +// tags are not expected at all and if they are then the elements +// and the content of the elements should be stripped. This is a +// very strict policy.' // // Source: https://github.com/microcosm-cc/bluemonday#usage var strict *bluemonday.Policy = bluemonday.StrictPolicy() -// removeHTML strictly removes *all* recognized HTML elements from the given string. +// removeHTML strictly removes *all* recognized +// HTML elements from the given string. func removeHTML(in string) string { return strict.Sanitize(in) } -// SanitizeHTML sanitizes risky html elements from the given string, allowing only safe ones through. -func SanitizeHTML(in string) string { +// SanitizeToHTML sanitizes only risky html elements +// from the given string, allowing safe ones through. +func SanitizeToHTML(in string) string { return regular.Sanitize(in) } -// SanitizePlaintext runs text through basic sanitization. This removes -// any html elements that were in the string, and returns clean plaintext. -func SanitizePlaintext(in string) string { +// SanitizeToPlaintext runs text through basic sanitization. +// This removes any html elements that were in the string, +// and returns clean plaintext. +func SanitizeToPlaintext(in string) string { + // Unescape first to catch any tricky critters. content := html.UnescapeString(in) + + // Remove all detected HTML. content = removeHTML(content) + + // Unescape again to return plaintext. content = html.UnescapeString(content) return strings.TrimSpace(content) } diff --git a/internal/text/sanitize_test.go b/internal/text/sanitize_test.go index f299c2923..ae49c942c 100644 --- a/internal/text/sanitize_test.go +++ b/internal/text/sanitize_test.go @@ -36,30 +36,30 @@ type SanitizeTestSuite struct { } func (suite *SanitizeTestSuite) TestSanitizeOutgoing() { - s := text.SanitizeHTML(sanitizeOutgoing) + s := text.SanitizeToHTML(sanitizeOutgoing) suite.Equal(sanitizedOutgoing, s) } func (suite *SanitizeTestSuite) TestSanitizeHTML() { - s := text.SanitizeHTML(sanitizeHTML) + s := text.SanitizeToHTML(sanitizeHTML) suite.Equal(sanitizedHTML, s) } func (suite *SanitizeTestSuite) TestSanitizeCaption1() { dodgyCaption := "this is just a normal caption ;)" - sanitized := text.SanitizePlaintext(dodgyCaption) + sanitized := text.SanitizeToPlaintext(dodgyCaption) suite.Equal("this is just a normal caption ;)", sanitized) } func (suite *SanitizeTestSuite) TestSanitizeCaption2() { dodgyCaption := "here's a LOUD caption" - sanitized := text.SanitizePlaintext(dodgyCaption) + sanitized := text.SanitizeToPlaintext(dodgyCaption) suite.Equal("here's a LOUD caption", sanitized) } func (suite *SanitizeTestSuite) TestSanitizeCaption3() { dodgyCaption := "" - sanitized := text.SanitizePlaintext(dodgyCaption) + sanitized := text.SanitizeToPlaintext(dodgyCaption) suite.Equal("", sanitized) } @@ -75,21 +75,21 @@ with some newlines ` - sanitized := text.SanitizePlaintext(dodgyCaption) + sanitized := text.SanitizeToPlaintext(dodgyCaption) suite.Equal("here is\na multi line\ncaption\nwith some newlines", sanitized) } func (suite *SanitizeTestSuite) TestSanitizeCaption5() { // html-escaped: " hello world" dodgyCaption := `<script>console.log('aha!')</script> hello world` - sanitized := text.SanitizePlaintext(dodgyCaption) + sanitized := text.SanitizeToPlaintext(dodgyCaption) suite.Equal("hello world", sanitized) } func (suite *SanitizeTestSuite) TestSanitizeCaption6() { // html-encoded: " hello world" dodgyCaption := `<script>console.log('aha!')</script> hello world` - sanitized := text.SanitizePlaintext(dodgyCaption) + sanitized := text.SanitizeToPlaintext(dodgyCaption) suite.Equal("hello world", sanitized) } @@ -104,24 +104,30 @@ func (suite *SanitizeTestSuite) TestSanitizeCustomCSS() { overflow: hidden; text-overflow: ellipsis; }` - sanitized := text.SanitizePlaintext(customCSS) + sanitized := text.SanitizeToPlaintext(customCSS) suite.Equal(customCSS, sanitized) // should be the same as it was before } func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS1() { // try to break out of pee pee poo poo