mirror of
https://github.com/superseriousbusiness/gotosocial
synced 2025-06-05 21:59:39 +02:00
[feature] Parse content warning to HTML, serialize via client API as plaintext (#3876)
* [feature] Parse content warning as HTML, serialize via API to plaintext * tidy up some cruft * whoops * oops * i'm da joker baybee * clemency muy lorde * rename some of the text functions for clarity * jiggle the opts * fiddle de deee * hopefully the last test fix i ever have to do in my beautiful life
This commit is contained in:
@@ -20,8 +20,11 @@ package text
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
gohtml "html"
|
||||
"strings"
|
||||
|
||||
"codeberg.org/gruf/go-byteutil"
|
||||
"github.com/k3a/html2text"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/log"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/regexes"
|
||||
@@ -52,7 +55,7 @@ func (f *Formatter) FromPlain(
|
||||
return f.fromPlain(
|
||||
ctx,
|
||||
plainTextParser,
|
||||
false, // emojiOnly = false
|
||||
false, // basic = false
|
||||
parseMention,
|
||||
authorID,
|
||||
statusID,
|
||||
@@ -85,7 +88,7 @@ func (f *Formatter) FromPlainNoParagraph(
|
||||
return f.fromPlain(
|
||||
ctx,
|
||||
plainTextParser,
|
||||
false, // emojiOnly = false
|
||||
false, // basic = false
|
||||
parseMention,
|
||||
authorID,
|
||||
statusID,
|
||||
@@ -93,12 +96,14 @@ func (f *Formatter) FromPlainNoParagraph(
|
||||
)
|
||||
}
|
||||
|
||||
// FromPlainEmojiOnly fulfils FormatFunc by parsing
|
||||
// FromPlainBasic fulfils FormatFunc by parsing
|
||||
// the given plaintext input into a FormatResult.
|
||||
//
|
||||
// Unlike FromPlain, it will only parse emojis with
|
||||
// the custom renderer, leaving aside mentions and tags.
|
||||
func (f *Formatter) FromPlainEmojiOnly(
|
||||
//
|
||||
// Resulting HTML will also NOT be wrapped in <p> tags.
|
||||
func (f *Formatter) FromPlainBasic(
|
||||
ctx context.Context,
|
||||
parseMention gtsmodel.ParseMentionFunc,
|
||||
authorID string,
|
||||
@@ -116,7 +121,7 @@ func (f *Formatter) FromPlainEmojiOnly(
|
||||
return f.fromPlain(
|
||||
ctx,
|
||||
plainTextParser,
|
||||
true, // emojiOnly = true
|
||||
true, // basic = true
|
||||
parseMention,
|
||||
authorID,
|
||||
statusID,
|
||||
@@ -130,7 +135,7 @@ func (f *Formatter) FromPlainEmojiOnly(
|
||||
func (f *Formatter) fromPlain(
|
||||
ctx context.Context,
|
||||
plainTextParser parser.Parser,
|
||||
emojiOnly bool,
|
||||
basic bool,
|
||||
parseMention gtsmodel.ParseMentionFunc,
|
||||
authorID string,
|
||||
statusID string,
|
||||
@@ -156,7 +161,9 @@ func (f *Formatter) fromPlain(
|
||||
parseMention,
|
||||
authorID,
|
||||
statusID,
|
||||
emojiOnly,
|
||||
// If basic, pass
|
||||
// emojiOnly = true.
|
||||
basic,
|
||||
result,
|
||||
},
|
||||
// Turns URLs into links.
|
||||
@@ -181,8 +188,51 @@ func (f *Formatter) fromPlain(
|
||||
|
||||
// Clean and shrink HTML.
|
||||
result.HTML = byteutil.B2S(htmlBytes.Bytes())
|
||||
result.HTML = SanitizeToHTML(result.HTML)
|
||||
result.HTML = SanitizeHTML(result.HTML)
|
||||
result.HTML = MinifyHTML(result.HTML)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ParseHTMLToPlain parses the given HTML string, then
|
||||
// outputs it to equivalent plaintext while trying to
|
||||
// keep as much of the smenantic intent of the input
|
||||
// HTML as possible, ie., titles are placed on separate
|
||||
// lines, `<br>`s are converted to newlines, text inside
|
||||
// `<strong>` and `<em>` tags is retained, but without
|
||||
// emphasis, `<a>` links are unnested and the URL they
|
||||
// link to is placed in angle brackets next to them,
|
||||
// lists are replaced with newline-separated indented
|
||||
// items, etc.
|
||||
//
|
||||
// This function is useful when you need to filter on
|
||||
// HTML and want to avoid catching tags in the filter,
|
||||
// or when you want to serve something in a plaintext
|
||||
// format that may contain HTML tags (eg., CWs).
|
||||
func ParseHTMLToPlain(html string) string {
|
||||
plain := html2text.HTML2TextWithOptions(
|
||||
html,
|
||||
html2text.WithLinksInnerText(),
|
||||
html2text.WithUnixLineBreaks(),
|
||||
html2text.WithListSupport(),
|
||||
)
|
||||
return strings.TrimSpace(plain)
|
||||
}
|
||||
|
||||
// StripHTMLFromText runs text through strict sanitization
|
||||
// to completely remove any HTML from the input without
|
||||
// trying to preserve the semantic intent of any HTML tags.
|
||||
//
|
||||
// This is useful in cases where the input was not allowed
|
||||
// to contain HTML at all, and the output isn't either.
|
||||
func StripHTMLFromText(text string) string {
|
||||
// Unescape first to catch any tricky critters.
|
||||
content := gohtml.UnescapeString(text)
|
||||
|
||||
// Remove all detected HTML.
|
||||
content = strict.Sanitize(content)
|
||||
|
||||
// Unescape again to return plaintext.
|
||||
content = gohtml.UnescapeString(content)
|
||||
return strings.TrimSpace(content)
|
||||
}
|
||||
|
Reference in New Issue
Block a user