[bugfix] Fix existing bio text showing as HTML (#531)

* fix existing bio text showing as HTML

- updated replaced mentions to include instance
- strips HTML from account source note in Verify handler
- update text formatter to use buffers for string writes

Signed-off-by: kim <grufwub@gmail.com>

* go away linter

Signed-off-by: kim <grufwub@gmail.com>

* change buf reset location, change html mention tags

Signed-off-by: kim <grufwub@gmail.com>

* reduce FindLinks code complexity

Signed-off-by: kim <grufwub@gmail.com>

* fix HTML to text conversion

Signed-off-by: kim <grufwub@gmail.com>

* Update internal/regexes/regexes.go

Co-authored-by: Mina Galić <mina.galic@puppet.com>

* use improved html2text lib with more options

Signed-off-by: kim <grufwub@gmail.com>

* fix to produce actual plaintext from html

Signed-off-by: kim <grufwub@gmail.com>

* fix span tags instead written as space

Signed-off-by: kim <grufwub@gmail.com>

* performance improvements to regex replacements, fix link replace logic for un-html-ing in the future

Signed-off-by: kim <grufwub@gmail.com>

* fix tag/mention replacements to use input string, fix link replace to not include scheme

Signed-off-by: kim <grufwub@gmail.com>

* use matched input string for link replace href text

Signed-off-by: kim <grufwub@gmail.com>

* remove unused code (to appease linter :sobs:)

Signed-off-by: kim <grufwub@gmail.com>

* improve hashtagFinger regex to be more compliant

Signed-off-by: kim <grufwub@gmail.com>

* update breakReplacer to include both unix and windows line endings

Signed-off-by: kim <grufwub@gmail.com>

* add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts

Signed-off-by: kim <grufwub@gmail.com>

* drop unnecessary code

Signed-off-by: kim <grufwub@gmail.com>

* update text package tests to fix logic changes

Signed-off-by: kim <grufwub@gmail.com>

* add raw note content testing to account update and account verify

Signed-off-by: kim <grufwub@gmail.com>

* remove unused modules

Signed-off-by: kim <grufwub@gmail.com>

* fix emoji regex

Signed-off-by: kim <grufwub@gmail.com>

* fix replacement of hashtags

Signed-off-by: kim <grufwub@gmail.com>

* update code comment

Signed-off-by: kim <grufwub@gmail.com>

Co-authored-by: Mina Galić <mina.galic@puppet.com>
This commit is contained in:
kim
2022-05-07 16:55:27 +01:00
committed by GitHub
parent 08eb271a4c
commit 26b74aefaf
16 changed files with 180 additions and 108 deletions

View File

@@ -19,34 +19,28 @@
package text
import (
"bytes"
"context"
"fmt"
"net/url"
"strings"
"mvdan.cc/xurls/v2"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
)
// schemes is the regex for schemes we accept when looking for links.
// Basically, we accept https or http.
var schemes = `(((http|https))://)`
// FindLinks parses the given string looking for recognizable URLs (including scheme).
// It returns a list of those URLs, without changing the string, or an error if something goes wrong.
// If no URLs are found within the given string, an empty slice and nil will be returned.
func FindLinks(in string) ([]*url.URL, error) {
rxStrict, err := xurls.StrictMatchingScheme(schemes)
if err != nil {
return nil, err
}
urls := []*url.URL{}
func FindLinks(in string) []*url.URL {
var urls []*url.URL
// bail already if we don't find anything
found := rxStrict.FindAllString(in, -1)
found := regexes.LinkScheme.FindAllString(in, -1)
if len(found) == 0 {
return urls, nil
return nil
}
urlmap := map[string]struct{}{}
// for each string we find, we want to parse it into a URL if we can
// if we fail to parse it, just ignore this match and continue
for _, f := range found {
@@ -54,29 +48,18 @@ func FindLinks(in string) ([]*url.URL, error) {
if err != nil {
continue
}
urls = append(urls, u)
}
// deduplicate the URLs
urlsDeduped := []*url.URL{}
// Calculate string
ustr := u.String()
for _, u := range urls {
if !contains(urlsDeduped, u) {
urlsDeduped = append(urlsDeduped, u)
if _, ok := urlmap[ustr]; !ok {
// Has not been encountered yet
urls = append(urls, u)
urlmap[ustr] = struct{}{}
}
}
return urlsDeduped, nil
}
// contains checks if the given url is already within a slice of URLs
func contains(urls []*url.URL, url *url.URL) bool {
for _, u := range urls {
if u.String() == url.String() {
return true
}
}
return false
return urls
}
// ReplaceLinks replaces all detected links in a piece of text with their HTML (href) equivalents.
@@ -84,33 +67,20 @@ func contains(urls []*url.URL, url *url.URL) bool {
// href will end up double-formatted, if the text you pass here contains one or more hrefs already.
// To avoid this, you should sanitize any HTML out of text before you pass it into this function.
func (f *formatter) ReplaceLinks(ctx context.Context, in string) string {
rxStrict, err := xurls.StrictMatchingScheme(schemes)
if err != nil {
panic(err)
}
replaced := rxStrict.ReplaceAllStringFunc(in, func(urlString string) string {
return regexes.ReplaceAllStringFunc(regexes.LinkScheme, in, func(urlString string, buf *bytes.Buffer) string {
thisURL, err := url.Parse(urlString)
if err != nil {
return urlString // we can't parse it as a URL so don't replace it
}
shortString := thisURL.Hostname()
if thisURL.Path != "" {
shortString += thisURL.Path
}
if thisURL.Fragment != "" {
shortString = shortString + "#" + thisURL.Fragment
}
if thisURL.RawQuery != "" {
shortString = shortString + "?" + thisURL.RawQuery
}
replacement := fmt.Sprintf(`<a href="%s" rel="noopener">%s</a>`, urlString, shortString)
return replacement
// <a href="thisURL.String()" rel="noopener">urlString</a>
urlString = thisURL.String()
buf.WriteString(`<a href="`)
buf.WriteString(thisURL.String())
buf.WriteString(`" rel="noopener">`)
urlString = strings.TrimPrefix(urlString, thisURL.Scheme)
urlString = strings.TrimPrefix(urlString, "://")
buf.WriteString(urlString)
buf.WriteString(`</a>`)
return buf.String()
})
return replaced
}