[bugfix] Fix existing bio text showing as HTML (#531)

* fix existing bio text showing as HTML

- updated replaced mentions to include instance
- strips HTML from account source note in Verify handler
- update text formatter to use buffers for string writes

Signed-off-by: kim <grufwub@gmail.com>

* go away linter

Signed-off-by: kim <grufwub@gmail.com>

* change buf reset location, change html mention tags

Signed-off-by: kim <grufwub@gmail.com>

* reduce FindLinks code complexity

Signed-off-by: kim <grufwub@gmail.com>

* fix HTML to text conversion

Signed-off-by: kim <grufwub@gmail.com>

* Update internal/regexes/regexes.go

Co-authored-by: Mina Galić <mina.galic@puppet.com>

* use improved html2text lib with more options

Signed-off-by: kim <grufwub@gmail.com>

* fix to produce actual plaintext from html

Signed-off-by: kim <grufwub@gmail.com>

* fix span tags instead written as space

Signed-off-by: kim <grufwub@gmail.com>

* performance improvements to regex replacements, fix link replace logic for un-html-ing in the future

Signed-off-by: kim <grufwub@gmail.com>

* fix tag/mention replacements to use input string, fix link replace to not include scheme

Signed-off-by: kim <grufwub@gmail.com>

* use matched input string for link replace href text

Signed-off-by: kim <grufwub@gmail.com>

* remove unused code (to appease linter :sobs:)

Signed-off-by: kim <grufwub@gmail.com>

* improve hashtagFinger regex to be more compliant

Signed-off-by: kim <grufwub@gmail.com>

* update breakReplacer to include both unix and windows line endings

Signed-off-by: kim <grufwub@gmail.com>

* add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts

Signed-off-by: kim <grufwub@gmail.com>

* drop unnecessary code

Signed-off-by: kim <grufwub@gmail.com>

* update text package tests to fix logic changes

Signed-off-by: kim <grufwub@gmail.com>

* add raw note content testing to account update and account verify

Signed-off-by: kim <grufwub@gmail.com>

* remove unused modules

Signed-off-by: kim <grufwub@gmail.com>

* fix emoji regex

Signed-off-by: kim <grufwub@gmail.com>

* fix replacement of hashtags

Signed-off-by: kim <grufwub@gmail.com>

* update code comment

Signed-off-by: kim <grufwub@gmail.com>

Co-authored-by: Mina Galić <mina.galic@puppet.com>
This commit is contained in:
kim
2022-05-07 16:55:27 +01:00
committed by GitHub
parent 08eb271a4c
commit 26b74aefaf
16 changed files with 180 additions and 108 deletions

View File

@@ -19,8 +19,12 @@
package regexes
import (
"bytes"
"fmt"
"regexp"
"sync"
"mvdan.cc/xurls/v2"
)
const (
@@ -47,6 +51,16 @@ const (
)
var (
schemes = `(http|https)://`
// LinkScheme captures http/https schemes in URLs.
LinkScheme = func() *regexp.Regexp {
rgx, err := xurls.StrictMatchingScheme(schemes)
if err != nil {
panic(err)
}
return rgx
}()
mentionName = `^@(\w+)(?:@([a-zA-Z0-9_\-\.:]+))?$`
// MentionName captures the username and domain part from a mention string
// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols)
@@ -58,7 +72,7 @@ var (
MentionFinder = regexp.MustCompile(mentionFinder)
// hashtag regex can be played with here: https://regex101.com/r/bPxeca/1
hashtagFinder = fmt.Sprintf(`(?:^|\n|\s)(#[a-zA-Z0-9]{1,%d})(?:\b)`, maximumHashtagLength)
hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[a-zA-Z0-9]{1,%d})(?:#|\b)`, maximumHashtagLength)
// HashtagFinder finds possible hashtags in a string.
// It returns just the string part of the hashtag, not the # symbol.
HashtagFinder = regexp.MustCompile(hashtagFinder)
@@ -68,7 +82,7 @@ var (
EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))
// emoji regex can be played with here: https://regex101.com/r/478XGM/1
emojiFinderString = fmt.Sprintf(`(?:\B)?:(%s):(?:\B)?`, emojiShortcode)
emojiFinderString = fmt.Sprintf(`(?:\b)?:(%s):(?:\b)?`, emojiShortcode)
// EmojiFinder extracts emoji strings from a piece of text.
EmojiFinder = regexp.MustCompile(emojiFinderString)
@@ -134,3 +148,21 @@ var (
// from eg /users/example_username/blocks/01F7XT5JZW1WMVSW1KADS8PVDH
BlockPath = regexp.MustCompile(blockPath)
)
// bufpool is a memory pool of byte buffers for use in our regex utility functions.
var bufpool = sync.Pool{
New: func() any {
buf := bytes.NewBuffer(make([]byte, 0, 512))
return buf
},
}
// ReplaceAllStringFunc will call through to .ReplaceAllStringFunc in the provided regex, but provide you a clean byte buffer for optimized string writes.
func ReplaceAllStringFunc(rgx *regexp.Regexp, src string, repl func(match string, buf *bytes.Buffer) string) string {
buf := bufpool.Get().(*bytes.Buffer) //nolint
defer bufpool.Put(buf)
return rgx.ReplaceAllStringFunc(src, func(match string) string {
buf.Reset() // reset use
return repl(match, buf)
})
}