mirror of
https://github.com/superseriousbusiness/gotosocial
synced 2025-06-05 21:59:39 +02:00
[bugfix] Use better plaintext representation of status for filtering (#3301)
* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
This commit is contained in:
10
vendor/github.com/k3a/html2text/.travis.yml
generated
vendored
Normal file
10
vendor/github.com/k3a/html2text/.travis.yml
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
language: go
|
||||
go:
|
||||
- master
|
||||
before_install:
|
||||
- go get github.com/axw/gocov/gocov
|
||||
- go get github.com/mattn/goveralls
|
||||
- if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
|
||||
script:
|
||||
- $HOME/gopath/bin/goveralls -service=travis-ci
|
||||
|
21
vendor/github.com/k3a/html2text/LICENSE
generated
vendored
Normal file
21
vendor/github.com/k3a/html2text/LICENSE
generated
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2017 Mario K3A Hros (www.k3a.me)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
60
vendor/github.com/k3a/html2text/README.md
generated
vendored
Normal file
60
vendor/github.com/k3a/html2text/README.md
generated
vendored
Normal file
@ -0,0 +1,60 @@
|
||||
[](https://godoc.org/github.com/k3a/html2text)
|
||||
[](https://travis-ci.org/k3a/html2text)
|
||||
[](https://coveralls.io/github/k3a/html2text?branch=master)
|
||||
[](https://goreportcard.com/report/github.com/k3a/html2text)
|
||||
|
||||
# html2text
|
||||
|
||||
A simple Golang package to convert HTML to plain text (without non-standard dependencies).
|
||||
|
||||
It converts HTML tags to text and also parses HTML entities into characters they represent.
|
||||
A `<head>` section of the HTML document, as well as most other tags are stripped out but
|
||||
links are properly converted into their href attribute.
|
||||
|
||||
It can be used for converting HTML emails into text.
|
||||
|
||||
Some tests are installed as well.
|
||||
Uses semantic versioning and no breaking changes are planned.
|
||||
|
||||
Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom.
|
||||
|
||||
## Install
|
||||
```bash
|
||||
go get github.com/k3a/html2text
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/k3a/html2text"
|
||||
)
|
||||
|
||||
func main() {
|
||||
html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>`
|
||||
|
||||
plain := html2text.HTML2Text(html)
|
||||
|
||||
fmt.Println(plain)
|
||||
}
|
||||
|
||||
/* Outputs:
|
||||
|
||||
clean text
|
||||
*/
|
||||
|
||||
```
|
||||
|
||||
To see all features, please look info `html2text_test.go`.
|
||||
|
||||
## Alternatives
|
||||
- https://github.com/jaytaylor/html2text (heavier, with more features)
|
||||
- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust)
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
2046
vendor/github.com/k3a/html2text/entity.go
generated
vendored
Normal file
2046
vendor/github.com/k3a/html2text/entity.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
333
vendor/github.com/k3a/html2text/html2text.go
generated
vendored
Normal file
333
vendor/github.com/k3a/html2text/html2text.go
generated
vendored
Normal file
@ -0,0 +1,333 @@
|
||||
package html2text
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Line break constants
|
||||
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||
const (
|
||||
WIN_LBR = "\r\n"
|
||||
UNIX_LBR = "\n"
|
||||
)
|
||||
|
||||
var legacyLBR = WIN_LBR
|
||||
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
|
||||
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
|
||||
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
|
||||
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
|
||||
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
|
||||
|
||||
type options struct {
|
||||
lbr string
|
||||
linksInnerText bool
|
||||
listPrefix string
|
||||
}
|
||||
|
||||
func newOptions() *options {
|
||||
// apply defaults
|
||||
return &options{
|
||||
lbr: WIN_LBR,
|
||||
}
|
||||
}
|
||||
|
||||
// Option is a functional option
|
||||
type Option func(*options)
|
||||
|
||||
// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
|
||||
func WithUnixLineBreaks() Option {
|
||||
return func(o *options) {
|
||||
o.lbr = UNIX_LBR
|
||||
}
|
||||
}
|
||||
|
||||
// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
|
||||
// Example: click news <http://bit.ly/2n4wXRs>
|
||||
func WithLinksInnerText() Option {
|
||||
return func(o *options) {
|
||||
o.linksInnerText = true
|
||||
}
|
||||
}
|
||||
|
||||
// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
|
||||
func WithListSupportPrefix(prefix string) Option {
|
||||
return func(o *options) {
|
||||
o.listPrefix = prefix
|
||||
}
|
||||
}
|
||||
|
||||
// WithListSupport formats <ul> and <li> lists with " - " prefix
|
||||
func WithListSupport() Option {
|
||||
return WithListSupportPrefix(" - ")
|
||||
}
|
||||
|
||||
func parseHTMLEntity(entName string) (string, bool) {
|
||||
if r, ok := entity[entName]; ok {
|
||||
return string(r), true
|
||||
}
|
||||
|
||||
if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
|
||||
var (
|
||||
err error
|
||||
n int64
|
||||
digits = match[1]
|
||||
)
|
||||
|
||||
if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
|
||||
n, err = strconv.ParseInt(digits[1:], 16, 64)
|
||||
} else {
|
||||
n, err = strconv.ParseInt(digits, 10, 64)
|
||||
}
|
||||
|
||||
if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
|
||||
return string(rune(n)), true
|
||||
}
|
||||
}
|
||||
|
||||
return "", false
|
||||
}
|
||||
|
||||
// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
|
||||
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
|
||||
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
|
||||
func SetUnixLbr(b bool) {
|
||||
if b {
|
||||
legacyLBR = UNIX_LBR
|
||||
} else {
|
||||
legacyLBR = WIN_LBR
|
||||
}
|
||||
}
|
||||
|
||||
// HTMLEntitiesToText decodes HTML entities inside a provided
|
||||
// string and returns decoded text
|
||||
func HTMLEntitiesToText(htmlEntsText string) string {
|
||||
outBuf := bytes.NewBufferString("")
|
||||
inEnt := false
|
||||
|
||||
for i, r := range htmlEntsText {
|
||||
switch {
|
||||
case r == ';' && inEnt:
|
||||
inEnt = false
|
||||
continue
|
||||
|
||||
case r == '&': //possible html entity
|
||||
entName := ""
|
||||
isEnt := false
|
||||
|
||||
// parse the entity name - max 10 chars
|
||||
chars := 0
|
||||
for _, er := range htmlEntsText[i+1:] {
|
||||
if er == ';' {
|
||||
isEnt = true
|
||||
break
|
||||
} else {
|
||||
entName += string(er)
|
||||
}
|
||||
|
||||
chars++
|
||||
if chars == 10 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if isEnt {
|
||||
if ent, isEnt := parseHTMLEntity(entName); isEnt {
|
||||
outBuf.WriteString(ent)
|
||||
inEnt = true
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !inEnt {
|
||||
outBuf.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return outBuf.String()
|
||||
}
|
||||
|
||||
func writeSpace(outBuf *bytes.Buffer) {
|
||||
bts := outBuf.Bytes()
|
||||
if len(bts) > 0 && bts[len(bts)-1] != ' ' {
|
||||
outBuf.WriteString(" ")
|
||||
}
|
||||
}
|
||||
|
||||
// HTML2Text converts html into a text form
|
||||
func HTML2Text(html string) string {
|
||||
var opts []Option
|
||||
if legacyLBR == UNIX_LBR {
|
||||
opts = append(opts, WithUnixLineBreaks())
|
||||
}
|
||||
return HTML2TextWithOptions(html, opts...)
|
||||
}
|
||||
|
||||
// HTML2TextWithOptions converts html into a text form with additional options
|
||||
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
|
||||
opts := newOptions()
|
||||
for _, opt := range reqOpts {
|
||||
opt(opts)
|
||||
}
|
||||
|
||||
inLen := len(html)
|
||||
tagStart := 0
|
||||
inEnt := false
|
||||
badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
|
||||
shouldOutput := true
|
||||
// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
|
||||
hrefs := []string{}
|
||||
// new line cannot be printed at the beginning or
|
||||
// for <p> after a new line created by previous <p></p>
|
||||
canPrintNewline := false
|
||||
|
||||
outBuf := bytes.NewBufferString("")
|
||||
|
||||
for i, r := range html {
|
||||
if inLen > 0 && i == inLen-1 {
|
||||
// prevent new line at the end of the document
|
||||
canPrintNewline = false
|
||||
}
|
||||
|
||||
switch {
|
||||
// skip new lines and spaces adding a single space if not there yet
|
||||
case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
|
||||
r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
|
||||
if shouldOutput && badTagStackDepth == 0 && !inEnt {
|
||||
//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
|
||||
writeSpace(outBuf)
|
||||
}
|
||||
continue
|
||||
|
||||
case r == ';' && inEnt: // end of html entity
|
||||
inEnt = false
|
||||
continue
|
||||
|
||||
case r == '&' && shouldOutput: // possible html entity
|
||||
entName := ""
|
||||
isEnt := false
|
||||
|
||||
// parse the entity name - max 10 chars
|
||||
chars := 0
|
||||
for _, er := range html[i+1:] {
|
||||
if er == ';' {
|
||||
isEnt = true
|
||||
break
|
||||
} else {
|
||||
entName += string(er)
|
||||
}
|
||||
|
||||
chars++
|
||||
if chars == 10 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if isEnt {
|
||||
if ent, isEnt := parseHTMLEntity(entName); isEnt {
|
||||
outBuf.WriteString(ent)
|
||||
inEnt = true
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
case r == '<': // start of a tag
|
||||
tagStart = i + 1
|
||||
shouldOutput = false
|
||||
continue
|
||||
|
||||
case r == '>': // end of a tag
|
||||
shouldOutput = true
|
||||
tag := html[tagStart:i]
|
||||
tagNameLowercase := strings.ToLower(tag)
|
||||
|
||||
if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
|
||||
outBuf.WriteString(opts.lbr)
|
||||
} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
|
||||
if opts.listPrefix != "" {
|
||||
outBuf.WriteString(opts.lbr + opts.listPrefix)
|
||||
} else {
|
||||
outBuf.WriteString(opts.lbr)
|
||||
}
|
||||
} else if headersRE.MatchString(tagNameLowercase) {
|
||||
if canPrintNewline {
|
||||
outBuf.WriteString(opts.lbr + opts.lbr)
|
||||
}
|
||||
canPrintNewline = false
|
||||
} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
|
||||
// new line
|
||||
outBuf.WriteString(opts.lbr)
|
||||
} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
|
||||
if canPrintNewline {
|
||||
outBuf.WriteString(opts.lbr + opts.lbr)
|
||||
}
|
||||
canPrintNewline = false
|
||||
} else if opts.linksInnerText && tagNameLowercase == "/a" {
|
||||
// end of link
|
||||
// links can be empty can happen if the link matches the badLinkHrefRE
|
||||
if len(hrefs) > 0 {
|
||||
outBuf.WriteString(" <")
|
||||
outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
|
||||
outBuf.WriteString(">")
|
||||
hrefs = hrefs[1:]
|
||||
}
|
||||
} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
|
||||
// parse link href
|
||||
// add special handling for a tags
|
||||
m := linkTagRE.FindStringSubmatch(tag)
|
||||
if len(m) == 5 {
|
||||
link := m[2]
|
||||
if len(link) == 0 {
|
||||
link = m[3]
|
||||
if len(link) == 0 {
|
||||
link = m[4]
|
||||
}
|
||||
}
|
||||
|
||||
if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
|
||||
hrefs = append(hrefs, link)
|
||||
}
|
||||
}
|
||||
} else if badTagnamesRE.MatchString(tagNameLowercase) {
|
||||
// unwanted block
|
||||
badTagStackDepth++
|
||||
|
||||
// if link inner text preservation is not enabled
|
||||
// and the current tag is a link tag, parse its href and output that
|
||||
if !opts.linksInnerText {
|
||||
// parse link href
|
||||
m := linkTagRE.FindStringSubmatch(tag)
|
||||
if len(m) == 5 {
|
||||
link := m[2]
|
||||
if len(link) == 0 {
|
||||
link = m[3]
|
||||
if len(link) == 0 {
|
||||
link = m[4]
|
||||
}
|
||||
}
|
||||
|
||||
if !badLinkHrefRE.MatchString(link) {
|
||||
outBuf.WriteString(HTMLEntitiesToText(link))
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
|
||||
badTagnamesRE.MatchString(tagNameLowercase[1:]) {
|
||||
// end of unwanted block
|
||||
badTagStackDepth--
|
||||
}
|
||||
continue
|
||||
|
||||
} // switch end
|
||||
|
||||
if shouldOutput && badTagStackDepth == 0 && !inEnt {
|
||||
canPrintNewline = true
|
||||
outBuf.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return outBuf.String()
|
||||
}
|
Reference in New Issue
Block a user