[bugfix] Use better plaintext representation of status for filtering (#3301)

* [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain
2025-06-05 21:59:39 +02:00 · 2024-09-16 14:00:23 +02:00
parent 6dd936fbe1
commit efd1a4f717
15 changed files with 2685 additions and 64 deletions
--- a/vendor/github.com/k3a/html2text/.travis.yml
+++ b/vendor/github.com/k3a/html2text/.travis.yml
@ -0,0 +1,10 @@
+language: go
+go:
+  - master
+before_install:
+  - go get github.com/axw/gocov/gocov
+  - go get github.com/mattn/goveralls
+  - if ! go get github.com/golang/tools/cmd/cover; then go get golang.org/x/tools/cmd/cover; fi
+script:
+  - $HOME/gopath/bin/goveralls -service=travis-ci
+
--- a/vendor/github.com/k3a/html2text/LICENSE
+++ b/vendor/github.com/k3a/html2text/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Mario K3A Hros (www.k3a.me)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vendor/github.com/k3a/html2text/README.md
+++ b/vendor/github.com/k3a/html2text/README.md
@ -0,0 +1,60 @@
+[![GoDoc](https://godoc.org/github.com/k3a/html2text?status.svg)](https://godoc.org/github.com/k3a/html2text)
+[![Build Status](https://travis-ci.org/k3a/html2text.svg?branch=master)](https://travis-ci.org/k3a/html2text)
+[![Coverage Status](https://coveralls.io/repos/github/k3a/html2text/badge.svg?branch=master)](https://coveralls.io/github/k3a/html2text?branch=master)
+[![Report Card](https://goreportcard.com/badge/github.com/k3a/html2text)](https://goreportcard.com/report/github.com/k3a/html2text)
+
+# html2text
+
+A simple Golang package to convert HTML to plain text (without non-standard dependencies).
+
+It converts HTML tags to text and also parses HTML entities into characters they represent.
+A `<head>` section of the HTML document, as well as most other tags are stripped out but 
+links are properly converted into their href attribute.
+
+It can be used for converting HTML emails into text.
+
+Some tests are installed as well.
+Uses semantic versioning and no breaking changes are planned.
+
+Fell free to publish a pull request if you have suggestions for improvement but please note that the library can now be considered feature-complete and API stable. If you need more than this basic conversion, please use an alternative mentioned at the bottom.
+
+## Install
+```bash
+go get github.com/k3a/html2text
+```
+
+## Usage
+
+```go
+package main
+
+import (
+	"fmt"
+	"github.com/k3a/html2text"
+)
+
+func main() {
+	html := `<html><head><title>Good</title></head><body><strong>clean</strong> text</body>`
+	
+	plain := html2text.HTML2Text(html)
+			  
+	fmt.Println(plain)
+}
+
+/*	Outputs:
+
+	clean text
+*/
+
+```
+
+To see all features, please look info `html2text_test.go`.
+
+## Alternatives
+- https://github.com/jaytaylor/html2text (heavier, with more features)
+- https://git.alexwennerberg.com/nanohtml2text (rewrite of this module in Rust)
+
+## License
+
+MIT
+
--- a/vendor/github.com/k3a/html2text/entity.go
+++ b/vendor/github.com/k3a/html2text/entity.go
--- a/vendor/github.com/k3a/html2text/html2text.go
+++ b/vendor/github.com/k3a/html2text/html2text.go
@ -0,0 +1,333 @@
+package html2text
+
+import (
+	"bytes"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+// Line break constants
+// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
+const (
+	WIN_LBR  = "\r\n"
+	UNIX_LBR = "\n"
+)
+
+var legacyLBR = WIN_LBR
+var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
+var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
+var badLinkHrefRE = regexp.MustCompile(`javascript:`)
+var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
+var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
+
+type options struct {
+	lbr            string
+	linksInnerText bool
+	listPrefix     string
+}
+
+func newOptions() *options {
+	// apply defaults
+	return &options{
+		lbr: WIN_LBR,
+	}
+}
+
+// Option is a functional option
+type Option func(*options)
+
+// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
+func WithUnixLineBreaks() Option {
+	return func(o *options) {
+		o.lbr = UNIX_LBR
+	}
+}
+
+// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
+// Example: click news <http://bit.ly/2n4wXRs>
+func WithLinksInnerText() Option {
+	return func(o *options) {
+		o.linksInnerText = true
+	}
+}
+
+// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
+func WithListSupportPrefix(prefix string) Option {
+	return func(o *options) {
+		o.listPrefix = prefix
+	}
+}
+
+// WithListSupport formats <ul> and <li> lists with " - " prefix
+func WithListSupport() Option {
+	return WithListSupportPrefix(" - ")
+}
+
+func parseHTMLEntity(entName string) (string, bool) {
+	if r, ok := entity[entName]; ok {
+		return string(r), true
+	}
+
+	if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
+		var (
+			err    error
+			n      int64
+			digits = match[1]
+		)
+
+		if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
+			n, err = strconv.ParseInt(digits[1:], 16, 64)
+		} else {
+			n, err = strconv.ParseInt(digits, 10, 64)
+		}
+
+		if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
+			return string(rune(n)), true
+		}
+	}
+
+	return "", false
+}
+
+// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
+// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
+// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
+func SetUnixLbr(b bool) {
+	if b {
+		legacyLBR = UNIX_LBR
+	} else {
+		legacyLBR = WIN_LBR
+	}
+}
+
+// HTMLEntitiesToText decodes HTML entities inside a provided
+// string and returns decoded text
+func HTMLEntitiesToText(htmlEntsText string) string {
+	outBuf := bytes.NewBufferString("")
+	inEnt := false
+
+	for i, r := range htmlEntsText {
+		switch {
+		case r == ';' && inEnt:
+			inEnt = false
+			continue
+
+		case r == '&': //possible html entity
+			entName := ""
+			isEnt := false
+
+			// parse the entity name - max 10 chars
+			chars := 0
+			for _, er := range htmlEntsText[i+1:] {
+				if er == ';' {
+					isEnt = true
+					break
+				} else {
+					entName += string(er)
+				}
+
+				chars++
+				if chars == 10 {
+					break
+				}
+			}
+
+			if isEnt {
+				if ent, isEnt := parseHTMLEntity(entName); isEnt {
+					outBuf.WriteString(ent)
+					inEnt = true
+					continue
+				}
+			}
+		}
+
+		if !inEnt {
+			outBuf.WriteRune(r)
+		}
+	}
+
+	return outBuf.String()
+}
+
+func writeSpace(outBuf *bytes.Buffer) {
+	bts := outBuf.Bytes()
+	if len(bts) > 0 && bts[len(bts)-1] != ' ' {
+		outBuf.WriteString(" ")
+	}
+}
+
+// HTML2Text converts html into a text form
+func HTML2Text(html string) string {
+	var opts []Option
+	if legacyLBR == UNIX_LBR {
+		opts = append(opts, WithUnixLineBreaks())
+	}
+	return HTML2TextWithOptions(html, opts...)
+}
+
+// HTML2TextWithOptions converts html into a text form with additional options
+func HTML2TextWithOptions(html string, reqOpts ...Option) string {
+	opts := newOptions()
+	for _, opt := range reqOpts {
+		opt(opts)
+	}
+
+	inLen := len(html)
+	tagStart := 0
+	inEnt := false
+	badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
+	shouldOutput := true
+	// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
+	hrefs := []string{}
+	// new line cannot be printed at the beginning or
+	// for <p> after a new line created by previous <p></p>
+	canPrintNewline := false
+
+	outBuf := bytes.NewBufferString("")
+
+	for i, r := range html {
+		if inLen > 0 && i == inLen-1 {
+			// prevent new line at the end of the document
+			canPrintNewline = false
+		}
+
+		switch {
+		// skip new lines and spaces adding a single space if not there yet
+		case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
+			r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
+			if shouldOutput && badTagStackDepth == 0 && !inEnt {
+				//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
+				writeSpace(outBuf)
+			}
+			continue
+
+		case r == ';' && inEnt: // end of html entity
+			inEnt = false
+			continue
+
+		case r == '&' && shouldOutput: // possible html entity
+			entName := ""
+			isEnt := false
+
+			// parse the entity name - max 10 chars
+			chars := 0
+			for _, er := range html[i+1:] {
+				if er == ';' {
+					isEnt = true
+					break
+				} else {
+					entName += string(er)
+				}
+
+				chars++
+				if chars == 10 {
+					break
+				}
+			}
+
+			if isEnt {
+				if ent, isEnt := parseHTMLEntity(entName); isEnt {
+					outBuf.WriteString(ent)
+					inEnt = true
+					continue
+				}
+			}
+
+		case r == '<': // start of a tag
+			tagStart = i + 1
+			shouldOutput = false
+			continue
+
+		case r == '>': // end of a tag
+			shouldOutput = true
+			tag := html[tagStart:i]
+			tagNameLowercase := strings.ToLower(tag)
+
+			if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
+				outBuf.WriteString(opts.lbr)
+			} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
+				if opts.listPrefix != "" {
+					outBuf.WriteString(opts.lbr + opts.listPrefix)
+				} else {
+					outBuf.WriteString(opts.lbr)
+				}
+			} else if headersRE.MatchString(tagNameLowercase) {
+				if canPrintNewline {
+					outBuf.WriteString(opts.lbr + opts.lbr)
+				}
+				canPrintNewline = false
+			} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
+				// new line
+				outBuf.WriteString(opts.lbr)
+			} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
+				if canPrintNewline {
+					outBuf.WriteString(opts.lbr + opts.lbr)
+				}
+				canPrintNewline = false
+			} else if opts.linksInnerText && tagNameLowercase == "/a" {
+				// end of link
+				// links can be empty can happen if the link matches the badLinkHrefRE
+				if len(hrefs) > 0 {
+					outBuf.WriteString(" <")
+					outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
+					outBuf.WriteString(">")
+					hrefs = hrefs[1:]
+				}
+			} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
+				// parse link href
+				// add special handling for a tags
+				m := linkTagRE.FindStringSubmatch(tag)
+				if len(m) == 5 {
+					link := m[2]
+					if len(link) == 0 {
+						link = m[3]
+						if len(link) == 0 {
+							link = m[4]
+						}
+					}
+
+					if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
+						hrefs = append(hrefs, link)
+					}
+				}
+			} else if badTagnamesRE.MatchString(tagNameLowercase) {
+				// unwanted block
+				badTagStackDepth++
+
+				// if link inner text preservation is not enabled
+				// and the current tag is a link tag, parse its href and output that
+				if !opts.linksInnerText {
+					// parse link href
+					m := linkTagRE.FindStringSubmatch(tag)
+					if len(m) == 5 {
+						link := m[2]
+						if len(link) == 0 {
+							link = m[3]
+							if len(link) == 0 {
+								link = m[4]
+							}
+						}
+
+						if !badLinkHrefRE.MatchString(link) {
+							outBuf.WriteString(HTMLEntitiesToText(link))
+						}
+					}
+				}
+			} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
+				badTagnamesRE.MatchString(tagNameLowercase[1:]) {
+				// end of unwanted block
+				badTagStackDepth--
+			}
+			continue
+
+		} // switch end
+
+		if shouldOutput && badTagStackDepth == 0 && !inEnt {
+			canPrintNewline = true
+			outBuf.WriteRune(r)
+		}
+	}
+
+	return outBuf.String()
+}