[bug] respect X-Robots-Tag and robots.txt on api/v1/instance and nodeinfo (#3756)

* feat: check X-Robots-Tag when accessing /api/v1/instance or /nodeinfo endpoints respect X-Robots-Tag * chore: go fmt ./... * Check robots.txt as well, add tests --------- Co-authored-by: tobi <tobi.smethurst@protonmail.com>
2025-06-05 21:59:39 +02:00 · 2025-02-11 13:16:14 +01:00
parent 2c95fd4115
commit d0de3ad492
20 changed files with 1404 additions and 24 deletions
--- a/vendor/github.com/temoto/robotstxt/scanner.go
+++ b/vendor/github.com/temoto/robotstxt/scanner.go
@ -0,0 +1,185 @@
+package robotstxt
+
+import (
+	"bytes"
+	"fmt"
+	"go/token"
+	"os"
+	"sync"
+	"unicode/utf8"
+)
+
+type byteScanner struct {
+	pos           token.Position
+	buf           []byte
+	ErrorCount    int
+	ch            rune
+	Quiet         bool
+	keyTokenFound bool
+	lastChunk     bool
+}
+
+const tokEOL = "\n"
+
+var WhitespaceChars = []rune{' ', '\t', '\v'}
+var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
+
+func newByteScanner(srcname string, quiet bool) *byteScanner {
+	return &byteScanner{
+		Quiet: quiet,
+		ch:    -1,
+		pos:   token.Position{Filename: srcname},
+	}
+}
+
+func (s *byteScanner) feed(input []byte, end bool) {
+	s.buf = input
+	s.pos.Offset = 0
+	s.pos.Line = 1
+	s.pos.Column = 1
+	s.lastChunk = end
+
+	// Read first char into look-ahead buffer `s.ch`.
+	if !s.nextChar() {
+		return
+	}
+
+	// Skip UTF-8 byte order mark
+	if s.ch == 65279 {
+		s.nextChar()
+		s.pos.Column = 1
+	}
+}
+
+func (s *byteScanner) GetPosition() token.Position {
+	return s.pos
+}
+
+func (s *byteScanner) scan() string {
+	// Note Offset > len, not >=, so we can scan last character.
+	if s.lastChunk && s.pos.Offset > len(s.buf) {
+		return ""
+	}
+
+	s.skipSpace()
+
+	if s.ch == -1 {
+		return ""
+	}
+
+	// EOL
+	if s.isEol() {
+		s.keyTokenFound = false
+		// skip subsequent newline chars
+		for s.ch != -1 && s.isEol() {
+			s.nextChar()
+		}
+		// emit newline as separate token
+		return tokEOL
+	}
+
+	// skip comments
+	if s.ch == '#' {
+		s.keyTokenFound = false
+		s.skipUntilEol()
+		if s.ch == -1 {
+			return ""
+		}
+		// emit newline as separate token
+		return tokEOL
+	}
+
+	// else we found something
+	tok := tokBuffers.Get().(*bytes.Buffer)
+	defer tokBuffers.Put(tok)
+	tok.Reset()
+	tok.WriteRune(s.ch)
+	s.nextChar()
+	for s.ch != -1 && !s.isSpace() && !s.isEol() {
+		// Do not consider ":" to be a token separator if a first key token
+		// has already been found on this line (avoid cutting an absolute URL
+		// after the "http:")
+		if s.ch == ':' && !s.keyTokenFound {
+			s.nextChar()
+			s.keyTokenFound = true
+			break
+		}
+
+		tok.WriteRune(s.ch)
+		s.nextChar()
+	}
+	return tok.String()
+}
+
+func (s *byteScanner) scanAll() []string {
+	results := make([]string, 0, 64) // random guess of average tokens length
+	for {
+		token := s.scan()
+		if token != "" {
+			results = append(results, token)
+		} else {
+			break
+		}
+	}
+	return results
+}
+
+func (s *byteScanner) error(pos token.Position, msg string) {
+	s.ErrorCount++
+	if !s.Quiet {
+		fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
+	}
+}
+
+func (s *byteScanner) isEol() bool {
+	return s.ch == '\n' || s.ch == '\r'
+}
+
+func (s *byteScanner) isSpace() bool {
+	for _, r := range WhitespaceChars {
+		if s.ch == r {
+			return true
+		}
+	}
+	return false
+}
+
+func (s *byteScanner) skipSpace() {
+	for s.ch != -1 && s.isSpace() {
+		s.nextChar()
+	}
+}
+
+func (s *byteScanner) skipUntilEol() {
+	for s.ch != -1 && !s.isEol() {
+		s.nextChar()
+	}
+	// skip subsequent newline chars
+	for s.ch != -1 && s.isEol() {
+		s.nextChar()
+	}
+}
+
+// Reads next Unicode char.
+func (s *byteScanner) nextChar() bool {
+	if s.pos.Offset >= len(s.buf) {
+		s.ch = -1
+		return false
+	}
+	s.pos.Column++
+	if s.ch == '\n' {
+		s.pos.Line++
+		s.pos.Column = 1
+	}
+	r, w := rune(s.buf[s.pos.Offset]), 1
+	if r >= 0x80 {
+		r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
+		if r == utf8.RuneError && w == 1 {
+			s.error(s.pos, "illegal UTF-8 encoding")
+		}
+	}
+	s.pos.Column++
+	s.pos.Offset += w
+	s.ch = r
+	return true
+}