mirror of
https://github.com/superseriousbusiness/gotosocial
synced 2025-06-05 21:59:39 +02:00
[bug] respect X-Robots-Tag
and robots.txt
on api/v1/instance and nodeinfo (#3756)
* feat: check X-Robots-Tag when accessing /api/v1/instance or /nodeinfo endpoints respect X-Robots-Tag * chore: go fmt ./... * Check robots.txt as well, add tests --------- Co-authored-by: tobi <tobi.smethurst@protonmail.com>
This commit is contained in:
15
vendor/github.com/temoto/robotstxt/.gitignore
generated
vendored
Normal file
15
vendor/github.com/temoto/robotstxt/.gitignore
generated
vendored
Normal file
@ -0,0 +1,15 @@
|
||||
*.cgo?.*
|
||||
*.o
|
||||
*.so
|
||||
*.sublime-*
|
||||
*.zip
|
||||
.DS_Store
|
||||
.idea/
|
||||
.tags*
|
||||
_cgo_*
|
||||
_gofuzz/crashers/
|
||||
_gofuzz/suppressions/
|
||||
_obj
|
||||
_test
|
||||
coverage.txt
|
||||
robots.txt-check/robots.txt-check
|
20
vendor/github.com/temoto/robotstxt/.golangci.yml
generated
vendored
Normal file
20
vendor/github.com/temoto/robotstxt/.golangci.yml
generated
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
linters:
|
||||
enable:
|
||||
- goconst
|
||||
- gofmt
|
||||
- gosec
|
||||
- maligned
|
||||
- prealloc
|
||||
- staticcheck
|
||||
disable:
|
||||
- deadcode
|
||||
- structcheck
|
||||
- varcheck
|
||||
|
||||
linters-settings:
|
||||
gofmt:
|
||||
simplify: true
|
||||
govet:
|
||||
check-shadowing: true
|
||||
maligned:
|
||||
suggest-new: true
|
30
vendor/github.com/temoto/robotstxt/.travis.yml
generated
vendored
Normal file
30
vendor/github.com/temoto/robotstxt/.travis.yml
generated
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
cache:
|
||||
go: true
|
||||
directories:
|
||||
- $HOME/.cache
|
||||
- $HOME/bin
|
||||
- $HOME/gopath/pkg/mod
|
||||
language: go
|
||||
go:
|
||||
- 1.11
|
||||
- 1.12
|
||||
- 1.13
|
||||
- 1.14
|
||||
- 1.x
|
||||
- master
|
||||
install: true
|
||||
script: GO111MODULE=on go test -race
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- go: 1.x
|
||||
env: task=coverage
|
||||
script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt
|
||||
after_success: bash <(curl -s https://codecov.io/bash)
|
||||
- go: 1.x
|
||||
env: task=bench
|
||||
script: GO111MODULE=on ./script/bench
|
||||
- go: 1.x
|
||||
install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1
|
||||
env: task=clean
|
||||
script: GO111MODULE=on ./script/clean
|
21
vendor/github.com/temoto/robotstxt/LICENSE
generated
vendored
Normal file
21
vendor/github.com/temoto/robotstxt/LICENSE
generated
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
The MIT License
|
||||
|
||||
Copyright (c) 2010 Sergey Shepelev <temotor@gmail.com>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
115
vendor/github.com/temoto/robotstxt/README.rst
generated
vendored
Normal file
115
vendor/github.com/temoto/robotstxt/README.rst
generated
vendored
Normal file
@ -0,0 +1,115 @@
|
||||
What
|
||||
====
|
||||
|
||||
This is a robots.txt exclusion protocol implementation for Go language (golang).
|
||||
|
||||
|
||||
Build
|
||||
=====
|
||||
|
||||
To build and run tests run `go test` in source directory.
|
||||
|
||||
|
||||
Contribute
|
||||
==========
|
||||
|
||||
Warm welcome.
|
||||
|
||||
* If desired, add your name in README.rst, section Who.
|
||||
* Run `script/test && script/clean && echo ok`
|
||||
* You can ignore linter warnings, but everything else must pass.
|
||||
* Send your change as pull request or just a regular patch to current maintainer (see section Who).
|
||||
|
||||
Thank you.
|
||||
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
As usual, no special installation is required, just
|
||||
|
||||
import "github.com/temoto/robotstxt"
|
||||
|
||||
run `go get` and you're ready.
|
||||
|
||||
1. Parse
|
||||
^^^^^^^^
|
||||
|
||||
First of all, you need to parse robots.txt data. You can do it with
|
||||
functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`::
|
||||
|
||||
robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:"))
|
||||
robots, err := robotstxt.FromString("User-agent: *\nDisallow:")
|
||||
|
||||
As of 2012-10-03, `FromBytes` is the most efficient method, everything else
|
||||
is a wrapper for this core function.
|
||||
|
||||
There are few convenient constructors for various purposes:
|
||||
|
||||
* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data
|
||||
from HTTP response. It *does not* call `response.Body.Close()`::
|
||||
|
||||
robots, err := robotstxt.FromResponse(resp)
|
||||
resp.Body.Close()
|
||||
if err != nil {
|
||||
log.Println("Error parsing robots.txt:", err.Error())
|
||||
}
|
||||
|
||||
* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or
|
||||
`FromStatusAndString` if you prefer to read bytes (string) yourself.
|
||||
Passing status code applies following logic in line with Google's interpretation
|
||||
of robots.txt files:
|
||||
|
||||
* status 2xx -> parse body with `FromBytes` and apply rules listed there.
|
||||
* status 4xx -> allow all (even 401/403, as recommended by Google).
|
||||
* other (5xx) -> disallow all, consider this a temporary unavailability.
|
||||
|
||||
2. Query
|
||||
^^^^^^^^
|
||||
|
||||
Parsing robots.txt content builds a kind of logic database, which you can
|
||||
query with `(r *RobotsData) TestAgent(url, agent string) (bool)`.
|
||||
|
||||
Explicit passing of agent is useful if you want to query for different agents. For
|
||||
single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)`
|
||||
returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`.
|
||||
|
||||
Simple query with explicit user agent. Each call will scan all rules.
|
||||
|
||||
::
|
||||
|
||||
allow := robots.TestAgent("/", "FooBot")
|
||||
|
||||
Or query several paths against same user agent for performance.
|
||||
|
||||
::
|
||||
|
||||
group := robots.FindGroup("BarBot")
|
||||
group.Test("/")
|
||||
group.Test("/download.mp3")
|
||||
group.Test("/news/article-2012-1")
|
||||
|
||||
|
||||
Who
|
||||
===
|
||||
|
||||
Honorable contributors (in undefined order):
|
||||
|
||||
* Ilya Grigorik (igrigorik)
|
||||
* Martin Angers (PuerkitoBio)
|
||||
* Micha Gorelick (mynameisfiber)
|
||||
|
||||
Initial commit and other: Sergey Shepelev temotor@gmail.com
|
||||
|
||||
|
||||
Flair
|
||||
=====
|
||||
|
||||
.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master
|
||||
:target: https://travis-ci.org/temoto/robotstxt
|
||||
|
||||
.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg
|
||||
:target: https://codecov.io/gh/temoto/robotstxt
|
||||
|
||||
.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt
|
||||
:target: https://goreportcard.com/report/github.com/temoto/robotstxt
|
2
vendor/github.com/temoto/robotstxt/codecov.yml
generated
vendored
Normal file
2
vendor/github.com/temoto/robotstxt/codecov.yml
generated
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
codecov:
|
||||
token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04
|
29
vendor/github.com/temoto/robotstxt/fuzz.go
generated
vendored
Normal file
29
vendor/github.com/temoto/robotstxt/fuzz.go
generated
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
// +build gofuzz
|
||||
|
||||
package robotstxt
|
||||
|
||||
import "testing/quick"
|
||||
|
||||
func Fuzz(data []byte) int {
|
||||
r, err := FromBytes(data)
|
||||
if err != nil {
|
||||
if r != nil {
|
||||
panic("r != nil on error")
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// FindGroup must never return nil
|
||||
f1 := func(agent string) bool { return r.FindGroup(agent) != nil }
|
||||
if err := quick.Check(f1, nil); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// just check TestAgent doesn't panic
|
||||
f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true }
|
||||
if err := quick.Check(f2, nil); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return 1
|
||||
}
|
271
vendor/github.com/temoto/robotstxt/parser.go
generated
vendored
Normal file
271
vendor/github.com/temoto/robotstxt/parser.go
generated
vendored
Normal file
@ -0,0 +1,271 @@
|
||||
package robotstxt
|
||||
|
||||
// Comments explaining the logic are taken from either the google's spec:
|
||||
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
|
||||
//
|
||||
// or the Wikipedia's entry on robots.txt:
|
||||
// http://en.wikipedia.org/wiki/Robots.txt
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type lineType uint
|
||||
|
||||
const (
|
||||
lIgnore lineType = iota
|
||||
lUnknown
|
||||
lUserAgent
|
||||
lAllow
|
||||
lDisallow
|
||||
lCrawlDelay
|
||||
lSitemap
|
||||
lHost
|
||||
)
|
||||
|
||||
type parser struct {
|
||||
tokens []string
|
||||
pos int
|
||||
}
|
||||
|
||||
type lineInfo struct {
|
||||
t lineType // Type of line key
|
||||
k string // String representation of the type of key
|
||||
vs string // String value of the key
|
||||
vf float64 // Float value of the key
|
||||
vr *regexp.Regexp // Regexp value of the key
|
||||
}
|
||||
|
||||
func newParser(tokens []string) *parser {
|
||||
return &parser{tokens: tokens}
|
||||
}
|
||||
|
||||
func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
|
||||
var g *Group
|
||||
for _, a := range agents {
|
||||
if g = groups[a]; g == nil {
|
||||
g = new(Group)
|
||||
groups[a] = g
|
||||
}
|
||||
fun(g)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
|
||||
groups = make(map[string]*Group, 16)
|
||||
agents := make([]string, 0, 4)
|
||||
isEmptyGroup := true
|
||||
|
||||
// Reset internal fields, tokens are assigned at creation time, never change
|
||||
p.pos = 0
|
||||
|
||||
for {
|
||||
if li, err := p.parseLine(); err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
errs = append(errs, err)
|
||||
} else {
|
||||
switch li.t {
|
||||
case lUserAgent:
|
||||
// Two successive user-agent lines are part of the same group.
|
||||
if !isEmptyGroup {
|
||||
// End previous group
|
||||
agents = make([]string, 0, 4)
|
||||
}
|
||||
if len(agents) == 0 {
|
||||
isEmptyGroup = true
|
||||
}
|
||||
agents = append(agents, li.vs)
|
||||
|
||||
case lDisallow:
|
||||
// Error if no current group
|
||||
if len(agents) == 0 {
|
||||
errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
|
||||
} else {
|
||||
isEmptyGroup = false
|
||||
var r *rule
|
||||
if li.vr != nil {
|
||||
r = &rule{"", false, li.vr}
|
||||
} else {
|
||||
r = &rule{li.vs, false, nil}
|
||||
}
|
||||
parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
|
||||
}
|
||||
|
||||
case lAllow:
|
||||
// Error if no current group
|
||||
if len(agents) == 0 {
|
||||
errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
|
||||
} else {
|
||||
isEmptyGroup = false
|
||||
var r *rule
|
||||
if li.vr != nil {
|
||||
r = &rule{"", true, li.vr}
|
||||
} else {
|
||||
r = &rule{li.vs, true, nil}
|
||||
}
|
||||
parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
|
||||
}
|
||||
|
||||
case lHost:
|
||||
host = li.vs
|
||||
|
||||
case lSitemap:
|
||||
sitemaps = append(sitemaps, li.vs)
|
||||
|
||||
case lCrawlDelay:
|
||||
if len(agents) == 0 {
|
||||
errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
|
||||
} else {
|
||||
isEmptyGroup = false
|
||||
delay := time.Duration(li.vf * float64(time.Second))
|
||||
parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (p *parser) parseLine() (li *lineInfo, err error) {
|
||||
t1, ok1 := p.popToken()
|
||||
if !ok1 {
|
||||
// proper EOF
|
||||
return nil, io.EOF
|
||||
}
|
||||
|
||||
t2, ok2 := p.peekToken()
|
||||
if !ok2 {
|
||||
// EOF, no value associated with the token, so ignore token and return
|
||||
return nil, io.EOF
|
||||
}
|
||||
|
||||
// Helper closure for all string-based tokens, common behaviour:
|
||||
// - Consume t2 token
|
||||
// - If empty, return unknown line info
|
||||
// - Otherwise return the specified line info
|
||||
returnStringVal := func(t lineType) (*lineInfo, error) {
|
||||
p.popToken()
|
||||
if t2 != "" {
|
||||
return &lineInfo{t: t, k: t1, vs: t2}, nil
|
||||
}
|
||||
return &lineInfo{t: lIgnore}, nil
|
||||
}
|
||||
|
||||
// Helper closure for all path tokens (allow/disallow), common behaviour:
|
||||
// - Consume t2 token
|
||||
// - If empty, return unknown line info
|
||||
// - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
|
||||
// - Detect if wildcards are present, if so, compile into a regexp
|
||||
// - Return the specified line info
|
||||
returnPathVal := func(t lineType) (*lineInfo, error) {
|
||||
p.popToken()
|
||||
if t2 != "" {
|
||||
if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
|
||||
t2 = "/" + t2
|
||||
}
|
||||
t2 = strings.TrimRightFunc(t2, isAsterisk)
|
||||
// From google's spec:
|
||||
// Google, Bing, Yahoo, and Ask support a limited form of
|
||||
// "wildcards" for path values. These are:
|
||||
// * designates 0 or more instances of any valid character
|
||||
// $ designates the end of the URL
|
||||
if strings.ContainsAny(t2, "*$") {
|
||||
// Must compile a regexp, this is a pattern.
|
||||
// Escape string before compile.
|
||||
t2 = regexp.QuoteMeta(t2)
|
||||
t2 = strings.Replace(t2, `\*`, `.*`, -1)
|
||||
t2 = strings.Replace(t2, `\$`, `$`, -1)
|
||||
if r, e := regexp.Compile(t2); e != nil {
|
||||
return nil, e
|
||||
} else {
|
||||
return &lineInfo{t: t, k: t1, vr: r}, nil
|
||||
}
|
||||
} else {
|
||||
// Simple string path
|
||||
return &lineInfo{t: t, k: t1, vs: t2}, nil
|
||||
}
|
||||
}
|
||||
return &lineInfo{t: lIgnore}, nil
|
||||
}
|
||||
|
||||
switch strings.ToLower(t1) {
|
||||
case tokEOL:
|
||||
// Don't consume t2 and continue parsing
|
||||
return &lineInfo{t: lIgnore}, nil
|
||||
|
||||
case "user-agent", "useragent":
|
||||
// From google's spec:
|
||||
// Handling of <field> elements with simple errors / typos (eg "useragent"
|
||||
// instead of "user-agent") is undefined and may be interpreted as correct
|
||||
// directives by some user-agents.
|
||||
// The user-agent is non-case-sensitive.
|
||||
t2 = strings.ToLower(t2)
|
||||
return returnStringVal(lUserAgent)
|
||||
|
||||
case "disallow":
|
||||
// From google's spec:
|
||||
// When no path is specified, the directive is ignored (so an empty Disallow
|
||||
// CAN be an allow, since allow is the default. The actual result depends
|
||||
// on the other rules in the group).
|
||||
return returnPathVal(lDisallow)
|
||||
|
||||
case "allow":
|
||||
// From google's spec:
|
||||
// When no path is specified, the directive is ignored.
|
||||
return returnPathVal(lAllow)
|
||||
|
||||
case "host":
|
||||
// Host directive to specify main site mirror
|
||||
// Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
|
||||
return returnStringVal(lHost)
|
||||
|
||||
case "sitemap":
|
||||
// Non-group field, applies to the host as a whole, not to a specific user-agent
|
||||
return returnStringVal(lSitemap)
|
||||
|
||||
case "crawl-delay", "crawldelay":
|
||||
// From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
|
||||
// Several major crawlers support a Crawl-delay parameter, set to the
|
||||
// number of seconds to wait between successive requests to the same server.
|
||||
p.popToken()
|
||||
if cd, e := strconv.ParseFloat(t2, 64); e != nil {
|
||||
return nil, e
|
||||
} else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) {
|
||||
return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2)
|
||||
} else {
|
||||
return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Consume t2 token
|
||||
p.popToken()
|
||||
return &lineInfo{t: lUnknown, k: t1}, nil
|
||||
}
|
||||
|
||||
func (p *parser) popToken() (tok string, ok bool) {
|
||||
tok, ok = p.peekToken()
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
p.pos++
|
||||
return tok, true
|
||||
}
|
||||
|
||||
func (p *parser) peekToken() (tok string, ok bool) {
|
||||
if p.pos >= len(p.tokens) {
|
||||
return "", false
|
||||
}
|
||||
return p.tokens[p.pos], true
|
||||
}
|
||||
|
||||
func isAsterisk(r rune) bool {
|
||||
return r == '*'
|
||||
}
|
227
vendor/github.com/temoto/robotstxt/robotstxt.go
generated
vendored
Normal file
227
vendor/github.com/temoto/robotstxt/robotstxt.go
generated
vendored
Normal file
@ -0,0 +1,227 @@
|
||||
// Package robotstxt implements the robots.txt Exclusion Protocol
|
||||
// as specified in http://www.robotstxt.org/wc/robots.html
|
||||
// with various extensions.
|
||||
package robotstxt
|
||||
|
||||
// Comments explaining the logic are taken from either the Google's spec:
|
||||
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type RobotsData struct {
|
||||
// private
|
||||
groups map[string]*Group
|
||||
allowAll bool
|
||||
disallowAll bool
|
||||
Host string
|
||||
Sitemaps []string
|
||||
}
|
||||
|
||||
type Group struct {
|
||||
rules []*rule
|
||||
Agent string
|
||||
CrawlDelay time.Duration
|
||||
}
|
||||
|
||||
type rule struct {
|
||||
path string
|
||||
allow bool
|
||||
pattern *regexp.Regexp
|
||||
}
|
||||
|
||||
type ParseError struct {
|
||||
Errs []error
|
||||
}
|
||||
|
||||
func newParseError(errs []error) *ParseError {
|
||||
return &ParseError{errs}
|
||||
}
|
||||
|
||||
func (e ParseError) Error() string {
|
||||
var b bytes.Buffer
|
||||
|
||||
b.WriteString("Parse error(s): " + "\n")
|
||||
for _, er := range e.Errs {
|
||||
b.WriteString(er.Error() + "\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
var allowAll = &RobotsData{allowAll: true}
|
||||
var disallowAll = &RobotsData{disallowAll: true}
|
||||
var emptyGroup = &Group{}
|
||||
|
||||
func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) {
|
||||
switch {
|
||||
case statusCode >= 200 && statusCode < 300:
|
||||
return FromBytes(body)
|
||||
|
||||
// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
|
||||
//
|
||||
// Google treats all 4xx errors in the same way and assumes that no valid
|
||||
// robots.txt file exists. It is assumed that there are no restrictions.
|
||||
// This is a "full allow" for crawling. Note: this includes 401
|
||||
// "Unauthorized" and 403 "Forbidden" HTTP result codes.
|
||||
case statusCode >= 400 && statusCode < 500:
|
||||
return allowAll, nil
|
||||
|
||||
// From Google's spec:
|
||||
// Server errors (5xx) are seen as temporary errors that result in a "full
|
||||
// disallow" of crawling.
|
||||
case statusCode >= 500 && statusCode < 600:
|
||||
return disallowAll, nil
|
||||
}
|
||||
|
||||
return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode))
|
||||
}
|
||||
|
||||
func FromStatusAndString(statusCode int, body string) (*RobotsData, error) {
|
||||
return FromStatusAndBytes(statusCode, []byte(body))
|
||||
}
|
||||
|
||||
func FromResponse(res *http.Response) (*RobotsData, error) {
|
||||
if res == nil {
|
||||
// Edge case, if res is nil, return nil data
|
||||
return nil, nil
|
||||
}
|
||||
buf, e := ioutil.ReadAll(res.Body)
|
||||
if e != nil {
|
||||
return nil, e
|
||||
}
|
||||
return FromStatusAndBytes(res.StatusCode, buf)
|
||||
}
|
||||
|
||||
func FromBytes(body []byte) (r *RobotsData, err error) {
|
||||
var errs []error
|
||||
|
||||
// special case (probably not worth optimization?)
|
||||
trimmed := bytes.TrimSpace(body)
|
||||
if len(trimmed) == 0 {
|
||||
return allowAll, nil
|
||||
}
|
||||
|
||||
sc := newByteScanner("bytes", true)
|
||||
//sc.Quiet = !print_errors
|
||||
sc.feed(body, true)
|
||||
tokens := sc.scanAll()
|
||||
|
||||
// special case worth optimization
|
||||
if len(tokens) == 0 {
|
||||
return allowAll, nil
|
||||
}
|
||||
|
||||
r = &RobotsData{}
|
||||
parser := newParser(tokens)
|
||||
r.groups, r.Host, r.Sitemaps, errs = parser.parseAll()
|
||||
if len(errs) > 0 {
|
||||
return nil, newParseError(errs)
|
||||
}
|
||||
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func FromString(body string) (r *RobotsData, err error) {
|
||||
return FromBytes([]byte(body))
|
||||
}
|
||||
|
||||
func (r *RobotsData) TestAgent(path, agent string) bool {
|
||||
if r.allowAll {
|
||||
return true
|
||||
}
|
||||
if r.disallowAll {
|
||||
return false
|
||||
}
|
||||
|
||||
// Find a group of rules that applies to this agent
|
||||
// From Google's spec:
|
||||
// The user-agent is non-case-sensitive.
|
||||
g := r.FindGroup(agent)
|
||||
return g.Test(path)
|
||||
}
|
||||
|
||||
// FindGroup searches block of declarations for specified user-agent.
|
||||
// From Google's spec:
|
||||
// Only one group of group-member records is valid for a particular crawler.
|
||||
// The crawler must determine the correct group of records by finding the group
|
||||
// with the most specific user-agent that still matches. All other groups of
|
||||
// records are ignored by the crawler. The user-agent is non-case-sensitive.
|
||||
// The order of the groups within the robots.txt file is irrelevant.
|
||||
func (r *RobotsData) FindGroup(agent string) (ret *Group) {
|
||||
var prefixLen int
|
||||
|
||||
agent = strings.ToLower(agent)
|
||||
if ret = r.groups["*"]; ret != nil {
|
||||
// Weakest match possible
|
||||
prefixLen = 1
|
||||
}
|
||||
for a, g := range r.groups {
|
||||
if a != "*" && strings.HasPrefix(agent, a) {
|
||||
if l := len(a); l > prefixLen {
|
||||
prefixLen = l
|
||||
ret = g
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ret == nil {
|
||||
return emptyGroup
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (g *Group) Test(path string) bool {
|
||||
if r := g.findRule(path); r != nil {
|
||||
return r.allow
|
||||
}
|
||||
|
||||
// From Google's spec:
|
||||
// By default, there are no restrictions for crawling for the designated crawlers.
|
||||
return true
|
||||
}
|
||||
|
||||
// From Google's spec:
|
||||
// The path value is used as a basis to determine whether or not a rule applies
|
||||
// to a specific URL on a site. With the exception of wildcards, the path is
|
||||
// used to match the beginning of a URL (and any valid URLs that start with the
|
||||
// same path).
|
||||
//
|
||||
// At a group-member level, in particular for allow and disallow directives,
|
||||
// the most specific rule based on the length of the [path] entry will trump
|
||||
// the less specific (shorter) rule. The order of precedence for rules with
|
||||
// wildcards is undefined.
|
||||
func (g *Group) findRule(path string) (ret *rule) {
|
||||
var prefixLen int
|
||||
|
||||
for _, r := range g.rules {
|
||||
if r.pattern != nil {
|
||||
if r.pattern.MatchString(path) {
|
||||
// Consider this a match equal to the length of the pattern.
|
||||
// From Google's spec:
|
||||
// The order of precedence for rules with wildcards is undefined.
|
||||
if l := len(r.pattern.String()); l > prefixLen {
|
||||
prefixLen = l
|
||||
ret = r
|
||||
}
|
||||
}
|
||||
} else if r.path == "/" && prefixLen == 0 {
|
||||
// Weakest match possible
|
||||
prefixLen = 1
|
||||
ret = r
|
||||
} else if strings.HasPrefix(path, r.path) {
|
||||
if l := len(r.path); l > prefixLen {
|
||||
prefixLen = l
|
||||
ret = r
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
185
vendor/github.com/temoto/robotstxt/scanner.go
generated
vendored
Normal file
185
vendor/github.com/temoto/robotstxt/scanner.go
generated
vendored
Normal file
@ -0,0 +1,185 @@
|
||||
package robotstxt
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"go/token"
|
||||
"os"
|
||||
"sync"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type byteScanner struct {
|
||||
pos token.Position
|
||||
buf []byte
|
||||
ErrorCount int
|
||||
ch rune
|
||||
Quiet bool
|
||||
keyTokenFound bool
|
||||
lastChunk bool
|
||||
}
|
||||
|
||||
const tokEOL = "\n"
|
||||
|
||||
var WhitespaceChars = []rune{' ', '\t', '\v'}
|
||||
var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
|
||||
|
||||
func newByteScanner(srcname string, quiet bool) *byteScanner {
|
||||
return &byteScanner{
|
||||
Quiet: quiet,
|
||||
ch: -1,
|
||||
pos: token.Position{Filename: srcname},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *byteScanner) feed(input []byte, end bool) {
|
||||
s.buf = input
|
||||
s.pos.Offset = 0
|
||||
s.pos.Line = 1
|
||||
s.pos.Column = 1
|
||||
s.lastChunk = end
|
||||
|
||||
// Read first char into look-ahead buffer `s.ch`.
|
||||
if !s.nextChar() {
|
||||
return
|
||||
}
|
||||
|
||||
// Skip UTF-8 byte order mark
|
||||
if s.ch == 65279 {
|
||||
s.nextChar()
|
||||
s.pos.Column = 1
|
||||
}
|
||||
}
|
||||
|
||||
func (s *byteScanner) GetPosition() token.Position {
|
||||
return s.pos
|
||||
}
|
||||
|
||||
func (s *byteScanner) scan() string {
|
||||
// Note Offset > len, not >=, so we can scan last character.
|
||||
if s.lastChunk && s.pos.Offset > len(s.buf) {
|
||||
return ""
|
||||
}
|
||||
|
||||
s.skipSpace()
|
||||
|
||||
if s.ch == -1 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// EOL
|
||||
if s.isEol() {
|
||||
s.keyTokenFound = false
|
||||
// skip subsequent newline chars
|
||||
for s.ch != -1 && s.isEol() {
|
||||
s.nextChar()
|
||||
}
|
||||
// emit newline as separate token
|
||||
return tokEOL
|
||||
}
|
||||
|
||||
// skip comments
|
||||
if s.ch == '#' {
|
||||
s.keyTokenFound = false
|
||||
s.skipUntilEol()
|
||||
if s.ch == -1 {
|
||||
return ""
|
||||
}
|
||||
// emit newline as separate token
|
||||
return tokEOL
|
||||
}
|
||||
|
||||
// else we found something
|
||||
tok := tokBuffers.Get().(*bytes.Buffer)
|
||||
defer tokBuffers.Put(tok)
|
||||
tok.Reset()
|
||||
tok.WriteRune(s.ch)
|
||||
s.nextChar()
|
||||
for s.ch != -1 && !s.isSpace() && !s.isEol() {
|
||||
// Do not consider ":" to be a token separator if a first key token
|
||||
// has already been found on this line (avoid cutting an absolute URL
|
||||
// after the "http:")
|
||||
if s.ch == ':' && !s.keyTokenFound {
|
||||
s.nextChar()
|
||||
s.keyTokenFound = true
|
||||
break
|
||||
}
|
||||
|
||||
tok.WriteRune(s.ch)
|
||||
s.nextChar()
|
||||
}
|
||||
return tok.String()
|
||||
}
|
||||
|
||||
func (s *byteScanner) scanAll() []string {
|
||||
results := make([]string, 0, 64) // random guess of average tokens length
|
||||
for {
|
||||
token := s.scan()
|
||||
if token != "" {
|
||||
results = append(results, token)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
func (s *byteScanner) error(pos token.Position, msg string) {
|
||||
s.ErrorCount++
|
||||
if !s.Quiet {
|
||||
fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *byteScanner) isEol() bool {
|
||||
return s.ch == '\n' || s.ch == '\r'
|
||||
}
|
||||
|
||||
func (s *byteScanner) isSpace() bool {
|
||||
for _, r := range WhitespaceChars {
|
||||
if s.ch == r {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *byteScanner) skipSpace() {
|
||||
for s.ch != -1 && s.isSpace() {
|
||||
s.nextChar()
|
||||
}
|
||||
}
|
||||
|
||||
func (s *byteScanner) skipUntilEol() {
|
||||
for s.ch != -1 && !s.isEol() {
|
||||
s.nextChar()
|
||||
}
|
||||
// skip subsequent newline chars
|
||||
for s.ch != -1 && s.isEol() {
|
||||
s.nextChar()
|
||||
}
|
||||
}
|
||||
|
||||
// Reads next Unicode char.
|
||||
func (s *byteScanner) nextChar() bool {
|
||||
if s.pos.Offset >= len(s.buf) {
|
||||
s.ch = -1
|
||||
return false
|
||||
}
|
||||
s.pos.Column++
|
||||
if s.ch == '\n' {
|
||||
s.pos.Line++
|
||||
s.pos.Column = 1
|
||||
}
|
||||
r, w := rune(s.buf[s.pos.Offset]), 1
|
||||
if r >= 0x80 {
|
||||
r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
|
||||
if r == utf8.RuneError && w == 1 {
|
||||
s.error(s.pos, "illegal UTF-8 encoding")
|
||||
}
|
||||
}
|
||||
s.pos.Column++
|
||||
s.pos.Offset += w
|
||||
s.ch = r
|
||||
return true
|
||||
}
|
Reference in New Issue
Block a user