GoToSocial/internal/regexes/regexes.go

// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

package regexes

import (
	"bytes"
	"regexp"
	"sync"

	"mvdan.cc/xurls/v2"
)

const (
	users     = "users"
	actors    = "actors"
	statuses  = "statuses"
	inbox     = "inbox"
	outbox    = "outbox"
	followers = "followers"
	following = "following"
	liked     = "liked"
	publicKey = "main-key"
	follow    = "follow"
	blocks    = "blocks"
	reports   = "reports"

	schemes                  = `(http|https)://`                                         // Allowed URI protocols for parsing links in text.
	alphaNumeric             = `\p{L}\p{M}*|\p{N}`                                       // A single number or script character in any language, including chars with accents.
	usernameGrp              = `(?:` + alphaNumeric + `|\.|\-|\_)`                       // Non-capturing group that matches against a single valid username character.
	domainGrp                = `(?:` + alphaNumeric + `|\.|\-|\:)`                       // Non-capturing group that matches against a single valid domain character.
	mentionName              = `^@(` + usernameGrp + `+)(?:@(` + domainGrp + `+))?$`     // Extract parts of one mention, maybe including domain.
	mentionFinder            = `(?:^|\s)(@` + usernameGrp + `+(?:@` + domainGrp + `+)?)` // Extract all mentions from a text, each mention may include domain.
	emojiShortcode           = `\w{2,30}`                                                // Pattern for emoji shortcodes. maximumEmojiShortcodeLength = 30
	emojiFinder              = `(?:\b)?:(` + emojiShortcode + `):(?:\b)?`                // Extract all emoji shortcodes from a text.
	usernameStrict           = `^[a-z0-9_]{1,64}$`                                       // Pattern for usernames on THIS instance. maximumUsernameLength = 64
	usernameRelaxed          = `[a-z0-9_\.]{1,}`                                         // Relaxed version of username that can match instance accounts too.
	misskeyReportNotesFinder = `(?m)(?:^Note: ((?:http|https):\/\/.*)$)`                 // Extract reported Note URIs from the text of a Misskey report/flag.
	ulid                     = `[0123456789ABCDEFGHJKMNPQRSTVWXYZ]{26}`                  // Pattern for ULID.
	ulidValidate             = `^` + ulid + `$`                                          // Validate one ULID.

	/*
		Path parts / capture.
	*/

	userPathPrefix    = `^/?` + users + `/(` + usernameRelaxed + `)`
	userPath          = userPathPrefix + `$`
	userWebPathPrefix = `^/?` + `@(` + usernameRelaxed + `)`
	userWebPath       = userWebPathPrefix + `$`
	publicKeyPath     = userPathPrefix + `/` + publicKey + `$`
	inboxPath         = userPathPrefix + `/` + inbox + `$`
	outboxPath        = userPathPrefix + `/` + outbox + `$`
	followersPath     = userPathPrefix + `/` + followers + `$`
	followingPath     = userPathPrefix + `/` + following + `$`
	likedPath         = userPathPrefix + `/` + liked + `$`
	followPath        = userPathPrefix + `/` + follow + `/(` + ulid + `)$`
	likePath          = userPathPrefix + `/` + liked + `/(` + ulid + `)$`
	statusesPath      = userPathPrefix + `/` + statuses + `/(` + ulid + `)$`
	blockPath         = userPathPrefix + `/` + blocks + `/(` + ulid + `)$`
	reportPath        = `^/?` + reports + `/(` + ulid + `)$`
	filePath          = `^/?(` + ulid + `)/([a-z]+)/([a-z]+)/(` + ulid + `)\.([a-z0-9]+)$`
)

var (
	// LinkScheme captures http/https schemes in URLs.
	LinkScheme = func() *regexp.Regexp {
		rgx, err := xurls.StrictMatchingScheme(schemes)
		if err != nil {
			panic(err)
		}
		return rgx
	}()

	// MentionName captures the username and domain part from
	// a mention string such as @whatever_user@example.org,
	// returning whatever_user and example.org (without the @ symbols).
	// Will also work for characters with umlauts and other accents.
	// See: https://regex101.com/r/9tjNUy/1 for explanation and examples.
	MentionName = regexp.MustCompile(mentionName)

	// MentionFinder extracts whole mentions from a piece of text.
	MentionFinder = regexp.MustCompile(mentionFinder)

	// EmojiShortcode validates an emoji name.
	EmojiShortcode = regexp.MustCompile(emojiShortcode)

	// EmojiFinder extracts emoji strings from a piece of text.
	// See: https://regex101.com/r/478XGM/1
	EmojiFinder = regexp.MustCompile(emojiFinder)

	// Username can be used to validate usernames of new signups on this instance.
	Username = regexp.MustCompile(usernameStrict)

	// MisskeyReportNotes captures a list of Note URIs from report content created by Misskey.
	// See: https://regex101.com/r/EnTOBV/1
	MisskeyReportNotes = regexp.MustCompile(misskeyReportNotesFinder)

	// UserPath validates and captures the username part from eg /users/example_username.
	UserPath = regexp.MustCompile(userPath)

	// UserWebPath validates and captures the username part from eg /@example_username.
	UserWebPath = regexp.MustCompile(userWebPath)

	// PublicKeyPath parses a path that validates and captures the username part from eg /users/example_username/main-key
	PublicKeyPath = regexp.MustCompile(publicKeyPath)

	// InboxPath parses a path that validates and captures the username part from eg /users/example_username/inbox
	InboxPath = regexp.MustCompile(inboxPath)

	// OutboxPath parses a path that validates and captures the username part from eg /users/example_username/outbox
	OutboxPath = regexp.MustCompile(outboxPath)

	// FollowersPath parses a path that validates and captures the username part from eg /users/example_username/followers
	FollowersPath = regexp.MustCompile(followersPath)

	// FollowingPath parses a path that validates and captures the username part from eg /users/example_username/following
	FollowingPath = regexp.MustCompile(followingPath)

	// LikedPath parses a path that validates and captures the username part from eg /users/example_username/liked
	LikedPath = regexp.MustCompile(likedPath)

	// ULID parses and validate a ULID.
	ULID = regexp.MustCompile(ulidValidate)

	// FollowPath parses a path that validates and captures the username part and the ulid part
	// from eg /users/example_username/follow/01F7XT5JZW1WMVSW1KADS8PVDH
	FollowPath = regexp.MustCompile(followPath)

	// LikePath parses a path that validates and captures the username part and the ulid part
	// from eg /users/example_username/liked/01F7XT5JZW1WMVSW1KADS8PVDH
	LikePath = regexp.MustCompile(likePath)

	// StatusesPath parses a path that validates and captures the username part and the ulid part
	// from eg /users/example_username/statuses/01F7XT5JZW1WMVSW1KADS8PVDH
	// The regex can be played with here: https://regex101.com/r/G9zuxQ/1
	StatusesPath = regexp.MustCompile(statusesPath)

	// BlockPath parses a path that validates and captures the username part and the ulid part
	// from eg /users/example_username/blocks/01F7XT5JZW1WMVSW1KADS8PVDH
	BlockPath = regexp.MustCompile(blockPath)

	// ReportPath parses a path that validates and captures the ulid part
	// from eg /reports/01GP3AWY4CRDVRNZKW0TEAMB5R
	ReportPath = regexp.MustCompile(reportPath)

	// FilePath parses a file storage path of the form [ACCOUNT_ID]/[MEDIA_TYPE]/[MEDIA_SIZE]/[FILE_NAME]
	// eg 01F8MH1H7YV1Z7D2C8K2730QBF/attachment/small/01F8MH8RMYQ6MSNY3JM2XT1CQ5.jpeg
	// It captures the account id, media type, media size, file name, and file extension, eg
	// `01F8MH1H7YV1Z7D2C8K2730QBF`, `attachment`, `small`, `01F8MH8RMYQ6MSNY3JM2XT1CQ5`, `jpeg`.
	FilePath = regexp.MustCompile(filePath)
)

// bufpool is a memory pool of byte buffers for use in our regex utility functions.
var bufpool = sync.Pool{
	New: func() any {
		buf := bytes.NewBuffer(make([]byte, 0, 512))
		return buf
	},
}

// ReplaceAllStringFunc will call through to .ReplaceAllStringFunc in the provided regex, but provide you a clean byte buffer for optimized string writes.
func ReplaceAllStringFunc(rgx *regexp.Regexp, src string, repl func(match string, buf *bytes.Buffer) string) string {
	buf := bufpool.Get().(*bytes.Buffer) //nolint
	defer bufpool.Put(buf)
	return rgx.ReplaceAllStringFunc(src, func(match string) string {
		buf.Reset() // reset use
		return repl(match, buf)
	})
}