Fix image extraction for URLs with query strings

Previously, image extraction wouldn't catch images with a query string
(or anything else) appended. This fixes that by parsing extracted URLs
and only checking the path for what looks like an image file.
This commit is contained in:
Matt Baer 2020-02-19 17:07:02 -05:00
parent 34d196376e
commit 563ea5b25b
1 changed files with 11 additions and 4 deletions

View File

@ -16,6 +16,7 @@ import (
"fmt" "fmt"
"html/template" "html/template"
"net/http" "net/http"
"net/url"
"regexp" "regexp"
"strings" "strings"
"time" "time"
@ -1543,7 +1544,7 @@ func (rp *RawPost) Created8601() string {
return rp.Created.Format("2006-01-02T15:04:05Z") return rp.Created.Format("2006-01-02T15:04:05Z")
} }
var imageURLRegex = regexp.MustCompile(`(?i)^https?:\/\/[^ ]*\.(gif|png|jpg|jpeg|image)$`) var imageURLRegex = regexp.MustCompile(`(?i)[^ ]+\.(gif|png|jpg|jpeg|image)$`)
func (p *Post) extractImages() { func (p *Post) extractImages() {
p.Images = extractImages(p.Content) p.Images = extractImages(p.Content)
@ -1553,11 +1554,17 @@ func extractImages(content string) []string {
matches := extract.ExtractUrls(content) matches := extract.ExtractUrls(content)
urls := map[string]bool{} urls := map[string]bool{}
for i := range matches { for i := range matches {
u := matches[i].Text uRaw := matches[i].Text
if !imageURLRegex.MatchString(u) { // Parse the extracted text so we can examine the path
u, err := url.Parse(uRaw)
if err != nil {
continue continue
} }
urls[u] = true // Ensure the path looks like it leads to an image file
if !imageURLRegex.MatchString(u.Path) {
continue
}
urls[uRaw] = true
} }
resURLs := make([]string, 0) resURLs := make([]string, 0)