Fix image extraction for URLs with query strings
Previously, image extraction wouldn't catch images with a query string (or anything else) appended. This fixes that by parsing extracted URLs and only checking the path for what looks like an image file.
This commit is contained in:
parent
34d196376e
commit
563ea5b25b
15
posts.go
15
posts.go
|
@ -16,6 +16,7 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"html/template"
|
"html/template"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
@ -1543,7 +1544,7 @@ func (rp *RawPost) Created8601() string {
|
||||||
return rp.Created.Format("2006-01-02T15:04:05Z")
|
return rp.Created.Format("2006-01-02T15:04:05Z")
|
||||||
}
|
}
|
||||||
|
|
||||||
var imageURLRegex = regexp.MustCompile(`(?i)^https?:\/\/[^ ]*\.(gif|png|jpg|jpeg|image)$`)
|
var imageURLRegex = regexp.MustCompile(`(?i)[^ ]+\.(gif|png|jpg|jpeg|image)$`)
|
||||||
|
|
||||||
func (p *Post) extractImages() {
|
func (p *Post) extractImages() {
|
||||||
p.Images = extractImages(p.Content)
|
p.Images = extractImages(p.Content)
|
||||||
|
@ -1553,11 +1554,17 @@ func extractImages(content string) []string {
|
||||||
matches := extract.ExtractUrls(content)
|
matches := extract.ExtractUrls(content)
|
||||||
urls := map[string]bool{}
|
urls := map[string]bool{}
|
||||||
for i := range matches {
|
for i := range matches {
|
||||||
u := matches[i].Text
|
uRaw := matches[i].Text
|
||||||
if !imageURLRegex.MatchString(u) {
|
// Parse the extracted text so we can examine the path
|
||||||
|
u, err := url.Parse(uRaw)
|
||||||
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
urls[u] = true
|
// Ensure the path looks like it leads to an image file
|
||||||
|
if !imageURLRegex.MatchString(u.Path) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
urls[uRaw] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
resURLs := make([]string, 0)
|
resURLs := make([]string, 0)
|
||||||
|
|
Loading…
Reference in New Issue