From 563ea5b25b306ce0cde59f156a10eb4369a70c64 Mon Sep 17 00:00:00 2001 From: Matt Baer Date: Wed, 19 Feb 2020 17:07:02 -0500 Subject: [PATCH] Fix image extraction for URLs with query strings Previously, image extraction wouldn't catch images with a query string (or anything else) appended. This fixes that by parsing extracted URLs and only checking the path for what looks like an image file. --- posts.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/posts.go b/posts.go index a9c8c11..35e9bd3 100644 --- a/posts.go +++ b/posts.go @@ -16,6 +16,7 @@ import ( "fmt" "html/template" "net/http" + "net/url" "regexp" "strings" "time" @@ -1543,7 +1544,7 @@ func (rp *RawPost) Created8601() string { return rp.Created.Format("2006-01-02T15:04:05Z") } -var imageURLRegex = regexp.MustCompile(`(?i)^https?:\/\/[^ ]*\.(gif|png|jpg|jpeg|image)$`) +var imageURLRegex = regexp.MustCompile(`(?i)[^ ]+\.(gif|png|jpg|jpeg|image)$`) func (p *Post) extractImages() { p.Images = extractImages(p.Content) @@ -1553,11 +1554,17 @@ func extractImages(content string) []string { matches := extract.ExtractUrls(content) urls := map[string]bool{} for i := range matches { - u := matches[i].Text - if !imageURLRegex.MatchString(u) { + uRaw := matches[i].Text + // Parse the extracted text so we can examine the path + u, err := url.Parse(uRaw) + if err != nil { continue } - urls[u] = true + // Ensure the path looks like it leads to an image file + if !imageURLRegex.MatchString(u.Path) { + continue + } + urls[uRaw] = true } resURLs := make([]string, 0)