Delete image from content or description only if it's a cover image, so an img tag surrounded by a p or div tag a the beginning of the string

This commit is contained in:
Shinokuni 2019-02-10 21:36:38 +00:00
parent 1a78feec81
commit c9b3e3f09f
3 changed files with 25 additions and 15 deletions

View File

@ -201,22 +201,26 @@ public class LocalFeedRepository extends ARepository implements QueryCallback {
if (dbItem.getImageLink() == null) { if (dbItem.getImageLink() == null) {
String imageUrl = HtmlParser.getDescImageLink(dbItem.getDescription(), feed.getSiteUrl()); String imageUrl = HtmlParser.getDescImageLink(dbItem.getDescription(), feed.getSiteUrl());
if (imageUrl != null) {
dbItem.setImageLink(imageUrl);
if (imageUrl != null)
dbItem.setImageLink(imageUrl);
}
}
// we check a second time because imageLink could have been set earlier with media:content tag value
if (dbItem.getImageLink() != null) {
if (dbItem.getContent() != null) { if (dbItem.getContent() != null) {
// removing cover image in content if found in description // removing cover image in content if found in description
dbItem.setContent(HtmlParser.deleteCoverImage(dbItem.getContent())); dbItem.setContent(HtmlParser.deleteCoverImage(dbItem.getContent()));
dbItem.setReadTime(Utils.readTimeFromString(dbItem.getContent())); dbItem.setReadTime(Utils.readTimeFromString(Jsoup.parse(dbItem.getContent()).text()));
} else } else if (dbItem.getDescription() != null) {
dbItem.setDescription(HtmlParser.deleteCoverImage(dbItem.getDescription()));
dbItem.setReadTime(Utils.readTimeFromString(dbItem.getCleanDescription())); dbItem.setReadTime(Utils.readTimeFromString(dbItem.getCleanDescription()));
} }
} }
}
database.itemDao().insert(dbItem); database.itemDao().insert(dbItem);
Log.d(TAG, "adding " + dbItem.getTitle());
} }
} }
} }

View File

@ -13,11 +13,14 @@ import org.jsoup.select.Elements;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.regex.Pattern;
public final class HtmlParser { public final class HtmlParser {
private static final String TAG = HtmlParser.class.getSimpleName(); private static final String TAG = HtmlParser.class.getSimpleName();
public static final String COVER_IMAGE_REGEX = "^(<p>|(<div.*>))?<img.*>";
/** /**
* Parse the html page to get all rss urls * Parse the html page to get all rss urls
* @param url url to request * @param url url to request
@ -113,6 +116,7 @@ public final class HtmlParser {
} }
public static String deleteCoverImage(String content) { public static String deleteCoverImage(String content) {
if (Pattern.compile(COVER_IMAGE_REGEX).matcher(content).find()) {
Document document = Jsoup.parse(content); Document document = Jsoup.parse(content);
Elements elements = document.select("img"); Elements elements = document.select("img");
@ -120,5 +124,7 @@ public final class HtmlParser {
elements.first().remove(); elements.first().remove();
return document.toString(); return document.toString();
} else
return content;
} }
} }

View File

@ -29,6 +29,6 @@ public class RSSMediaContent {
} }
public boolean isContentAnImage() { public boolean isContentAnImage() {
return medium.equals("image"); return medium.equals("image") || medium.equals("image/jpeg") || medium.equals("image/png");
} }
} }