Only mark items as duplicates if duration and date are similar

This commit is contained in:
ByteHamster 2021-12-14 22:11:36 +01:00
parent 19dfa08905
commit dde499f5b1
3 changed files with 137 additions and 30 deletions

View File

@ -352,42 +352,13 @@ public final class DBTasks {
*/
private static FeedItem searchFeedItemGuessDuplicate(List<FeedItem> items, FeedItem searchItem) {
for (FeedItem item : items) {
if ((item.getMedia() != null)
&& (searchItem.getMedia() != null)
&& !TextUtils.isEmpty(item.getMedia().getStreamUrl())
&& !TextUtils.isEmpty(searchItem.getMedia().getStreamUrl())
&& TextUtils.equals(item.getMedia().getStreamUrl(), searchItem.getMedia().getStreamUrl())) {
if (FeedItemDuplicateGuesser.seemDuplicates(item, searchItem)) {
return item;
} else if (titlesLookSimilar(item.getTitle(), searchItem.getTitle())) {
if (searchItem.getPubDate() == null || item.getPubDate() == null) {
continue;
}
long dateOriginal = item.getPubDate().getTime();
long dateNew = searchItem.getPubDate().getTime();
if (Math.abs(dateOriginal - dateNew) < 7L * 24L * 3600L * 1000L) { // Same week
return item;
}
}
}
return null;
}
private static boolean titlesLookSimilar(String title1, String title2) {
if (TextUtils.isEmpty(title1) || TextUtils.isEmpty(title2)) {
return false;
}
return canonicalizeTitle(title1).equals(canonicalizeTitle(title2));
}
private static String canonicalizeTitle(String title) {
return title
.trim()
.replace('“', '"')
.replace('”', '"')
.replace('„', '"')
.replace('—', '-');
}
/**
* Adds new Feeds to the database or updates the old versions if they already exists. If another Feed with the same
* identifying value already exists, this method will add new FeedItems from the new Feed to the existing Feed.

View File

@ -0,0 +1,70 @@
package de.danoeh.antennapod.core.storage;
import android.text.TextUtils;
import de.danoeh.antennapod.model.feed.FeedItem;
import de.danoeh.antennapod.model.feed.FeedMedia;
import java.text.DateFormat;
import java.util.Locale;
/**
* Publishers sometimes mess up their feed by adding episodes twice or by changing the ID of existing episodes.
* This class tries to guess if publishers actually meant another episode,
* even if their feed explicitly says that the episodes are different.
*/
public class FeedItemDuplicateGuesser {
public static boolean seemDuplicates(FeedItem item1, FeedItem item2) {
if (sameAndNotEmpty(item1.getItemIdentifier(), item2.getItemIdentifier())) {
return true;
}
FeedMedia media1 = item1.getMedia();
FeedMedia media2 = item2.getMedia();
if (media1 == null || media2 == null) {
return false;
}
if (sameAndNotEmpty(media1.getStreamUrl(), media2.getStreamUrl())) {
return true;
}
return titlesLookSimilar(item1, item2)
&& datesLookSimilar(item1, item2)
&& durationsLookSimilar(media1, media2)
&& TextUtils.equals(media1.getMime_type(), media2.getMime_type());
}
private static boolean sameAndNotEmpty(String string1, String string2) {
if (TextUtils.isEmpty(string1) || TextUtils.isEmpty(string2)) {
return false;
}
return string1.equals(string2);
}
private static boolean datesLookSimilar(FeedItem item1, FeedItem item2) {
if (item1.getPubDate() == null || item2.getPubDate() == null) {
return false;
}
DateFormat dateFormat = DateFormat.getDateInstance(DateFormat.SHORT, Locale.US); // MM/DD/YY
String dateOriginal = dateFormat.format(item2.getPubDate());
String dateNew = dateFormat.format(item1.getPubDate());
return TextUtils.equals(dateOriginal, dateNew); // Same date; time is ignored.
}
private static boolean durationsLookSimilar(FeedMedia media1, FeedMedia media2) {
return Math.abs(media1.getDuration() - media2.getDuration()) < 10 * 60L * 1000L;
}
private static boolean titlesLookSimilar(FeedItem item1, FeedItem item2) {
return sameAndNotEmpty(canonicalizeTitle(item1.getTitle()), canonicalizeTitle(item2.getTitle()));
}
private static String canonicalizeTitle(String title) {
if (title == null) {
return "";
}
return title
.trim()
.replace('“', '"')
.replace('”', '"')
.replace('„', '"')
.replace('—', '-');
}
}

View File

@ -0,0 +1,66 @@
package de.danoeh.antennapod.core.storage;
import de.danoeh.antennapod.model.feed.FeedItem;
import de.danoeh.antennapod.model.feed.FeedMedia;
import org.junit.Test;
import java.util.Date;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
/**
* Test class for {@link FeedItemDuplicateGuesser}.
*/
public class FeedItemDuplicateGuesserTest {
private static final long MINUTES = 1000 * 60;
private static final long DAYS = 24 * 60 * MINUTES;
@Test
public void testSameId() {
assertTrue(FeedItemDuplicateGuesser.seemDuplicates(
item("id", "Title1", "example.com/episode1", 0, 5 * MINUTES, "audio/*"),
item("id", "Title2", "example.com/episode2", 0, 20 * MINUTES, "video/*")));
}
@Test
public void testDuplicateDownloadUrl() {
assertTrue(FeedItemDuplicateGuesser.seemDuplicates(
item("id1", "Title1", "example.com/episode", 0, 5 * MINUTES, "audio/*"),
item("id2", "Title2", "example.com/episode", 0, 5 * MINUTES, "audio/*")));
assertFalse(FeedItemDuplicateGuesser.seemDuplicates(
item("id1", "Title1", "example.com/episode1", 0, 5 * MINUTES, "audio/*"),
item("id2", "Title2", "example.com/episode2", 0, 5 * MINUTES, "audio/*")));
}
@Test
public void testOtherAttributes() {
assertTrue(FeedItemDuplicateGuesser.seemDuplicates(
item("id1", "Title", "example.com/episode1", 10, 5 * MINUTES, "audio/*"),
item("id2", "Title", "example.com/episode2", 10, 5 * MINUTES, "audio/*")));
assertTrue(FeedItemDuplicateGuesser.seemDuplicates(
item("id1", "Title", "example.com/episode1", 10, 5 * MINUTES, "audio/*"),
item("id2", "Title", "example.com/episode2", 20, 6 * MINUTES, "audio/*")));
assertFalse(FeedItemDuplicateGuesser.seemDuplicates(
item("id1", "Title", "example.com/episode1", 10, 5 * MINUTES, "audio/*"),
item("id2", "Title", "example.com/episode2", 10, 5 * MINUTES, "video/*")));
assertFalse(FeedItemDuplicateGuesser.seemDuplicates(
item("id1", "Title", "example.com/episode1", 5 * DAYS, 5 * MINUTES, "audio/*"),
item("id2", "Title", "example.com/episode2", 2 * DAYS, 5 * MINUTES, "audio/*")));
}
@Test
public void testNoMediaType() {
assertTrue(FeedItemDuplicateGuesser.seemDuplicates(
item("id1", "Title", "example.com/episode1", 2 * DAYS, 5 * MINUTES, ""),
item("id2", "Title", "example.com/episode2", 2 * DAYS, 5 * MINUTES, "")));
}
private FeedItem item(String guid, String title, String downloadUrl,
long date, long duration, String mime) {
FeedItem item = new FeedItem(0, title, guid, "link", new Date(date), FeedItem.PLAYED, null);
FeedMedia media = new FeedMedia(item, downloadUrl, duration, mime);
item.setMedia(media);
return item;
}
}