Transcript semantic parsing (#6852)

2024-02-05 04:42:59 +08:00 · 2024-02-05 04:42:59 +08:00 · 7c4f19c979
commit 7c4f19c979
parent 27e9bf36b1
11 changed files with 479 additions and 0 deletions
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
@ -46,6 +46,7 @@ public class FeedItem implements Serializable {
    private String podcastIndexTranscriptUrl;
    private String podcastIndexTranscriptType;
    private String podcastIndexTranscriptText;
    private Transcript transcript;
    private int state;
    public static final int NEW = -1;
@ -463,6 +464,14 @@ public class FeedItem implements Serializable {
        }
    }
    public Transcript getTranscript() {
        return transcript;
    }
    public void setTranscript(Transcript t) {
        transcript = t;
    }
    public String getPodcastIndexTranscriptText() {
        return podcastIndexTranscriptText;
    }
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java
@ -0,0 +1,28 @@
 package de.danoeh.antennapod.model.feed;
 import java.util.Map;
 import java.util.TreeMap;
 public class Transcript {
    private final TreeMap<Long, TranscriptSegment> segmentsMap = new TreeMap<>();
    public void addSegment(TranscriptSegment segment) {
        segmentsMap.put(segment.getStartTime(), segment);
    }
    public TranscriptSegment getSegmentAtTime(long time) {
        if (segmentsMap.floorEntry(time) == null) {
            return null;
        }
        return segmentsMap.floorEntry(time).getValue();
    }
    public int getSegmentCount() {
        return segmentsMap.size();
    }
    public Map.Entry<Long, TranscriptSegment> getEntryAfterTime(long time) {
        return segmentsMap.ceilingEntry(time);
    }
 }
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
@ -0,0 +1,31 @@
 package de.danoeh.antennapod.model.feed;
 public class TranscriptSegment {
    private final long startTime;
    private final long endTime;
    private final String words;
    private final String speaker;
    public TranscriptSegment(long start, long end, String w, String s) {
        startTime = start;
        endTime = end;
        words = w;
        speaker = s;
    }
    public long getStartTime() {
        return startTime;
    }
    public long getEndTime() {
        return endTime;
    }
    public String getWords() {
        return words;
    }
    public String getSpeaker() {
        return speaker;
    }
 }
--- a/parser/transcript/README.md
+++ b/parser/transcript/README.md
@ -0,0 +1,3 @@
 # :parser:transcript
 This module provides parsing for transcripts
--- a/parser/transcript/build.gradle
+++ b/parser/transcript/build.gradle
@ -0,0 +1,23 @@
 plugins {
    id("com.android.library")
 }
 apply from: "../../common.gradle"
 android {
    namespace "de.danoeh.antennapod.parser.transcript"
 }
 dependencies {
    implementation project(':model')
    annotationProcessor "androidx.annotation:annotation:$annotationVersion"
    implementation "androidx.core:core:$coreVersion"
    implementation "org.apache.commons:commons-lang3:$commonslangVersion"
    implementation "commons-io:commons-io:$commonsioVersion"
    implementation "org.jsoup:jsoup:$jsoupVersion"
    testImplementation "junit:junit:$junitVersion"
    testImplementation "org.robolectric:robolectric:$robolectricVersion"
 }
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java
@ -0,0 +1,65 @@
 package de.danoeh.antennapod.parser.transcript;
 import org.apache.commons.lang3.StringUtils;
 import org.json.JSONArray;
 import org.json.JSONObject;
 import org.jsoup.internal.StringUtil;
 import de.danoeh.antennapod.model.feed.Transcript;
 import de.danoeh.antennapod.model.feed.TranscriptSegment;
 public class JsonTranscriptParser {
    public static Transcript parse(String jsonStr) {
        try {
            Transcript transcript = new Transcript();
            long startTime = -1L;
            long endTime = -1L;
            long segmentStartTime = -1L;
            long duration = 0L;
            String speaker = "";
            String segmentBody = "";
            JSONObject obj = new JSONObject(jsonStr);
            JSONArray objSegments = obj.getJSONArray("segments");
            for (int i = 0; i < objSegments.length(); i++) {
                JSONObject jsonObject = objSegments.getJSONObject(i);
                startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue();
                endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue();
                if (startTime < 0 || endTime < 0) {
                    continue;
                }
                if (segmentStartTime == -1L) {
                    segmentStartTime = startTime;
                }
                duration += endTime - startTime;
                speaker = jsonObject.optString("speaker");
                String body = jsonObject.optString("body");
                segmentBody += body + " ";
                if (duration >= TranscriptParser.MIN_SPAN) {
                    segmentBody = StringUtils.trim(segmentBody);
                    transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
                    duration = 0L;
                    segmentBody = "";
                    segmentStartTime = -1L;
                }
            }
            if (!StringUtil.isBlank(segmentBody)) {
                segmentBody = StringUtils.trim(segmentBody);
                transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
            }
            if (transcript.getSegmentCount() > 0) {
                return transcript;
            } else {
                return null;
            }
        } catch (org.json.JSONException e) {
            e.printStackTrace();
        }
        return null;
    }
 }
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java
@ -0,0 +1,118 @@
 package de.danoeh.antennapod.parser.transcript;
 import org.apache.commons.lang3.StringUtils;
 import org.jsoup.internal.StringUtil;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import de.danoeh.antennapod.model.feed.Transcript;
 import de.danoeh.antennapod.model.feed.TranscriptSegment;
 public class SrtTranscriptParser {
    private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$");
    public static Transcript parse(String str) {
        if (StringUtils.isBlank(str)) {
            return null;
        }
        str = str.replaceAll("\r\n", "\n");
        Transcript transcript = new Transcript();
        List<String> lines = Arrays.asList(str.split("\n"));
        Iterator<String> iter = lines.iterator();
        String speaker = "";
        StringBuilder body = new StringBuilder();
        String line;
        String segmentBody = "";
        long startTimecode = -1L;
        long spanStartTimecode = -1L;
        long endTimecode = -1L;
        long duration = 0L;
        while (iter.hasNext()) {
            line = iter.next();
            if (line.isEmpty()) {
                continue;
            }
            if (line.contains("-->")) {
                String[] timecodes = line.split("-->");
                if (timecodes.length < 2) {
                    continue;
                }
                startTimecode = parseTimecode(timecodes[0].trim());
                endTimecode = parseTimecode(timecodes[1].trim());
                if (startTimecode == -1 || endTimecode == -1) {
                    continue;
                }
                if (spanStartTimecode == -1) {
                    spanStartTimecode = startTimecode;
                }
                duration += endTimecode - startTimecode;
                do {
                    line = iter.next();
                    if (StringUtil.isBlank(line)) {
                        break;
                    }
                    body.append(line.strip());
                    body.append(" ");
                } while (iter.hasNext());
            }
            if (body.indexOf(":") != -1) {
                String [] parts = body.toString().trim().split(":");
                if (parts.length < 2) {
                    continue;
                }
                speaker = parts[0];
                body = new StringBuilder(parts[1].strip());
            }
            if (!StringUtil.isBlank(body.toString())) {
                segmentBody += " " + body;
                segmentBody = StringUtils.trim(segmentBody);
                if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) {
                    transcript.addSegment(new TranscriptSegment(spanStartTimecode,
                            endTimecode,
                            segmentBody,
                            speaker));
                    duration = 0L;
                    spanStartTimecode = -1L;
                    segmentBody = "";
                }
                body = new StringBuilder();
            }
        }
        if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) {
            segmentBody = StringUtils.trim(segmentBody);
            transcript.addSegment(new TranscriptSegment(spanStartTimecode,
                    endTimecode,
                    segmentBody,
                    speaker));
        }
        if (transcript.getSegmentCount() > 0) {
            return transcript;
        } else {
            return null;
        }
    }
    // Time format 00:00:00,000
    static long parseTimecode(String timecode) {
        Matcher matcher = TIMECODE_PATTERN.matcher(timecode);
        if (!matcher.matches()) {
            return -1;
        }
        long hours = Integer.parseInt(matcher.group(1));
        long minutes = Integer.parseInt(matcher.group(2));
        long seconds = Integer.parseInt(matcher.group(3));
        long milliseconds = Integer.parseInt(matcher.group(4));
        return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
    }
 }
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
@ -0,0 +1,24 @@
 package de.danoeh.antennapod.parser.transcript;
 import org.apache.commons.lang3.StringUtils;
 import de.danoeh.antennapod.model.feed.Transcript;
 public class TranscriptParser {
    static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second
    public static Transcript parse(String str, String type) {
        if (str == null || StringUtils.isBlank(str)) {
            return null;
        }
        if ("application/json".equals(type)) {
            return JsonTranscriptParser.parse(str);
        }
        if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
            return SrtTranscriptParser.parse(str);
        }
        return null;
    }
 }
--- a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java
+++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java
@ -0,0 +1,84 @@
 package de.danoeh.antennapod.parser.transcript;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.robolectric.RobolectricTestRunner;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 import de.danoeh.antennapod.model.feed.Transcript;
@RunWith(RobolectricTestRunner.class)
 public class JsonTranscriptParserTest {
    private static String jsonStr = "{'version': '1.0.0', "
            + "'segments': [ "
            + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
            + "{ 'speaker' : 'Sally Green', 'startTime': 1.91, 'endTime': 2.8, 'body': 'this merges' },"
            + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
            + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
    @Test
    public void testParseJson() {
        Transcript result = JsonTranscriptParser.parse(jsonStr);
        assertEquals(result.getSegmentAtTime(0L), null);
        assertEquals(result.getSegmentAtTime(800L).getSpeaker(), "John Doe");
        assertEquals(result.getSegmentAtTime(800L).getStartTime(), 800L);
        assertEquals(result.getSegmentAtTime(800L).getEndTime(), 1900L);
        assertEquals(1910L, (long) result.getEntryAfterTime(1800L).getKey());
        // 2 segments get merged into at least 1 second
        assertEquals("this merges the", result.getEntryAfterTime(1800L).getValue().getWords());
    }
    @Test
    public void testParse() {
        String type = "application/json";
        Transcript result = TranscriptParser.parse(jsonStr, type);
        // There isn't a segment at 900L, so go backwards and get the segment at 800L
        assertEquals(result.getSegmentAtTime(900L).getSpeaker(), "John Doe");
        assertEquals(result.getSegmentAtTime(930L).getWords(), "And");
        // blank string
        String blankStr = "";
        result = TranscriptParser.parse(blankStr, type);
        assertEquals(result, null);
        result = TranscriptParser.parse(null, type);
        assertEquals(result, null);
        // All blank lines
        String allNewlinesStr = "\r\n\r\n\r\n\r\n";
        result = TranscriptParser.parse(allNewlinesStr, type);
        assertEquals(result, null);
        // segments is missing
        String jsonStrBad1 = "{'version': '1.0.0', "
                + "'segmentsX': [ "
                + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
                + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
                + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
        result = TranscriptParser.parse(jsonStrBad1, type);
        assertEquals(result, null);
        // invalid time formatting
        String jsonStrBad2 = "{'version': '1.0.0', "
                + "'segments': [ "
                + "{ 'speaker' : 'XJohn Doe', 'startTime': stringTime, 'endTime': stringTime, 'body': 'And' },"
                + "{ 'XstartTime': 2.9, 'XendTime': 3.4, 'body': 'the' },"
                + "{ 'startTime': '-2.9', 'endTime': '-3.4', 'body': 'the' },"
                + "{ 'startTime': 'bad_time', 'endTime': '-3.4', 'body': 'the' }]}";
        result = TranscriptParser.parse(jsonStrBad2, type);
        assertNull(result);
        // Just plain text
        String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
                + "way. The latest from PogNews.";
        result = TranscriptParser.parse(strBad3, type);
        assertNull(result);
        // passing the wrong type
        type = "application/srt";
        result = TranscriptParser.parse(jsonStr, type);
        assertEquals(result, null);
    }
 }
--- a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java
+++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java
@ -0,0 +1,93 @@
 package de.danoeh.antennapod.parser.transcript;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.robolectric.RobolectricTestRunner;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 import de.danoeh.antennapod.model.feed.Transcript;
@RunWith(RobolectricTestRunner.class)
 public class SrtTranscriptParserTest {
    private static String srtStr = "1\n"
            + "00:00:00,000 --> 00:00:02,730\n"
            + "John Doe: Promoting your podcast in a new\n\n"
            + "2\n"
            + "00:00:02,730 --> 00:00:04,600\n"
            + "way. The latest from PogNews.\n\n"
            + "00:00:04,730 --> 00:00:05,600\n"
            + "way. The latest from PogNews.";
    @Test
    public void testParseSrt() {
        Transcript result = SrtTranscriptParser.parse(srtStr);
        assertEquals(result.getSegmentAtTime(0L).getWords(), "Promoting your podcast in a new");
        assertEquals(result.getSegmentAtTime(0L).getSpeaker(), "John Doe");
        assertEquals(result.getSegmentAtTime(0L).getStartTime(), 0L);
        assertEquals(result.getSegmentAtTime(0L).getEndTime(), 2730L);
        assertEquals((long) result.getEntryAfterTime(1000L).getKey(), 2730L);
        assertEquals(result.getEntryAfterTime(1000L).getValue().getWords(), "way. The latest from PogNews.");
    }
    @Test
    public void testParse() {
        String type = "application/srr";
        Transcript result;
        result = TranscriptParser.parse(srtStr, type);
        // There isn't a segment at 800L, so go backwards and get the segment at 0L
        assertEquals(result.getSegmentAtTime(800L).getWords(), "Promoting your podcast in a new");
        result = TranscriptParser.parse(null, type);
        assertEquals(result, null);
        // blank string
        String blankStr = "";
        result = TranscriptParser.parse(blankStr, type);
        assertNull(result);
        // All empty lines
        String allNewlinesStr = "\r\n\r\n\r\n\r\n";
        result = TranscriptParser.parse(allNewlinesStr, type);
        assertEquals(result, null);
        // first segment has invalid time formatting, so the entire segment will be thrown out
        String srtStrBad1 = "00:0000,000 --> 00:00:02,730\n"
                + "John Doe: Promoting your podcast in a new\n\n"
                + "2\n"
                + "00:00:02,730 --> 00:00:04,600\n"
                + "way. The latest from PogNews.";
        result = TranscriptParser.parse(srtStrBad1, type);
        assertEquals(result.getSegmentAtTime(2730L).getWords(), "way. The latest from PogNews.");
        // first segment has invalid time in end time, 2nd segment has invalid time in both start time and end time
        String srtStrBad2 = "00:00:00,000 --> 00:0002,730\n"
                + "Jane Doe: Promoting your podcast in a new\n\n"
                + "2\n"
                + "badstarttime --> badendtime\n"
                + "way. The latest from PogNews.\n"
                + "badstarttime -->\n"
                + "Jane Doe says something\n"
                + "00:00:00,000 --> 00:00:02,730\n"
                + "Jane Doe:";
        result = TranscriptParser.parse(srtStrBad2, type);
        assertNull(result);
        // Just plain text
        String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
                + "way. The latest from PogNews.";
        result = TranscriptParser.parse(strBad3, type);
        assertNull(result);
        // passing the wrong type
        type = "application/json";
        result = TranscriptParser.parse(srtStr, type);
        assertEquals(result, null);
        type = "unknown";
        result = TranscriptParser.parse(srtStr, type);
        assertEquals(result, null);
    }
 }
--- a/settings.gradle
+++ b/settings.gradle
@ -30,6 +30,7 @@ include ':net:sync:service'
 include ':parser:feed'
 include ':parser:media'
 include ':parser:transcript'
 include ':playback:base'
 include ':playback:cast'
		`@ -0,0 +1,3 @@`
							`# :parser:transcript`

							`This module provides parsing for transcripts`