Transcript semantic parsing (#6852)

2024-02-05 04:42:59 +08:00 · 2024-02-05 04:42:59 +08:00 · 7c4f19c979
parent 27e9bf36b1
commit 7c4f19c979
11 changed files with 479 additions and 0 deletions
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java
@ -46,6 +46,7 @@ public class FeedItem implements Serializable {
    private String podcastIndexTranscriptUrl;
    private String podcastIndexTranscriptType;
    private String podcastIndexTranscriptText;
+    private Transcript transcript;

    private int state;
    public static final int NEW = -1;
@ -463,6 +464,14 @@ public class FeedItem implements Serializable {
        }
    }

+    public Transcript getTranscript() {
+        return transcript;
+    }
+
+    public void setTranscript(Transcript t) {
+        transcript = t;
+    }
+
    public String getPodcastIndexTranscriptText() {
        return podcastIndexTranscriptText;
    }
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java
@ -0,0 +1,28 @@
+package de.danoeh.antennapod.model.feed;
+
+import java.util.Map;
+import java.util.TreeMap;
+
+public class Transcript {
+
+    private final TreeMap<Long, TranscriptSegment> segmentsMap = new TreeMap<>();
+
+    public void addSegment(TranscriptSegment segment) {
+        segmentsMap.put(segment.getStartTime(), segment);
+    }
+
+    public TranscriptSegment getSegmentAtTime(long time) {
+        if (segmentsMap.floorEntry(time) == null) {
+            return null;
+        }
+        return segmentsMap.floorEntry(time).getValue();
+    }
+
+    public int getSegmentCount() {
+        return segmentsMap.size();
+    }
+
+    public Map.Entry<Long, TranscriptSegment> getEntryAfterTime(long time) {
+        return segmentsMap.ceilingEntry(time);
+    }
+}
--- a/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
+++ b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java
@ -0,0 +1,31 @@
+package de.danoeh.antennapod.model.feed;
+
+public class TranscriptSegment {
+    private final long startTime;
+    private final long endTime;
+    private final String words;
+    private final String speaker;
+
+    public TranscriptSegment(long start, long end, String w, String s) {
+        startTime = start;
+        endTime = end;
+        words = w;
+        speaker = s;
+    }
+
+    public long getStartTime() {
+        return startTime;
+    }
+
+    public long getEndTime() {
+        return endTime;
+    }
+
+    public String getWords() {
+        return words;
+    }
+
+    public String getSpeaker() {
+        return speaker;
+    }
+}
--- a/parser/transcript/README.md
+++ b/parser/transcript/README.md
@ -0,0 +1,3 @@
+# :parser:transcript
+
+This module provides parsing for transcripts
--- a/parser/transcript/build.gradle
+++ b/parser/transcript/build.gradle
@ -0,0 +1,23 @@
+plugins {
+    id("com.android.library")
+}
+apply from: "../../common.gradle"
+
+android {
+    namespace "de.danoeh.antennapod.parser.transcript"
+}
+
+dependencies {
+    implementation project(':model')
+
+    annotationProcessor "androidx.annotation:annotation:$annotationVersion"
+
+    implementation "androidx.core:core:$coreVersion"
+
+    implementation "org.apache.commons:commons-lang3:$commonslangVersion"
+    implementation "commons-io:commons-io:$commonsioVersion"
+    implementation "org.jsoup:jsoup:$jsoupVersion"
+
+    testImplementation "junit:junit:$junitVersion"
+    testImplementation "org.robolectric:robolectric:$robolectricVersion"
+}
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java
@ -0,0 +1,65 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.apache.commons.lang3.StringUtils;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.jsoup.internal.StringUtil;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+import de.danoeh.antennapod.model.feed.TranscriptSegment;
+
+public class JsonTranscriptParser {
+    public static Transcript parse(String jsonStr) {
+        try {
+            Transcript transcript = new Transcript();
+            long startTime = -1L;
+            long endTime = -1L;
+            long segmentStartTime = -1L;
+            long duration = 0L;
+            String speaker = "";
+            String segmentBody = "";
+            JSONObject obj = new JSONObject(jsonStr);
+            JSONArray objSegments = obj.getJSONArray("segments");
+
+            for (int i = 0; i < objSegments.length(); i++) {
+                JSONObject jsonObject = objSegments.getJSONObject(i);
+                startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue();
+                endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue();
+                if (startTime < 0 || endTime < 0) {
+                    continue;
+                }
+                if (segmentStartTime == -1L) {
+                    segmentStartTime = startTime;
+                }
+                duration += endTime - startTime;
+
+                speaker = jsonObject.optString("speaker");
+                String body = jsonObject.optString("body");
+                segmentBody += body + " ";
+
+                if (duration >= TranscriptParser.MIN_SPAN) {
+                    segmentBody = StringUtils.trim(segmentBody);
+                    transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
+                    duration = 0L;
+                    segmentBody = "";
+                    segmentStartTime = -1L;
+                }
+            }
+
+            if (!StringUtil.isBlank(segmentBody)) {
+                segmentBody = StringUtils.trim(segmentBody);
+                transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
+            }
+
+            if (transcript.getSegmentCount() > 0) {
+                return transcript;
+            } else {
+                return null;
+            }
+
+        } catch (org.json.JSONException e) {
+            e.printStackTrace();
+        }
+        return null;
+    }
+}
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java
@ -0,0 +1,118 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.internal.StringUtil;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+import de.danoeh.antennapod.model.feed.TranscriptSegment;
+
+public class SrtTranscriptParser {
+    private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$");
+
+    public static Transcript parse(String str) {
+        if (StringUtils.isBlank(str)) {
+            return null;
+        }
+        str = str.replaceAll("\r\n", "\n");
+
+        Transcript transcript = new Transcript();
+        List<String> lines = Arrays.asList(str.split("\n"));
+        Iterator<String> iter = lines.iterator();
+        String speaker = "";
+        StringBuilder body = new StringBuilder();
+        String line;
+        String segmentBody = "";
+        long startTimecode = -1L;
+        long spanStartTimecode = -1L;
+        long endTimecode = -1L;
+        long duration = 0L;
+
+        while (iter.hasNext()) {
+            line = iter.next();
+
+            if (line.isEmpty()) {
+                continue;
+            }
+
+            if (line.contains("-->")) {
+                String[] timecodes = line.split("-->");
+                if (timecodes.length < 2) {
+                    continue;
+                }
+                startTimecode = parseTimecode(timecodes[0].trim());
+                endTimecode = parseTimecode(timecodes[1].trim());
+                if (startTimecode == -1 || endTimecode == -1) {
+                    continue;
+                }
+
+                if (spanStartTimecode == -1) {
+                    spanStartTimecode = startTimecode;
+                }
+                duration += endTimecode - startTimecode;
+                do {
+                    line = iter.next();
+                    if (StringUtil.isBlank(line)) {
+                        break;
+                    }
+                    body.append(line.strip());
+                    body.append(" ");
+                } while (iter.hasNext());
+            }
+
+            if (body.indexOf(":") != -1) {
+                String [] parts = body.toString().trim().split(":");
+                if (parts.length < 2) {
+                    continue;
+                }
+                speaker = parts[0];
+                body = new StringBuilder(parts[1].strip());
+            }
+            if (!StringUtil.isBlank(body.toString())) {
+                segmentBody += " " + body;
+                segmentBody = StringUtils.trim(segmentBody);
+                if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) {
+                    transcript.addSegment(new TranscriptSegment(spanStartTimecode,
+                            endTimecode,
+                            segmentBody,
+                            speaker));
+                    duration = 0L;
+                    spanStartTimecode = -1L;
+                    segmentBody = "";
+                }
+                body = new StringBuilder();
+            }
+        }
+
+        if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) {
+            segmentBody = StringUtils.trim(segmentBody);
+            transcript.addSegment(new TranscriptSegment(spanStartTimecode,
+                    endTimecode,
+                    segmentBody,
+                    speaker));
+        }
+        if (transcript.getSegmentCount() > 0) {
+            return transcript;
+        } else {
+            return null;
+        }
+    }
+
+    // Time format 00:00:00,000
+    static long parseTimecode(String timecode) {
+        Matcher matcher = TIMECODE_PATTERN.matcher(timecode);
+        if (!matcher.matches()) {
+            return -1;
+        }
+        long hours = Integer.parseInt(matcher.group(1));
+        long minutes = Integer.parseInt(matcher.group(2));
+        long seconds = Integer.parseInt(matcher.group(3));
+        long milliseconds = Integer.parseInt(matcher.group(4));
+        return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
+    }
+}
--- a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
+++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java
@ -0,0 +1,24 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.apache.commons.lang3.StringUtils;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+
+public class TranscriptParser {
+    static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second
+
+    public static Transcript parse(String str, String type) {
+        if (str == null || StringUtils.isBlank(str)) {
+            return null;
+        }
+
+        if ("application/json".equals(type)) {
+            return JsonTranscriptParser.parse(str);
+        }
+
+        if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
+            return SrtTranscriptParser.parse(str);
+        }
+        return null;
+    }
+}
--- a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java
+++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java
@ -0,0 +1,84 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.robolectric.RobolectricTestRunner;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import de.danoeh.antennapod.model.feed.Transcript;
+
+@RunWith(RobolectricTestRunner.class)
+public class JsonTranscriptParserTest {
+    private static String jsonStr = "{'version': '1.0.0', "
+            + "'segments': [ "
+            + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
+            + "{ 'speaker' : 'Sally Green', 'startTime': 1.91, 'endTime': 2.8, 'body': 'this merges' },"
+            + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
+            + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
+
+    @Test
+    public void testParseJson() {
+        Transcript result = JsonTranscriptParser.parse(jsonStr);
+
+        assertEquals(result.getSegmentAtTime(0L), null);
+        assertEquals(result.getSegmentAtTime(800L).getSpeaker(), "John Doe");
+        assertEquals(result.getSegmentAtTime(800L).getStartTime(), 800L);
+        assertEquals(result.getSegmentAtTime(800L).getEndTime(), 1900L);
+        assertEquals(1910L, (long) result.getEntryAfterTime(1800L).getKey());
+        // 2 segments get merged into at least 1 second
+        assertEquals("this merges the", result.getEntryAfterTime(1800L).getValue().getWords());
+    }
+
+    @Test
+    public void testParse() {
+        String type = "application/json";
+        Transcript result = TranscriptParser.parse(jsonStr, type);
+        // There isn't a segment at 900L, so go backwards and get the segment at 800L
+        assertEquals(result.getSegmentAtTime(900L).getSpeaker(), "John Doe");
+        assertEquals(result.getSegmentAtTime(930L).getWords(), "And");
+
+        // blank string
+        String blankStr = "";
+        result = TranscriptParser.parse(blankStr, type);
+        assertEquals(result, null);
+
+        result = TranscriptParser.parse(null, type);
+        assertEquals(result, null);
+
+        // All blank lines
+        String allNewlinesStr = "\r\n\r\n\r\n\r\n";
+        result = TranscriptParser.parse(allNewlinesStr, type);
+        assertEquals(result, null);
+
+        // segments is missing
+        String jsonStrBad1 = "{'version': '1.0.0', "
+                + "'segmentsX': [ "
+                + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
+                + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
+                + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
+        result = TranscriptParser.parse(jsonStrBad1, type);
+        assertEquals(result, null);
+
+        // invalid time formatting
+        String jsonStrBad2 = "{'version': '1.0.0', "
+                + "'segments': [ "
+                + "{ 'speaker' : 'XJohn Doe', 'startTime': stringTime, 'endTime': stringTime, 'body': 'And' },"
+                + "{ 'XstartTime': 2.9, 'XendTime': 3.4, 'body': 'the' },"
+                + "{ 'startTime': '-2.9', 'endTime': '-3.4', 'body': 'the' },"
+                + "{ 'startTime': 'bad_time', 'endTime': '-3.4', 'body': 'the' }]}";
+        result = TranscriptParser.parse(jsonStrBad2, type);
+        assertNull(result);
+
+        // Just plain text
+        String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
+                + "way. The latest from PogNews.";
+        result = TranscriptParser.parse(strBad3, type);
+        assertNull(result);
+
+        // passing the wrong type
+        type = "application/srt";
+        result = TranscriptParser.parse(jsonStr, type);
+        assertEquals(result, null);
+    }
+}
--- a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java
+++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java
@ -0,0 +1,93 @@
+package de.danoeh.antennapod.parser.transcript;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.robolectric.RobolectricTestRunner;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import de.danoeh.antennapod.model.feed.Transcript;
+
+@RunWith(RobolectricTestRunner.class)
+public class SrtTranscriptParserTest {
+    private static String srtStr = "1\n"
+            + "00:00:00,000 --> 00:00:02,730\n"
+            + "John Doe: Promoting your podcast in a new\n\n"
+            + "2\n"
+            + "00:00:02,730 --> 00:00:04,600\n"
+            + "way. The latest from PogNews.\n\n"
+            + "00:00:04,730 --> 00:00:05,600\n"
+            + "way. The latest from PogNews.";
+
+    @Test
+    public void testParseSrt() {
+        Transcript result = SrtTranscriptParser.parse(srtStr);
+
+        assertEquals(result.getSegmentAtTime(0L).getWords(), "Promoting your podcast in a new");
+        assertEquals(result.getSegmentAtTime(0L).getSpeaker(), "John Doe");
+        assertEquals(result.getSegmentAtTime(0L).getStartTime(), 0L);
+        assertEquals(result.getSegmentAtTime(0L).getEndTime(), 2730L);
+        assertEquals((long) result.getEntryAfterTime(1000L).getKey(), 2730L);
+        assertEquals(result.getEntryAfterTime(1000L).getValue().getWords(), "way. The latest from PogNews.");
+    }
+
+    @Test
+    public void testParse() {
+        String type = "application/srr";
+        Transcript result;
+
+        result = TranscriptParser.parse(srtStr, type);
+        // There isn't a segment at 800L, so go backwards and get the segment at 0L
+        assertEquals(result.getSegmentAtTime(800L).getWords(), "Promoting your podcast in a new");
+
+        result = TranscriptParser.parse(null, type);
+        assertEquals(result, null);
+
+        // blank string
+        String blankStr = "";
+        result = TranscriptParser.parse(blankStr, type);
+        assertNull(result);
+
+        // All empty lines
+        String allNewlinesStr = "\r\n\r\n\r\n\r\n";
+        result = TranscriptParser.parse(allNewlinesStr, type);
+        assertEquals(result, null);
+
+        // first segment has invalid time formatting, so the entire segment will be thrown out
+        String srtStrBad1 = "00:0000,000 --> 00:00:02,730\n"
+                + "John Doe: Promoting your podcast in a new\n\n"
+                + "2\n"
+                + "00:00:02,730 --> 00:00:04,600\n"
+                + "way. The latest from PogNews.";
+        result = TranscriptParser.parse(srtStrBad1, type);
+        assertEquals(result.getSegmentAtTime(2730L).getWords(), "way. The latest from PogNews.");
+
+        // first segment has invalid time in end time, 2nd segment has invalid time in both start time and end time
+        String srtStrBad2 = "00:00:00,000 --> 00:0002,730\n"
+                + "Jane Doe: Promoting your podcast in a new\n\n"
+                + "2\n"
+                + "badstarttime --> badendtime\n"
+                + "way. The latest from PogNews.\n"
+                + "badstarttime -->\n"
+                + "Jane Doe says something\n"
+                + "00:00:00,000 --> 00:00:02,730\n"
+                + "Jane Doe:";
+        result = TranscriptParser.parse(srtStrBad2, type);
+        assertNull(result);
+
+        // Just plain text
+        String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
+                + "way. The latest from PogNews.";
+        result = TranscriptParser.parse(strBad3, type);
+        assertNull(result);
+
+        // passing the wrong type
+        type = "application/json";
+        result = TranscriptParser.parse(srtStr, type);
+        assertEquals(result, null);
+
+        type = "unknown";
+        result = TranscriptParser.parse(srtStr, type);
+        assertEquals(result, null);
+    }
+}
+
--- a/settings.gradle
+++ b/settings.gradle
@ -30,6 +30,7 @@ include ':net:sync:service'

 include ':parser:feed'
 include ':parser:media'
+include ':parser:transcript'

 include ':playback:base'
 include ':playback:cast'