From 7c4f19c9798b7c2c0c059a17fdfe843731cba5b4 Mon Sep 17 00:00:00 2001 From: Tony Tam <149837+tonytamsf@users.noreply.github.com> Date: Mon, 5 Feb 2024 04:42:59 +0800 Subject: [PATCH] Transcript semantic parsing (#6852) --- .../antennapod/model/feed/FeedItem.java | 9 ++ .../antennapod/model/feed/Transcript.java | 28 +++++ .../model/feed/TranscriptSegment.java | 31 +++++ parser/transcript/README.md | 3 + parser/transcript/build.gradle | 23 ++++ .../transcript/JsonTranscriptParser.java | 65 ++++++++++ .../transcript/SrtTranscriptParser.java | 118 ++++++++++++++++++ .../parser/transcript/TranscriptParser.java | 24 ++++ .../transcript/JsonTranscriptParserTest.java | 84 +++++++++++++ .../transcript/SrtTranscriptParserTest.java | 93 ++++++++++++++ settings.gradle | 1 + 11 files changed, 479 insertions(+) create mode 100644 model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java create mode 100644 model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java create mode 100644 parser/transcript/README.md create mode 100644 parser/transcript/build.gradle create mode 100644 parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java create mode 100644 parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java create mode 100644 parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java create mode 100644 parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java create mode 100644 parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java diff --git a/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java b/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java index 0f5a3f4bb..1e623fd8e 100644 --- a/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java +++ b/model/src/main/java/de/danoeh/antennapod/model/feed/FeedItem.java @@ -46,6 +46,7 @@ public class FeedItem implements Serializable { private String podcastIndexTranscriptUrl; private String podcastIndexTranscriptType; private String podcastIndexTranscriptText; + private Transcript transcript; private int state; public static final int NEW = -1; @@ -463,6 +464,14 @@ public class FeedItem implements Serializable { } } + public Transcript getTranscript() { + return transcript; + } + + public void setTranscript(Transcript t) { + transcript = t; + } + public String getPodcastIndexTranscriptText() { return podcastIndexTranscriptText; } diff --git a/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java b/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java new file mode 100644 index 000000000..da01c0e58 --- /dev/null +++ b/model/src/main/java/de/danoeh/antennapod/model/feed/Transcript.java @@ -0,0 +1,28 @@ +package de.danoeh.antennapod.model.feed; + +import java.util.Map; +import java.util.TreeMap; + +public class Transcript { + + private final TreeMap segmentsMap = new TreeMap<>(); + + public void addSegment(TranscriptSegment segment) { + segmentsMap.put(segment.getStartTime(), segment); + } + + public TranscriptSegment getSegmentAtTime(long time) { + if (segmentsMap.floorEntry(time) == null) { + return null; + } + return segmentsMap.floorEntry(time).getValue(); + } + + public int getSegmentCount() { + return segmentsMap.size(); + } + + public Map.Entry getEntryAfterTime(long time) { + return segmentsMap.ceilingEntry(time); + } +} diff --git a/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java new file mode 100644 index 000000000..0101bb8ed --- /dev/null +++ b/model/src/main/java/de/danoeh/antennapod/model/feed/TranscriptSegment.java @@ -0,0 +1,31 @@ +package de.danoeh.antennapod.model.feed; + +public class TranscriptSegment { + private final long startTime; + private final long endTime; + private final String words; + private final String speaker; + + public TranscriptSegment(long start, long end, String w, String s) { + startTime = start; + endTime = end; + words = w; + speaker = s; + } + + public long getStartTime() { + return startTime; + } + + public long getEndTime() { + return endTime; + } + + public String getWords() { + return words; + } + + public String getSpeaker() { + return speaker; + } +} \ No newline at end of file diff --git a/parser/transcript/README.md b/parser/transcript/README.md new file mode 100644 index 000000000..a6ca61612 --- /dev/null +++ b/parser/transcript/README.md @@ -0,0 +1,3 @@ +# :parser:transcript + +This module provides parsing for transcripts diff --git a/parser/transcript/build.gradle b/parser/transcript/build.gradle new file mode 100644 index 000000000..122c74025 --- /dev/null +++ b/parser/transcript/build.gradle @@ -0,0 +1,23 @@ +plugins { + id("com.android.library") +} +apply from: "../../common.gradle" + +android { + namespace "de.danoeh.antennapod.parser.transcript" +} + +dependencies { + implementation project(':model') + + annotationProcessor "androidx.annotation:annotation:$annotationVersion" + + implementation "androidx.core:core:$coreVersion" + + implementation "org.apache.commons:commons-lang3:$commonslangVersion" + implementation "commons-io:commons-io:$commonsioVersion" + implementation "org.jsoup:jsoup:$jsoupVersion" + + testImplementation "junit:junit:$junitVersion" + testImplementation "org.robolectric:robolectric:$robolectricVersion" +} diff --git a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java new file mode 100644 index 000000000..78f3bf9c8 --- /dev/null +++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParser.java @@ -0,0 +1,65 @@ +package de.danoeh.antennapod.parser.transcript; + +import org.apache.commons.lang3.StringUtils; +import org.json.JSONArray; +import org.json.JSONObject; +import org.jsoup.internal.StringUtil; + +import de.danoeh.antennapod.model.feed.Transcript; +import de.danoeh.antennapod.model.feed.TranscriptSegment; + +public class JsonTranscriptParser { + public static Transcript parse(String jsonStr) { + try { + Transcript transcript = new Transcript(); + long startTime = -1L; + long endTime = -1L; + long segmentStartTime = -1L; + long duration = 0L; + String speaker = ""; + String segmentBody = ""; + JSONObject obj = new JSONObject(jsonStr); + JSONArray objSegments = obj.getJSONArray("segments"); + + for (int i = 0; i < objSegments.length(); i++) { + JSONObject jsonObject = objSegments.getJSONObject(i); + startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue(); + endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue(); + if (startTime < 0 || endTime < 0) { + continue; + } + if (segmentStartTime == -1L) { + segmentStartTime = startTime; + } + duration += endTime - startTime; + + speaker = jsonObject.optString("speaker"); + String body = jsonObject.optString("body"); + segmentBody += body + " "; + + if (duration >= TranscriptParser.MIN_SPAN) { + segmentBody = StringUtils.trim(segmentBody); + transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker)); + duration = 0L; + segmentBody = ""; + segmentStartTime = -1L; + } + } + + if (!StringUtil.isBlank(segmentBody)) { + segmentBody = StringUtils.trim(segmentBody); + transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker)); + } + + if (transcript.getSegmentCount() > 0) { + return transcript; + } else { + return null; + } + + } catch (org.json.JSONException e) { + e.printStackTrace(); + } + return null; + } +} diff --git a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java new file mode 100644 index 000000000..098dadd99 --- /dev/null +++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParser.java @@ -0,0 +1,118 @@ +package de.danoeh.antennapod.parser.transcript; + +import org.apache.commons.lang3.StringUtils; +import org.jsoup.internal.StringUtil; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danoeh.antennapod.model.feed.Transcript; +import de.danoeh.antennapod.model.feed.TranscriptSegment; + +public class SrtTranscriptParser { + private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$"); + + public static Transcript parse(String str) { + if (StringUtils.isBlank(str)) { + return null; + } + str = str.replaceAll("\r\n", "\n"); + + Transcript transcript = new Transcript(); + List lines = Arrays.asList(str.split("\n")); + Iterator iter = lines.iterator(); + String speaker = ""; + StringBuilder body = new StringBuilder(); + String line; + String segmentBody = ""; + long startTimecode = -1L; + long spanStartTimecode = -1L; + long endTimecode = -1L; + long duration = 0L; + + while (iter.hasNext()) { + line = iter.next(); + + if (line.isEmpty()) { + continue; + } + + if (line.contains("-->")) { + String[] timecodes = line.split("-->"); + if (timecodes.length < 2) { + continue; + } + startTimecode = parseTimecode(timecodes[0].trim()); + endTimecode = parseTimecode(timecodes[1].trim()); + if (startTimecode == -1 || endTimecode == -1) { + continue; + } + + if (spanStartTimecode == -1) { + spanStartTimecode = startTimecode; + } + duration += endTimecode - startTimecode; + do { + line = iter.next(); + if (StringUtil.isBlank(line)) { + break; + } + body.append(line.strip()); + body.append(" "); + } while (iter.hasNext()); + } + + if (body.indexOf(":") != -1) { + String [] parts = body.toString().trim().split(":"); + if (parts.length < 2) { + continue; + } + speaker = parts[0]; + body = new StringBuilder(parts[1].strip()); + } + if (!StringUtil.isBlank(body.toString())) { + segmentBody += " " + body; + segmentBody = StringUtils.trim(segmentBody); + if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) { + transcript.addSegment(new TranscriptSegment(spanStartTimecode, + endTimecode, + segmentBody, + speaker)); + duration = 0L; + spanStartTimecode = -1L; + segmentBody = ""; + } + body = new StringBuilder(); + } + } + + if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) { + segmentBody = StringUtils.trim(segmentBody); + transcript.addSegment(new TranscriptSegment(spanStartTimecode, + endTimecode, + segmentBody, + speaker)); + } + if (transcript.getSegmentCount() > 0) { + return transcript; + } else { + return null; + } + } + + // Time format 00:00:00,000 + static long parseTimecode(String timecode) { + Matcher matcher = TIMECODE_PATTERN.matcher(timecode); + if (!matcher.matches()) { + return -1; + } + long hours = Integer.parseInt(matcher.group(1)); + long minutes = Integer.parseInt(matcher.group(2)); + long seconds = Integer.parseInt(matcher.group(3)); + long milliseconds = Integer.parseInt(matcher.group(4)); + return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds; + } +} diff --git a/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java new file mode 100644 index 000000000..0a4025d96 --- /dev/null +++ b/parser/transcript/src/main/java/de/danoeh/antennapod/parser/transcript/TranscriptParser.java @@ -0,0 +1,24 @@ +package de.danoeh.antennapod.parser.transcript; + +import org.apache.commons.lang3.StringUtils; + +import de.danoeh.antennapod.model.feed.Transcript; + +public class TranscriptParser { + static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second + + public static Transcript parse(String str, String type) { + if (str == null || StringUtils.isBlank(str)) { + return null; + } + + if ("application/json".equals(type)) { + return JsonTranscriptParser.parse(str); + } + + if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) { + return SrtTranscriptParser.parse(str); + } + return null; + } +} diff --git a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java new file mode 100644 index 000000000..48996f492 --- /dev/null +++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/JsonTranscriptParserTest.java @@ -0,0 +1,84 @@ +package de.danoeh.antennapod.parser.transcript; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.robolectric.RobolectricTestRunner; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import de.danoeh.antennapod.model.feed.Transcript; + +@RunWith(RobolectricTestRunner.class) +public class JsonTranscriptParserTest { + private static String jsonStr = "{'version': '1.0.0', " + + "'segments': [ " + + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' }," + + "{ 'speaker' : 'Sally Green', 'startTime': 1.91, 'endTime': 2.8, 'body': 'this merges' }," + + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' }," + + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}"; + + @Test + public void testParseJson() { + Transcript result = JsonTranscriptParser.parse(jsonStr); + + assertEquals(result.getSegmentAtTime(0L), null); + assertEquals(result.getSegmentAtTime(800L).getSpeaker(), "John Doe"); + assertEquals(result.getSegmentAtTime(800L).getStartTime(), 800L); + assertEquals(result.getSegmentAtTime(800L).getEndTime(), 1900L); + assertEquals(1910L, (long) result.getEntryAfterTime(1800L).getKey()); + // 2 segments get merged into at least 1 second + assertEquals("this merges the", result.getEntryAfterTime(1800L).getValue().getWords()); + } + + @Test + public void testParse() { + String type = "application/json"; + Transcript result = TranscriptParser.parse(jsonStr, type); + // There isn't a segment at 900L, so go backwards and get the segment at 800L + assertEquals(result.getSegmentAtTime(900L).getSpeaker(), "John Doe"); + assertEquals(result.getSegmentAtTime(930L).getWords(), "And"); + + // blank string + String blankStr = ""; + result = TranscriptParser.parse(blankStr, type); + assertEquals(result, null); + + result = TranscriptParser.parse(null, type); + assertEquals(result, null); + + // All blank lines + String allNewlinesStr = "\r\n\r\n\r\n\r\n"; + result = TranscriptParser.parse(allNewlinesStr, type); + assertEquals(result, null); + + // segments is missing + String jsonStrBad1 = "{'version': '1.0.0', " + + "'segmentsX': [ " + + "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' }," + + "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' }," + + "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}"; + result = TranscriptParser.parse(jsonStrBad1, type); + assertEquals(result, null); + + // invalid time formatting + String jsonStrBad2 = "{'version': '1.0.0', " + + "'segments': [ " + + "{ 'speaker' : 'XJohn Doe', 'startTime': stringTime, 'endTime': stringTime, 'body': 'And' }," + + "{ 'XstartTime': 2.9, 'XendTime': 3.4, 'body': 'the' }," + + "{ 'startTime': '-2.9', 'endTime': '-3.4', 'body': 'the' }," + + "{ 'startTime': 'bad_time', 'endTime': '-3.4', 'body': 'the' }]}"; + result = TranscriptParser.parse(jsonStrBad2, type); + assertNull(result); + + // Just plain text + String strBad3 = "John Doe: Promoting your podcast in a new\n\n" + + "way. The latest from PogNews."; + result = TranscriptParser.parse(strBad3, type); + assertNull(result); + + // passing the wrong type + type = "application/srt"; + result = TranscriptParser.parse(jsonStr, type); + assertEquals(result, null); + } +} diff --git a/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java new file mode 100644 index 000000000..f7854c5bf --- /dev/null +++ b/parser/transcript/src/test/java/de/danoeh/antennapod/parser/transcript/SrtTranscriptParserTest.java @@ -0,0 +1,93 @@ +package de.danoeh.antennapod.parser.transcript; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.robolectric.RobolectricTestRunner; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import de.danoeh.antennapod.model.feed.Transcript; + +@RunWith(RobolectricTestRunner.class) +public class SrtTranscriptParserTest { + private static String srtStr = "1\n" + + "00:00:00,000 --> 00:00:02,730\n" + + "John Doe: Promoting your podcast in a new\n\n" + + "2\n" + + "00:00:02,730 --> 00:00:04,600\n" + + "way. The latest from PogNews.\n\n" + + "00:00:04,730 --> 00:00:05,600\n" + + "way. The latest from PogNews."; + + @Test + public void testParseSrt() { + Transcript result = SrtTranscriptParser.parse(srtStr); + + assertEquals(result.getSegmentAtTime(0L).getWords(), "Promoting your podcast in a new"); + assertEquals(result.getSegmentAtTime(0L).getSpeaker(), "John Doe"); + assertEquals(result.getSegmentAtTime(0L).getStartTime(), 0L); + assertEquals(result.getSegmentAtTime(0L).getEndTime(), 2730L); + assertEquals((long) result.getEntryAfterTime(1000L).getKey(), 2730L); + assertEquals(result.getEntryAfterTime(1000L).getValue().getWords(), "way. The latest from PogNews."); + } + + @Test + public void testParse() { + String type = "application/srr"; + Transcript result; + + result = TranscriptParser.parse(srtStr, type); + // There isn't a segment at 800L, so go backwards and get the segment at 0L + assertEquals(result.getSegmentAtTime(800L).getWords(), "Promoting your podcast in a new"); + + result = TranscriptParser.parse(null, type); + assertEquals(result, null); + + // blank string + String blankStr = ""; + result = TranscriptParser.parse(blankStr, type); + assertNull(result); + + // All empty lines + String allNewlinesStr = "\r\n\r\n\r\n\r\n"; + result = TranscriptParser.parse(allNewlinesStr, type); + assertEquals(result, null); + + // first segment has invalid time formatting, so the entire segment will be thrown out + String srtStrBad1 = "00:0000,000 --> 00:00:02,730\n" + + "John Doe: Promoting your podcast in a new\n\n" + + "2\n" + + "00:00:02,730 --> 00:00:04,600\n" + + "way. The latest from PogNews."; + result = TranscriptParser.parse(srtStrBad1, type); + assertEquals(result.getSegmentAtTime(2730L).getWords(), "way. The latest from PogNews."); + + // first segment has invalid time in end time, 2nd segment has invalid time in both start time and end time + String srtStrBad2 = "00:00:00,000 --> 00:0002,730\n" + + "Jane Doe: Promoting your podcast in a new\n\n" + + "2\n" + + "badstarttime --> badendtime\n" + + "way. The latest from PogNews.\n" + + "badstarttime -->\n" + + "Jane Doe says something\n" + + "00:00:00,000 --> 00:00:02,730\n" + + "Jane Doe:"; + result = TranscriptParser.parse(srtStrBad2, type); + assertNull(result); + + // Just plain text + String strBad3 = "John Doe: Promoting your podcast in a new\n\n" + + "way. The latest from PogNews."; + result = TranscriptParser.parse(strBad3, type); + assertNull(result); + + // passing the wrong type + type = "application/json"; + result = TranscriptParser.parse(srtStr, type); + assertEquals(result, null); + + type = "unknown"; + result = TranscriptParser.parse(srtStr, type); + assertEquals(result, null); + } +} + diff --git a/settings.gradle b/settings.gradle index 8cf8baf3e..3b3df7ba8 100644 --- a/settings.gradle +++ b/settings.gradle @@ -30,6 +30,7 @@ include ':net:sync:service' include ':parser:feed' include ':parser:media' +include ':parser:transcript' include ':playback:base' include ':playback:cast'