Transcript semantic parsing (#6852)
This commit is contained in:
parent
27e9bf36b1
commit
7c4f19c979
|
@ -46,6 +46,7 @@ public class FeedItem implements Serializable {
|
|||
private String podcastIndexTranscriptUrl;
|
||||
private String podcastIndexTranscriptType;
|
||||
private String podcastIndexTranscriptText;
|
||||
private Transcript transcript;
|
||||
|
||||
private int state;
|
||||
public static final int NEW = -1;
|
||||
|
@ -463,6 +464,14 @@ public class FeedItem implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public Transcript getTranscript() {
|
||||
return transcript;
|
||||
}
|
||||
|
||||
public void setTranscript(Transcript t) {
|
||||
transcript = t;
|
||||
}
|
||||
|
||||
public String getPodcastIndexTranscriptText() {
|
||||
return podcastIndexTranscriptText;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
package de.danoeh.antennapod.model.feed;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
public class Transcript {
|
||||
|
||||
private final TreeMap<Long, TranscriptSegment> segmentsMap = new TreeMap<>();
|
||||
|
||||
public void addSegment(TranscriptSegment segment) {
|
||||
segmentsMap.put(segment.getStartTime(), segment);
|
||||
}
|
||||
|
||||
public TranscriptSegment getSegmentAtTime(long time) {
|
||||
if (segmentsMap.floorEntry(time) == null) {
|
||||
return null;
|
||||
}
|
||||
return segmentsMap.floorEntry(time).getValue();
|
||||
}
|
||||
|
||||
public int getSegmentCount() {
|
||||
return segmentsMap.size();
|
||||
}
|
||||
|
||||
public Map.Entry<Long, TranscriptSegment> getEntryAfterTime(long time) {
|
||||
return segmentsMap.ceilingEntry(time);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package de.danoeh.antennapod.model.feed;
|
||||
|
||||
public class TranscriptSegment {
|
||||
private final long startTime;
|
||||
private final long endTime;
|
||||
private final String words;
|
||||
private final String speaker;
|
||||
|
||||
public TranscriptSegment(long start, long end, String w, String s) {
|
||||
startTime = start;
|
||||
endTime = end;
|
||||
words = w;
|
||||
speaker = s;
|
||||
}
|
||||
|
||||
public long getStartTime() {
|
||||
return startTime;
|
||||
}
|
||||
|
||||
public long getEndTime() {
|
||||
return endTime;
|
||||
}
|
||||
|
||||
public String getWords() {
|
||||
return words;
|
||||
}
|
||||
|
||||
public String getSpeaker() {
|
||||
return speaker;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
# :parser:transcript
|
||||
|
||||
This module provides parsing for transcripts
|
|
@ -0,0 +1,23 @@
|
|||
plugins {
|
||||
id("com.android.library")
|
||||
}
|
||||
apply from: "../../common.gradle"
|
||||
|
||||
android {
|
||||
namespace "de.danoeh.antennapod.parser.transcript"
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation project(':model')
|
||||
|
||||
annotationProcessor "androidx.annotation:annotation:$annotationVersion"
|
||||
|
||||
implementation "androidx.core:core:$coreVersion"
|
||||
|
||||
implementation "org.apache.commons:commons-lang3:$commonslangVersion"
|
||||
implementation "commons-io:commons-io:$commonsioVersion"
|
||||
implementation "org.jsoup:jsoup:$jsoupVersion"
|
||||
|
||||
testImplementation "junit:junit:$junitVersion"
|
||||
testImplementation "org.robolectric:robolectric:$robolectricVersion"
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.internal.StringUtil;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
import de.danoeh.antennapod.model.feed.TranscriptSegment;
|
||||
|
||||
public class JsonTranscriptParser {
|
||||
public static Transcript parse(String jsonStr) {
|
||||
try {
|
||||
Transcript transcript = new Transcript();
|
||||
long startTime = -1L;
|
||||
long endTime = -1L;
|
||||
long segmentStartTime = -1L;
|
||||
long duration = 0L;
|
||||
String speaker = "";
|
||||
String segmentBody = "";
|
||||
JSONObject obj = new JSONObject(jsonStr);
|
||||
JSONArray objSegments = obj.getJSONArray("segments");
|
||||
|
||||
for (int i = 0; i < objSegments.length(); i++) {
|
||||
JSONObject jsonObject = objSegments.getJSONObject(i);
|
||||
startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue();
|
||||
endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue();
|
||||
if (startTime < 0 || endTime < 0) {
|
||||
continue;
|
||||
}
|
||||
if (segmentStartTime == -1L) {
|
||||
segmentStartTime = startTime;
|
||||
}
|
||||
duration += endTime - startTime;
|
||||
|
||||
speaker = jsonObject.optString("speaker");
|
||||
String body = jsonObject.optString("body");
|
||||
segmentBody += body + " ";
|
||||
|
||||
if (duration >= TranscriptParser.MIN_SPAN) {
|
||||
segmentBody = StringUtils.trim(segmentBody);
|
||||
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
|
||||
duration = 0L;
|
||||
segmentBody = "";
|
||||
segmentStartTime = -1L;
|
||||
}
|
||||
}
|
||||
|
||||
if (!StringUtil.isBlank(segmentBody)) {
|
||||
segmentBody = StringUtils.trim(segmentBody);
|
||||
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
|
||||
}
|
||||
|
||||
if (transcript.getSegmentCount() > 0) {
|
||||
return transcript;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
} catch (org.json.JSONException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.internal.StringUtil;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
import de.danoeh.antennapod.model.feed.TranscriptSegment;
|
||||
|
||||
public class SrtTranscriptParser {
|
||||
private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$");
|
||||
|
||||
public static Transcript parse(String str) {
|
||||
if (StringUtils.isBlank(str)) {
|
||||
return null;
|
||||
}
|
||||
str = str.replaceAll("\r\n", "\n");
|
||||
|
||||
Transcript transcript = new Transcript();
|
||||
List<String> lines = Arrays.asList(str.split("\n"));
|
||||
Iterator<String> iter = lines.iterator();
|
||||
String speaker = "";
|
||||
StringBuilder body = new StringBuilder();
|
||||
String line;
|
||||
String segmentBody = "";
|
||||
long startTimecode = -1L;
|
||||
long spanStartTimecode = -1L;
|
||||
long endTimecode = -1L;
|
||||
long duration = 0L;
|
||||
|
||||
while (iter.hasNext()) {
|
||||
line = iter.next();
|
||||
|
||||
if (line.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.contains("-->")) {
|
||||
String[] timecodes = line.split("-->");
|
||||
if (timecodes.length < 2) {
|
||||
continue;
|
||||
}
|
||||
startTimecode = parseTimecode(timecodes[0].trim());
|
||||
endTimecode = parseTimecode(timecodes[1].trim());
|
||||
if (startTimecode == -1 || endTimecode == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (spanStartTimecode == -1) {
|
||||
spanStartTimecode = startTimecode;
|
||||
}
|
||||
duration += endTimecode - startTimecode;
|
||||
do {
|
||||
line = iter.next();
|
||||
if (StringUtil.isBlank(line)) {
|
||||
break;
|
||||
}
|
||||
body.append(line.strip());
|
||||
body.append(" ");
|
||||
} while (iter.hasNext());
|
||||
}
|
||||
|
||||
if (body.indexOf(":") != -1) {
|
||||
String [] parts = body.toString().trim().split(":");
|
||||
if (parts.length < 2) {
|
||||
continue;
|
||||
}
|
||||
speaker = parts[0];
|
||||
body = new StringBuilder(parts[1].strip());
|
||||
}
|
||||
if (!StringUtil.isBlank(body.toString())) {
|
||||
segmentBody += " " + body;
|
||||
segmentBody = StringUtils.trim(segmentBody);
|
||||
if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) {
|
||||
transcript.addSegment(new TranscriptSegment(spanStartTimecode,
|
||||
endTimecode,
|
||||
segmentBody,
|
||||
speaker));
|
||||
duration = 0L;
|
||||
spanStartTimecode = -1L;
|
||||
segmentBody = "";
|
||||
}
|
||||
body = new StringBuilder();
|
||||
}
|
||||
}
|
||||
|
||||
if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) {
|
||||
segmentBody = StringUtils.trim(segmentBody);
|
||||
transcript.addSegment(new TranscriptSegment(spanStartTimecode,
|
||||
endTimecode,
|
||||
segmentBody,
|
||||
speaker));
|
||||
}
|
||||
if (transcript.getSegmentCount() > 0) {
|
||||
return transcript;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Time format 00:00:00,000
|
||||
static long parseTimecode(String timecode) {
|
||||
Matcher matcher = TIMECODE_PATTERN.matcher(timecode);
|
||||
if (!matcher.matches()) {
|
||||
return -1;
|
||||
}
|
||||
long hours = Integer.parseInt(matcher.group(1));
|
||||
long minutes = Integer.parseInt(matcher.group(2));
|
||||
long seconds = Integer.parseInt(matcher.group(3));
|
||||
long milliseconds = Integer.parseInt(matcher.group(4));
|
||||
return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
|
||||
public class TranscriptParser {
|
||||
static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second
|
||||
|
||||
public static Transcript parse(String str, String type) {
|
||||
if (str == null || StringUtils.isBlank(str)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ("application/json".equals(type)) {
|
||||
return JsonTranscriptParser.parse(str);
|
||||
}
|
||||
|
||||
if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
|
||||
return SrtTranscriptParser.parse(str);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.robolectric.RobolectricTestRunner;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNull;
|
||||
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
|
||||
@RunWith(RobolectricTestRunner.class)
|
||||
public class JsonTranscriptParserTest {
|
||||
private static String jsonStr = "{'version': '1.0.0', "
|
||||
+ "'segments': [ "
|
||||
+ "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
|
||||
+ "{ 'speaker' : 'Sally Green', 'startTime': 1.91, 'endTime': 2.8, 'body': 'this merges' },"
|
||||
+ "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
|
||||
+ "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
|
||||
|
||||
@Test
|
||||
public void testParseJson() {
|
||||
Transcript result = JsonTranscriptParser.parse(jsonStr);
|
||||
|
||||
assertEquals(result.getSegmentAtTime(0L), null);
|
||||
assertEquals(result.getSegmentAtTime(800L).getSpeaker(), "John Doe");
|
||||
assertEquals(result.getSegmentAtTime(800L).getStartTime(), 800L);
|
||||
assertEquals(result.getSegmentAtTime(800L).getEndTime(), 1900L);
|
||||
assertEquals(1910L, (long) result.getEntryAfterTime(1800L).getKey());
|
||||
// 2 segments get merged into at least 1 second
|
||||
assertEquals("this merges the", result.getEntryAfterTime(1800L).getValue().getWords());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParse() {
|
||||
String type = "application/json";
|
||||
Transcript result = TranscriptParser.parse(jsonStr, type);
|
||||
// There isn't a segment at 900L, so go backwards and get the segment at 800L
|
||||
assertEquals(result.getSegmentAtTime(900L).getSpeaker(), "John Doe");
|
||||
assertEquals(result.getSegmentAtTime(930L).getWords(), "And");
|
||||
|
||||
// blank string
|
||||
String blankStr = "";
|
||||
result = TranscriptParser.parse(blankStr, type);
|
||||
assertEquals(result, null);
|
||||
|
||||
result = TranscriptParser.parse(null, type);
|
||||
assertEquals(result, null);
|
||||
|
||||
// All blank lines
|
||||
String allNewlinesStr = "\r\n\r\n\r\n\r\n";
|
||||
result = TranscriptParser.parse(allNewlinesStr, type);
|
||||
assertEquals(result, null);
|
||||
|
||||
// segments is missing
|
||||
String jsonStrBad1 = "{'version': '1.0.0', "
|
||||
+ "'segmentsX': [ "
|
||||
+ "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
|
||||
+ "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
|
||||
+ "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
|
||||
result = TranscriptParser.parse(jsonStrBad1, type);
|
||||
assertEquals(result, null);
|
||||
|
||||
// invalid time formatting
|
||||
String jsonStrBad2 = "{'version': '1.0.0', "
|
||||
+ "'segments': [ "
|
||||
+ "{ 'speaker' : 'XJohn Doe', 'startTime': stringTime, 'endTime': stringTime, 'body': 'And' },"
|
||||
+ "{ 'XstartTime': 2.9, 'XendTime': 3.4, 'body': 'the' },"
|
||||
+ "{ 'startTime': '-2.9', 'endTime': '-3.4', 'body': 'the' },"
|
||||
+ "{ 'startTime': 'bad_time', 'endTime': '-3.4', 'body': 'the' }]}";
|
||||
result = TranscriptParser.parse(jsonStrBad2, type);
|
||||
assertNull(result);
|
||||
|
||||
// Just plain text
|
||||
String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
|
||||
+ "way. The latest from PogNews.";
|
||||
result = TranscriptParser.parse(strBad3, type);
|
||||
assertNull(result);
|
||||
|
||||
// passing the wrong type
|
||||
type = "application/srt";
|
||||
result = TranscriptParser.parse(jsonStr, type);
|
||||
assertEquals(result, null);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
package de.danoeh.antennapod.parser.transcript;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.robolectric.RobolectricTestRunner;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import de.danoeh.antennapod.model.feed.Transcript;
|
||||
|
||||
@RunWith(RobolectricTestRunner.class)
|
||||
public class SrtTranscriptParserTest {
|
||||
private static String srtStr = "1\n"
|
||||
+ "00:00:00,000 --> 00:00:02,730\n"
|
||||
+ "John Doe: Promoting your podcast in a new\n\n"
|
||||
+ "2\n"
|
||||
+ "00:00:02,730 --> 00:00:04,600\n"
|
||||
+ "way. The latest from PogNews.\n\n"
|
||||
+ "00:00:04,730 --> 00:00:05,600\n"
|
||||
+ "way. The latest from PogNews.";
|
||||
|
||||
@Test
|
||||
public void testParseSrt() {
|
||||
Transcript result = SrtTranscriptParser.parse(srtStr);
|
||||
|
||||
assertEquals(result.getSegmentAtTime(0L).getWords(), "Promoting your podcast in a new");
|
||||
assertEquals(result.getSegmentAtTime(0L).getSpeaker(), "John Doe");
|
||||
assertEquals(result.getSegmentAtTime(0L).getStartTime(), 0L);
|
||||
assertEquals(result.getSegmentAtTime(0L).getEndTime(), 2730L);
|
||||
assertEquals((long) result.getEntryAfterTime(1000L).getKey(), 2730L);
|
||||
assertEquals(result.getEntryAfterTime(1000L).getValue().getWords(), "way. The latest from PogNews.");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParse() {
|
||||
String type = "application/srr";
|
||||
Transcript result;
|
||||
|
||||
result = TranscriptParser.parse(srtStr, type);
|
||||
// There isn't a segment at 800L, so go backwards and get the segment at 0L
|
||||
assertEquals(result.getSegmentAtTime(800L).getWords(), "Promoting your podcast in a new");
|
||||
|
||||
result = TranscriptParser.parse(null, type);
|
||||
assertEquals(result, null);
|
||||
|
||||
// blank string
|
||||
String blankStr = "";
|
||||
result = TranscriptParser.parse(blankStr, type);
|
||||
assertNull(result);
|
||||
|
||||
// All empty lines
|
||||
String allNewlinesStr = "\r\n\r\n\r\n\r\n";
|
||||
result = TranscriptParser.parse(allNewlinesStr, type);
|
||||
assertEquals(result, null);
|
||||
|
||||
// first segment has invalid time formatting, so the entire segment will be thrown out
|
||||
String srtStrBad1 = "00:0000,000 --> 00:00:02,730\n"
|
||||
+ "John Doe: Promoting your podcast in a new\n\n"
|
||||
+ "2\n"
|
||||
+ "00:00:02,730 --> 00:00:04,600\n"
|
||||
+ "way. The latest from PogNews.";
|
||||
result = TranscriptParser.parse(srtStrBad1, type);
|
||||
assertEquals(result.getSegmentAtTime(2730L).getWords(), "way. The latest from PogNews.");
|
||||
|
||||
// first segment has invalid time in end time, 2nd segment has invalid time in both start time and end time
|
||||
String srtStrBad2 = "00:00:00,000 --> 00:0002,730\n"
|
||||
+ "Jane Doe: Promoting your podcast in a new\n\n"
|
||||
+ "2\n"
|
||||
+ "badstarttime --> badendtime\n"
|
||||
+ "way. The latest from PogNews.\n"
|
||||
+ "badstarttime -->\n"
|
||||
+ "Jane Doe says something\n"
|
||||
+ "00:00:00,000 --> 00:00:02,730\n"
|
||||
+ "Jane Doe:";
|
||||
result = TranscriptParser.parse(srtStrBad2, type);
|
||||
assertNull(result);
|
||||
|
||||
// Just plain text
|
||||
String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
|
||||
+ "way. The latest from PogNews.";
|
||||
result = TranscriptParser.parse(strBad3, type);
|
||||
assertNull(result);
|
||||
|
||||
// passing the wrong type
|
||||
type = "application/json";
|
||||
result = TranscriptParser.parse(srtStr, type);
|
||||
assertEquals(result, null);
|
||||
|
||||
type = "unknown";
|
||||
result = TranscriptParser.parse(srtStr, type);
|
||||
assertEquals(result, null);
|
||||
}
|
||||
}
|
||||
|
|
@ -30,6 +30,7 @@ include ':net:sync:service'
|
|||
|
||||
include ':parser:feed'
|
||||
include ':parser:media'
|
||||
include ':parser:transcript'
|
||||
|
||||
include ':playback:base'
|
||||
include ':playback:cast'
|
||||
|
|
Loading…
Reference in New Issue