Transcript semantic parsing (#6852)

This commit is contained in:
Tony Tam 2024-02-05 04:42:59 +08:00 committed by ByteHamster
parent c3877ff1f4
commit 1edceb1b97
11 changed files with 479 additions and 0 deletions

View File

@ -46,6 +46,7 @@ public class FeedItem implements Serializable {
private String podcastIndexTranscriptUrl;
private String podcastIndexTranscriptType;
private String podcastIndexTranscriptText;
private Transcript transcript;
private int state;
public static final int NEW = -1;
@ -463,6 +464,14 @@ public class FeedItem implements Serializable {
}
}
public Transcript getTranscript() {
return transcript;
}
public void setTranscript(Transcript t) {
transcript = t;
}
public String getPodcastIndexTranscriptText() {
return podcastIndexTranscriptText;
}

View File

@ -0,0 +1,28 @@
package de.danoeh.antennapod.model.feed;
import java.util.Map;
import java.util.TreeMap;
public class Transcript {
private final TreeMap<Long, TranscriptSegment> segmentsMap = new TreeMap<>();
public void addSegment(TranscriptSegment segment) {
segmentsMap.put(segment.getStartTime(), segment);
}
public TranscriptSegment getSegmentAtTime(long time) {
if (segmentsMap.floorEntry(time) == null) {
return null;
}
return segmentsMap.floorEntry(time).getValue();
}
public int getSegmentCount() {
return segmentsMap.size();
}
public Map.Entry<Long, TranscriptSegment> getEntryAfterTime(long time) {
return segmentsMap.ceilingEntry(time);
}
}

View File

@ -0,0 +1,31 @@
package de.danoeh.antennapod.model.feed;
public class TranscriptSegment {
private final long startTime;
private final long endTime;
private final String words;
private final String speaker;
public TranscriptSegment(long start, long end, String w, String s) {
startTime = start;
endTime = end;
words = w;
speaker = s;
}
public long getStartTime() {
return startTime;
}
public long getEndTime() {
return endTime;
}
public String getWords() {
return words;
}
public String getSpeaker() {
return speaker;
}
}

View File

@ -0,0 +1,3 @@
# :parser:transcript
This module provides parsing for transcripts

View File

@ -0,0 +1,23 @@
plugins {
id("com.android.library")
}
apply from: "../../common.gradle"
android {
namespace "de.danoeh.antennapod.parser.transcript"
}
dependencies {
implementation project(':model')
annotationProcessor "androidx.annotation:annotation:$annotationVersion"
implementation "androidx.core:core:$coreVersion"
implementation "org.apache.commons:commons-lang3:$commonslangVersion"
implementation "commons-io:commons-io:$commonsioVersion"
implementation "org.jsoup:jsoup:$jsoupVersion"
testImplementation "junit:junit:$junitVersion"
testImplementation "org.robolectric:robolectric:$robolectricVersion"
}

View File

@ -0,0 +1,65 @@
package de.danoeh.antennapod.parser.transcript;
import org.apache.commons.lang3.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.internal.StringUtil;
import de.danoeh.antennapod.model.feed.Transcript;
import de.danoeh.antennapod.model.feed.TranscriptSegment;
public class JsonTranscriptParser {
public static Transcript parse(String jsonStr) {
try {
Transcript transcript = new Transcript();
long startTime = -1L;
long endTime = -1L;
long segmentStartTime = -1L;
long duration = 0L;
String speaker = "";
String segmentBody = "";
JSONObject obj = new JSONObject(jsonStr);
JSONArray objSegments = obj.getJSONArray("segments");
for (int i = 0; i < objSegments.length(); i++) {
JSONObject jsonObject = objSegments.getJSONObject(i);
startTime = Double.valueOf(jsonObject.optDouble("startTime", -1) * 1000L).longValue();
endTime = Double.valueOf(jsonObject.optDouble("endTime", -1) * 1000L).longValue();
if (startTime < 0 || endTime < 0) {
continue;
}
if (segmentStartTime == -1L) {
segmentStartTime = startTime;
}
duration += endTime - startTime;
speaker = jsonObject.optString("speaker");
String body = jsonObject.optString("body");
segmentBody += body + " ";
if (duration >= TranscriptParser.MIN_SPAN) {
segmentBody = StringUtils.trim(segmentBody);
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
duration = 0L;
segmentBody = "";
segmentStartTime = -1L;
}
}
if (!StringUtil.isBlank(segmentBody)) {
segmentBody = StringUtils.trim(segmentBody);
transcript.addSegment(new TranscriptSegment(segmentStartTime, endTime, segmentBody, speaker));
}
if (transcript.getSegmentCount() > 0) {
return transcript;
} else {
return null;
}
} catch (org.json.JSONException e) {
e.printStackTrace();
}
return null;
}
}

View File

@ -0,0 +1,118 @@
package de.danoeh.antennapod.parser.transcript;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.internal.StringUtil;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.danoeh.antennapod.model.feed.Transcript;
import de.danoeh.antennapod.model.feed.TranscriptSegment;
public class SrtTranscriptParser {
private static final Pattern TIMECODE_PATTERN = Pattern.compile("^([0-9]{2}):([0-9]{2}):([0-9]{2}),([0-9]{3})$");
public static Transcript parse(String str) {
if (StringUtils.isBlank(str)) {
return null;
}
str = str.replaceAll("\r\n", "\n");
Transcript transcript = new Transcript();
List<String> lines = Arrays.asList(str.split("\n"));
Iterator<String> iter = lines.iterator();
String speaker = "";
StringBuilder body = new StringBuilder();
String line;
String segmentBody = "";
long startTimecode = -1L;
long spanStartTimecode = -1L;
long endTimecode = -1L;
long duration = 0L;
while (iter.hasNext()) {
line = iter.next();
if (line.isEmpty()) {
continue;
}
if (line.contains("-->")) {
String[] timecodes = line.split("-->");
if (timecodes.length < 2) {
continue;
}
startTimecode = parseTimecode(timecodes[0].trim());
endTimecode = parseTimecode(timecodes[1].trim());
if (startTimecode == -1 || endTimecode == -1) {
continue;
}
if (spanStartTimecode == -1) {
spanStartTimecode = startTimecode;
}
duration += endTimecode - startTimecode;
do {
line = iter.next();
if (StringUtil.isBlank(line)) {
break;
}
body.append(line.strip());
body.append(" ");
} while (iter.hasNext());
}
if (body.indexOf(":") != -1) {
String [] parts = body.toString().trim().split(":");
if (parts.length < 2) {
continue;
}
speaker = parts[0];
body = new StringBuilder(parts[1].strip());
}
if (!StringUtil.isBlank(body.toString())) {
segmentBody += " " + body;
segmentBody = StringUtils.trim(segmentBody);
if (duration >= TranscriptParser.MIN_SPAN && endTimecode > spanStartTimecode) {
transcript.addSegment(new TranscriptSegment(spanStartTimecode,
endTimecode,
segmentBody,
speaker));
duration = 0L;
spanStartTimecode = -1L;
segmentBody = "";
}
body = new StringBuilder();
}
}
if (!StringUtil.isBlank(segmentBody) && endTimecode > spanStartTimecode) {
segmentBody = StringUtils.trim(segmentBody);
transcript.addSegment(new TranscriptSegment(spanStartTimecode,
endTimecode,
segmentBody,
speaker));
}
if (transcript.getSegmentCount() > 0) {
return transcript;
} else {
return null;
}
}
// Time format 00:00:00,000
static long parseTimecode(String timecode) {
Matcher matcher = TIMECODE_PATTERN.matcher(timecode);
if (!matcher.matches()) {
return -1;
}
long hours = Integer.parseInt(matcher.group(1));
long minutes = Integer.parseInt(matcher.group(2));
long seconds = Integer.parseInt(matcher.group(3));
long milliseconds = Integer.parseInt(matcher.group(4));
return (hours * 60 * 60 * 1000) + (minutes * 60 * 1000) + (seconds * 1000) + milliseconds;
}
}

View File

@ -0,0 +1,24 @@
package de.danoeh.antennapod.parser.transcript;
import org.apache.commons.lang3.StringUtils;
import de.danoeh.antennapod.model.feed.Transcript;
public class TranscriptParser {
static final long MIN_SPAN = 1000L; // merge short segments together to form a span of 1 second
public static Transcript parse(String str, String type) {
if (str == null || StringUtils.isBlank(str)) {
return null;
}
if ("application/json".equals(type)) {
return JsonTranscriptParser.parse(str);
}
if ("application/srt".equals(type) || "application/srr".equals(type) || "application/x-subrip".equals(type)) {
return SrtTranscriptParser.parse(str);
}
return null;
}
}

View File

@ -0,0 +1,84 @@
package de.danoeh.antennapod.parser.transcript;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.RobolectricTestRunner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import de.danoeh.antennapod.model.feed.Transcript;
@RunWith(RobolectricTestRunner.class)
public class JsonTranscriptParserTest {
private static String jsonStr = "{'version': '1.0.0', "
+ "'segments': [ "
+ "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
+ "{ 'speaker' : 'Sally Green', 'startTime': 1.91, 'endTime': 2.8, 'body': 'this merges' },"
+ "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
+ "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
@Test
public void testParseJson() {
Transcript result = JsonTranscriptParser.parse(jsonStr);
assertEquals(result.getSegmentAtTime(0L), null);
assertEquals(result.getSegmentAtTime(800L).getSpeaker(), "John Doe");
assertEquals(result.getSegmentAtTime(800L).getStartTime(), 800L);
assertEquals(result.getSegmentAtTime(800L).getEndTime(), 1900L);
assertEquals(1910L, (long) result.getEntryAfterTime(1800L).getKey());
// 2 segments get merged into at least 1 second
assertEquals("this merges the", result.getEntryAfterTime(1800L).getValue().getWords());
}
@Test
public void testParse() {
String type = "application/json";
Transcript result = TranscriptParser.parse(jsonStr, type);
// There isn't a segment at 900L, so go backwards and get the segment at 800L
assertEquals(result.getSegmentAtTime(900L).getSpeaker(), "John Doe");
assertEquals(result.getSegmentAtTime(930L).getWords(), "And");
// blank string
String blankStr = "";
result = TranscriptParser.parse(blankStr, type);
assertEquals(result, null);
result = TranscriptParser.parse(null, type);
assertEquals(result, null);
// All blank lines
String allNewlinesStr = "\r\n\r\n\r\n\r\n";
result = TranscriptParser.parse(allNewlinesStr, type);
assertEquals(result, null);
// segments is missing
String jsonStrBad1 = "{'version': '1.0.0', "
+ "'segmentsX': [ "
+ "{ 'speaker' : 'John Doe', 'startTime': 0.8, 'endTime': 1.9, 'body': 'And' },"
+ "{ 'startTime': 2.9, 'endTime': 3.4, 'body': 'the' },"
+ "{ 'startTime': 3.5, 'endTime': 3.6, 'body': 'person' }]}";
result = TranscriptParser.parse(jsonStrBad1, type);
assertEquals(result, null);
// invalid time formatting
String jsonStrBad2 = "{'version': '1.0.0', "
+ "'segments': [ "
+ "{ 'speaker' : 'XJohn Doe', 'startTime': stringTime, 'endTime': stringTime, 'body': 'And' },"
+ "{ 'XstartTime': 2.9, 'XendTime': 3.4, 'body': 'the' },"
+ "{ 'startTime': '-2.9', 'endTime': '-3.4', 'body': 'the' },"
+ "{ 'startTime': 'bad_time', 'endTime': '-3.4', 'body': 'the' }]}";
result = TranscriptParser.parse(jsonStrBad2, type);
assertNull(result);
// Just plain text
String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
+ "way. The latest from PogNews.";
result = TranscriptParser.parse(strBad3, type);
assertNull(result);
// passing the wrong type
type = "application/srt";
result = TranscriptParser.parse(jsonStr, type);
assertEquals(result, null);
}
}

View File

@ -0,0 +1,93 @@
package de.danoeh.antennapod.parser.transcript;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.RobolectricTestRunner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import de.danoeh.antennapod.model.feed.Transcript;
@RunWith(RobolectricTestRunner.class)
public class SrtTranscriptParserTest {
private static String srtStr = "1\n"
+ "00:00:00,000 --> 00:00:02,730\n"
+ "John Doe: Promoting your podcast in a new\n\n"
+ "2\n"
+ "00:00:02,730 --> 00:00:04,600\n"
+ "way. The latest from PogNews.\n\n"
+ "00:00:04,730 --> 00:00:05,600\n"
+ "way. The latest from PogNews.";
@Test
public void testParseSrt() {
Transcript result = SrtTranscriptParser.parse(srtStr);
assertEquals(result.getSegmentAtTime(0L).getWords(), "Promoting your podcast in a new");
assertEquals(result.getSegmentAtTime(0L).getSpeaker(), "John Doe");
assertEquals(result.getSegmentAtTime(0L).getStartTime(), 0L);
assertEquals(result.getSegmentAtTime(0L).getEndTime(), 2730L);
assertEquals((long) result.getEntryAfterTime(1000L).getKey(), 2730L);
assertEquals(result.getEntryAfterTime(1000L).getValue().getWords(), "way. The latest from PogNews.");
}
@Test
public void testParse() {
String type = "application/srr";
Transcript result;
result = TranscriptParser.parse(srtStr, type);
// There isn't a segment at 800L, so go backwards and get the segment at 0L
assertEquals(result.getSegmentAtTime(800L).getWords(), "Promoting your podcast in a new");
result = TranscriptParser.parse(null, type);
assertEquals(result, null);
// blank string
String blankStr = "";
result = TranscriptParser.parse(blankStr, type);
assertNull(result);
// All empty lines
String allNewlinesStr = "\r\n\r\n\r\n\r\n";
result = TranscriptParser.parse(allNewlinesStr, type);
assertEquals(result, null);
// first segment has invalid time formatting, so the entire segment will be thrown out
String srtStrBad1 = "00:0000,000 --> 00:00:02,730\n"
+ "John Doe: Promoting your podcast in a new\n\n"
+ "2\n"
+ "00:00:02,730 --> 00:00:04,600\n"
+ "way. The latest from PogNews.";
result = TranscriptParser.parse(srtStrBad1, type);
assertEquals(result.getSegmentAtTime(2730L).getWords(), "way. The latest from PogNews.");
// first segment has invalid time in end time, 2nd segment has invalid time in both start time and end time
String srtStrBad2 = "00:00:00,000 --> 00:0002,730\n"
+ "Jane Doe: Promoting your podcast in a new\n\n"
+ "2\n"
+ "badstarttime --> badendtime\n"
+ "way. The latest from PogNews.\n"
+ "badstarttime -->\n"
+ "Jane Doe says something\n"
+ "00:00:00,000 --> 00:00:02,730\n"
+ "Jane Doe:";
result = TranscriptParser.parse(srtStrBad2, type);
assertNull(result);
// Just plain text
String strBad3 = "John Doe: Promoting your podcast in a new\n\n"
+ "way. The latest from PogNews.";
result = TranscriptParser.parse(strBad3, type);
assertNull(result);
// passing the wrong type
type = "application/json";
result = TranscriptParser.parse(srtStr, type);
assertEquals(result, null);
type = "unknown";
result = TranscriptParser.parse(srtStr, type);
assertEquals(result, null);
}
}

View File

@ -30,6 +30,7 @@ include ':net:sync:service'
include ':parser:feed'
include ':parser:media'
include ':parser:transcript'
include ':playback:base'
include ':playback:cast'