From 42ec6f0810332904fbb22d1d3dd87a8ce442c655 Mon Sep 17 00:00:00 2001 From: kapodamy Date: Tue, 14 Jan 2020 00:00:45 -0300 Subject: [PATCH 1/4] ttml to srt conversion rewrite SubtitleConverter (use JSoup library instead, remove unused methods) --- .../newpipe/download/DownloadDialog.java | 1 - .../newpipe/streams/SrtFromTtmlWriter.java | 95 +++++ .../newpipe/streams/SubtitleConverter.java | 369 ------------------ .../giga/postprocessing/TtmlConverter.java | 32 +- 4 files changed, 100 insertions(+), 397 deletions(-) create mode 100644 app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java delete mode 100644 app/src/main/java/org/schabi/newpipe/streams/SubtitleConverter.java diff --git a/app/src/main/java/org/schabi/newpipe/download/DownloadDialog.java b/app/src/main/java/org/schabi/newpipe/download/DownloadDialog.java index 853bb6d68..44966744b 100644 --- a/app/src/main/java/org/schabi/newpipe/download/DownloadDialog.java +++ b/app/src/main/java/org/schabi/newpipe/download/DownloadDialog.java @@ -832,7 +832,6 @@ public class DownloadDialog extends DialogFragment implements RadioGroup.OnCheck psArgs = new String[]{ selectedStream.getFormat().getSuffix(), "false",// ignore empty frames - "false",// detect youtube duplicate lines }; } break; diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java new file mode 100644 index 000000000..75e16edad --- /dev/null +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -0,0 +1,95 @@ +package org.schabi.newpipe.streams; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.parser.Parser; +import org.jsoup.select.Elements; +import org.schabi.newpipe.streams.io.SharpStream; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.text.ParseException; + +/** + * @author kapodamy + */ +public class SrtFromTtmlWriter { + private static final String NEW_LINE = "\r\n"; + + private SharpStream out; + private boolean ignoreEmptyFrames; + private final Charset charset = Charset.forName("utf-8"); + + private int frameIndex = 0; + + public SrtFromTtmlWriter(SharpStream out, boolean ignoreEmptyFrames) { + this.out = out; + this.ignoreEmptyFrames = true; + } + + private static String getTimestamp(Element frame, String attr) { + return frame + .attr(attr) + .replace('.', ',');// Str uses comma as decimal separator + } + + private void writeFrame(String begin, String end, StringBuilder text) throws IOException { + writeString(String.valueOf(frameIndex++)); + writeString(NEW_LINE); + writeString(begin); + writeString(" --> "); + writeString(end); + writeString(NEW_LINE); + writeString(text.toString()); + writeString(NEW_LINE); + writeString(NEW_LINE); + } + + private void writeString(String text) throws IOException { + out.write(text.getBytes(charset)); + } + + public void build(SharpStream ttml) throws IOException { + /* + * TTML parser with BASIC support + * multiple CUE is not supported + * styling is not supported + * tag timestamps (in auto-generated subtitles) are not supported, maybe in the future + * also TimestampTagOption enum is not applicable + * Language parsing is not supported + */ + + // parse XML + byte[] buffer = new byte[(int) ttml.available()]; + ttml.read(buffer); + Document doc = Jsoup.parse(new ByteArrayInputStream(buffer), "UTF-8", "", Parser.xmlParser()); + + StringBuilder text = new StringBuilder(128); + Elements paragraph_list = doc.select("body>div>p"); + + // check if has frames + if (paragraph_list.size() < 1) return; + + for (Element paragraph : paragraph_list) { + text.setLength(0); + + for (Node children : paragraph.childNodes()) { + if (children instanceof TextNode) + text.append(((TextNode) children).text()); + else if (children instanceof Element && ((Element) children).tagName().equalsIgnoreCase("br")) + text.append(NEW_LINE); + } + + if (ignoreEmptyFrames && text.length() < 1) continue; + + String begin = getTimestamp(paragraph, "begin"); + String end = getTimestamp(paragraph, "end"); + + writeFrame(begin, end, text); + } + } +} diff --git a/app/src/main/java/org/schabi/newpipe/streams/SubtitleConverter.java b/app/src/main/java/org/schabi/newpipe/streams/SubtitleConverter.java deleted file mode 100644 index c41db4373..000000000 --- a/app/src/main/java/org/schabi/newpipe/streams/SubtitleConverter.java +++ /dev/null @@ -1,369 +0,0 @@ -package org.schabi.newpipe.streams; - -import org.schabi.newpipe.streams.io.SharpStream; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.charset.Charset; -import java.text.ParseException; -import java.util.Locale; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.xpath.XPathExpressionException; - -/** - * @author kapodamy - */ -public class SubtitleConverter { - private static final String NEW_LINE = "\r\n"; - - public void dumpTTML(SharpStream in, final SharpStream out, final boolean ignoreEmptyFrames, final boolean detectYoutubeDuplicateLines - ) throws IOException, ParseException, SAXException, ParserConfigurationException, XPathExpressionException { - - final FrameWriter callback = new FrameWriter() { - int frameIndex = 0; - final Charset charset = Charset.forName("utf-8"); - - @Override - public void yield(SubtitleFrame frame) throws IOException { - if (ignoreEmptyFrames && frame.isEmptyText()) { - return; - } - out.write(String.valueOf(frameIndex++).getBytes(charset)); - out.write(NEW_LINE.getBytes(charset)); - out.write(getTime(frame.start, true).getBytes(charset)); - out.write(" --> ".getBytes(charset)); - out.write(getTime(frame.end, true).getBytes(charset)); - out.write(NEW_LINE.getBytes(charset)); - out.write(frame.text.getBytes(charset)); - out.write(NEW_LINE.getBytes(charset)); - out.write(NEW_LINE.getBytes(charset)); - } - }; - - read_xml_based(in, callback, detectYoutubeDuplicateLines, - "tt", "xmlns", "http://www.w3.org/ns/ttml", - new String[]{"timedtext", "head", "wp"}, - new String[]{"body", "div", "p"}, - "begin", "end", true - ); - } - - private void read_xml_based(SharpStream source, FrameWriter callback, boolean detectYoutubeDuplicateLines, - String root, String formatAttr, String formatVersion, String[] cuePath, String[] framePath, - String timeAttr, String durationAttr, boolean hasTimestamp - ) throws IOException, ParseException, SAXException, ParserConfigurationException, XPathExpressionException { - /* - * XML based subtitles parser with BASIC support - * multiple CUE is not supported - * styling is not supported - * tag timestamps (in auto-generated subtitles) are not supported, maybe in the future - * also TimestampTagOption enum is not applicable - * Language parsing is not supported - */ - - byte[] buffer = new byte[(int) source.available()]; - source.read(buffer); - - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - DocumentBuilder builder = factory.newDocumentBuilder(); - Document xml = builder.parse(new ByteArrayInputStream(buffer)); - - String attr; - - // get the format version or namespace - Element node = xml.getDocumentElement(); - - if (node == null) { - throw new ParseException("Can't get the format version. ¿wrong namespace?", -1); - } else if (!node.getNodeName().equals(root)) { - throw new ParseException("Invalid root", -1); - } - - if (formatAttr.equals("xmlns")) { - if (!node.getNamespaceURI().equals(formatVersion)) { - throw new UnsupportedOperationException("Expected xml namespace: " + formatVersion); - } - } else { - attr = node.getAttributeNS(formatVersion, formatAttr); - if (attr == null) { - throw new ParseException("Can't get the format attribute", -1); - } - if (!attr.equals(formatVersion)) { - throw new ParseException("Invalid format version : " + attr, -1); - } - } - - NodeList node_list; - - int line_break = 0;// Maximum characters per line if present (valid for TranScript v3) - - if (!hasTimestamp) { - node_list = selectNodes(xml, cuePath, formatVersion); - - if (node_list != null) { - // if the subtitle has multiple CUEs, use the highest value - for (int i = 0; i < node_list.getLength(); i++) { - try { - int tmp = Integer.parseInt(((Element) node_list.item(i)).getAttributeNS(formatVersion, "ah")); - if (tmp > line_break) { - line_break = tmp; - } - } catch (Exception err) { - } - } - } - } - - // parse every frame - node_list = selectNodes(xml, framePath, formatVersion); - - if (node_list == null) { - return;// no frames detected - } - - int fs_ff = -1;// first timestamp of first frame - boolean limit_lines = false; - - for (int i = 0; i < node_list.getLength(); i++) { - Element elem = (Element) node_list.item(i); - SubtitleFrame obj = new SubtitleFrame(); - obj.text = elem.getTextContent(); - - attr = elem.getAttribute(timeAttr);// ¡this cant be null! - obj.start = hasTimestamp ? parseTimestamp(attr) : Integer.parseInt(attr); - - attr = elem.getAttribute(durationAttr); - if (obj.text == null || attr == null) { - continue;// normally is a blank line (on auto-generated subtitles) ignore - } - - if (hasTimestamp) { - obj.end = parseTimestamp(attr); - - if (detectYoutubeDuplicateLines) { - if (limit_lines) { - int swap = obj.end; - obj.end = fs_ff; - fs_ff = swap; - } else { - if (fs_ff < 0) { - fs_ff = obj.end; - } else { - if (fs_ff < obj.start) { - limit_lines = true;// the subtitles has duplicated lines - } else { - detectYoutubeDuplicateLines = false; - } - } - } - } - } else { - obj.end = obj.start + Integer.parseInt(attr); - } - - if (/*node.getAttribute("w").equals("1") &&*/line_break > 1 && obj.text.length() > line_break) { - - // implement auto line breaking (once) - StringBuilder text = new StringBuilder(obj.text); - obj.text = null; - - switch (text.charAt(line_break)) { - case ' ': - case '\t': - putBreakAt(line_break, text); - break; - default:// find the word start position - for (int j = line_break - 1; j > 0; j--) { - switch (text.charAt(j)) { - case ' ': - case '\t': - putBreakAt(j, text); - j = -1; - break; - case '\r': - case '\n': - j = -1;// long word, just ignore - break; - } - } - break; - } - - obj.text = text.toString();// set the processed text - } - - callback.yield(obj); - } - } - - private static NodeList selectNodes(Document xml, String[] path, String namespaceUri) { - Element ref = xml.getDocumentElement(); - - for (int i = 0; i < path.length - 1; i++) { - NodeList nodes = ref.getChildNodes(); - if (nodes.getLength() < 1) { - return null; - } - - Element elem; - for (int j = 0; j < nodes.getLength(); j++) { - if (nodes.item(j).getNodeType() == Node.ELEMENT_NODE) { - elem = (Element) nodes.item(j); - if (elem.getNodeName().equals(path[i]) && elem.getNamespaceURI().equals(namespaceUri)) { - ref = elem; - break; - } - } - } - } - - return ref.getElementsByTagNameNS(namespaceUri, path[path.length - 1]); - } - - private static int parseTimestamp(String multiImpl) throws NumberFormatException, ParseException { - if (multiImpl.length() < 1) { - return 0; - } else if (multiImpl.length() == 1) { - return Integer.parseInt(multiImpl) * 1000;// ¡this must be a number in seconds! - } - - // detect wallclock-time - if (multiImpl.startsWith("wallclock(")) { - throw new UnsupportedOperationException("Parsing wallclock timestamp is not implemented"); - } - - // detect offset-time - if (multiImpl.indexOf(':') < 0) { - int multiplier = 1000; - char metric = multiImpl.charAt(multiImpl.length() - 1); - switch (metric) { - case 'h': - multiplier *= 3600000; - break; - case 'm': - multiplier *= 60000; - break; - case 's': - if (multiImpl.charAt(multiImpl.length() - 2) == 'm') { - multiplier = 1;// ms - } - break; - default: - if (!Character.isDigit(metric)) { - throw new NumberFormatException("Invalid metric suffix found on : " + multiImpl); - } - metric = '\0'; - break; - } - try { - String offset_time = multiImpl; - - if (multiplier == 1) { - offset_time = offset_time.substring(0, offset_time.length() - 2); - } else if (metric != '\0') { - offset_time = offset_time.substring(0, offset_time.length() - 1); - } - - double time_metric_based = Double.parseDouble(offset_time); - if (Math.abs(time_metric_based) <= Double.MAX_VALUE) { - return (int) (time_metric_based * multiplier); - } - } catch (Exception err) { - throw new UnsupportedOperationException("Invalid or not implemented timestamp on: " + multiImpl); - } - } - - // detect clock-time - int time = 0; - String[] units = multiImpl.split(":"); - - if (units.length < 3) { - throw new ParseException("Invalid clock-time timestamp", -1); - } - - time += Integer.parseInt(units[0]) * 3600000;// hours - time += Integer.parseInt(units[1]) * 60000;//minutes - time += Float.parseFloat(units[2]) * 1000f;// seconds and milliseconds (if present) - - // frames and sub-frames are ignored (not implemented) - // time += units[3] * fps; - return time; - } - - private static void putBreakAt(int idx, StringBuilder str) { - // this should be optimized at compile time - - if (NEW_LINE.length() > 1) { - str.delete(idx, idx + 1);// remove after replace - str.insert(idx, NEW_LINE); - } else { - str.setCharAt(idx, NEW_LINE.charAt(0)); - } - } - - private static String getTime(int time, boolean comma) { - // cast every value to integer to avoid auto-round in ToString("00"). - StringBuilder str = new StringBuilder(12); - str.append(numberToString(time / 1000 / 3600, 2));// hours - str.append(':'); - str.append(numberToString(time / 1000 / 60 % 60, 2));// minutes - str.append(':'); - str.append(numberToString(time / 1000 % 60, 2));// seconds - str.append(comma ? ',' : '.'); - str.append(numberToString(time % 1000, 3));// miliseconds - - return str.toString(); - } - - private static String numberToString(int nro, int pad) { - return String.format(Locale.ENGLISH, "%0".concat(String.valueOf(pad)).concat("d"), nro); - } - - - /****************** - * helper classes * - ******************/ - - private interface FrameWriter { - - void yield(SubtitleFrame frame) throws IOException; - } - - private static class SubtitleFrame { - //Java no support unsigned int - - public int end; - public int start; - public String text = ""; - - private boolean isEmptyText() { - if (text == null) { - return true; - } - - for (int i = 0; i < text.length(); i++) { - switch (text.charAt(i)) { - case ' ': - case '\t': - case '\r': - case '\n': - break; - default: - return false; - } - } - - return true; - } - } - -} diff --git a/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java b/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java index 5a5b687f7..8ed0dfae5 100644 --- a/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java +++ b/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java @@ -2,15 +2,10 @@ package us.shandian.giga.postprocessing; import android.util.Log; -import org.schabi.newpipe.streams.SubtitleConverter; +import org.schabi.newpipe.streams.SrtFromTtmlWriter; import org.schabi.newpipe.streams.io.SharpStream; -import org.xml.sax.SAXException; import java.io.IOException; -import java.text.ParseException; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.xpath.XPathExpressionException; /** * @author kapodamy @@ -27,33 +22,16 @@ class TtmlConverter extends Postprocessing { int process(SharpStream out, SharpStream... sources) throws IOException { // check if the subtitle is already in srt and copy, this should never happen String format = getArgumentAt(0, null); + boolean ignoreEmptyFrames = getArgumentAt(1, "true").equals("true"); if (format == null || format.equals("ttml")) { - SubtitleConverter ttmlDumper = new SubtitleConverter(); + SrtFromTtmlWriter writer = new SrtFromTtmlWriter(out, ignoreEmptyFrames); try { - ttmlDumper.dumpTTML( - sources[0], - out, - getArgumentAt(1, "true").equals("true"), - getArgumentAt(2, "true").equals("true") - ); + writer.build(sources[0]); } catch (Exception err) { Log.e(TAG, "subtitle parse failed", err); - - if (err instanceof IOException) { - return 1; - } else if (err instanceof ParseException) { - return 2; - } else if (err instanceof SAXException) { - return 3; - } else if (err instanceof ParserConfigurationException) { - return 4; - } else if (err instanceof XPathExpressionException) { - return 7; - } - - return 8; + return err instanceof IOException ? 1 : 8; } return OK_RESULT; From 845767e2f8f62dc34452e54723a2dd3670c93ff7 Mon Sep 17 00:00:00 2001 From: kapodamy Date: Sat, 18 Jan 2020 00:43:38 -0300 Subject: [PATCH 2/4] StandardCharsets.UTF_8 instead of Charset.forName("utf-8") --- .../main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 75e16edad..696f24d05 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -22,7 +22,7 @@ public class SrtFromTtmlWriter { private SharpStream out; private boolean ignoreEmptyFrames; - private final Charset charset = Charset.forName("utf-8"); + private final Charset charset = StandardCharsets.UTF_8; private int frameIndex = 0; From a2d3e2c7e0799fbc9fba717d7cc586dbf63ac241 Mon Sep 17 00:00:00 2001 From: kapodamy Date: Sat, 18 Jan 2020 01:10:25 -0300 Subject: [PATCH 3/4] 2 typo fixup * add missing namespace of StandardCharsets * use an unused constructor argument --- .../java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index 696f24d05..e20b06352 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -12,7 +12,7 @@ import org.schabi.newpipe.streams.io.SharpStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.Charset; -import java.text.ParseException; +import java.nio.charset.StandardCharsets; /** * @author kapodamy @@ -28,7 +28,7 @@ public class SrtFromTtmlWriter { public SrtFromTtmlWriter(SharpStream out, boolean ignoreEmptyFrames) { this.out = out; - this.ignoreEmptyFrames = true; + this.ignoreEmptyFrames = ignoreEmptyFrames; } private static String getTimestamp(Element frame, String attr) { From afc362d2b613773df1896dbb1a3f6179f2d7beb6 Mon Sep 17 00:00:00 2001 From: kapodamy Date: Mon, 20 Jan 2020 23:33:30 -0300 Subject: [PATCH 4/4] readability changes --- .../java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java index e20b06352..6f1cceeed 100644 --- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java +++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java @@ -34,7 +34,7 @@ public class SrtFromTtmlWriter { private static String getTimestamp(Element frame, String attr) { return frame .attr(attr) - .replace('.', ',');// Str uses comma as decimal separator + .replace('.', ',');// SRT subtitles uses comma as decimal separator } private void writeFrame(String begin, String end, StringBuilder text) throws IOException { @@ -69,7 +69,7 @@ public class SrtFromTtmlWriter { Document doc = Jsoup.parse(new ByteArrayInputStream(buffer), "UTF-8", "", Parser.xmlParser()); StringBuilder text = new StringBuilder(128); - Elements paragraph_list = doc.select("body>div>p"); + Elements paragraph_list = doc.select("body > div > p"); // check if has frames if (paragraph_list.size() < 1) return;