ttml to srt conversion

rewrite SubtitleConverter (use JSoup library instead, remove unused methods)
2020-01-14 00:00:45 -03:00 · 2020-01-14 00:00:45 -03:00 · 42ec6f0810
parent 9f47a274a8
commit 42ec6f0810
4 changed files with 100 additions and 397 deletions
--- a/app/src/main/java/org/schabi/newpipe/download/DownloadDialog.java
+++ b/app/src/main/java/org/schabi/newpipe/download/DownloadDialog.java
@ -832,7 +832,6 @@ public class DownloadDialog extends DialogFragment implements RadioGroup.OnCheck
                    psArgs = new String[]{
                            selectedStream.getFormat().getSuffix(),
                            "false",// ignore empty frames
-                            "false",// detect youtube duplicate lines
                    };
                }
                break;
--- a/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SrtFromTtmlWriter.java
@ -0,0 +1,95 @@
+package org.schabi.newpipe.streams;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.select.Elements;
+import org.schabi.newpipe.streams.io.SharpStream;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.text.ParseException;
+
+/**
+ * @author kapodamy
+ */
+public class SrtFromTtmlWriter {
+    private static final String NEW_LINE = "\r\n";
+
+    private SharpStream out;
+    private boolean ignoreEmptyFrames;
+    private final Charset charset = Charset.forName("utf-8");
+
+    private int frameIndex = 0;
+
+    public SrtFromTtmlWriter(SharpStream out, boolean ignoreEmptyFrames) {
+        this.out = out;
+        this.ignoreEmptyFrames = true;
+    }
+
+    private static String getTimestamp(Element frame, String attr) {
+        return frame
+                .attr(attr)
+                .replace('.', ',');// Str uses comma as decimal separator
+    }
+
+    private void writeFrame(String begin, String end, StringBuilder text) throws IOException {
+        writeString(String.valueOf(frameIndex++));
+        writeString(NEW_LINE);
+        writeString(begin);
+        writeString(" --> ");
+        writeString(end);
+        writeString(NEW_LINE);
+        writeString(text.toString());
+        writeString(NEW_LINE);
+        writeString(NEW_LINE);
+    }
+
+    private void writeString(String text) throws IOException {
+        out.write(text.getBytes(charset));
+    }
+
+    public void build(SharpStream ttml) throws IOException {
+        /*
+         * TTML parser with BASIC support
+         * multiple CUE is not supported
+         * styling is not supported
+         * tag timestamps (in auto-generated subtitles) are not supported, maybe in the future
+         * also TimestampTagOption enum is not applicable
+         * Language parsing is not supported
+         */
+
+        // parse XML
+        byte[] buffer = new byte[(int) ttml.available()];
+        ttml.read(buffer);
+        Document doc = Jsoup.parse(new ByteArrayInputStream(buffer), "UTF-8", "", Parser.xmlParser());
+
+        StringBuilder text = new StringBuilder(128);
+        Elements paragraph_list = doc.select("body>div>p");
+
+        // check if has frames
+        if (paragraph_list.size() < 1) return;
+
+        for (Element paragraph : paragraph_list) {
+            text.setLength(0);
+
+            for (Node children : paragraph.childNodes()) {
+                if (children instanceof TextNode)
+                    text.append(((TextNode) children).text());
+                else if (children instanceof Element && ((Element) children).tagName().equalsIgnoreCase("br"))
+                    text.append(NEW_LINE);
+            }
+
+            if (ignoreEmptyFrames && text.length() < 1) continue;
+
+            String begin = getTimestamp(paragraph, "begin");
+            String end = getTimestamp(paragraph, "end");
+
+            writeFrame(begin, end, text);
+        }
+    }
+}
--- a/app/src/main/java/org/schabi/newpipe/streams/SubtitleConverter.java
+++ b/app/src/main/java/org/schabi/newpipe/streams/SubtitleConverter.java
@ -1,369 +0,0 @@
-package org.schabi.newpipe.streams;
-
-import org.schabi.newpipe.streams.io.SharpStream;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.text.ParseException;
-import java.util.Locale;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.xpath.XPathExpressionException;
-
-/**
- * @author kapodamy
- */
-public class SubtitleConverter {
-    private static final String NEW_LINE = "\r\n";
-
-    public void dumpTTML(SharpStream in, final SharpStream out, final boolean ignoreEmptyFrames, final boolean detectYoutubeDuplicateLines
-    ) throws IOException, ParseException, SAXException, ParserConfigurationException, XPathExpressionException {
-
-        final FrameWriter callback = new FrameWriter() {
-            int frameIndex = 0;
-            final Charset charset = Charset.forName("utf-8");
-
-            @Override
-            public void yield(SubtitleFrame frame) throws IOException {
-                if (ignoreEmptyFrames && frame.isEmptyText()) {
-                    return;
-                }
-                out.write(String.valueOf(frameIndex++).getBytes(charset));
-                out.write(NEW_LINE.getBytes(charset));
-                out.write(getTime(frame.start, true).getBytes(charset));
-                out.write(" --> ".getBytes(charset));
-                out.write(getTime(frame.end, true).getBytes(charset));
-                out.write(NEW_LINE.getBytes(charset));
-                out.write(frame.text.getBytes(charset));
-                out.write(NEW_LINE.getBytes(charset));
-                out.write(NEW_LINE.getBytes(charset));
-            }
-        };
-
-        read_xml_based(in, callback, detectYoutubeDuplicateLines,
-                "tt", "xmlns", "http://www.w3.org/ns/ttml",
-                new String[]{"timedtext", "head", "wp"},
-                new String[]{"body", "div", "p"},
-                "begin", "end", true
-        );
-    }
-
-    private void read_xml_based(SharpStream source, FrameWriter callback, boolean detectYoutubeDuplicateLines,
-                                String root, String formatAttr, String formatVersion, String[] cuePath, String[] framePath,
-                                String timeAttr, String durationAttr, boolean hasTimestamp
-    ) throws IOException, ParseException, SAXException, ParserConfigurationException, XPathExpressionException {
-        /*
-         * XML based subtitles parser with BASIC support
-         * multiple CUE is not supported
-         * styling is not supported
-         * tag timestamps (in auto-generated subtitles) are not supported, maybe in the future
-         * also TimestampTagOption enum is not applicable
-         * Language parsing is not supported
-         */
-
-        byte[] buffer = new byte[(int) source.available()];
-        source.read(buffer);
-
-        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-        factory.setNamespaceAware(true);
-        DocumentBuilder builder = factory.newDocumentBuilder();
-        Document xml = builder.parse(new ByteArrayInputStream(buffer));
-
-        String attr;
-
-        // get the format version or namespace
-        Element node = xml.getDocumentElement();
-
-        if (node == null) {
-            throw new ParseException("Can't get the format version. ¿wrong namespace?", -1);
-        } else if (!node.getNodeName().equals(root)) {
-            throw new ParseException("Invalid root", -1);
-        }
-
-        if (formatAttr.equals("xmlns")) {
-            if (!node.getNamespaceURI().equals(formatVersion)) {
-                throw new UnsupportedOperationException("Expected xml namespace: " + formatVersion);
-            }
-        } else {
-            attr = node.getAttributeNS(formatVersion, formatAttr);
-            if (attr == null) {
-                throw new ParseException("Can't get the format attribute", -1);
-            }
-            if (!attr.equals(formatVersion)) {
-                throw new ParseException("Invalid format version : " + attr, -1);
-            }
-        }
-
-        NodeList node_list;
-
-        int line_break = 0;// Maximum characters per line if present (valid for TranScript v3)
-
-        if (!hasTimestamp) {
-            node_list = selectNodes(xml, cuePath, formatVersion);
-
-            if (node_list != null) {
-                // if the subtitle has multiple CUEs, use the highest value
-                for (int i = 0; i < node_list.getLength(); i++) {
-                    try {
-                        int tmp = Integer.parseInt(((Element) node_list.item(i)).getAttributeNS(formatVersion, "ah"));
-                        if (tmp > line_break) {
-                            line_break = tmp;
-                        }
-                    } catch (Exception err) {
-                    }
-                }
-            }
-        }
-
-        // parse every frame
-        node_list = selectNodes(xml, framePath, formatVersion);
-
-        if (node_list == null) {
-            return;// no frames detected
-        }
-
-        int fs_ff = -1;// first timestamp of first frame
-        boolean limit_lines = false;
-
-        for (int i = 0; i < node_list.getLength(); i++) {
-            Element elem = (Element) node_list.item(i);
-            SubtitleFrame obj = new SubtitleFrame();
-            obj.text = elem.getTextContent();
-
-            attr = elem.getAttribute(timeAttr);// ¡this cant be null!
-            obj.start = hasTimestamp ? parseTimestamp(attr) : Integer.parseInt(attr);
-
-            attr = elem.getAttribute(durationAttr);
-            if (obj.text == null || attr == null) {
-                continue;// normally is a blank line (on auto-generated subtitles) ignore
-            }
-
-            if (hasTimestamp) {
-                obj.end = parseTimestamp(attr);
-
-                if (detectYoutubeDuplicateLines) {
-                    if (limit_lines) {
-                        int swap = obj.end;
-                        obj.end = fs_ff;
-                        fs_ff = swap;
-                    } else {
-                        if (fs_ff < 0) {
-                            fs_ff = obj.end;
-                        } else {
-                            if (fs_ff < obj.start) {
-                                limit_lines = true;// the subtitles has duplicated lines
-                            } else {
-                                detectYoutubeDuplicateLines = false;
-                            }
-                        }
-                    }
-                }
-            } else {
-                obj.end = obj.start + Integer.parseInt(attr);
-            }
-
-            if (/*node.getAttribute("w").equals("1") &&*/line_break > 1 && obj.text.length() > line_break) {
-
-                // implement auto line breaking (once)
-                StringBuilder text = new StringBuilder(obj.text);
-                obj.text = null;
-
-                switch (text.charAt(line_break)) {
-                    case ' ':
-                    case '\t':
-                        putBreakAt(line_break, text);
-                        break;
-                    default:// find the word start position
-                        for (int j = line_break - 1; j > 0; j--) {
-                            switch (text.charAt(j)) {
-                                case ' ':
-                                case '\t':
-                                    putBreakAt(j, text);
-                                    j = -1;
-                                    break;
-                                case '\r':
-                                case '\n':
-                                    j = -1;// long word, just ignore
-                                    break;
-                            }
-                        }
-                        break;
-                }
-
-                obj.text = text.toString();// set the processed text
-            }
-
-            callback.yield(obj);
-        }
-    }
-
-    private static NodeList selectNodes(Document xml, String[] path, String namespaceUri) {
-        Element ref = xml.getDocumentElement();
-
-        for (int i = 0; i < path.length - 1; i++) {
-            NodeList nodes = ref.getChildNodes();
-            if (nodes.getLength() < 1) {
-                return null;
-            }
-
-            Element elem;
-            for (int j = 0; j < nodes.getLength(); j++) {
-                if (nodes.item(j).getNodeType() == Node.ELEMENT_NODE) {
-                    elem = (Element) nodes.item(j);
-                    if (elem.getNodeName().equals(path[i]) && elem.getNamespaceURI().equals(namespaceUri)) {
-                        ref = elem;
-                        break;
-                    }
-                }
-            }
-        }
-
-        return ref.getElementsByTagNameNS(namespaceUri, path[path.length - 1]);
-    }
-
-    private static int parseTimestamp(String multiImpl) throws NumberFormatException, ParseException {
-        if (multiImpl.length() < 1) {
-            return 0;
-        } else if (multiImpl.length() == 1) {
-            return Integer.parseInt(multiImpl) * 1000;// ¡this must be a number in seconds!
-        }
-
-        // detect wallclock-time
-        if (multiImpl.startsWith("wallclock(")) {
-            throw new UnsupportedOperationException("Parsing wallclock timestamp is not implemented");
-        }
-
-        // detect offset-time
-        if (multiImpl.indexOf(':') < 0) {
-            int multiplier = 1000;
-            char metric = multiImpl.charAt(multiImpl.length() - 1);
-            switch (metric) {
-                case 'h':
-                    multiplier *= 3600000;
-                    break;
-                case 'm':
-                    multiplier *= 60000;
-                    break;
-                case 's':
-                    if (multiImpl.charAt(multiImpl.length() - 2) == 'm') {
-                        multiplier = 1;// ms
-                    }
-                    break;
-                default:
-                    if (!Character.isDigit(metric)) {
-                        throw new NumberFormatException("Invalid metric suffix found on : " + multiImpl);
-                    }
-                    metric = '\0';
-                    break;
-            }
-            try {
-                String offset_time = multiImpl;
-
-                if (multiplier == 1) {
-                    offset_time = offset_time.substring(0, offset_time.length() - 2);
-                } else if (metric != '\0') {
-                    offset_time = offset_time.substring(0, offset_time.length() - 1);
-                }
-
-                double time_metric_based = Double.parseDouble(offset_time);
-                if (Math.abs(time_metric_based) <= Double.MAX_VALUE) {
-                    return (int) (time_metric_based * multiplier);
-                }
-            } catch (Exception err) {
-                throw new UnsupportedOperationException("Invalid or not implemented timestamp on: " + multiImpl);
-            }
-        }
-
-        // detect clock-time
-        int time = 0;
-        String[] units = multiImpl.split(":");
-
-        if (units.length < 3) {
-            throw new ParseException("Invalid clock-time timestamp", -1);
-        }
-
-        time += Integer.parseInt(units[0]) * 3600000;// hours
-        time += Integer.parseInt(units[1]) * 60000;//minutes
-        time += Float.parseFloat(units[2]) * 1000f;// seconds and milliseconds (if present)
-
-        // frames and sub-frames are ignored (not implemented)
-        // time += units[3] * fps;
-        return time;
-    }
-
-    private static void putBreakAt(int idx, StringBuilder str) {
-        // this should be optimized at compile time
-
-        if (NEW_LINE.length() > 1) {
-            str.delete(idx, idx + 1);// remove after replace
-            str.insert(idx, NEW_LINE);
-        } else {
-            str.setCharAt(idx, NEW_LINE.charAt(0));
-        }
-    }
-
-    private static String getTime(int time, boolean comma) {
-        // cast every value to integer to avoid auto-round in ToString("00").
-        StringBuilder str = new StringBuilder(12);
-        str.append(numberToString(time / 1000 / 3600, 2));// hours
-        str.append(':');
-        str.append(numberToString(time / 1000 / 60 % 60, 2));// minutes
-        str.append(':');
-        str.append(numberToString(time / 1000 % 60, 2));// seconds
-        str.append(comma ? ',' : '.');
-        str.append(numberToString(time % 1000, 3));// miliseconds
-
-        return str.toString();
-    }
-
-    private static String numberToString(int nro, int pad) {
-        return String.format(Locale.ENGLISH, "%0".concat(String.valueOf(pad)).concat("d"), nro);
-    }
-
-
-    /******************
-     * helper classes *
-     ******************/
-
-    private interface FrameWriter {
-
-        void yield(SubtitleFrame frame) throws IOException;
-    }
-
-    private static class SubtitleFrame {
-        //Java no support unsigned int
-
-        public int end;
-        public int start;
-        public String text = "";
-
-        private boolean isEmptyText() {
-            if (text == null) {
-                return true;
-            }
-
-            for (int i = 0; i < text.length(); i++) {
-                switch (text.charAt(i)) {
-                    case ' ':
-                    case '\t':
-                    case '\r':
-                    case '\n':
-                        break;
-                    default:
-                        return false;
-                }
-            }
-
-            return true;
-        }
-    }
-
-}
--- a/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java
+++ b/app/src/main/java/us/shandian/giga/postprocessing/TtmlConverter.java
@ -2,15 +2,10 @@ package us.shandian.giga.postprocessing;

 import android.util.Log;

-import org.schabi.newpipe.streams.SubtitleConverter;
+import org.schabi.newpipe.streams.SrtFromTtmlWriter;
 import org.schabi.newpipe.streams.io.SharpStream;
-import org.xml.sax.SAXException;

 import java.io.IOException;
-import java.text.ParseException;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.xpath.XPathExpressionException;

 /**
 * @author kapodamy
@ -27,33 +22,16 @@ class TtmlConverter extends Postprocessing {
    int process(SharpStream out, SharpStream... sources) throws IOException {
        // check if the subtitle is already in srt and copy, this should never happen
        String format = getArgumentAt(0, null);
+        boolean ignoreEmptyFrames = getArgumentAt(1, "true").equals("true");

        if (format == null || format.equals("ttml")) {
-            SubtitleConverter ttmlDumper = new SubtitleConverter();
+            SrtFromTtmlWriter writer = new SrtFromTtmlWriter(out, ignoreEmptyFrames);

            try {
-                ttmlDumper.dumpTTML(
-                        sources[0],
-                        out,
-                        getArgumentAt(1, "true").equals("true"),
-                        getArgumentAt(2, "true").equals("true")
-                );
+                writer.build(sources[0]);
            } catch (Exception err) {
                Log.e(TAG, "subtitle parse failed", err);
-
-                if (err instanceof IOException) {
-                    return 1;
-                } else if (err instanceof ParseException) {
-                    return 2;
-                } else if (err instanceof SAXException) {
-                    return 3;
-                } else if (err instanceof ParserConfigurationException) {
-                    return 4;
-                } else if (err instanceof XPathExpressionException) {
-                    return 7;
-                }
-
-                return 8;
+                return err instanceof IOException ? 1 : 8;
            }

            return OK_RESULT;