Stream filters (first part)

2018-12-29 18:22:13 +01:00 · 2018-12-29 18:22:13 +01:00 · a9292a4c02
parent 9239d663e6
commit a9292a4c02
3 changed files with 493 additions and 2 deletions
--- a/PdfForQtLib/PdfForQtLib.pro
+++ b/PdfForQtLib/PdfForQtLib.pro
@ -44,7 +44,8 @@ SOURCES += \
    sources/pdfvisitor.cpp \
    sources/pdfencoding.cpp \
    sources/pdfcatalog.cpp \
-    sources/pdfpage.cpp
+    sources/pdfpage.cpp \
+    sources/pdfstreamfilters.cpp

 HEADERS += \
    sources/pdfobject.h \
@ -59,7 +60,8 @@ HEADERS += \
    sources/pdfencoding.h \
    sources/pdfcatalog.h \
    sources/pdfnumbertreeloader.h \
-    sources/pdfpage.h
+    sources/pdfpage.h \
+    sources/pdfstreamfilters.h

 unix {
    target.path = /usr/lib
--- a/PdfForQtLib/sources/pdfstreamfilters.cpp
+++ b/PdfForQtLib/sources/pdfstreamfilters.cpp
@ -0,0 +1,404 @@
+//    Copyright (C) 2018 Jakub Melka
+//
+//    This file is part of PdfForQt.
+//
+//    PdfForQt is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU Lesser General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    PdfForQt is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with PDFForQt.  If not, see <https://www.gnu.org/licenses/>.
+
+#include "pdfstreamfilters.h"
+#include "pdfdocument.h"
+#include "pdfparser.h"
+
+#include <QtEndian>
+
+namespace pdf
+{
+
+QByteArray PDFAsciiHexDecodeFilter::apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const
+{
+    Q_UNUSED(document);
+    Q_UNUSED(parameters);
+
+    const int indexOfEnd = data.indexOf('>');
+    const int size = (indexOfEnd == -1) ? data.size() : indexOfEnd;
+
+    if (size % 2 == 1)
+    {
+        // We must add trailing zero to the buffer
+        QByteArray temporaryData(data.constData(), size);
+        temporaryData.push_back('0');
+        return QByteArray::fromHex(temporaryData);
+    }
+    else if (size == data.size())
+    {
+        // We do this, because we do not want to allocate unnecessary buffer for this case.
+        // This case should be common.
+        return QByteArray::fromHex(data);
+    }
+
+    return QByteArray::fromHex(QByteArray::fromRawData(data.constData(), size));
+}
+
+QByteArray PDFAscii85DecodeFilter::apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const
+{
+    Q_UNUSED(document);
+    Q_UNUSED(parameters);
+
+    const unsigned char* dataBegin = reinterpret_cast<const unsigned char*>(data.constData());
+    const unsigned char* dataEnd = reinterpret_cast<const unsigned char*>(data.constData() + data.size());
+
+    const unsigned char* it = dataBegin;
+    const constexpr uint32_t STREAM_END = 0xFFFFFFFF;
+
+    auto getChar = [&it, dataEnd, STREAM_END]() -> uint32_t
+    {
+        // Skip whitespace characters
+        while (it != dataEnd && PDFLexicalAnalyzer::isWhitespace(*it))
+        {
+            ++it;
+        }
+
+        if (it == dataEnd || (*it == '~'))
+        {
+            return STREAM_END;
+        }
+
+        return *it++;
+    };
+
+    QByteArray result;
+    result.reserve(data.size() * 4 / 5);
+
+    while (true)
+    {
+        const uint32_t scannedChar = getChar();
+        if (scannedChar == STREAM_END)
+        {
+            break;
+        }
+        else if (scannedChar == 'z')
+        {
+            result.append(4, static_cast<char>(0));
+        }
+        else
+        {
+            // Scan all 5 characters, some of then can be equal to STREAM_END constant. We will
+            // treat all these characters as last character.
+            std::array<uint32_t, 5> scannedChars;
+            scannedChars.fill(84);
+            scannedChars[0] = scannedChar - 33;
+            int validBytes = 0;
+            for (auto it = std::next(scannedChars.begin()); it != scannedChars.end(); ++it)
+            {
+                uint32_t character = getChar();
+                if (character == STREAM_END)
+                {
+                    break;
+                }
+                *it = character - 33;
+                ++validBytes;
+            }
+
+            // Decode bytes using 85 base
+            uint32_t decodedBytesPacked = 0;
+            for (const uint32_t value : scannedChars)
+            {
+                decodedBytesPacked = decodedBytesPacked * 85 + value;
+            }
+
+            // Decode bytes into byte array
+            std::array<char, 4> decodedBytesUnpacked;
+            decodedBytesUnpacked.fill(0);
+            for (auto byteIt = decodedBytesUnpacked.rbegin(); byteIt != decodedBytesUnpacked.rend(); ++byteIt)
+            {
+                *byteIt = static_cast<char>(decodedBytesPacked & 0xFF);
+                decodedBytesPacked = decodedBytesPacked >> 8;
+            }
+
+            Q_ASSERT(validBytes <= decodedBytesUnpacked.size());
+            for (int i = 0; i < validBytes; ++i)
+            {
+                result.push_back(decodedBytesUnpacked[i]);
+            }
+        }
+    }
+
+    return result;
+}
+
+class PDFLzwStreamDecoder
+{
+public:
+    explicit PDFLzwStreamDecoder(const QByteArray& inputByteArray, uint32_t early);
+
+    QByteArray decompress();
+
+private:
+    static constexpr const uint32_t CODE_TABLE_RESET = 256;
+    static constexpr const uint32_t CODE_END_OF_STREAM = 257;
+
+    // Maximal code size is 12 bits. so we can have 2^12 = 4096 items
+    // in the table (some items are unused, for example 256, 257). We also
+    // need to initialize items under code 256, because we treat them specially,
+    // they are not initialized in the decompress.
+    static constexpr const uint32_t TABLE_SIZE = 4096;
+
+    /// Clears the input data table
+    void clearTable();
+
+    /// Returns a newly scanned code
+    uint32_t getCode();
+
+    struct TableItem
+    {
+        uint32_t previous = TABLE_SIZE;
+        char character = 0;
+    };
+
+    std::array<TableItem, TABLE_SIZE> m_table;
+    std::array<char, TABLE_SIZE> m_sequence;
+
+    uint32_t m_nextCode;        ///< Next code value (to be written into the table)
+    uint32_t m_nextBits;        ///< Number of bits of the next code
+    uint32_t m_early;           ///< Early (see PDF 1.7 Specification, this constant is 0 or 1, based on the dictionary value)
+    uint32_t m_inputBuffer;     ///< Input buffer, containing bits, which were read from the input byte array
+    uint32_t m_inputBits;       ///< Number of bits in the input buffer.
+    std::array<char, TABLE_SIZE>::iterator m_currentSequenceEnd;
+    bool m_first;               ///< Are we reading from stream for first time after the reset
+    char m_newCharacter;        ///< New character to be written
+    int m_position;             ///< Position in the input array
+    const QByteArray& m_inputByteArray;
+};
+
+PDFLzwStreamDecoder::PDFLzwStreamDecoder(const QByteArray& inputByteArray, uint32_t early) :
+    m_table(),
+    m_sequence(),
+    m_nextCode(0),
+    m_nextBits(0),
+    m_early(early),
+    m_inputBuffer(0),
+    m_inputBits(0),
+    m_currentSequenceEnd(m_sequence.begin()),
+    m_first(false),
+    m_newCharacter(0),
+    m_position(0),
+    m_inputByteArray(inputByteArray)
+{
+    for (size_t i = 0; i < 256; ++i)
+    {
+        m_table[i].character = static_cast<char>(i);
+        m_table[i].previous = TABLE_SIZE;
+    }
+
+    clearTable();
+}
+
+QByteArray PDFLzwStreamDecoder::decompress()
+{
+    QByteArray result;
+
+    // Guess output byte array size - assume compress ratio is 2:1
+    result.reserve(m_inputByteArray.size() * 2);
+
+    uint32_t previousCode = TABLE_SIZE;
+    while (true)
+    {
+        const uint32_t code = getCode();
+
+        if (code == CODE_END_OF_STREAM)
+        {
+            // We are at end of stream
+            break;
+        }
+        else if (code == CODE_TABLE_RESET)
+        {
+            // Just reset the table
+            clearTable();
+            continue;
+        }
+
+        // Normal operation code
+        if (code < m_nextCode)
+        {
+            m_currentSequenceEnd = m_sequence.begin();
+
+            for (uint32_t currentCode = code; currentCode != TABLE_SIZE; currentCode = m_table[currentCode].previous)
+            {
+                *m_currentSequenceEnd++ = m_table[currentCode].character;
+            }
+
+            // We must reverse the sequence, because we stored it in the
+            // linked list, which we traversed from last to first item.
+            std::reverse(m_sequence.begin(), m_currentSequenceEnd);
+        }
+        else if (code == m_nextCode)
+        {
+            // We use the buffer from previous run, just add a new
+            // character to the end.
+            *m_currentSequenceEnd++ = m_newCharacter;
+        }
+        else
+        {
+            // Unknown code
+            throw PDFParserException(PDFTranslationContext::tr("Invalid code in the LZW stream."));
+        }
+        m_newCharacter = m_sequence.front();
+
+        if (m_first)
+        {
+            m_first = false;
+        }
+        else
+        {
+            // Add a new word in the dictionary, if we have it
+            if (m_nextCode < TABLE_SIZE)
+            {
+                m_table[m_nextCode].character = m_newCharacter;
+                m_table[m_nextCode].previous = previousCode;
+                ++m_nextCode;
+            }
+
+            // Change bit size of the code, if it is neccessary
+            switch (m_nextCode + m_early)
+            {
+                case 512:
+                    m_nextBits = 10;
+                    break;
+
+                case 1024:
+                    m_nextBits = 11;
+                    break;
+
+                case 2048:
+                    m_nextBits = 12;
+                    break;
+
+                default:
+                    break;
+            }
+        }
+
+        previousCode = code;
+
+        // Copy the input array to the buffer
+        std::copy(m_sequence.begin(), m_currentSequenceEnd, std::back_inserter(result));
+    }
+
+    result.shrink_to_fit();
+    return result;
+}
+
+void PDFLzwStreamDecoder::clearTable()
+{
+    // We do not clear the m_table array here. It is for performance reasons, we assume
+    // the input is correct. We also do not clear the sequence buffer here.
+
+    m_nextCode = 258;
+    m_nextBits = 9;
+    m_first = true;
+    m_newCharacter = 0;
+}
+
+uint32_t PDFLzwStreamDecoder::getCode()
+{
+    while (m_inputBits < m_nextBits)
+    {
+        // Did we reach end of array?
+        if (m_position == m_inputByteArray.size())
+        {
+            return CODE_END_OF_STREAM;
+        }
+
+        m_inputBuffer = (m_inputBuffer << 8) | static_cast<unsigned char>(m_inputByteArray[m_position++]);
+        m_inputBits += 8;
+    }
+
+    // We must omit bits from left (old ones) and right (newly scanned ones) and
+    // read just m_nextBits bits. Mask should omit the old ones and shift (m_inputBits - m_nextBits)
+    // should omit the new ones.
+    const uint32_t mask = ((1 << m_nextBits) - 1);
+    const uint32_t code = (m_inputBuffer >> (m_inputBits - m_nextBits)) & mask;
+    m_inputBits -= m_nextBits;
+    return code;
+}
+
+QByteArray PDFLzwDecodeFilter::apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const
+{
+    uint32_t early = 1;
+
+    const PDFObject& dereferencedParameters = document->getObject(parameters);
+    if (dereferencedParameters.isDictionary())
+    {
+        const PDFDictionary* dictionary = dereferencedParameters.getDictionary();
+
+        PDFDocumentDataLoaderDecorator loader(document);
+        early = loader.readInteger(dictionary->get("EarlyChange"), 1);
+    }
+
+    PDFLzwStreamDecoder decoder(data, early);
+    return decoder.decompress();
+}
+
+QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const
+{
+    Q_UNUSED(document);
+    Q_UNUSED(parameters);
+
+    uint32_t size = data.size();
+
+    QByteArray dataToUncompress;
+    dataToUncompress.resize(sizeof(decltype(size)) + data.size());
+
+    qToBigEndian(size, dataToUncompress.data());
+    std::copy(data.cbegin(), data.cend(), std::next(dataToUncompress.begin(), sizeof(decltype(size))));
+
+    return qUncompress(dataToUncompress);
+}
+
+QByteArray PDFRunLengthDecodeFilter::apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const
+{
+    Q_UNUSED(document);
+    Q_UNUSED(parameters);
+
+    QByteArray result;
+    result.reserve(data.size() * 2);
+
+    auto itEnd = data.cend();
+    for (auto it = data.cbegin(); it != itEnd;)
+    {
+        const unsigned char current = *it++;
+        if (current == 128)
+        {
+            // End of stream marker
+            break;
+        }
+        else if (current < 128)
+        {
+            // Copy n + 1 characters from the input array literally (and advance iterators)
+            const int count = static_cast<int>(current) + 1;
+            std::copy(it, std::next(it, count), std::back_inserter(result));
+            std::advance(it, count);
+        }
+        else if (current > 128)
+        {
+            // Copy 257 - n copies of single character
+            const int count = 257 - current;
+            const char toBeCopied = *it++;
+            std::fill_n(std::back_inserter(result), count, toBeCopied);
+        }
+    }
+
+    return result;
+}
+
+}   // namespace pdf
--- a/PdfForQtLib/sources/pdfstreamfilters.h
+++ b/PdfForQtLib/sources/pdfstreamfilters.h
@ -0,0 +1,85 @@
+//    Copyright (C) 2018 Jakub Melka
+//
+//    This file is part of PdfForQt.
+//
+//    PdfForQt is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU Lesser General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    PdfForQt is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with PDFForQt.  If not, see <https://www.gnu.org/licenses/>.
+
+#ifndef PDFSTREAMFILTERS_H
+#define PDFSTREAMFILTERS_H
+
+#include "pdfobject.h"
+
+#include <QByteArray>
+
+namespace pdf
+{
+class PDFDocument;
+
+class PDFStreamFilter
+{
+public:
+    explicit PDFStreamFilter() = default;
+    virtual ~PDFStreamFilter() = default;
+
+    virtual QByteArray apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const = 0;
+};
+
+class PDFAsciiHexDecodeFilter : public PDFStreamFilter
+{
+public:
+    explicit PDFAsciiHexDecodeFilter() = default;
+    virtual ~PDFAsciiHexDecodeFilter() override = default;
+
+    virtual QByteArray apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const override;
+};
+
+class PDFAscii85DecodeFilter : public PDFStreamFilter
+{
+public:
+    explicit PDFAscii85DecodeFilter() = default;
+    virtual ~PDFAscii85DecodeFilter() override = default;
+
+    virtual QByteArray apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const override;
+};
+
+class PDFLzwDecodeFilter : public PDFStreamFilter
+{
+public:
+    explicit PDFLzwDecodeFilter() = default;
+    virtual ~PDFLzwDecodeFilter() override = default;
+
+    virtual QByteArray apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const override;
+};
+
+class PDFFlateDecodeFilter : public PDFStreamFilter
+{
+public:
+    explicit PDFFlateDecodeFilter() = default;
+    virtual ~PDFFlateDecodeFilter() override = default;
+
+    virtual QByteArray apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const override;
+};
+
+class PDFRunLengthDecodeFilter : public PDFStreamFilter
+{
+public:
+    explicit PDFRunLengthDecodeFilter() = default;
+    virtual ~PDFRunLengthDecodeFilter() override = default;
+
+    virtual QByteArray apply(const QByteArray& data, const PDFDocument* document, const PDFObject& parameters) const override;
+};
+
+}   // namespace pdf
+
+#endif // PDFSTREAMFILTERS_H