// Copyright (C) 2018-2022 Jakub Melka // // This file is part of PDF4QT. // // PDF4QT is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // with the written consent of the copyright owner, any later version. // // PDF4QT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with PDF4QT. If not, see . #include "pdfstreamfilters.h" #include "pdfexception.h" #include "pdfconstants.h" #include "pdfparser.h" #include "pdfsecurityhandler.h" #include "pdfutils.h" #include "pdfdbgheap.h" #include #include namespace pdf { QByteArray PDFAsciiHexDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters, const PDFSecurityHandler* securityHandler) const { Q_UNUSED(objectFetcher); Q_UNUSED(parameters); Q_UNUSED(securityHandler); const int indexOfEnd = data.indexOf('>'); const int size = (indexOfEnd == -1) ? data.size() : indexOfEnd; if (size % 2 == 1) { // We must add trailing zero to the buffer QByteArray temporaryData(data.constData(), size); temporaryData.push_back('0'); return QByteArray::fromHex(temporaryData); } else if (size == data.size()) { // We do this, because we do not want to allocate unnecessary buffer for this case. // This case should be common. return QByteArray::fromHex(data); } return QByteArray::fromHex(QByteArray::fromRawData(data.constData(), size)); } QByteArray PDFAscii85DecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters, const PDFSecurityHandler* securityHandler) const { Q_UNUSED(objectFetcher); Q_UNUSED(parameters); Q_UNUSED(securityHandler); const unsigned char* dataBegin = reinterpret_cast(data.constData()); const unsigned char* dataEnd = reinterpret_cast(data.constData() + data.size()); const unsigned char* it = dataBegin; const constexpr uint32_t STREAM_END = 0xFFFFFFFF; auto getChar = [&it, dataEnd]() -> uint32_t { // Skip whitespace characters while (it != dataEnd && PDFLexicalAnalyzer::isWhitespace(*it)) { ++it; } if (it == dataEnd || (*it == '~')) { return STREAM_END; } return *it++; }; QByteArray result; result.reserve(data.size() * 4 / 5); while (true) { const uint32_t scannedChar = getChar(); if (scannedChar == STREAM_END) { break; } else if (scannedChar == 'z') { result.append(4, static_cast(0)); } else { // Scan all 5 characters, some of then can be equal to STREAM_END constant. We will // treat all these characters as last character. std::array scannedChars; scannedChars.fill(84); scannedChars[0] = scannedChar - 33; int validBytes = 0; for (auto it = std::next(scannedChars.begin()); it != scannedChars.end(); ++it) { uint32_t character = getChar(); if (character == STREAM_END) { break; } *it = character - 33; ++validBytes; } // Decode bytes using 85 base uint32_t decodedBytesPacked = 0; for (const uint32_t value : scannedChars) { decodedBytesPacked = decodedBytesPacked * 85 + value; } // Decode bytes into byte array std::array decodedBytesUnpacked; decodedBytesUnpacked.fill(0); for (auto byteIt = decodedBytesUnpacked.rbegin(); byteIt != decodedBytesUnpacked.rend(); ++byteIt) { *byteIt = static_cast(decodedBytesPacked & 0xFF); decodedBytesPacked = decodedBytesPacked >> 8; } Q_ASSERT(validBytes <= decodedBytesUnpacked.size()); for (int i = 0; i < validBytes; ++i) { result.push_back(decodedBytesUnpacked[i]); } } } return result; } class PDFLzwStreamDecoder { public: explicit PDFLzwStreamDecoder(const QByteArray& inputByteArray, uint32_t early); QByteArray decompress(); private: static constexpr const uint32_t CODE_TABLE_RESET = 256; static constexpr const uint32_t CODE_END_OF_STREAM = 257; // Maximal code size is 12 bits. so we can have 2^12 = 4096 items // in the table (some items are unused, for example 256, 257). We also // need to initialize items under code 256, because we treat them specially, // they are not initialized in the decompress. static constexpr const uint32_t TABLE_SIZE = 4096; /// Clears the input data table void clearTable(); /// Returns a newly scanned code uint32_t getCode(); struct TableItem { uint32_t previous = TABLE_SIZE; char character = 0; }; std::array m_table; std::array m_sequence; uint32_t m_nextCode; ///< Next code value (to be written into the table) uint32_t m_nextBits; ///< Number of bits of the next code uint32_t m_early; ///< Early (see PDF 1.7 Specification, this constant is 0 or 1, based on the dictionary value) uint32_t m_inputBuffer; ///< Input buffer, containing bits, which were read from the input byte array uint32_t m_inputBits; ///< Number of bits in the input buffer. std::array::iterator m_currentSequenceEnd; bool m_first; ///< Are we reading from stream for first time after the reset char m_newCharacter; ///< New character to be written int m_position; ///< Position in the input array const QByteArray& m_inputByteArray; }; PDFLzwStreamDecoder::PDFLzwStreamDecoder(const QByteArray& inputByteArray, uint32_t early) : m_table(), m_sequence(), m_nextCode(0), m_nextBits(0), m_early(early), m_inputBuffer(0), m_inputBits(0), m_currentSequenceEnd(m_sequence.begin()), m_first(false), m_newCharacter(0), m_position(0), m_inputByteArray(inputByteArray) { for (size_t i = 0; i < 256; ++i) { m_table[i].character = static_cast(i); m_table[i].previous = TABLE_SIZE; } clearTable(); } QByteArray PDFLzwStreamDecoder::decompress() { QByteArray result; // Guess output byte array size - assume compress ratio is 2:1 result.reserve(m_inputByteArray.size() * 2); uint32_t previousCode = TABLE_SIZE; while (true) { const uint32_t code = getCode(); if (code == CODE_END_OF_STREAM) { // We are at end of stream break; } else if (code == CODE_TABLE_RESET) { // Just reset the table clearTable(); continue; } // Normal operation code if (code < m_nextCode) { m_currentSequenceEnd = m_sequence.begin(); for (uint32_t currentCode = code; currentCode != TABLE_SIZE; currentCode = m_table[currentCode].previous) { *m_currentSequenceEnd++ = m_table[currentCode].character; } // We must reverse the sequence, because we stored it in the // linked list, which we traversed from last to first item. std::reverse(m_sequence.begin(), m_currentSequenceEnd); } else if (code == m_nextCode) { // We use the buffer from previous run, just add a new // character to the end. *m_currentSequenceEnd++ = m_newCharacter; } else { // Unknown code throw PDFException(PDFTranslationContext::tr("Invalid code in the LZW stream.")); } m_newCharacter = m_sequence.front(); if (m_first) { m_first = false; } else { // Add a new word in the dictionary, if we have it if (m_nextCode < TABLE_SIZE) { m_table[m_nextCode].character = m_newCharacter; m_table[m_nextCode].previous = previousCode; ++m_nextCode; } // Change bit size of the code, if it is neccessary switch (m_nextCode + m_early) { case 512: m_nextBits = 10; break; case 1024: m_nextBits = 11; break; case 2048: m_nextBits = 12; break; default: break; } } previousCode = code; // Copy the input array to the buffer std::copy(m_sequence.begin(), m_currentSequenceEnd, std::back_inserter(result)); } result.shrink_to_fit(); return result; } void PDFLzwStreamDecoder::clearTable() { // We do not clear the m_table array here. It is for performance reasons, we assume // the input is correct. We also do not clear the sequence buffer here. m_nextCode = 258; m_nextBits = 9; m_first = true; m_newCharacter = 0; } uint32_t PDFLzwStreamDecoder::getCode() { while (m_inputBits < m_nextBits) { // Did we reach end of array? if (m_position == m_inputByteArray.size()) { return CODE_END_OF_STREAM; } m_inputBuffer = (m_inputBuffer << 8) | static_cast(m_inputByteArray[m_position++]); m_inputBits += 8; } // We must omit bits from left (old ones) and right (newly scanned ones) and // read just m_nextBits bits. Mask should omit the old ones and shift (m_inputBits - m_nextBits) // should omit the new ones. const uint32_t mask = ((1 << m_nextBits) - 1); const uint32_t code = (m_inputBuffer >> (m_inputBits - m_nextBits)) & mask; m_inputBits -= m_nextBits; return code; } QByteArray PDFLzwDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters, const PDFSecurityHandler* securityHandler) const { Q_UNUSED(securityHandler); uint32_t early = 1; const PDFObject& dereferencedParameters = objectFetcher(parameters); if (dereferencedParameters.isDictionary()) { const PDFDictionary* dictionary = dereferencedParameters.getDictionary(); const PDFObject& earlyChangeObject = objectFetcher(dictionary->get("EarlyChange")); if (earlyChangeObject.isInt()) { early = earlyChangeObject.getInteger(); } } PDFStreamPredictor predictor = PDFStreamPredictor::createPredictor(objectFetcher, parameters); PDFLzwStreamDecoder decoder(data, early); return predictor.apply(decoder.decompress()); } QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters, const PDFSecurityHandler* securityHandler) const { Q_UNUSED(securityHandler); PDFStreamPredictor predictor = PDFStreamPredictor::createPredictor(objectFetcher, parameters); return predictor.apply(uncompress(data)); } QByteArray PDFFlateDecodeFilter::compress(const QByteArray& decompressedData) { QByteArray result; z_stream stream = { }; stream.next_in = const_cast(convertByteArrayToUcharPtr(decompressedData)); stream.avail_in = decompressedData.size(); std::array outputBuffer = { }; int error = deflateInit(&stream, Z_BEST_COMPRESSION); if (error != Z_OK) { throw PDFException(PDFTranslationContext::tr("Failed to initialize flate compression stream.")); } do { stream.next_out = outputBuffer.data(); stream.avail_out = static_cast(outputBuffer.size()); error = deflate(&stream, Z_FINISH); int bytesWritten = int(outputBuffer.size()) - stream.avail_out; result.append(reinterpret_cast(outputBuffer.data()), bytesWritten); } while (error == Z_OK); QString errorMessage; if (stream.msg) { errorMessage = QString::fromLatin1(stream.msg); } deflateEnd(&stream); switch (error) { case Z_STREAM_END: break; // No error, normal behaviour default: { if (errorMessage.isEmpty()) { errorMessage = PDFTranslationContext::tr("zlib code: %1").arg(error); } throw PDFException(PDFTranslationContext::tr("Error compressing by flate method: %1").arg(errorMessage)); } } return result; } QByteArray PDFFlateDecodeFilter::recompress(const QByteArray& data) { QByteArray decompressedData = uncompress(data); return compress(decompressedData); } PDFInteger PDFFlateDecodeFilter::getStreamDataLength(const QByteArray& data, PDFInteger offset) const { if (offset < 0 || offset >= data.size()) { return -1; } z_stream stream = { }; stream.next_in = const_cast(convertByteArrayToUcharPtr(data) + offset); stream.avail_in = data.size() - offset; std::array outputBuffer = { }; int error = inflateInit(&stream); if (error != Z_OK) { return -1; } do { stream.next_out = outputBuffer.data(); stream.avail_out = static_cast(outputBuffer.size()); error = inflate(&stream, Z_NO_FLUSH); } while (error == Z_OK); PDFInteger dataLength = stream.total_in; inflateEnd(&stream); if (error == Z_STREAM_END) { return dataLength; } return -1; } QByteArray PDFFlateDecodeFilter::uncompress(const QByteArray& data) { QByteArray result; z_stream stream = { }; stream.next_in = const_cast(convertByteArrayToUcharPtr(data)); stream.avail_in = data.size(); std::array outputBuffer = { }; int error = inflateInit(&stream); if (error != Z_OK) { throw PDFException(PDFTranslationContext::tr("Failed to initialize flate decompression stream.")); } do { stream.next_out = outputBuffer.data(); stream.avail_out = static_cast(outputBuffer.size()); error = inflate(&stream, Z_NO_FLUSH); int bytesWritten = int(outputBuffer.size()) - stream.avail_out; result.append(reinterpret_cast(outputBuffer.data()), bytesWritten); } while (error == Z_OK); QString errorMessage; if (stream.msg) { errorMessage = QString::fromLatin1(stream.msg); } inflateEnd(&stream); switch (error) { case Z_STREAM_END: break; // No error, normal behaviour default: { if (errorMessage.isEmpty()) { errorMessage = PDFTranslationContext::tr("zlib code: %1").arg(error); } throw PDFException(PDFTranslationContext::tr("Error decompressing by flate method: %1").arg(errorMessage)); } } return result; } QByteArray PDFRunLengthDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters, const PDFSecurityHandler* securityHandler) const { Q_UNUSED(objectFetcher); Q_UNUSED(parameters); Q_UNUSED(securityHandler); QByteArray result; result.reserve(data.size() * 2); auto itEnd = data.cend(); for (auto it = data.cbegin(); it != itEnd;) { const unsigned char current = *it++; if (current == 128) { // End of stream marker break; } else if (current < 128) { // Copy n + 1 characters from the input array literally (and advance iterators) const int count = static_cast(current) + 1; std::copy(it, std::next(it, count), std::back_inserter(result)); std::advance(it, count); } else if (current > 128) { // Copy 257 - n copies of single character const int count = 257 - current; const char toBeCopied = *it++; std::fill_n(std::back_inserter(result), count, toBeCopied); } } return result; } const PDFStreamFilter* PDFStreamFilterStorage::getFilter(const QByteArray& filterName) { const PDFStreamFilterStorage* instance = getInstance(); auto it = instance->m_filters.find(filterName); if (it != instance->m_filters.cend()) { return it->second.get(); } auto itNameDecoded = instance->m_abbreviations.find(filterName); if (itNameDecoded != instance->m_abbreviations.cend()) { return getFilter(itNameDecoded->second); } return nullptr; } PDFStreamFilterStorage::StreamFilters PDFStreamFilterStorage::getStreamFilters(const PDFStream* stream, const PDFObjectFetcher& objectFetcher) { StreamFilters result; const PDFDictionary* dictionary = stream->getDictionary(); // Retrieve filters PDFObject filters; if (dictionary->hasKey(PDF_STREAM_DICT_FILTER)) { filters = objectFetcher(dictionary->get(PDF_STREAM_DICT_FILTER)); } else if (dictionary->hasKey(PDF_STREAM_DICT_FILE_FILTER)) { filters = objectFetcher(dictionary->get(PDF_STREAM_DICT_FILE_FILTER)); } // Retrieve filter parameters PDFObject filterParameters; if (dictionary->hasKey(PDF_STREAM_DICT_DECODE_PARMS)) { filterParameters = objectFetcher(dictionary->get(PDF_STREAM_DICT_DECODE_PARMS)); } else if (dictionary->hasKey(PDF_STREAM_DICT_FDECODE_PARMS)) { filterParameters = objectFetcher(dictionary->get(PDF_STREAM_DICT_FDECODE_PARMS)); } if (filters.isName()) { result.filterObjects.push_back(PDFStreamFilterStorage::getFilter(filters.getString())); } else if (filters.isArray()) { const PDFArray* filterArray = filters.getArray(); const size_t filterCount = filterArray->getCount(); for (size_t i = 0; i < filterCount; ++i) { const PDFObject& object = objectFetcher(filterArray->getItem(i)); if (object.isName()) { result.filterObjects.push_back(PDFStreamFilterStorage::getFilter(object.getString())); } else { result.valid = false; return result; } } } else if (!filters.isNull()) { result.valid = false; return result; } if (filterParameters.isArray()) { const PDFArray* filterParameterArray = filterParameters.getArray(); const size_t filterParameterCount = filterParameterArray->getCount(); for (size_t i = 0; i < filterParameterCount; ++i) { const PDFObject& object = objectFetcher(filterParameterArray->getItem(i)); result.filterParameterObjects.push_back(object); } } else { result.filterParameterObjects.push_back(filterParameters); } result.filterParameterObjects.resize(result.filterObjects.size()); std::reverse(result.filterObjects.begin(), result.filterObjects.end()); std::reverse(result.filterParameterObjects.begin(), result.filterParameterObjects.end()); return result; } QByteArray PDFStreamFilterStorage::getDecodedStream(const PDFStream* stream, const PDFObjectFetcher& objectFetcher, const PDFSecurityHandler* securityHandler) { StreamFilters streamFilters = getStreamFilters(stream, objectFetcher); QByteArray result = *stream->getContent(); if (!streamFilters.valid) { // Stream filters are invalid return QByteArray(); } for (size_t i = 0, count = streamFilters.filterObjects.size(); i < count; ++i) { const PDFStreamFilter* streamFilter = streamFilters.filterObjects[i]; const PDFObject& streamFilterParameters = streamFilters.filterParameterObjects[i]; if (streamFilter) { result = streamFilter->apply(result, objectFetcher, streamFilterParameters, securityHandler); } } return result; } QByteArray PDFStreamFilterStorage::getDecodedStream(const PDFStream* stream, const PDFSecurityHandler* securityHandler) { return getDecodedStream(stream, [](const PDFObject& object) -> const PDFObject& { return object; }, securityHandler); } PDFInteger PDFStreamFilterStorage::getStreamDataLength(const QByteArray& data, const QByteArray& filterName, PDFInteger offset) { if (const PDFStreamFilter* filter = getFilter(filterName)) { return filter->getStreamDataLength(data, offset); } return -1; } PDFStreamFilterStorage::PDFStreamFilterStorage() { // Initialize map with the filters m_filters["ASCIIHexDecode"] = std::make_unique(); m_filters["ASCII85Decode"] = std::make_unique(); m_filters["LZWDecode"] = std::make_unique(); m_filters["FlateDecode"] = std::make_unique(); m_filters["RunLengthDecode"] = std::make_unique(); m_filters["Crypt"] = std::make_unique(); m_abbreviations["AHx"] = "ASCIIHexDecode"; m_abbreviations["A85"] = "ASCII85Decode"; m_abbreviations["LZW"] = "LZWDecode"; m_abbreviations["Fl"] = "FlateDecode"; m_abbreviations["RL"] = "RunLengthDecode"; m_abbreviations["CCF"] = "CCITFaxDecode"; m_abbreviations["DCT"] = "DCTDecode"; } const PDFStreamFilterStorage* PDFStreamFilterStorage::getInstance() { static PDFStreamFilterStorage instance; return &instance; } PDFStreamPredictor PDFStreamPredictor::createPredictor(const PDFObjectFetcher& objectFetcher, const PDFObject& parameters) { const PDFObject& dereferencedParameters = objectFetcher(parameters); if (dereferencedParameters.isDictionary()) { const PDFDictionary* dictionary = dereferencedParameters.getDictionary(); auto getInteger = [dictionary, &objectFetcher](const char* key, int min, int max, int defaultValue) -> int { const PDFObject& object = objectFetcher(dictionary->get(key)); if (object.isInt()) { PDFInteger value = object.getInteger(); if (value < min || value > max) { throw PDFException(PDFTranslationContext::tr("Property '%1' should be in range from %2 to %3.").arg(QString::fromLatin1(key)).arg(min).arg(max)); } return value; } else if (object.isNull()) { return defaultValue; } throw PDFException(PDFTranslationContext::tr("Invalid property '%1' of the stream predictor parameters.").arg(QString::fromLatin1(key))); return 0; }; int predictor = getInteger("Predictor", 1, 15, 1); int components = getInteger("Colors", 1, PDF_MAX_COLOR_COMPONENTS, 1); int bitsPerComponent = getInteger("BitsPerComponent", 1, 16, 8); int columns = getInteger("Columns", 1, std::numeric_limits::max(), 1); return PDFStreamPredictor(static_cast(predictor), components, bitsPerComponent, columns); } return PDFStreamPredictor(); } QByteArray PDFStreamPredictor::apply(const QByteArray& data) const { switch (m_predictor) { case NoPredictor: return data; case TIFF: return applyTIFFPredictor(data); default: { if (m_predictor >= 10) { return applyPNGPredictor(data); } break; } } throw PDFException(PDFTranslationContext::tr("Invalid predictor algorithm.")); return QByteArray(); } QByteArray PDFStreamPredictor::applyPNGPredictor(const QByteArray& data) const { QByteArray outputData; outputData.reserve(data.size()); auto it = data.cbegin(); auto itEnd = data.cend(); int pixelBytes = (m_components * m_bitsPerComponent + 7) / 8; auto readByte = [&it, &itEnd]() -> uint8_t { if (it != itEnd) { return static_cast(*it++); } // According to the PDF specification, incomplete line is completed. For this // reason, we behave as we have zero data in the buffer. return 0; }; // Idea: to avoid using if for many cases, we use larger buffer filled with zeros const int totalBytes = m_stride + pixelBytes; std::vector line(totalBytes, 0); std::vector lineOld(totalBytes, 0); Predictor currentPredictor = m_predictor; while (it != itEnd) { // First, read the predictor data for current line currentPredictor = static_cast(readByte() + 10); for (int i = 0; i < m_stride; ++i) { uint8_t currentByte = readByte(); int lineIndex = i + pixelBytes; switch (currentPredictor) { case PNG_Sub: { line[lineIndex] = line[i] + currentByte; break; } case PNG_Up: { line[lineIndex] = lineOld[lineIndex] + currentByte; break; } case PNG_Average: { line[lineIndex] = (lineOld[lineIndex] + line[i]) / 2 + currentByte; break; } case PNG_Paeth: { // a = left, // b = upper, // c = upper left const int a = line[i]; const int b = lineOld[lineIndex]; const int c = lineOld[i]; const int p = a + b - c; const int pa = std::abs(p - a); const int pb = std::abs(p - b); const int pc = std::abs(p - c); if (pa <= pb && pa <= pc) { line[lineIndex] = a + currentByte; } else if (pb <= pc) { line[lineIndex] = b + currentByte; } else { line[lineIndex] = c + currentByte; } break; } case PNG_None: default: { line[lineIndex] = currentByte; break; } } // Fill the output buffer outputData.push_back(static_cast(line[lineIndex])); } // Swap the buffers std::swap(line, lineOld); } return outputData; } QByteArray PDFStreamPredictor::applyTIFFPredictor(const QByteArray& data) const { Q_UNUSED(data); PDFBitWriter writer(m_bitsPerComponent); PDFBitReader reader(&data, m_bitsPerComponent); writer.reserve(data.size()); std::vector leftValues(m_components, 0); while (!reader.isAtEnd()) { for (int i = 0; i < m_columns; ++i) { for (int componentIndex = 0; componentIndex < m_components; ++componentIndex) { leftValues[componentIndex] = (leftValues[componentIndex] + reader.read()) & reader.max(); writer.write(leftValues[componentIndex]); } } std::fill(leftValues.begin(), leftValues.end(), 0); reader.alignToBytes(); writer.finishLine(); } return writer.takeByteArray(); } QByteArray PDFCryptFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters, const PDFSecurityHandler* securityHandler) const { if (!securityHandler) { throw PDFException(PDFTranslationContext::tr("Security handler required, but not provided.")); } PDFObjectReference objectReference; QByteArray cryptFilterName = PDFSecurityHandler::IDENTITY_FILTER_NAME; const PDFObject& dereferencedParameters = objectFetcher(parameters); if (dereferencedParameters.isDictionary()) { const PDFDictionary* dictionary = dereferencedParameters.getDictionary(); const PDFObject& cryptFilterNameObject = objectFetcher(dictionary->get("Name")); if (cryptFilterNameObject.isName()) { cryptFilterName = cryptFilterNameObject.getString(); } const PDFObject& objectReferenceObject = dictionary->get(PDFSecurityHandler::OBJECT_REFERENCE_DICTIONARY_NAME); if (objectReferenceObject.isReference()) { objectReference = objectReferenceObject.getReference(); } } return securityHandler->decryptByFilter(data, cryptFilterName, objectReference); } PDFInteger PDFStreamFilter::getStreamDataLength(const QByteArray& data, PDFInteger offset) const { Q_UNUSED(data); Q_UNUSED(offset); return -1; } } // namespace pdf