Handling object reference streams

This commit is contained in:
Jakub Melka 2019-07-01 12:29:57 +02:00
parent 8c130ca013
commit 4d782af971
7 changed files with 383 additions and 20 deletions

View File

@ -1,4 +1,4 @@
// Copyright (C) 2018 Jakub Melka
// Copyright (C) 2018-2019 Jakub Melka
//
// This file is part of PdfForQt.
//
@ -55,6 +55,9 @@ static constexpr const char* PDF_XREF_OCCUPIED = "n";
static constexpr const char* PDF_OBJECT_START_MARK = "obj";
static constexpr const char* PDF_OBJECT_END_MARK = "endobj";
// Colors
static constexpr const int PDF_MAX_COLOR_COMPONENTS = 32;
} // namespace pdf
#endif // PDFCONSTANTS_H

View File

@ -21,6 +21,7 @@
#include "pdfxreftable.h"
#include "pdfexception.h"
#include "pdfparser.h"
#include "pdfstreamfilters.h"
#include <QFile>
@ -239,6 +240,7 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
std::vector<PDFXRefTable::Entry> occupiedEntries = xrefTable.getOccupiedEntries();
// First, process regular objects
auto processEntry = [this, &getObject, &objectFetcher, &objects](const PDFXRefTable::Entry& entry)
{
Q_ASSERT(entry.type == PDFXRefTable::EntryType::Occupied);
@ -248,7 +250,10 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
try
{
PDFParsingContext context(objectFetcher);
objects[entry.reference.objectNumber] = PDFObjectStorage::Entry(entry.reference.generation, getObject(&context, entry.offset, entry.reference));
PDFObject object = getObject(&context, entry.offset, entry.reference);
QMutexLocker lock(&m_mutex);
objects[entry.reference.objectNumber] = PDFObjectStorage::Entry(entry.reference.generation, object);
}
catch (PDFParserException exception)
{
@ -262,6 +267,108 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
// Now, we are ready to scan all objects
std::for_each(std::execution::parallel_policy(), occupiedEntries.cbegin(), occupiedEntries.cend(), processEntry);
// Then process object streams
std::vector<PDFXRefTable::Entry> objectStreamEntries = xrefTable.getObjectStreamEntries();
std::set<PDFObjectReference> objectStreams;
for (const PDFXRefTable::Entry& entry : objectStreamEntries)
{
Q_ASSERT(entry.type == PDFXRefTable::EntryType::InObjectStream);
objectStreams.insert(entry.objectStream);
}
auto processObjectStream = [this, &getObject, &objectFetcher, &objects, &objectStreamEntries] (const PDFObjectReference& objectStreamReference)
{
if (!m_successfull)
{
return;
}
try
{
PDFParsingContext context(objectFetcher);
if (objectStreamReference.objectNumber >= static_cast<PDFInteger>(objects.size()))
{
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 not found.").arg(objectStreamReference.objectNumber));
}
const PDFObject& object = objects[objectStreamReference.objectNumber].object;
if (!object.isStream())
{
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
}
const PDFStream* objectStream = object.getStream();
const PDFDictionary* objectStreamDictionary = objectStream->getDictionary();
const PDFObject& objectStreamType = objectStreamDictionary->get("Type");
if (!objectStreamType.isName() || objectStreamType.getString() != "ObjStm")
{
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
}
const PDFObject& nObject = objectStreamDictionary->get("N");
const PDFObject& firstObject = objectStreamDictionary->get("First");
if (!nObject.isInt() || !firstObject.isInt())
{
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
}
// Number of objects in object stream dictionary
const PDFInteger n = nObject.getInteger();
const PDFInteger first = firstObject.getInteger();
QByteArray objectStreamData = PDFStreamFilterStorage::getDecodedStream(objectStream);
PDFParsingContext::PDFParsingContextGuard guard(&context, objectStreamReference);
PDFParser parser(objectStreamData, &context, PDFParser::AllowStreams);
std::vector<std::pair<PDFInteger, PDFInteger>> objectNumberAndOffset;
objectNumberAndOffset.reserve(n);
for (PDFInteger i = 0; i < n; ++i)
{
PDFObject currentObjectNumber = parser.getObject();
PDFObject currentOffset = parser.getObject();
if (!currentObjectNumber.isInt() || !currentOffset.isInt())
{
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
}
const PDFInteger objectNumber = currentObjectNumber.getInteger();
const PDFInteger offset = currentOffset.getInteger() + first;
objectNumberAndOffset.emplace_back(objectNumber, offset);
}
for (size_t i = 0; i < objectNumberAndOffset.size(); ++i)
{
const PDFInteger objectNumber = objectNumberAndOffset[i].first;
const PDFInteger offset = objectNumberAndOffset[i].second;
parser.seek(offset);
PDFObject object = parser.getObject();
auto predicate = [objectNumber, objectStreamReference](const PDFXRefTable::Entry& entry) -> bool { return entry.reference.objectNumber == objectNumber && entry.objectStream == objectStreamReference; };
if (std::find_if(objectStreamEntries.cbegin(), objectStreamEntries.cend(), predicate) != objectStreamEntries.cend())
{
QMutexLocker lock(&m_mutex);
objects[objectNumber].object = qMove(object);
}
else
{
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
}
}
}
catch (PDFParserException exception)
{
QMutexLocker lock(&m_mutex);
m_successfull = false;
m_errorMessage = exception.getMessage();
}
};
// Now, we are ready to scan all object streams
std::for_each(std::execution::parallel_policy(), objectStreams.cbegin(), objectStreams.cend(), processObjectStream);
PDFObjectStorage storage(std::move(objects), PDFObject(xrefTable.getTrailerDictionary()));
return PDFDocument(std::move(storage));
}

View File

@ -894,7 +894,7 @@ PDFObject PDFParser::getObject(PDFObjectReference reference)
void PDFParser::error(const QString& message) const
{
throw new PDFParserException(message);
throw PDFParserException(message);
}
void PDFParser::seek(PDFInteger offset)

View File

@ -354,16 +354,11 @@ QByteArray PDFLzwDecodeFilter::apply(const QByteArray& data, const PDFObjectFetc
{
early = earlyChangeObject.getInteger();
}
if (predictor != 1)
{
// TODO: Implement Predictor algorithm
return QByteArray();
}
}
PDFStreamPredictor predictor = PDFStreamPredictor::createPredictor(objectFetcher, parameters);
PDFLzwStreamDecoder decoder(data, early);
return decoder.decompress();
return predictor.apply(decoder.decompress());
}
QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters) const
@ -379,12 +374,6 @@ QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFObjectFe
{
predictor = predictorObject.getInteger();
}
if (predictor != 1)
{
// TODO: Implement Predictor algorithm
return QByteArray();
}
}
uint32_t size = data.size();
@ -395,7 +384,8 @@ QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFObjectFe
qToBigEndian(size, dataToUncompress.data());
std::copy(data.cbegin(), data.cend(), std::next(dataToUncompress.begin(), sizeof(decltype(size))));
return qUncompress(dataToUncompress);
PDFStreamPredictor predictor = PDFStreamPredictor::createPredictor(objectFetcher, parameters);
return predictor.apply(qUncompress(dataToUncompress));
}
QByteArray PDFRunLengthDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters) const
@ -571,4 +561,182 @@ const PDFStreamFilterStorage* PDFStreamFilterStorage::getInstance()
return &instance;
}
PDFStreamPredictor PDFStreamPredictor::createPredictor(const PDFObjectFetcher& objectFetcher, const PDFObject& parameters)
{
const PDFObject& dereferencedParameters = objectFetcher(parameters);
if (dereferencedParameters.isDictionary())
{
const PDFDictionary* dictionary = dereferencedParameters.getDictionary();
auto getInteger = [dictionary, &objectFetcher](const char* key, int min, int max, int defaultValue) -> int
{
const PDFObject& object = objectFetcher(dictionary->get(key));
if (object.isInt())
{
PDFInteger value = object.getInteger();
if (value < min || value > max)
{
throw PDFParserException(PDFTranslationContext::tr("Property '%1' should be in range from %2 to %3.").arg(QString::fromLatin1(key)).arg(min).arg(max));
}
return value;
}
else if (object.isNull())
{
return defaultValue;
}
throw PDFParserException(PDFTranslationContext::tr("Invalid property '%1' of the stream predictor parameters.").arg(QString::fromLatin1(key)));
return 0;
};
int predictor = getInteger("Predictor", 1, 15, 1);
int components = getInteger("Colors", 1, PDF_MAX_COLOR_COMPONENTS, 1);
int bitsPerComponent = getInteger("BitsPerComponent", 1, 16, 8);
int columns = getInteger("Columns", 1, std::numeric_limits<int>::max(), 1);
return PDFStreamPredictor(static_cast<Predictor>(predictor), components, bitsPerComponent, columns);
}
return PDFStreamPredictor();
}
QByteArray PDFStreamPredictor::apply(const QByteArray& data) const
{
switch (m_predictor)
{
case NoPredictor:
return data;
case TIFF:
return applyTIFFPredictor(data);
default:
{
if (m_predictor >= 10)
{
return applyPNGPredictor(data);
}
break;
}
}
throw PDFParserException(PDFTranslationContext::tr("Invalid predictor algorithm."));
return QByteArray();
}
QByteArray PDFStreamPredictor::applyPNGPredictor(const QByteArray& data) const
{
QByteArray outputData;
outputData.reserve(data.size());
auto it = data.cbegin();
auto itEnd = data.cend();
int pixelBytes = (m_components * m_bitsPerComponent + 7) / 8;
auto readByte = [&it, &itEnd]() -> uint8_t
{
if (it != itEnd)
{
return static_cast<uint8_t>(*it++);
}
// According to the PDF specification, incomplete line is completed. For this
// reason, we behave as we have zero data in the buffer.
return 0;
};
// Idea: to avoid using if for many cases, we use larger buffer filled with zeros
const int totalBytes = m_stride + pixelBytes;
std::vector<uint8_t> line(totalBytes, 0);
std::vector<uint8_t> lineOld(totalBytes, 0);
Predictor currentPredictor = m_predictor;
while (it != itEnd)
{
// First, read the predictor data for current line
currentPredictor = static_cast<Predictor>(readByte() + 10);
for (int i = 0; i < m_stride; ++i)
{
uint8_t currentByte = readByte();
int lineIndex = i + pixelBytes;
switch (currentPredictor)
{
case PNG_Sub:
{
line[lineIndex] = line[i] + currentByte;
break;
}
case PNG_Up:
{
line[lineIndex] = lineOld[lineIndex] + currentByte;
break;
}
case PNG_Average:
{
line[lineIndex] = (lineOld[lineIndex] + line[i]) / 2 + currentByte;
break;
}
case PNG_Paeth:
{
// a = left,
// b = upper,
// c = upper left
const int a = line[i];
const int b = lineOld[lineIndex];
const int c = lineOld[i];
const int p = a + b - c;
const int pa = std::abs(p - a);
const int pb = std::abs(p - b);
const int pc = std::abs(p - c);
if (pa <= pb && pa <= pc)
{
line[lineIndex] = a + currentByte;
}
else if (pb <= pc)
{
line[lineIndex] = b + currentByte;
}
else
{
line[lineIndex] = c + currentByte;
}
break;
}
case PNG_None:
default:
{
line[lineIndex] = currentByte;
break;
}
}
// Fill the output buffer
outputData.push_back(static_cast<const char>(line[lineIndex]));
}
// Swap the buffers
std::swap(line, lineOld);
}
return outputData;
}
QByteArray PDFStreamPredictor::applyTIFFPredictor(const QByteArray& data) const
{
Q_UNUSED(data);
// TODO: Implement TIFF algorithm filter
throw PDFParserException(PDFTranslationContext::tr("Invalid predictor algorithm."));
return QByteArray();
}
} // namespace pdf

View File

@ -64,6 +64,56 @@ private:
std::map<QByteArray, QByteArray> m_abbreviations;
};
class PDFStreamPredictor
{
public:
/// Create predictor from stream parameters. If error occurs, exception is thrown.
/// \param objectFetcher Function which retrieves objects (for example, reads objects from reference)
/// \param parameters Parameters of the predictor (must be an dictionary)
static PDFStreamPredictor createPredictor(const PDFObjectFetcher& objectFetcher, const PDFObject& parameters);
/// Applies the predictor to the data.
/// \param data Data to be decoded using predictor
QByteArray apply(const QByteArray& data) const;
private:
enum Predictor
{
NoPredictor = 1,
TIFF = 2,
PNG_None = 10, ///< No prediction
PNG_Sub = 11, ///< Prediction based on previous byte
PNG_Up = 12, ///< Prediction based on byte above
PNG_Average = 13, ///< Prediction based on average of previous nad current byte
PNG_Paeth = 14, ///< Nonlinear function
};
inline explicit PDFStreamPredictor() = default;
inline explicit PDFStreamPredictor(Predictor predictor, int components, int bitsPerComponent, int columns) :
m_predictor(predictor),
m_components(components),
m_bitsPerComponent(bitsPerComponent),
m_columns(columns),
m_stride(0)
{
m_stride = (m_columns * m_components * m_bitsPerComponent + 7) / 8;
}
/// Applies PNG predictor
QByteArray applyPNGPredictor(const QByteArray& data) const;
/// Applies TIFF predictor
QByteArray applyTIFFPredictor(const QByteArray& data) const;
Predictor m_predictor = NoPredictor;
int m_components = 0;
int m_bitsPerComponent = 0;
int m_columns = 0;
int m_stride = 0;
};
class PDFFORQTLIBSHARED_EXPORT PDFStreamFilter
{
public:

View File

@ -44,7 +44,8 @@ void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& b
// Check, if we have cyclical references between tables
if (processedOffsets.count(currentOffset))
{
throw PDFParserException(tr("Cyclic reference found in reference table."));
// If cyclical reference occurs, do not report error, just ignore it.
continue;
}
else
{
@ -300,6 +301,7 @@ void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& b
case 0:
// Free object
break;
case 1:
{
Entry entry;
@ -313,7 +315,23 @@ void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& b
}
break;
}
case 2:
{
Entry entry;
entry.reference = PDFObjectReference(objectNumber, 0);
entry.objectStream = PDFObjectReference(itemObjectNumberOfObjectStreamOrByteOffset, 0);
entry.indexInObjectStream = itemGenerationNumberOrObjectIndex;
entry.type = EntryType::InObjectStream;
if (m_entries[objectNumber].type == EntryType::Free)
{
m_entries[objectNumber] = std::move(entry);
}
break;
}
default:
// According to the specification, treat this object as null object
break;
@ -341,6 +359,17 @@ std::vector<PDFXRefTable::Entry> PDFXRefTable::getOccupiedEntries() const
return result;
}
std::vector<PDFXRefTable::Entry> PDFXRefTable::getObjectStreamEntries() const
{
std::vector<PDFXRefTable::Entry> result;
// Suppose majority of items are occupied
result.reserve(m_entries.size());
std::copy_if(m_entries.cbegin(), m_entries.cend(), std::back_inserter(result), [](const Entry& entry) { return entry.type == EntryType::InObjectStream; });
return result;
}
const PDFXRefTable::Entry& PDFXRefTable::getEntry(PDFObjectReference reference) const
{
// We must also check generation number here. For this reason, we compare references of the entry at given position.

View File

@ -49,14 +49,17 @@ public:
enum class EntryType
{
Free, ///< Entry represents a free item (no object)
Occupied ///< Entry represents a occupied item (object)
Free, ///< Entry represents a free item (no object)
Occupied, ///< Entry represents a occupied item (object)
InObjectStream ///< Entry in object stream
};
struct Entry
{
PDFObjectReference reference;
PDFObjectReference objectStream;
PDFInteger offset = -1;
PDFInteger indexInObjectStream = -1;
EntryType type = EntryType::Free;
};
@ -70,6 +73,9 @@ public:
/// Filters only occupied entries and returns them
std::vector<Entry> getOccupiedEntries() const;
/// Filters only object stream entries and returns them
std::vector<Entry> getObjectStreamEntries() const;
/// Returns size of the reference table
std::size_t getSize() const { return m_entries.size(); }