mirror of https://github.com/JakubMelka/PDF4QT.git
Handling object reference streams
This commit is contained in:
parent
8c130ca013
commit
4d782af971
|
@ -1,4 +1,4 @@
|
|||
// Copyright (C) 2018 Jakub Melka
|
||||
// Copyright (C) 2018-2019 Jakub Melka
|
||||
//
|
||||
// This file is part of PdfForQt.
|
||||
//
|
||||
|
@ -55,6 +55,9 @@ static constexpr const char* PDF_XREF_OCCUPIED = "n";
|
|||
static constexpr const char* PDF_OBJECT_START_MARK = "obj";
|
||||
static constexpr const char* PDF_OBJECT_END_MARK = "endobj";
|
||||
|
||||
// Colors
|
||||
static constexpr const int PDF_MAX_COLOR_COMPONENTS = 32;
|
||||
|
||||
} // namespace pdf
|
||||
|
||||
#endif // PDFCONSTANTS_H
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "pdfxreftable.h"
|
||||
#include "pdfexception.h"
|
||||
#include "pdfparser.h"
|
||||
#include "pdfstreamfilters.h"
|
||||
|
||||
#include <QFile>
|
||||
|
||||
|
@ -239,6 +240,7 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
|
|||
|
||||
std::vector<PDFXRefTable::Entry> occupiedEntries = xrefTable.getOccupiedEntries();
|
||||
|
||||
// First, process regular objects
|
||||
auto processEntry = [this, &getObject, &objectFetcher, &objects](const PDFXRefTable::Entry& entry)
|
||||
{
|
||||
Q_ASSERT(entry.type == PDFXRefTable::EntryType::Occupied);
|
||||
|
@ -248,7 +250,10 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
|
|||
try
|
||||
{
|
||||
PDFParsingContext context(objectFetcher);
|
||||
objects[entry.reference.objectNumber] = PDFObjectStorage::Entry(entry.reference.generation, getObject(&context, entry.offset, entry.reference));
|
||||
PDFObject object = getObject(&context, entry.offset, entry.reference);
|
||||
|
||||
QMutexLocker lock(&m_mutex);
|
||||
objects[entry.reference.objectNumber] = PDFObjectStorage::Entry(entry.reference.generation, object);
|
||||
}
|
||||
catch (PDFParserException exception)
|
||||
{
|
||||
|
@ -262,6 +267,108 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
|
|||
// Now, we are ready to scan all objects
|
||||
std::for_each(std::execution::parallel_policy(), occupiedEntries.cbegin(), occupiedEntries.cend(), processEntry);
|
||||
|
||||
// Then process object streams
|
||||
std::vector<PDFXRefTable::Entry> objectStreamEntries = xrefTable.getObjectStreamEntries();
|
||||
std::set<PDFObjectReference> objectStreams;
|
||||
for (const PDFXRefTable::Entry& entry : objectStreamEntries)
|
||||
{
|
||||
Q_ASSERT(entry.type == PDFXRefTable::EntryType::InObjectStream);
|
||||
objectStreams.insert(entry.objectStream);
|
||||
}
|
||||
|
||||
auto processObjectStream = [this, &getObject, &objectFetcher, &objects, &objectStreamEntries] (const PDFObjectReference& objectStreamReference)
|
||||
{
|
||||
if (!m_successfull)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
PDFParsingContext context(objectFetcher);
|
||||
if (objectStreamReference.objectNumber >= static_cast<PDFInteger>(objects.size()))
|
||||
{
|
||||
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 not found.").arg(objectStreamReference.objectNumber));
|
||||
}
|
||||
|
||||
const PDFObject& object = objects[objectStreamReference.objectNumber].object;
|
||||
if (!object.isStream())
|
||||
{
|
||||
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
|
||||
}
|
||||
|
||||
const PDFStream* objectStream = object.getStream();
|
||||
const PDFDictionary* objectStreamDictionary = objectStream->getDictionary();
|
||||
|
||||
const PDFObject& objectStreamType = objectStreamDictionary->get("Type");
|
||||
if (!objectStreamType.isName() || objectStreamType.getString() != "ObjStm")
|
||||
{
|
||||
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
|
||||
}
|
||||
|
||||
const PDFObject& nObject = objectStreamDictionary->get("N");
|
||||
const PDFObject& firstObject = objectStreamDictionary->get("First");
|
||||
if (!nObject.isInt() || !firstObject.isInt())
|
||||
{
|
||||
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
|
||||
}
|
||||
|
||||
// Number of objects in object stream dictionary
|
||||
const PDFInteger n = nObject.getInteger();
|
||||
const PDFInteger first = firstObject.getInteger();
|
||||
|
||||
QByteArray objectStreamData = PDFStreamFilterStorage::getDecodedStream(objectStream);
|
||||
|
||||
PDFParsingContext::PDFParsingContextGuard guard(&context, objectStreamReference);
|
||||
PDFParser parser(objectStreamData, &context, PDFParser::AllowStreams);
|
||||
|
||||
std::vector<std::pair<PDFInteger, PDFInteger>> objectNumberAndOffset;
|
||||
objectNumberAndOffset.reserve(n);
|
||||
for (PDFInteger i = 0; i < n; ++i)
|
||||
{
|
||||
PDFObject currentObjectNumber = parser.getObject();
|
||||
PDFObject currentOffset = parser.getObject();
|
||||
|
||||
if (!currentObjectNumber.isInt() || !currentOffset.isInt())
|
||||
{
|
||||
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
|
||||
}
|
||||
|
||||
const PDFInteger objectNumber = currentObjectNumber.getInteger();
|
||||
const PDFInteger offset = currentOffset.getInteger() + first;
|
||||
objectNumberAndOffset.emplace_back(objectNumber, offset);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < objectNumberAndOffset.size(); ++i)
|
||||
{
|
||||
const PDFInteger objectNumber = objectNumberAndOffset[i].first;
|
||||
const PDFInteger offset = objectNumberAndOffset[i].second;
|
||||
parser.seek(offset);
|
||||
|
||||
PDFObject object = parser.getObject();
|
||||
auto predicate = [objectNumber, objectStreamReference](const PDFXRefTable::Entry& entry) -> bool { return entry.reference.objectNumber == objectNumber && entry.objectStream == objectStreamReference; };
|
||||
if (std::find_if(objectStreamEntries.cbegin(), objectStreamEntries.cend(), predicate) != objectStreamEntries.cend())
|
||||
{
|
||||
QMutexLocker lock(&m_mutex);
|
||||
objects[objectNumber].object = qMove(object);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw PDFParserException(PDFTranslationContext::tr("Object stream %1 is invalid.").arg(objectStreamReference.objectNumber));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (PDFParserException exception)
|
||||
{
|
||||
QMutexLocker lock(&m_mutex);
|
||||
m_successfull = false;
|
||||
m_errorMessage = exception.getMessage();
|
||||
}
|
||||
};
|
||||
|
||||
// Now, we are ready to scan all object streams
|
||||
std::for_each(std::execution::parallel_policy(), objectStreams.cbegin(), objectStreams.cend(), processObjectStream);
|
||||
|
||||
PDFObjectStorage storage(std::move(objects), PDFObject(xrefTable.getTrailerDictionary()));
|
||||
return PDFDocument(std::move(storage));
|
||||
}
|
||||
|
|
|
@ -894,7 +894,7 @@ PDFObject PDFParser::getObject(PDFObjectReference reference)
|
|||
|
||||
void PDFParser::error(const QString& message) const
|
||||
{
|
||||
throw new PDFParserException(message);
|
||||
throw PDFParserException(message);
|
||||
}
|
||||
|
||||
void PDFParser::seek(PDFInteger offset)
|
||||
|
|
|
@ -354,16 +354,11 @@ QByteArray PDFLzwDecodeFilter::apply(const QByteArray& data, const PDFObjectFetc
|
|||
{
|
||||
early = earlyChangeObject.getInteger();
|
||||
}
|
||||
|
||||
if (predictor != 1)
|
||||
{
|
||||
// TODO: Implement Predictor algorithm
|
||||
return QByteArray();
|
||||
}
|
||||
}
|
||||
|
||||
PDFStreamPredictor predictor = PDFStreamPredictor::createPredictor(objectFetcher, parameters);
|
||||
PDFLzwStreamDecoder decoder(data, early);
|
||||
return decoder.decompress();
|
||||
return predictor.apply(decoder.decompress());
|
||||
}
|
||||
|
||||
QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters) const
|
||||
|
@ -379,12 +374,6 @@ QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFObjectFe
|
|||
{
|
||||
predictor = predictorObject.getInteger();
|
||||
}
|
||||
|
||||
if (predictor != 1)
|
||||
{
|
||||
// TODO: Implement Predictor algorithm
|
||||
return QByteArray();
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t size = data.size();
|
||||
|
@ -395,7 +384,8 @@ QByteArray PDFFlateDecodeFilter::apply(const QByteArray& data, const PDFObjectFe
|
|||
qToBigEndian(size, dataToUncompress.data());
|
||||
std::copy(data.cbegin(), data.cend(), std::next(dataToUncompress.begin(), sizeof(decltype(size))));
|
||||
|
||||
return qUncompress(dataToUncompress);
|
||||
PDFStreamPredictor predictor = PDFStreamPredictor::createPredictor(objectFetcher, parameters);
|
||||
return predictor.apply(qUncompress(dataToUncompress));
|
||||
}
|
||||
|
||||
QByteArray PDFRunLengthDecodeFilter::apply(const QByteArray& data, const PDFObjectFetcher& objectFetcher, const PDFObject& parameters) const
|
||||
|
@ -571,4 +561,182 @@ const PDFStreamFilterStorage* PDFStreamFilterStorage::getInstance()
|
|||
return &instance;
|
||||
}
|
||||
|
||||
PDFStreamPredictor PDFStreamPredictor::createPredictor(const PDFObjectFetcher& objectFetcher, const PDFObject& parameters)
|
||||
{
|
||||
const PDFObject& dereferencedParameters = objectFetcher(parameters);
|
||||
if (dereferencedParameters.isDictionary())
|
||||
{
|
||||
const PDFDictionary* dictionary = dereferencedParameters.getDictionary();
|
||||
|
||||
auto getInteger = [dictionary, &objectFetcher](const char* key, int min, int max, int defaultValue) -> int
|
||||
{
|
||||
const PDFObject& object = objectFetcher(dictionary->get(key));
|
||||
|
||||
if (object.isInt())
|
||||
{
|
||||
PDFInteger value = object.getInteger();
|
||||
if (value < min || value > max)
|
||||
{
|
||||
throw PDFParserException(PDFTranslationContext::tr("Property '%1' should be in range from %2 to %3.").arg(QString::fromLatin1(key)).arg(min).arg(max));
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
else if (object.isNull())
|
||||
{
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
throw PDFParserException(PDFTranslationContext::tr("Invalid property '%1' of the stream predictor parameters.").arg(QString::fromLatin1(key)));
|
||||
return 0;
|
||||
};
|
||||
|
||||
int predictor = getInteger("Predictor", 1, 15, 1);
|
||||
int components = getInteger("Colors", 1, PDF_MAX_COLOR_COMPONENTS, 1);
|
||||
int bitsPerComponent = getInteger("BitsPerComponent", 1, 16, 8);
|
||||
int columns = getInteger("Columns", 1, std::numeric_limits<int>::max(), 1);
|
||||
|
||||
return PDFStreamPredictor(static_cast<Predictor>(predictor), components, bitsPerComponent, columns);
|
||||
}
|
||||
|
||||
return PDFStreamPredictor();
|
||||
}
|
||||
|
||||
QByteArray PDFStreamPredictor::apply(const QByteArray& data) const
|
||||
{
|
||||
switch (m_predictor)
|
||||
{
|
||||
case NoPredictor:
|
||||
return data;
|
||||
|
||||
case TIFF:
|
||||
return applyTIFFPredictor(data);
|
||||
|
||||
default:
|
||||
{
|
||||
if (m_predictor >= 10)
|
||||
{
|
||||
return applyPNGPredictor(data);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
throw PDFParserException(PDFTranslationContext::tr("Invalid predictor algorithm."));
|
||||
return QByteArray();
|
||||
}
|
||||
|
||||
QByteArray PDFStreamPredictor::applyPNGPredictor(const QByteArray& data) const
|
||||
{
|
||||
QByteArray outputData;
|
||||
outputData.reserve(data.size());
|
||||
|
||||
auto it = data.cbegin();
|
||||
auto itEnd = data.cend();
|
||||
|
||||
int pixelBytes = (m_components * m_bitsPerComponent + 7) / 8;
|
||||
|
||||
auto readByte = [&it, &itEnd]() -> uint8_t
|
||||
{
|
||||
if (it != itEnd)
|
||||
{
|
||||
return static_cast<uint8_t>(*it++);
|
||||
}
|
||||
|
||||
// According to the PDF specification, incomplete line is completed. For this
|
||||
// reason, we behave as we have zero data in the buffer.
|
||||
return 0;
|
||||
};
|
||||
|
||||
// Idea: to avoid using if for many cases, we use larger buffer filled with zeros
|
||||
const int totalBytes = m_stride + pixelBytes;
|
||||
std::vector<uint8_t> line(totalBytes, 0);
|
||||
std::vector<uint8_t> lineOld(totalBytes, 0);
|
||||
|
||||
Predictor currentPredictor = m_predictor;
|
||||
while (it != itEnd)
|
||||
{
|
||||
// First, read the predictor data for current line
|
||||
currentPredictor = static_cast<Predictor>(readByte() + 10);
|
||||
|
||||
for (int i = 0; i < m_stride; ++i)
|
||||
{
|
||||
uint8_t currentByte = readByte();
|
||||
|
||||
int lineIndex = i + pixelBytes;
|
||||
switch (currentPredictor)
|
||||
{
|
||||
case PNG_Sub:
|
||||
{
|
||||
line[lineIndex] = line[i] + currentByte;
|
||||
break;
|
||||
}
|
||||
|
||||
case PNG_Up:
|
||||
{
|
||||
line[lineIndex] = lineOld[lineIndex] + currentByte;
|
||||
break;
|
||||
}
|
||||
|
||||
case PNG_Average:
|
||||
{
|
||||
line[lineIndex] = (lineOld[lineIndex] + line[i]) / 2 + currentByte;
|
||||
break;
|
||||
}
|
||||
|
||||
case PNG_Paeth:
|
||||
{
|
||||
// a = left,
|
||||
// b = upper,
|
||||
// c = upper left
|
||||
const int a = line[i];
|
||||
const int b = lineOld[lineIndex];
|
||||
const int c = lineOld[i];
|
||||
const int p = a + b - c;
|
||||
const int pa = std::abs(p - a);
|
||||
const int pb = std::abs(p - b);
|
||||
const int pc = std::abs(p - c);
|
||||
if (pa <= pb && pa <= pc)
|
||||
{
|
||||
line[lineIndex] = a + currentByte;
|
||||
}
|
||||
else if (pb <= pc)
|
||||
{
|
||||
line[lineIndex] = b + currentByte;
|
||||
}
|
||||
else
|
||||
{
|
||||
line[lineIndex] = c + currentByte;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case PNG_None:
|
||||
default:
|
||||
{
|
||||
line[lineIndex] = currentByte;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill the output buffer
|
||||
outputData.push_back(static_cast<const char>(line[lineIndex]));
|
||||
}
|
||||
|
||||
// Swap the buffers
|
||||
std::swap(line, lineOld);
|
||||
}
|
||||
|
||||
return outputData;
|
||||
}
|
||||
|
||||
QByteArray PDFStreamPredictor::applyTIFFPredictor(const QByteArray& data) const
|
||||
{
|
||||
Q_UNUSED(data);
|
||||
|
||||
// TODO: Implement TIFF algorithm filter
|
||||
throw PDFParserException(PDFTranslationContext::tr("Invalid predictor algorithm."));
|
||||
return QByteArray();
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
|
|
@ -64,6 +64,56 @@ private:
|
|||
std::map<QByteArray, QByteArray> m_abbreviations;
|
||||
};
|
||||
|
||||
class PDFStreamPredictor
|
||||
{
|
||||
public:
|
||||
/// Create predictor from stream parameters. If error occurs, exception is thrown.
|
||||
/// \param objectFetcher Function which retrieves objects (for example, reads objects from reference)
|
||||
/// \param parameters Parameters of the predictor (must be an dictionary)
|
||||
static PDFStreamPredictor createPredictor(const PDFObjectFetcher& objectFetcher, const PDFObject& parameters);
|
||||
|
||||
/// Applies the predictor to the data.
|
||||
/// \param data Data to be decoded using predictor
|
||||
QByteArray apply(const QByteArray& data) const;
|
||||
|
||||
private:
|
||||
|
||||
enum Predictor
|
||||
{
|
||||
NoPredictor = 1,
|
||||
TIFF = 2,
|
||||
PNG_None = 10, ///< No prediction
|
||||
PNG_Sub = 11, ///< Prediction based on previous byte
|
||||
PNG_Up = 12, ///< Prediction based on byte above
|
||||
PNG_Average = 13, ///< Prediction based on average of previous nad current byte
|
||||
PNG_Paeth = 14, ///< Nonlinear function
|
||||
};
|
||||
|
||||
inline explicit PDFStreamPredictor() = default;
|
||||
|
||||
inline explicit PDFStreamPredictor(Predictor predictor, int components, int bitsPerComponent, int columns) :
|
||||
m_predictor(predictor),
|
||||
m_components(components),
|
||||
m_bitsPerComponent(bitsPerComponent),
|
||||
m_columns(columns),
|
||||
m_stride(0)
|
||||
{
|
||||
m_stride = (m_columns * m_components * m_bitsPerComponent + 7) / 8;
|
||||
}
|
||||
|
||||
/// Applies PNG predictor
|
||||
QByteArray applyPNGPredictor(const QByteArray& data) const;
|
||||
|
||||
/// Applies TIFF predictor
|
||||
QByteArray applyTIFFPredictor(const QByteArray& data) const;
|
||||
|
||||
Predictor m_predictor = NoPredictor;
|
||||
int m_components = 0;
|
||||
int m_bitsPerComponent = 0;
|
||||
int m_columns = 0;
|
||||
int m_stride = 0;
|
||||
};
|
||||
|
||||
class PDFFORQTLIBSHARED_EXPORT PDFStreamFilter
|
||||
{
|
||||
public:
|
||||
|
|
|
@ -44,7 +44,8 @@ void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& b
|
|||
// Check, if we have cyclical references between tables
|
||||
if (processedOffsets.count(currentOffset))
|
||||
{
|
||||
throw PDFParserException(tr("Cyclic reference found in reference table."));
|
||||
// If cyclical reference occurs, do not report error, just ignore it.
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -300,6 +301,7 @@ void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& b
|
|||
case 0:
|
||||
// Free object
|
||||
break;
|
||||
|
||||
case 1:
|
||||
{
|
||||
Entry entry;
|
||||
|
@ -313,7 +315,23 @@ void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& b
|
|||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
Entry entry;
|
||||
entry.reference = PDFObjectReference(objectNumber, 0);
|
||||
entry.objectStream = PDFObjectReference(itemObjectNumberOfObjectStreamOrByteOffset, 0);
|
||||
entry.indexInObjectStream = itemGenerationNumberOrObjectIndex;
|
||||
entry.type = EntryType::InObjectStream;
|
||||
|
||||
if (m_entries[objectNumber].type == EntryType::Free)
|
||||
{
|
||||
m_entries[objectNumber] = std::move(entry);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
// According to the specification, treat this object as null object
|
||||
break;
|
||||
|
@ -341,6 +359,17 @@ std::vector<PDFXRefTable::Entry> PDFXRefTable::getOccupiedEntries() const
|
|||
return result;
|
||||
}
|
||||
|
||||
std::vector<PDFXRefTable::Entry> PDFXRefTable::getObjectStreamEntries() const
|
||||
{
|
||||
std::vector<PDFXRefTable::Entry> result;
|
||||
|
||||
// Suppose majority of items are occupied
|
||||
result.reserve(m_entries.size());
|
||||
std::copy_if(m_entries.cbegin(), m_entries.cend(), std::back_inserter(result), [](const Entry& entry) { return entry.type == EntryType::InObjectStream; });
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const PDFXRefTable::Entry& PDFXRefTable::getEntry(PDFObjectReference reference) const
|
||||
{
|
||||
// We must also check generation number here. For this reason, we compare references of the entry at given position.
|
||||
|
|
|
@ -49,14 +49,17 @@ public:
|
|||
|
||||
enum class EntryType
|
||||
{
|
||||
Free, ///< Entry represents a free item (no object)
|
||||
Occupied ///< Entry represents a occupied item (object)
|
||||
Free, ///< Entry represents a free item (no object)
|
||||
Occupied, ///< Entry represents a occupied item (object)
|
||||
InObjectStream ///< Entry in object stream
|
||||
};
|
||||
|
||||
struct Entry
|
||||
{
|
||||
PDFObjectReference reference;
|
||||
PDFObjectReference objectStream;
|
||||
PDFInteger offset = -1;
|
||||
PDFInteger indexInObjectStream = -1;
|
||||
EntryType type = EntryType::Free;
|
||||
};
|
||||
|
||||
|
@ -70,6 +73,9 @@ public:
|
|||
/// Filters only occupied entries and returns them
|
||||
std::vector<Entry> getOccupiedEntries() const;
|
||||
|
||||
/// Filters only object stream entries and returns them
|
||||
std::vector<Entry> getObjectStreamEntries() const;
|
||||
|
||||
/// Returns size of the reference table
|
||||
std::size_t getSize() const { return m_entries.size(); }
|
||||
|
||||
|
|
Loading…
Reference in New Issue