Refactoring of document reading

This commit is contained in:
Jakub Melka
2020-09-17 19:17:16 +02:00
parent eb253f8417
commit 2637dfed17
4 changed files with 432 additions and 355 deletions

View File

@ -979,7 +979,7 @@ PDFContentStreamBuilder::ContentStream PDFContentStreamBuilder::end(QPainter* pa
delete m_buffer; delete m_buffer;
m_buffer = nullptr; m_buffer = nullptr;
PDFDocumentReader reader(nullptr, nullptr); PDFDocumentReader reader(nullptr, nullptr, false);
result.document = reader.readFromBuffer(bufferData); result.document = reader.readFromBuffer(bufferData);
if (result.document.getCatalog()->getPageCount() > 0) if (result.document.getCatalog()->getPageCount() > 0)

View File

@ -34,10 +34,11 @@
namespace pdf namespace pdf
{ {
PDFDocumentReader::PDFDocumentReader(PDFProgress* progress, const std::function<QString(bool*)>& getPasswordCallback) : PDFDocumentReader::PDFDocumentReader(PDFProgress* progress, const std::function<QString(bool*)>& getPasswordCallback, bool permissive) :
m_result(Result::OK), m_result(Result::OK),
m_getPasswordCallback(getPasswordCallback), m_getPasswordCallback(getPasswordCallback),
m_progress(progress) m_progress(progress),
m_permissive(permissive)
{ {
} }
@ -103,44 +104,29 @@ PDFDocument PDFDocumentReader::readFromDevice(QIODevice* device)
return PDFDocument(); return PDFDocument();
} }
PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer) void PDFDocumentReader::checkFooter(const QByteArray& buffer)
{ {
try
{
m_source = buffer;
// FOOTER CHECKING
// 1) Check, if EOF marking is present
// 2) Find start of cross reference table
if (findFromEnd(PDF_END_OF_FILE_MARK, buffer, PDF_FOOTER_SCAN_LIMIT) == FIND_NOT_FOUND_RESULT) if (findFromEnd(PDF_END_OF_FILE_MARK, buffer, PDF_FOOTER_SCAN_LIMIT) == FIND_NOT_FOUND_RESULT)
{ {
throw PDFException(tr("End of file marking was not found.")); QString message = tr("End of file marking was not found.");
} if (m_permissive)
const int startXRefPosition = findFromEnd(PDF_START_OF_XREF_MARK, buffer, PDF_FOOTER_SCAN_LIMIT);
if (startXRefPosition == FIND_NOT_FOUND_RESULT)
{ {
throw PDFException(tr("Start of object reference table not found.")); QMutexLocker lock(&m_mutex);
m_warnings << message;
} }
else
Q_ASSERT(startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK) < buffer.size());
PDFLexicalAnalyzer analyzer(buffer.constData() + startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK), buffer.constData() + buffer.size());
const PDFLexicalAnalyzer::Token token = analyzer.fetch();
if (token.type != PDFLexicalAnalyzer::TokenType::Integer)
{ {
throw PDFException(tr("Start of object reference table not found.")); throw PDFException(message);
}
}
} }
const PDFInteger firstXrefTableOffset = token.data.toLongLong();
// HEADER CHECKING
// 1) Check if header is present
// 2) Scan header version
void PDFDocumentReader::checkHeader(const QByteArray& buffer)
{
// According to PDF Reference 1.7, Appendix H, file header can have two formats: // According to PDF Reference 1.7, Appendix H, file header can have two formats:
// - %PDF-x.x // - %PDF-x.x
// - %!PS-Adobe-y.y PDF-x.x // - %!PS-Adobe-y.y PDF-x.x
// We will search for both of these formats. // We will search for both of these formats.
std::regex headerRegExp(PDF_FILE_HEADER_REGEXP); std::regex headerRegExp(PDF_FILE_HEADER_REGEXP);
std::cmatch headerMatch; std::cmatch headerMatch;
@ -173,18 +159,33 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
{ {
throw PDFException(tr("Version of the PDF file is not valid.")); throw PDFException(tr("Version of the PDF file is not valid."));
} }
}
// Now, we are ready to scan xref table const PDFInteger PDFDocumentReader::findXrefTableOffset(const QByteArray& buffer)
PDFXRefTable xrefTable; {
xrefTable.readXRefTable(nullptr, buffer, firstXrefTableOffset); const int startXRefPosition = findFromEnd(PDF_START_OF_XREF_MARK, buffer, PDF_FOOTER_SCAN_LIMIT);
if (startXRefPosition == FIND_NOT_FOUND_RESULT)
{
throw PDFException(tr("Start of object reference table not found."));
}
// This lambda function fetches object from the buffer from the specified offset. Q_ASSERT(startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK) < buffer.size());
// Can throw exception, returns a pair of scanned reference and object content. PDFLexicalAnalyzer analyzer(buffer.constData() + startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK), buffer.constData() + buffer.size());
auto getObject = [&buffer](PDFParsingContext* context, PDFInteger offset, PDFObjectReference reference) -> PDFObject const PDFLexicalAnalyzer::Token token = analyzer.fetch();
if (token.type != PDFLexicalAnalyzer::TokenType::Integer)
{
throw PDFException(tr("Start of object reference table not found."));
}
const PDFInteger firstXrefTableOffset = token.data.toLongLong();
return firstXrefTableOffset;
}
PDFObject PDFDocumentReader::getObject(PDFParsingContext* context, PDFInteger offset, PDFObjectReference reference) const
{ {
PDFParsingContext::PDFParsingContextGuard guard(context, reference); PDFParsingContext::PDFParsingContextGuard guard(context, reference);
PDFParser parser(buffer, context, PDFParser::AllowStreams); PDFParser parser(m_source, context, PDFParser::AllowStreams);
parser.seek(offset); parser.seek(offset);
PDFObject objectNumber = parser.getObject(); PDFObject objectNumber = parser.getObject();
@ -214,11 +215,11 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
} }
return object; return object;
}; }
auto objectFetcher = [&getObject, &xrefTable](PDFParsingContext* context, PDFObjectReference reference) -> PDFObject PDFObject PDFDocumentReader::getObjectFromXrefTable(PDFXRefTable* xrefTable, PDFParsingContext* context, PDFObjectReference reference) const
{ {
const PDFXRefTable::Entry& entry = xrefTable.getEntry(reference); const PDFXRefTable::Entry& entry = xrefTable->getEntry(reference);
switch (entry.type) switch (entry.type)
{ {
case PDFXRefTable::EntryType::Free: case PDFXRefTable::EntryType::Free:
@ -238,15 +239,12 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
} }
return PDFObject(); return PDFObject();
}; }
PDFObjectStorage::PDFObjects objects; PDFDocumentReader::Result PDFDocumentReader::processReferenceTableEntries(PDFXRefTable* xrefTable, const std::vector<PDFXRefTable::Entry>& occupiedEntries, PDFObjectStorage::PDFObjects& objects)
objects.resize(xrefTable.getSize()); {
auto objectFetcher = [this, xrefTable](PDFParsingContext* context, PDFObjectReference reference) { return getObjectFromXrefTable(xrefTable, context, reference); };
std::vector<PDFXRefTable::Entry> occupiedEntries = xrefTable.getOccupiedEntries(); auto processEntry = [this, &objectFetcher, &objects](const PDFXRefTable::Entry& entry)
// First, process regular objects
auto processEntry = [this, &getObject, &objectFetcher, &objects](const PDFXRefTable::Entry& entry)
{ {
Q_ASSERT(entry.type == PDFXRefTable::EntryType::Occupied); Q_ASSERT(entry.type == PDFXRefTable::EntryType::Occupied);
@ -265,10 +263,18 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
catch (PDFException exception) catch (PDFException exception)
{ {
QMutexLocker lock(&m_mutex); QMutexLocker lock(&m_mutex);
if (m_permissive)
{
m_warnings << exception.getMessage();
}
else
{
m_result = Result::Failed; m_result = Result::Failed;
m_errorMessage = exception.getMessage(); m_errorMessage = exception.getMessage();
} }
} }
}
}; };
// Now, we are ready to scan all objects // Now, we are ready to scan all objects
@ -279,17 +285,13 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
progressFinish(); progressFinish();
} }
if (m_result != Result::OK) return m_result;
{
// Do not proceed further, if document loading failed
return PDFDocument();
} }
// ------------------------------------------------------------------------------------------ PDFDocumentReader::Result PDFDocumentReader::processSecurityHandler(const PDFObject& trailerDictionaryObject,
// SECURITY - handle encrypted documents const std::vector<PDFXRefTable::Entry>& occupiedEntries,
// ------------------------------------------------------------------------------------------ PDFObjectStorage::PDFObjects& objects)
const PDFObject& trailerDictionaryObject = xrefTable.getTrailerDictionary(); {
const PDFDictionary* trailerDictionary = nullptr; const PDFDictionary* trailerDictionary = nullptr;
if (trailerDictionaryObject.isDictionary()) if (trailerDictionaryObject.isDictionary())
{ {
@ -334,14 +336,14 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
} }
// Read the security handler // Read the security handler
PDFSecurityHandlerPointer securityHandler = PDFSecurityHandler::createSecurityHandler(encryptObject, id); m_securityHandler = PDFSecurityHandler::createSecurityHandler(encryptObject, id);
PDFSecurityHandler::AuthorizationResult authorizationResult = securityHandler->authenticate(m_getPasswordCallback); PDFSecurityHandler::AuthorizationResult authorizationResult = m_securityHandler->authenticate(m_getPasswordCallback);
if (authorizationResult == PDFSecurityHandler::AuthorizationResult::Cancelled) if (authorizationResult == PDFSecurityHandler::AuthorizationResult::Cancelled)
{ {
// User cancelled the document reading // User cancelled the document reading
m_result = Result::Cancelled; m_result = Result::Cancelled;
return PDFDocument(); return m_result;
} }
if (authorizationResult == PDFSecurityHandler::AuthorizationResult::Failed) if (authorizationResult == PDFSecurityHandler::AuthorizationResult::Failed)
@ -360,9 +362,9 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
// because it needs object number and generation for generating the decrypt key. So 1) is handled // because it needs object number and generation for generating the decrypt key. So 1) is handled
// automatically. 2) is handled in the code below. 3) is handled also automatically, because we do not // automatically. 2) is handled in the code below. 3) is handled also automatically, because we do not
// decipher object streams here. 4) must be handled in the security handler. // decipher object streams here. 4) must be handled in the security handler.
if (securityHandler->getMode() != EncryptionMode::None) if (m_securityHandler->getMode() != EncryptionMode::None)
{ {
auto decryptEntry = [this, encryptObjectReference, &securityHandler, &objects](const PDFXRefTable::Entry& entry) auto decryptEntry = [this, encryptObjectReference, &objects](const PDFXRefTable::Entry& entry)
{ {
progressStep(); progressStep();
@ -372,7 +374,7 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
return; return;
} }
objects[entry.reference.objectNumber].object = securityHandler->decryptObject(objects[entry.reference.objectNumber].object, entry.reference); objects[entry.reference.objectNumber].object = m_securityHandler->decryptObject(objects[entry.reference.objectNumber].object, entry.reference);
}; };
progressStart(occupiedEntries.size(), PDFTranslationContext::tr("Decrypting encrypted contents of document...")); progressStart(occupiedEntries.size(), PDFTranslationContext::tr("Decrypting encrypted contents of document..."));
@ -380,12 +382,13 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
progressFinish(); progressFinish();
} }
// ------------------------------------------------------------------------------------------ return m_result;
// SECURITY - security handler created }
// ------------------------------------------------------------------------------------------
void PDFDocumentReader::processObjectStreams(PDFXRefTable* xrefTable, PDFObjectStorage::PDFObjects& objects)
{
// Then process object streams // Then process object streams
std::vector<PDFXRefTable::Entry> objectStreamEntries = xrefTable.getObjectStreamEntries(); std::vector<PDFXRefTable::Entry> objectStreamEntries = xrefTable->getObjectStreamEntries();
std::set<PDFObjectReference> objectStreams; std::set<PDFObjectReference> objectStreams;
for (const PDFXRefTable::Entry& entry : objectStreamEntries) for (const PDFXRefTable::Entry& entry : objectStreamEntries)
{ {
@ -393,7 +396,8 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
objectStreams.insert(entry.objectStream); objectStreams.insert(entry.objectStream);
} }
auto processObjectStream = [this, &getObject, &objectFetcher, &objects, &objectStreamEntries, &securityHandler] (const PDFObjectReference& objectStreamReference) auto objectFetcher = [this, xrefTable](PDFParsingContext* context, PDFObjectReference reference) { return getObjectFromXrefTable(xrefTable, context, reference); };
auto processObjectStream = [this, &objectFetcher, &objects, &objectStreamEntries] (const PDFObjectReference& objectStreamReference)
{ {
if (m_result != Result::OK) if (m_result != Result::OK)
{ {
@ -434,7 +438,7 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
const PDFInteger n = nObject.getInteger(); const PDFInteger n = nObject.getInteger();
const PDFInteger first = firstObject.getInteger(); const PDFInteger first = firstObject.getInteger();
QByteArray objectStreamData = PDFStreamFilterStorage::getDecodedStream(objectStream, securityHandler.data()); QByteArray objectStreamData = PDFStreamFilterStorage::getDecodedStream(objectStream, m_securityHandler.data());
PDFParsingContext::PDFParsingContextGuard guard(&context, objectStreamReference); PDFParsingContext::PDFParsingContextGuard guard(&context, objectStreamReference);
PDFParser parser(objectStreamData, &context, PDFParser::AllowStreams); PDFParser parser(objectStreamData, &context, PDFParser::AllowStreams);
@ -485,14 +489,56 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
// Now, we are ready to scan all object streams // Now, we are ready to scan all object streams
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Unknown, objectStreams.cbegin(), objectStreams.cend(), processObjectStream); PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Unknown, objectStreams.cbegin(), objectStreams.cend(), processObjectStream);
}
PDFObjectStorage storage(std::move(objects), PDFObject(xrefTable.getTrailerDictionary()), std::move(securityHandler)); PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
{
try
{
m_source = buffer;
// FOOTER CHECKING
// 1) Check, if EOF marking is present
// 2) Find start of cross reference table
checkFooter(buffer);
const PDFInteger firstXrefTableOffset = findXrefTableOffset(buffer);
// HEADER CHECKING
// 1) Check if header is present
// 2) Scan header version
checkHeader(buffer);
// Now, we are ready to scan xref table
PDFXRefTable xrefTable;
xrefTable.readXRefTable(nullptr, buffer, firstXrefTableOffset);
PDFObjectStorage::PDFObjects objects;
objects.resize(xrefTable.getSize());
std::vector<PDFXRefTable::Entry> occupiedEntries = xrefTable.getOccupiedEntries();
// First, process regular objects
if (processReferenceTableEntries(&xrefTable, occupiedEntries, objects) != Result::OK)
{
// Do not proceed further, if document loading failed
return PDFDocument();
}
if (processSecurityHandler(xrefTable.getTrailerDictionary(), occupiedEntries, objects) == Result::Cancelled)
{
return PDFDocument();
}
processObjectStreams(&xrefTable, objects);
PDFObjectStorage storage(std::move(objects), PDFObject(xrefTable.getTrailerDictionary()), qMove(m_securityHandler));
return PDFDocument(std::move(storage), m_version); return PDFDocument(std::move(storage), m_version);
} }
catch (PDFException parserException) catch (PDFException parserException)
{ {
m_result = Result::Failed; m_result = Result::Failed;
m_errorMessage = parserException.getMessage(); m_errorMessage = parserException.getMessage();
m_warnings << m_errorMessage;
} }
return PDFDocument(); return PDFDocument();
@ -503,6 +549,8 @@ void PDFDocumentReader::reset()
m_result = Result::OK; m_result = Result::OK;
m_errorMessage = QString(); m_errorMessage = QString();
m_version = PDFVersion(); m_version = PDFVersion();
m_source = QByteArray();
m_securityHandler = nullptr;
} }
int PDFDocumentReader::findFromEnd(const char* what, const QByteArray& byteArray, int limit) int PDFDocumentReader::findFromEnd(const char* what, const QByteArray& byteArray, int limit)

View File

@ -22,12 +22,15 @@
#include "pdfglobal.h" #include "pdfglobal.h"
#include "pdfdocument.h" #include "pdfdocument.h"
#include "pdfprogress.h" #include "pdfprogress.h"
#include "pdfxreftable.h"
#include <QtCore> #include <QtCore>
#include <QIODevice> #include <QIODevice>
namespace pdf namespace pdf
{ {
class PDFXRefTable;
class PDFParsingContext;
/// This class is a reader of PDF document from various devices (file, io device, /// This class is a reader of PDF document from various devices (file, io device,
/// byte buffer). This class doesn't throw exceptions, to check errors, use /// byte buffer). This class doesn't throw exceptions, to check errors, use
@ -37,7 +40,7 @@ class PDFFORQTLIBSHARED_EXPORT PDFDocumentReader
Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocumentReader) Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocumentReader)
public: public:
explicit PDFDocumentReader(PDFProgress* progress, const std::function<QString(bool*)>& getPasswordCallback); explicit PDFDocumentReader(PDFProgress* progress, const std::function<QString(bool*)>& getPasswordCallback, bool permissive);
constexpr inline PDFDocumentReader(const PDFDocumentReader&) = delete; constexpr inline PDFDocumentReader(const PDFDocumentReader&) = delete;
constexpr inline PDFDocumentReader(PDFDocumentReader&&) = delete; constexpr inline PDFDocumentReader(PDFDocumentReader&&) = delete;
@ -75,6 +78,9 @@ public:
/// Get source data of the document /// Get source data of the document
const QByteArray& getSource() const { return m_source; } const QByteArray& getSource() const { return m_source; }
/// Returns warning messages
const QStringList& getWarnings() const { return m_warnings; }
private: private:
static constexpr const int FIND_NOT_FOUND_RESULT = -1; static constexpr const int FIND_NOT_FOUND_RESULT = -1;
@ -90,6 +96,20 @@ private:
/// \returns Position of string, or FIND_NOT_FOUND_RESULT /// \returns Position of string, or FIND_NOT_FOUND_RESULT
int findFromEnd(const char* what, const QByteArray& byteArray, int limit); int findFromEnd(const char* what, const QByteArray& byteArray, int limit);
void checkFooter(const QByteArray& buffer);
void checkHeader(const QByteArray& buffer);
const PDFInteger findXrefTableOffset(const QByteArray& buffer);
Result processReferenceTableEntries(PDFXRefTable* xrefTable, const std::vector<PDFXRefTable::Entry>& occupiedEntries, PDFObjectStorage::PDFObjects& objects);
Result processSecurityHandler(const PDFObject& trailerDictionaryObject, const std::vector<PDFXRefTable::Entry>& occupiedEntries, PDFObjectStorage::PDFObjects& objects);
void processObjectStreams(PDFXRefTable* xrefTable, PDFObjectStorage::PDFObjects& objects);
/// This function fetches object from the buffer from the specified offset.
/// Can throw exception, returns a pair of scanned reference and object content.
PDFObject getObject(PDFParsingContext* context, PDFInteger offset, PDFObjectReference reference) const;
/// Fetch object from reference table
PDFObject getObjectFromXrefTable(PDFXRefTable* xrefTable, PDFParsingContext* context, PDFObjectReference reference) const;
void progressStart(size_t stepCount, QString text); void progressStart(size_t stepCount, QString text);
void progressStep(); void progressStep();
void progressFinish(); void progressFinish();
@ -115,6 +135,15 @@ private:
/// Raw document data (byte array containing source data for created document) /// Raw document data (byte array containing source data for created document)
QByteArray m_source; QByteArray m_source;
/// Security handler
PDFSecurityHandlerPointer m_securityHandler;
/// Be permissive when reading, tolerate errors and try to fix broken document
bool m_permissive;
/// Warnings
QStringList m_warnings;
}; };
} // namespace pdf } // namespace pdf

View File

@ -997,7 +997,7 @@ void PDFViewerMainWindow::openDocument(const QString& fileName)
}; };
// Try to open a new document // Try to open a new document
pdf::PDFDocumentReader reader(m_progress, qMove(queryPassword)); pdf::PDFDocumentReader reader(m_progress, qMove(queryPassword), true);
pdf::PDFDocument document = reader.readFromFile(fileName); pdf::PDFDocument document = reader.readFromFile(fileName);
result.errorMessage = reader.getErrorMessage(); result.errorMessage = reader.getErrorMessage();