mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
Parsing X Reference table
This commit is contained in:
@@ -42,6 +42,14 @@ static constexpr const char* PDF_STREAM_DICT_FILE_FILTER = "FFilter";
|
||||
static constexpr const char* PDF_STREAM_DICT_FDECODE_PARMS = "FDecodeParms";
|
||||
static constexpr const char* PDF_STREAM_DICT_DECODED_LENGTH = "DL";
|
||||
|
||||
// xref table constants
|
||||
static constexpr const char* PDF_XREF_HEADER = "xref";
|
||||
static constexpr const char* PDF_XREF_TRAILER = "trailer";
|
||||
static constexpr const char* PDF_XREF_TRAILER_PREVIOUS = "Prev";
|
||||
static constexpr const char* PDF_XREF_TRAILER_XREFSTM = "XRefStm";
|
||||
static constexpr const char* PDF_XREF_FREE = "f";
|
||||
static constexpr const char* PDF_XREF_OCCUPIED = "n";
|
||||
|
||||
} // namespace pdf
|
||||
|
||||
#endif // PDFCONSTANTS_H
|
||||
|
@@ -19,6 +19,7 @@
|
||||
#include "pdfdocumentreader.h"
|
||||
#include "pdfparser.h"
|
||||
#include "pdfconstants.h"
|
||||
#include "pdfxreftable.h"
|
||||
|
||||
#include <QFile>
|
||||
|
||||
@@ -114,6 +115,15 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
|
||||
throw PDFParserException(tr("Start of object reference table not found."));
|
||||
}
|
||||
|
||||
Q_ASSERT(startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK) < buffer.size());
|
||||
PDFLexicalAnalyzer analyzer(buffer.constData() + startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK), buffer.constData() + buffer.size());
|
||||
const PDFLexicalAnalyzer::Token token = analyzer.fetch();
|
||||
if (token.type != PDFLexicalAnalyzer::TokenType::Integer)
|
||||
{
|
||||
throw PDFParserException(tr("Start of object reference table not found."));
|
||||
}
|
||||
const PDFInteger firstXrefTableOffset = token.data.toLongLong();
|
||||
|
||||
// HEADER CHECKING
|
||||
// 1) Check if header is present
|
||||
// 2) Scan header version
|
||||
@@ -123,7 +133,7 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
|
||||
// - %!PS-Adobe-y.y PDF-x.x
|
||||
// We will search for both of these formats.
|
||||
|
||||
std::regex headerRegExp("(%PDF-[[:digit:]]\\.[[:digit:]])|(%!PS-Adobe-[[:digit:]]\\.[[:digit:]] PDF-[[:digit:]]\\.[[:digit:]])");
|
||||
std::regex headerRegExp(PDF_FILE_HEADER_REGEXP);
|
||||
std::cmatch headerMatch;
|
||||
|
||||
auto itBegin = buffer.cbegin();
|
||||
@@ -156,6 +166,9 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
|
||||
throw PDFParserException(tr("Version of the PDF file is not valid."));
|
||||
}
|
||||
|
||||
// Now, we are ready to scan xref table
|
||||
PDFXRefTable xrefTable;
|
||||
xrefTable.readXRefTable(nullptr, buffer, firstXrefTableOffset);
|
||||
|
||||
}
|
||||
catch (PDFParserException parserException)
|
||||
|
@@ -31,13 +31,18 @@ namespace pdf
|
||||
/// This class is a reader of PDF document from various devices (file, io device,
|
||||
/// byte buffer). This class doesn't throw exceptions, to check errors, use
|
||||
/// appropriate functions.
|
||||
class PDFDocumentReader
|
||||
class PDFFORQTLIBSHARED_EXPORT PDFDocumentReader
|
||||
{
|
||||
Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocumentReader)
|
||||
|
||||
public:
|
||||
explicit PDFDocumentReader();
|
||||
|
||||
constexpr inline PDFDocumentReader(const PDFDocumentReader&) = delete;
|
||||
constexpr inline PDFDocumentReader(PDFDocumentReader&&) = delete;
|
||||
constexpr inline PDFDocumentReader& operator=(const PDFDocumentReader&) = delete;
|
||||
constexpr inline PDFDocumentReader& operator=(PDFDocumentReader&&) = delete;
|
||||
|
||||
/// Reads a PDF document from the specified file. If file doesn't exist,
|
||||
/// cannot be opened or contain invalid pdf, empty PDF file is returned.
|
||||
/// No exception is thrown.
|
||||
@@ -56,6 +61,9 @@ public:
|
||||
/// Returns true, if document was successfully read from device
|
||||
bool isSuccessfull() const { return m_successfull; }
|
||||
|
||||
/// Returns error message, if document reading was unsuccessfull
|
||||
const QString& getErrorMessage() const { return m_errorMessage; }
|
||||
|
||||
private:
|
||||
static constexpr const int FIND_NOT_FOUND_RESULT = -1;
|
||||
|
||||
|
@@ -30,6 +30,14 @@ QByteArray PDFObject::getString() const
|
||||
return string->getString();
|
||||
}
|
||||
|
||||
const PDFDictionary*PDFObject::getDictionary() const
|
||||
{
|
||||
const PDFObjectContentPointer& objectContent = std::get<PDFObjectContentPointer>(m_data);
|
||||
|
||||
Q_ASSERT(dynamic_cast<const PDFDictionary*>(objectContent.get()));
|
||||
return static_cast<const PDFDictionary*>(objectContent.get());
|
||||
}
|
||||
|
||||
bool PDFObject::operator==(const PDFObject &other) const
|
||||
{
|
||||
if (m_type == other.m_type)
|
||||
|
@@ -29,6 +29,7 @@
|
||||
|
||||
namespace pdf
|
||||
{
|
||||
class PDFDictionary;
|
||||
|
||||
/// This class represents a content of the PDF object. It can be
|
||||
/// array of objects, dictionary, content stream data, or string data.
|
||||
@@ -98,6 +99,7 @@ public:
|
||||
|
||||
inline PDFInteger getInteger() const { return std::get<PDFInteger>(m_data); }
|
||||
QByteArray getString() const;
|
||||
const PDFDictionary* getDictionary() const;
|
||||
|
||||
bool operator==(const PDFObject& other) const;
|
||||
bool operator!=(const PDFObject& other) const { return !(*this == other); }
|
||||
|
@@ -460,6 +460,19 @@ PDFLexicalAnalyzer::Token PDFLexicalAnalyzer::fetch()
|
||||
return Token(TokenType::EndOfFile);
|
||||
}
|
||||
|
||||
void PDFLexicalAnalyzer::seek(PDFInteger offset)
|
||||
{
|
||||
const PDFInteger limit = std::distance(m_begin, m_end);
|
||||
if (offset >= 0 && offset < limit)
|
||||
{
|
||||
m_current = std::next(m_begin, offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
error(tr("Trying to seek stream position to %1 bytes from the start, byte offset is invalid.").arg(offset));
|
||||
}
|
||||
}
|
||||
|
||||
void PDFLexicalAnalyzer::skipWhitespaceAndComments()
|
||||
{
|
||||
bool isComment = false;
|
||||
@@ -601,9 +614,19 @@ void PDFParsingContext::endParsingObject(PDFObjectReference reference)
|
||||
m_activeParsedObjectSet.erase(reference);
|
||||
}
|
||||
|
||||
PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context) :
|
||||
PDFParser::PDFParser(const QByteArray& data, PDFParsingContext* context, Features features) :
|
||||
m_context(context),
|
||||
m_lexicalAnalyzer(begin, end)
|
||||
m_lexicalAnalyzer(data.constData(), data.constData() + data.size()),
|
||||
m_features(features)
|
||||
{
|
||||
m_lookAhead1 = m_lexicalAnalyzer.fetch();
|
||||
m_lookAhead2 = m_lexicalAnalyzer.fetch();
|
||||
}
|
||||
|
||||
PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features) :
|
||||
m_context(context),
|
||||
m_lexicalAnalyzer(begin, end),
|
||||
m_features(features)
|
||||
{
|
||||
m_lookAhead1 = m_lexicalAnalyzer.fetch();
|
||||
m_lookAhead2 = m_lexicalAnalyzer.fetch();
|
||||
@@ -611,13 +634,6 @@ PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* cont
|
||||
|
||||
PDFObject PDFParser::getObject()
|
||||
{
|
||||
/*
|
||||
*
|
||||
// Complex PDF objects
|
||||
,
|
||||
Dictionary,
|
||||
Stream,
|
||||
*/
|
||||
switch (m_lookAhead1.type)
|
||||
{
|
||||
case PDFLexicalAnalyzer::TokenType::Boolean:
|
||||
@@ -744,6 +760,11 @@ PDFObject PDFParser::getObject()
|
||||
if (m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command &&
|
||||
m_lookAhead2.data.toByteArray() == PDF_STREAM_START_COMMAND)
|
||||
{
|
||||
if (!m_features.testFlag(AllowStreams))
|
||||
{
|
||||
error(tr("Streams are not allowed in this context."));
|
||||
}
|
||||
|
||||
// Read stream content. According to the PDF Reference 1.7, chapter 3.2.7, stream
|
||||
// content can be placed in the file. If this is the case, then try to load file
|
||||
// content in the memory. But even in this case, stream content should be skipped.
|
||||
@@ -852,6 +873,27 @@ void PDFParser::error(const QString& message) const
|
||||
throw new PDFParserException(message);
|
||||
}
|
||||
|
||||
void PDFParser::seek(PDFInteger offset)
|
||||
{
|
||||
m_lexicalAnalyzer.seek(offset);
|
||||
|
||||
// We must read lookahead symbols, because we invalidated them
|
||||
m_lookAhead1 = m_lexicalAnalyzer.fetch();
|
||||
m_lookAhead2 = m_lexicalAnalyzer.fetch();
|
||||
}
|
||||
|
||||
bool PDFParser::fetchCommand(const char* command)
|
||||
{
|
||||
if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command &&
|
||||
m_lookAhead1.data.toByteArray() == command)
|
||||
{
|
||||
shift();
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void PDFParser::shift()
|
||||
{
|
||||
m_lookAhead1 = std::move(m_lookAhead2);
|
||||
|
@@ -138,6 +138,10 @@ public:
|
||||
/// stream, then EndOfFile token is returned.
|
||||
Token fetch();
|
||||
|
||||
/// Seeks stream from the start. If stream cannot be seeked (position is invalid),
|
||||
/// then exception is thrown.
|
||||
void seek(PDFInteger offset);
|
||||
|
||||
/// Skips whitespace and comments
|
||||
void skipWhitespaceAndComments();
|
||||
|
||||
@@ -242,14 +246,23 @@ class PDFParser
|
||||
Q_DECLARE_TR_FUNCTIONS(pdf::PDFParser)
|
||||
|
||||
public:
|
||||
explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context);
|
||||
enum Feature
|
||||
{
|
||||
None = 0x0000,
|
||||
AllowStreams = 0x0001,
|
||||
};
|
||||
|
||||
Q_DECLARE_FLAGS(Features, Feature)
|
||||
|
||||
explicit PDFParser(const QByteArray& data, PDFParsingContext* context, Features features);
|
||||
explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features);
|
||||
|
||||
/// Fetches single object from the stream. Does not check
|
||||
/// cyclical references. If object cannot be fetched, then
|
||||
/// exception is thrown.
|
||||
PDFObject getObject();
|
||||
|
||||
/// Fetches signle object from the stream. Performs check for
|
||||
/// Fetches single object from the stream. Performs check for
|
||||
/// cyclical references. If object cannot be fetched, then
|
||||
/// exception is thrown.
|
||||
PDFObject getObject(PDFObjectReference reference);
|
||||
@@ -257,12 +270,27 @@ public:
|
||||
/// Throws an error exception
|
||||
void error(const QString& message) const;
|
||||
|
||||
/// Seeks stream from the start. If stream cannot be seeked (position is invalid),
|
||||
/// then exception is thrown.
|
||||
void seek(PDFInteger offset);
|
||||
|
||||
/// Returns currently scanned token
|
||||
const PDFLexicalAnalyzer::Token& lookahead() const { return m_lookAhead1; }
|
||||
|
||||
/// If current token is a command with same string, then eat this command
|
||||
/// and return true. Otherwise do nothing and return false.
|
||||
/// \param command Command to be fetched
|
||||
bool fetchCommand(const char* command);
|
||||
|
||||
private:
|
||||
void shift();
|
||||
|
||||
/// Parsing context (multiple parsers can share it)
|
||||
PDFParsingContext* m_context;
|
||||
|
||||
/// Enabled features
|
||||
Features m_features;
|
||||
|
||||
/// Lexical analyzer for scanning tokens
|
||||
PDFLexicalAnalyzer m_lexicalAnalyzer;
|
||||
|
||||
|
148
PdfForQtLib/sources/pdfxreftable.cpp
Normal file
148
PdfForQtLib/sources/pdfxreftable.cpp
Normal file
@@ -0,0 +1,148 @@
|
||||
// Copyright (C) 2018 Jakub Melka
|
||||
//
|
||||
// This file is part of PdfForQt.
|
||||
//
|
||||
// PdfForQt is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Lesser General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// PdfForQt is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
#include "pdfxreftable.h"
|
||||
#include "pdfconstants.h"
|
||||
#include "pdfparser.h"
|
||||
|
||||
#include <stack>
|
||||
|
||||
namespace pdf
|
||||
{
|
||||
|
||||
void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& byteArray, PDFInteger startTableOffset)
|
||||
{
|
||||
PDFParser parser(byteArray, context, PDFParser::None);
|
||||
|
||||
m_entries.clear();
|
||||
|
||||
std::set<PDFInteger> processedOffsets;
|
||||
std::stack<PDFInteger> workSet;
|
||||
workSet.push(startTableOffset);
|
||||
|
||||
while (!workSet.empty())
|
||||
{
|
||||
PDFInteger currentOffset = workSet.top();
|
||||
workSet.pop();
|
||||
|
||||
// Check, if we have cyclical references between tables
|
||||
if (processedOffsets.count(currentOffset))
|
||||
{
|
||||
throw PDFParserException(tr("Cyclic reference found in reference table."));
|
||||
}
|
||||
else
|
||||
{
|
||||
processedOffsets.insert(currentOffset);
|
||||
}
|
||||
|
||||
// Now, we are ready to scan the table. Seek to the start of the reference table.
|
||||
parser.seek(currentOffset);
|
||||
|
||||
if (parser.fetchCommand(PDF_XREF_HEADER))
|
||||
{
|
||||
while (!parser.fetchCommand(PDF_XREF_TRAILER))
|
||||
{
|
||||
// Now, first number is start offset, second number is count of table items
|
||||
PDFObject firstObject = parser.getObject();
|
||||
PDFObject countObject = parser.getObject();
|
||||
|
||||
if (!firstObject.isInt() || !countObject.isInt())
|
||||
{
|
||||
throw PDFParserException(tr("Invalid format of reference table."));
|
||||
}
|
||||
|
||||
PDFInteger firstObjectNumber = firstObject.getInteger();
|
||||
PDFInteger count = countObject.getInteger();
|
||||
|
||||
const PDFInteger lastObjectIndex = firstObjectNumber + count - 1;
|
||||
const PDFInteger desiredSize = lastObjectIndex + 1;
|
||||
|
||||
if (static_cast<PDFInteger>(m_entries.size()) < desiredSize)
|
||||
{
|
||||
m_entries.resize(desiredSize);
|
||||
}
|
||||
|
||||
// Now, read the records
|
||||
for (PDFInteger i = 0; i < count; ++i)
|
||||
{
|
||||
const PDFInteger objectNumber = firstObjectNumber + i;
|
||||
|
||||
PDFObject offset = parser.getObject();
|
||||
PDFObject generation = parser.getObject();
|
||||
|
||||
bool occupied = parser.fetchCommand(PDF_XREF_OCCUPIED);
|
||||
if (!occupied && !parser.fetchCommand(PDF_XREF_FREE))
|
||||
{
|
||||
throw PDFParserException(tr("Bad format of reference table entry."));
|
||||
}
|
||||
|
||||
if (!offset.isInt() || !generation.isInt())
|
||||
{
|
||||
throw PDFParserException(tr("Bad format of reference table entry."));
|
||||
}
|
||||
|
||||
Entry entry;
|
||||
if (occupied)
|
||||
{
|
||||
entry.reference = PDFObjectReference(objectNumber, generation.getInteger());
|
||||
entry.offset = offset.getInteger();
|
||||
entry.type = EntryType::Occupied;
|
||||
}
|
||||
|
||||
m_entries[objectNumber] = std::move(entry);
|
||||
}
|
||||
}
|
||||
|
||||
PDFObject trailerDictionary = parser.getObject();
|
||||
if (!trailerDictionary.isDictionary())
|
||||
{
|
||||
throw PDFParserException(tr("Trailer dictionary is invalid."));
|
||||
}
|
||||
|
||||
// Now, we have scanned the table. If we didn't have a trailer dictionary yet, then
|
||||
// try to load it. We must also check, that trailer dictionary is OK.
|
||||
if (m_trailerDictionary.isNull())
|
||||
{
|
||||
m_trailerDictionary = trailerDictionary;
|
||||
}
|
||||
|
||||
const PDFDictionary* dictionary = trailerDictionary.getDictionary();
|
||||
if (dictionary->hasKey(PDF_XREF_TRAILER_PREVIOUS))
|
||||
{
|
||||
PDFObject previousOffset = dictionary->get(PDF_XREF_TRAILER_PREVIOUS);
|
||||
|
||||
if (!previousOffset.isInt())
|
||||
{
|
||||
throw PDFParserException(tr("Offset of previous reference table is invalid."));
|
||||
}
|
||||
|
||||
workSet.push(previousOffset.getInteger());
|
||||
}
|
||||
|
||||
if (dictionary->hasKey(PDF_XREF_TRAILER_XREFSTM))
|
||||
{
|
||||
throw PDFParserException(tr("Hybrid reference tables not supported."));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw PDFParserException(tr("Invalid format of reference table."));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pdf
|
80
PdfForQtLib/sources/pdfxreftable.h
Normal file
80
PdfForQtLib/sources/pdfxreftable.h
Normal file
@@ -0,0 +1,80 @@
|
||||
// Copyright (C) 2018 Jakub Melka
|
||||
//
|
||||
// This file is part of PdfForQt.
|
||||
//
|
||||
// PdfForQt is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Lesser General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// PdfForQt is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef PDFXREFTABLE_H
|
||||
#define PDFXREFTABLE_H
|
||||
|
||||
#include "pdfglobal.h"
|
||||
#include "pdfobject.h"
|
||||
|
||||
#include <QtCore>
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace pdf
|
||||
{
|
||||
class PDFParsingContext;
|
||||
|
||||
/// Represents table of references in the PDF file. It contains
|
||||
/// scanned table in the PDF file, together with information, if entry
|
||||
/// is occupied, or it is free.
|
||||
class PDFXRefTable
|
||||
{
|
||||
Q_DECLARE_TR_FUNCTIONS(pdf::PDFXRefTable)
|
||||
|
||||
public:
|
||||
constexpr inline explicit PDFXRefTable() = default;
|
||||
|
||||
// Enforce default copy constructor and default move constructor
|
||||
constexpr inline PDFXRefTable(const PDFXRefTable&) = default;
|
||||
constexpr inline PDFXRefTable(PDFXRefTable&&) = default;
|
||||
|
||||
// Enforce default copy assignment operator and move assignment operator
|
||||
constexpr inline PDFXRefTable& operator=(const PDFXRefTable&) = default;
|
||||
constexpr inline PDFXRefTable& operator=(PDFXRefTable&&) = default;
|
||||
|
||||
enum class EntryType
|
||||
{
|
||||
Free, ///< Entry represents a free item (no object)
|
||||
Occupied ///< Entry represents a occupied item (object)
|
||||
};
|
||||
|
||||
struct Entry
|
||||
{
|
||||
PDFObjectReference reference;
|
||||
PDFInteger offset = -1;
|
||||
EntryType type = EntryType::Free;
|
||||
};
|
||||
|
||||
/// Tries to read reference table from the byte array. If error occurs, then exception
|
||||
/// is raised. This fuction also checks redundant entries.
|
||||
/// \param context Current parsing context
|
||||
/// \param byteArray Input byte array (containing the PDF file)
|
||||
/// \param startTableOffset Offset of first reference table
|
||||
void readXRefTable(PDFParsingContext* context, const QByteArray& byteArray, PDFInteger startTableOffset);
|
||||
|
||||
private:
|
||||
/// Reference table entries
|
||||
std::vector<Entry> m_entries;
|
||||
|
||||
/// Trailer dictionary
|
||||
PDFObject m_trailerDictionary;
|
||||
};
|
||||
|
||||
} // namespace pdf
|
||||
|
||||
#endif // PDFXREFTABLE_H
|
Reference in New Issue
Block a user