Parsing X Reference table

This commit is contained in:
Jakub Melka
2018-11-21 19:30:15 +01:00
parent 58ad59e407
commit 8c93c82228
19 changed files with 625 additions and 58 deletions

View File

@@ -42,6 +42,14 @@ static constexpr const char* PDF_STREAM_DICT_FILE_FILTER = "FFilter";
static constexpr const char* PDF_STREAM_DICT_FDECODE_PARMS = "FDecodeParms";
static constexpr const char* PDF_STREAM_DICT_DECODED_LENGTH = "DL";
// xref table constants
static constexpr const char* PDF_XREF_HEADER = "xref";
static constexpr const char* PDF_XREF_TRAILER = "trailer";
static constexpr const char* PDF_XREF_TRAILER_PREVIOUS = "Prev";
static constexpr const char* PDF_XREF_TRAILER_XREFSTM = "XRefStm";
static constexpr const char* PDF_XREF_FREE = "f";
static constexpr const char* PDF_XREF_OCCUPIED = "n";
} // namespace pdf
#endif // PDFCONSTANTS_H

View File

@@ -19,6 +19,7 @@
#include "pdfdocumentreader.h"
#include "pdfparser.h"
#include "pdfconstants.h"
#include "pdfxreftable.h"
#include <QFile>
@@ -114,6 +115,15 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
throw PDFParserException(tr("Start of object reference table not found."));
}
Q_ASSERT(startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK) < buffer.size());
PDFLexicalAnalyzer analyzer(buffer.constData() + startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK), buffer.constData() + buffer.size());
const PDFLexicalAnalyzer::Token token = analyzer.fetch();
if (token.type != PDFLexicalAnalyzer::TokenType::Integer)
{
throw PDFParserException(tr("Start of object reference table not found."));
}
const PDFInteger firstXrefTableOffset = token.data.toLongLong();
// HEADER CHECKING
// 1) Check if header is present
// 2) Scan header version
@@ -123,7 +133,7 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
// - %!PS-Adobe-y.y PDF-x.x
// We will search for both of these formats.
std::regex headerRegExp("(%PDF-[[:digit:]]\\.[[:digit:]])|(%!PS-Adobe-[[:digit:]]\\.[[:digit:]] PDF-[[:digit:]]\\.[[:digit:]])");
std::regex headerRegExp(PDF_FILE_HEADER_REGEXP);
std::cmatch headerMatch;
auto itBegin = buffer.cbegin();
@@ -156,6 +166,9 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
throw PDFParserException(tr("Version of the PDF file is not valid."));
}
// Now, we are ready to scan xref table
PDFXRefTable xrefTable;
xrefTable.readXRefTable(nullptr, buffer, firstXrefTableOffset);
}
catch (PDFParserException parserException)

View File

@@ -31,13 +31,18 @@ namespace pdf
/// This class is a reader of PDF document from various devices (file, io device,
/// byte buffer). This class doesn't throw exceptions, to check errors, use
/// appropriate functions.
class PDFDocumentReader
class PDFFORQTLIBSHARED_EXPORT PDFDocumentReader
{
Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocumentReader)
public:
explicit PDFDocumentReader();
constexpr inline PDFDocumentReader(const PDFDocumentReader&) = delete;
constexpr inline PDFDocumentReader(PDFDocumentReader&&) = delete;
constexpr inline PDFDocumentReader& operator=(const PDFDocumentReader&) = delete;
constexpr inline PDFDocumentReader& operator=(PDFDocumentReader&&) = delete;
/// Reads a PDF document from the specified file. If file doesn't exist,
/// cannot be opened or contain invalid pdf, empty PDF file is returned.
/// No exception is thrown.
@@ -56,6 +61,9 @@ public:
/// Returns true, if document was successfully read from device
bool isSuccessfull() const { return m_successfull; }
/// Returns error message, if document reading was unsuccessfull
const QString& getErrorMessage() const { return m_errorMessage; }
private:
static constexpr const int FIND_NOT_FOUND_RESULT = -1;

View File

@@ -30,6 +30,14 @@ QByteArray PDFObject::getString() const
return string->getString();
}
const PDFDictionary*PDFObject::getDictionary() const
{
const PDFObjectContentPointer& objectContent = std::get<PDFObjectContentPointer>(m_data);
Q_ASSERT(dynamic_cast<const PDFDictionary*>(objectContent.get()));
return static_cast<const PDFDictionary*>(objectContent.get());
}
bool PDFObject::operator==(const PDFObject &other) const
{
if (m_type == other.m_type)

View File

@@ -29,6 +29,7 @@
namespace pdf
{
class PDFDictionary;
/// This class represents a content of the PDF object. It can be
/// array of objects, dictionary, content stream data, or string data.
@@ -98,6 +99,7 @@ public:
inline PDFInteger getInteger() const { return std::get<PDFInteger>(m_data); }
QByteArray getString() const;
const PDFDictionary* getDictionary() const;
bool operator==(const PDFObject& other) const;
bool operator!=(const PDFObject& other) const { return !(*this == other); }

View File

@@ -460,6 +460,19 @@ PDFLexicalAnalyzer::Token PDFLexicalAnalyzer::fetch()
return Token(TokenType::EndOfFile);
}
void PDFLexicalAnalyzer::seek(PDFInteger offset)
{
const PDFInteger limit = std::distance(m_begin, m_end);
if (offset >= 0 && offset < limit)
{
m_current = std::next(m_begin, offset);
}
else
{
error(tr("Trying to seek stream position to %1 bytes from the start, byte offset is invalid.").arg(offset));
}
}
void PDFLexicalAnalyzer::skipWhitespaceAndComments()
{
bool isComment = false;
@@ -601,9 +614,19 @@ void PDFParsingContext::endParsingObject(PDFObjectReference reference)
m_activeParsedObjectSet.erase(reference);
}
PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context) :
PDFParser::PDFParser(const QByteArray& data, PDFParsingContext* context, Features features) :
m_context(context),
m_lexicalAnalyzer(begin, end)
m_lexicalAnalyzer(data.constData(), data.constData() + data.size()),
m_features(features)
{
m_lookAhead1 = m_lexicalAnalyzer.fetch();
m_lookAhead2 = m_lexicalAnalyzer.fetch();
}
PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features) :
m_context(context),
m_lexicalAnalyzer(begin, end),
m_features(features)
{
m_lookAhead1 = m_lexicalAnalyzer.fetch();
m_lookAhead2 = m_lexicalAnalyzer.fetch();
@@ -611,13 +634,6 @@ PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* cont
PDFObject PDFParser::getObject()
{
/*
*
// Complex PDF objects
,
Dictionary,
Stream,
*/
switch (m_lookAhead1.type)
{
case PDFLexicalAnalyzer::TokenType::Boolean:
@@ -744,6 +760,11 @@ PDFObject PDFParser::getObject()
if (m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command &&
m_lookAhead2.data.toByteArray() == PDF_STREAM_START_COMMAND)
{
if (!m_features.testFlag(AllowStreams))
{
error(tr("Streams are not allowed in this context."));
}
// Read stream content. According to the PDF Reference 1.7, chapter 3.2.7, stream
// content can be placed in the file. If this is the case, then try to load file
// content in the memory. But even in this case, stream content should be skipped.
@@ -852,6 +873,27 @@ void PDFParser::error(const QString& message) const
throw new PDFParserException(message);
}
void PDFParser::seek(PDFInteger offset)
{
m_lexicalAnalyzer.seek(offset);
// We must read lookahead symbols, because we invalidated them
m_lookAhead1 = m_lexicalAnalyzer.fetch();
m_lookAhead2 = m_lexicalAnalyzer.fetch();
}
bool PDFParser::fetchCommand(const char* command)
{
if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command &&
m_lookAhead1.data.toByteArray() == command)
{
shift();
return true;
}
return false;
}
void PDFParser::shift()
{
m_lookAhead1 = std::move(m_lookAhead2);

View File

@@ -138,6 +138,10 @@ public:
/// stream, then EndOfFile token is returned.
Token fetch();
/// Seeks stream from the start. If stream cannot be seeked (position is invalid),
/// then exception is thrown.
void seek(PDFInteger offset);
/// Skips whitespace and comments
void skipWhitespaceAndComments();
@@ -242,14 +246,23 @@ class PDFParser
Q_DECLARE_TR_FUNCTIONS(pdf::PDFParser)
public:
explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context);
enum Feature
{
None = 0x0000,
AllowStreams = 0x0001,
};
Q_DECLARE_FLAGS(Features, Feature)
explicit PDFParser(const QByteArray& data, PDFParsingContext* context, Features features);
explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features);
/// Fetches single object from the stream. Does not check
/// cyclical references. If object cannot be fetched, then
/// exception is thrown.
PDFObject getObject();
/// Fetches signle object from the stream. Performs check for
/// Fetches single object from the stream. Performs check for
/// cyclical references. If object cannot be fetched, then
/// exception is thrown.
PDFObject getObject(PDFObjectReference reference);
@@ -257,12 +270,27 @@ public:
/// Throws an error exception
void error(const QString& message) const;
/// Seeks stream from the start. If stream cannot be seeked (position is invalid),
/// then exception is thrown.
void seek(PDFInteger offset);
/// Returns currently scanned token
const PDFLexicalAnalyzer::Token& lookahead() const { return m_lookAhead1; }
/// If current token is a command with same string, then eat this command
/// and return true. Otherwise do nothing and return false.
/// \param command Command to be fetched
bool fetchCommand(const char* command);
private:
void shift();
/// Parsing context (multiple parsers can share it)
PDFParsingContext* m_context;
/// Enabled features
Features m_features;
/// Lexical analyzer for scanning tokens
PDFLexicalAnalyzer m_lexicalAnalyzer;

View File

@@ -0,0 +1,148 @@
// Copyright (C) 2018 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#include "pdfxreftable.h"
#include "pdfconstants.h"
#include "pdfparser.h"
#include <stack>
namespace pdf
{
void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& byteArray, PDFInteger startTableOffset)
{
PDFParser parser(byteArray, context, PDFParser::None);
m_entries.clear();
std::set<PDFInteger> processedOffsets;
std::stack<PDFInteger> workSet;
workSet.push(startTableOffset);
while (!workSet.empty())
{
PDFInteger currentOffset = workSet.top();
workSet.pop();
// Check, if we have cyclical references between tables
if (processedOffsets.count(currentOffset))
{
throw PDFParserException(tr("Cyclic reference found in reference table."));
}
else
{
processedOffsets.insert(currentOffset);
}
// Now, we are ready to scan the table. Seek to the start of the reference table.
parser.seek(currentOffset);
if (parser.fetchCommand(PDF_XREF_HEADER))
{
while (!parser.fetchCommand(PDF_XREF_TRAILER))
{
// Now, first number is start offset, second number is count of table items
PDFObject firstObject = parser.getObject();
PDFObject countObject = parser.getObject();
if (!firstObject.isInt() || !countObject.isInt())
{
throw PDFParserException(tr("Invalid format of reference table."));
}
PDFInteger firstObjectNumber = firstObject.getInteger();
PDFInteger count = countObject.getInteger();
const PDFInteger lastObjectIndex = firstObjectNumber + count - 1;
const PDFInteger desiredSize = lastObjectIndex + 1;
if (static_cast<PDFInteger>(m_entries.size()) < desiredSize)
{
m_entries.resize(desiredSize);
}
// Now, read the records
for (PDFInteger i = 0; i < count; ++i)
{
const PDFInteger objectNumber = firstObjectNumber + i;
PDFObject offset = parser.getObject();
PDFObject generation = parser.getObject();
bool occupied = parser.fetchCommand(PDF_XREF_OCCUPIED);
if (!occupied && !parser.fetchCommand(PDF_XREF_FREE))
{
throw PDFParserException(tr("Bad format of reference table entry."));
}
if (!offset.isInt() || !generation.isInt())
{
throw PDFParserException(tr("Bad format of reference table entry."));
}
Entry entry;
if (occupied)
{
entry.reference = PDFObjectReference(objectNumber, generation.getInteger());
entry.offset = offset.getInteger();
entry.type = EntryType::Occupied;
}
m_entries[objectNumber] = std::move(entry);
}
}
PDFObject trailerDictionary = parser.getObject();
if (!trailerDictionary.isDictionary())
{
throw PDFParserException(tr("Trailer dictionary is invalid."));
}
// Now, we have scanned the table. If we didn't have a trailer dictionary yet, then
// try to load it. We must also check, that trailer dictionary is OK.
if (m_trailerDictionary.isNull())
{
m_trailerDictionary = trailerDictionary;
}
const PDFDictionary* dictionary = trailerDictionary.getDictionary();
if (dictionary->hasKey(PDF_XREF_TRAILER_PREVIOUS))
{
PDFObject previousOffset = dictionary->get(PDF_XREF_TRAILER_PREVIOUS);
if (!previousOffset.isInt())
{
throw PDFParserException(tr("Offset of previous reference table is invalid."));
}
workSet.push(previousOffset.getInteger());
}
if (dictionary->hasKey(PDF_XREF_TRAILER_XREFSTM))
{
throw PDFParserException(tr("Hybrid reference tables not supported."));
}
}
else
{
throw PDFParserException(tr("Invalid format of reference table."));
}
}
}
} // namespace pdf

View File

@@ -0,0 +1,80 @@
// Copyright (C) 2018 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#ifndef PDFXREFTABLE_H
#define PDFXREFTABLE_H
#include "pdfglobal.h"
#include "pdfobject.h"
#include <QtCore>
#include <vector>
namespace pdf
{
class PDFParsingContext;
/// Represents table of references in the PDF file. It contains
/// scanned table in the PDF file, together with information, if entry
/// is occupied, or it is free.
class PDFXRefTable
{
Q_DECLARE_TR_FUNCTIONS(pdf::PDFXRefTable)
public:
constexpr inline explicit PDFXRefTable() = default;
// Enforce default copy constructor and default move constructor
constexpr inline PDFXRefTable(const PDFXRefTable&) = default;
constexpr inline PDFXRefTable(PDFXRefTable&&) = default;
// Enforce default copy assignment operator and move assignment operator
constexpr inline PDFXRefTable& operator=(const PDFXRefTable&) = default;
constexpr inline PDFXRefTable& operator=(PDFXRefTable&&) = default;
enum class EntryType
{
Free, ///< Entry represents a free item (no object)
Occupied ///< Entry represents a occupied item (object)
};
struct Entry
{
PDFObjectReference reference;
PDFInteger offset = -1;
EntryType type = EntryType::Free;
};
/// Tries to read reference table from the byte array. If error occurs, then exception
/// is raised. This fuction also checks redundant entries.
/// \param context Current parsing context
/// \param byteArray Input byte array (containing the PDF file)
/// \param startTableOffset Offset of first reference table
void readXRefTable(PDFParsingContext* context, const QByteArray& byteArray, PDFInteger startTableOffset);
private:
/// Reference table entries
std::vector<Entry> m_entries;
/// Trailer dictionary
PDFObject m_trailerDictionary;
};
} // namespace pdf
#endif // PDFXREFTABLE_H