Parsing X Reference table

2025-06-05 21:59:17 +02:00 · 2018-11-21 19:30:15 +01:00
parent 58ad59e407
commit 8c93c82228
19 changed files with 625 additions and 58 deletions
--- a/PdfForQtLib/sources/pdfconstants.h
+++ b/PdfForQtLib/sources/pdfconstants.h
@@ -42,6 +42,14 @@ static constexpr const char* PDF_STREAM_DICT_FILE_FILTER = "FFilter";
 static constexpr const char* PDF_STREAM_DICT_FDECODE_PARMS = "FDecodeParms";
 static constexpr const char* PDF_STREAM_DICT_DECODED_LENGTH = "DL";

+// xref table constants
+static constexpr const char* PDF_XREF_HEADER = "xref";
+static constexpr const char* PDF_XREF_TRAILER = "trailer";
+static constexpr const char* PDF_XREF_TRAILER_PREVIOUS = "Prev";
+static constexpr const char* PDF_XREF_TRAILER_XREFSTM = "XRefStm";
+static constexpr const char* PDF_XREF_FREE = "f";
+static constexpr const char* PDF_XREF_OCCUPIED = "n";
+
 }   // namespace pdf

 #endif // PDFCONSTANTS_H
--- a/PdfForQtLib/sources/pdfdocumentreader.cpp
+++ b/PdfForQtLib/sources/pdfdocumentreader.cpp
@@ -19,6 +19,7 @@
 #include "pdfdocumentreader.h"
 #include "pdfparser.h"
 #include "pdfconstants.h"
+#include "pdfxreftable.h"

 #include <QFile>

@@ -114,6 +115,15 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
            throw PDFParserException(tr("Start of object reference table not found."));
        }

+        Q_ASSERT(startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK) < buffer.size());
+        PDFLexicalAnalyzer analyzer(buffer.constData() + startXRefPosition + std::strlen(PDF_START_OF_XREF_MARK), buffer.constData() + buffer.size());
+        const PDFLexicalAnalyzer::Token token = analyzer.fetch();
+        if (token.type != PDFLexicalAnalyzer::TokenType::Integer)
+        {
+            throw PDFParserException(tr("Start of object reference table not found."));
+        }
+        const PDFInteger firstXrefTableOffset = token.data.toLongLong();
+
        // HEADER CHECKING
        //  1) Check if header is present
        //  2) Scan header version
@@ -123,7 +133,7 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
        //  - %!PS-Adobe-y.y PDF-x.x
        // We will search for both of these formats.

-        std::regex headerRegExp("(%PDF-[[:digit:]]\\.[[:digit:]])|(%!PS-Adobe-[[:digit:]]\\.[[:digit:]] PDF-[[:digit:]]\\.[[:digit:]])");
+        std::regex headerRegExp(PDF_FILE_HEADER_REGEXP);
        std::cmatch headerMatch;

        auto itBegin = buffer.cbegin();
@@ -156,6 +166,9 @@ PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer)
            throw PDFParserException(tr("Version of the PDF file is not valid."));
        }

+        // Now, we are ready to scan xref table
+        PDFXRefTable xrefTable;
+        xrefTable.readXRefTable(nullptr, buffer, firstXrefTableOffset);

    }
    catch (PDFParserException parserException)
--- a/PdfForQtLib/sources/pdfdocumentreader.h
+++ b/PdfForQtLib/sources/pdfdocumentreader.h
@@ -31,13 +31,18 @@ namespace pdf
 /// This class is a reader of PDF document from various devices (file, io device,
 /// byte buffer). This class doesn't throw exceptions, to check errors, use
 /// appropriate functions.
-class PDFDocumentReader
+class PDFFORQTLIBSHARED_EXPORT PDFDocumentReader
 {
    Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocumentReader)

 public:
    explicit PDFDocumentReader();

+    constexpr inline PDFDocumentReader(const PDFDocumentReader&) = delete;
+    constexpr inline PDFDocumentReader(PDFDocumentReader&&) = delete;
+    constexpr inline PDFDocumentReader& operator=(const PDFDocumentReader&) = delete;
+    constexpr inline PDFDocumentReader& operator=(PDFDocumentReader&&) = delete;
+
    /// Reads a PDF document from the specified file. If file doesn't exist,
    /// cannot be opened or contain invalid pdf, empty PDF file is returned.
    /// No exception is thrown.
@@ -56,6 +61,9 @@ public:
    /// Returns true, if document was successfully read from device
    bool isSuccessfull() const { return m_successfull; }

+    /// Returns error message, if document reading was unsuccessfull
+    const QString& getErrorMessage() const { return m_errorMessage; }
+
 private:
    static constexpr const int FIND_NOT_FOUND_RESULT = -1;

--- a/PdfForQtLib/sources/pdfobject.cpp
+++ b/PdfForQtLib/sources/pdfobject.cpp
@@ -30,6 +30,14 @@ QByteArray PDFObject::getString() const
    return string->getString();
 }

+const PDFDictionary*PDFObject::getDictionary() const
+{
+    const PDFObjectContentPointer& objectContent = std::get<PDFObjectContentPointer>(m_data);
+
+    Q_ASSERT(dynamic_cast<const PDFDictionary*>(objectContent.get()));
+    return static_cast<const PDFDictionary*>(objectContent.get());
+}
+
 bool PDFObject::operator==(const PDFObject &other) const
 {
    if (m_type == other.m_type)
--- a/PdfForQtLib/sources/pdfobject.h
+++ b/PdfForQtLib/sources/pdfobject.h
@@ -29,6 +29,7 @@

 namespace pdf
 {
+class PDFDictionary;

 /// This class represents a content of the PDF object. It can be
 /// array of objects, dictionary, content stream data, or string data.
@@ -98,6 +99,7 @@ public:

    inline PDFInteger getInteger() const { return std::get<PDFInteger>(m_data); }
    QByteArray getString() const;
+    const PDFDictionary* getDictionary() const;

    bool operator==(const PDFObject& other) const;
    bool operator!=(const PDFObject& other) const { return !(*this == other); }
--- a/PdfForQtLib/sources/pdfparser.cpp
+++ b/PdfForQtLib/sources/pdfparser.cpp
@@ -460,6 +460,19 @@ PDFLexicalAnalyzer::Token PDFLexicalAnalyzer::fetch()
    return Token(TokenType::EndOfFile);
 }

+void PDFLexicalAnalyzer::seek(PDFInteger offset)
+{
+    const PDFInteger limit = std::distance(m_begin, m_end);
+    if (offset >= 0 && offset < limit)
+    {
+        m_current = std::next(m_begin, offset);
+    }
+    else
+    {
+        error(tr("Trying to seek stream position to %1 bytes from the start, byte offset is invalid.").arg(offset));
+    }
+}
+
 void PDFLexicalAnalyzer::skipWhitespaceAndComments()
 {
    bool isComment = false;
@@ -601,9 +614,19 @@ void PDFParsingContext::endParsingObject(PDFObjectReference reference)
    m_activeParsedObjectSet.erase(reference);
 }

-PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context) :
+PDFParser::PDFParser(const QByteArray& data, PDFParsingContext* context, Features features) :
    m_context(context),
-    m_lexicalAnalyzer(begin, end)
+    m_lexicalAnalyzer(data.constData(), data.constData() + data.size()),
+    m_features(features)
+{
+    m_lookAhead1 = m_lexicalAnalyzer.fetch();
+    m_lookAhead2 = m_lexicalAnalyzer.fetch();
+}
+
+PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features) :
+    m_context(context),
+    m_lexicalAnalyzer(begin, end),
+    m_features(features)
 {
    m_lookAhead1 = m_lexicalAnalyzer.fetch();
    m_lookAhead2 = m_lexicalAnalyzer.fetch();
@@ -611,13 +634,6 @@ PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* cont

 PDFObject PDFParser::getObject()
 {
-    /*
-     *
-        // Complex PDF objects
-        ,
-        Dictionary,
-        Stream,
-        */
    switch (m_lookAhead1.type)
    {
        case PDFLexicalAnalyzer::TokenType::Boolean:
@@ -744,6 +760,11 @@ PDFObject PDFParser::getObject()
            if (m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command &&
                m_lookAhead2.data.toByteArray() == PDF_STREAM_START_COMMAND)
            {
+                if (!m_features.testFlag(AllowStreams))
+                {
+                    error(tr("Streams are not allowed in this context."));
+                }
+
                // Read stream content. According to the PDF Reference 1.7, chapter 3.2.7, stream
                // content can be placed in the file. If this is the case, then try to load file
                // content in the memory. But even in this case, stream content should be skipped.
@@ -852,6 +873,27 @@ void PDFParser::error(const QString& message) const
    throw new PDFParserException(message);
 }

+void PDFParser::seek(PDFInteger offset)
+{
+    m_lexicalAnalyzer.seek(offset);
+
+    // We must read lookahead symbols, because we invalidated them
+    m_lookAhead1 = m_lexicalAnalyzer.fetch();
+    m_lookAhead2 = m_lexicalAnalyzer.fetch();
+}
+
+bool PDFParser::fetchCommand(const char* command)
+{
+    if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command &&
+        m_lookAhead1.data.toByteArray() == command)
+    {
+        shift();
+        return true;
+    }
+
+    return false;
+}
+
 void PDFParser::shift()
 {
    m_lookAhead1 = std::move(m_lookAhead2);
--- a/PdfForQtLib/sources/pdfparser.h
+++ b/PdfForQtLib/sources/pdfparser.h
@@ -138,6 +138,10 @@ public:
    /// stream, then EndOfFile token is returned.
    Token fetch();

+    /// Seeks stream from the start. If stream cannot be seeked (position is invalid),
+    /// then exception is thrown.
+    void seek(PDFInteger offset);
+
    /// Skips whitespace and comments
    void skipWhitespaceAndComments();

@@ -242,14 +246,23 @@ class PDFParser
    Q_DECLARE_TR_FUNCTIONS(pdf::PDFParser)

 public:
-    explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context);
+    enum Feature
+    {
+        None            = 0x0000,
+        AllowStreams    = 0x0001,
+    };
+
+    Q_DECLARE_FLAGS(Features, Feature)
+
+    explicit PDFParser(const QByteArray& data, PDFParsingContext* context, Features features);
+    explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features);

    /// Fetches single object from the stream. Does not check
    /// cyclical references. If object cannot be fetched, then
    /// exception is thrown.
    PDFObject getObject();

-    /// Fetches signle object from the stream. Performs check for
+    /// Fetches single object from the stream. Performs check for
    /// cyclical references. If object cannot be fetched, then
    /// exception is thrown.
    PDFObject getObject(PDFObjectReference reference);
@@ -257,12 +270,27 @@ public:
    /// Throws an error exception
    void error(const QString& message) const;

+    /// Seeks stream from the start. If stream cannot be seeked (position is invalid),
+    /// then exception is thrown.
+    void seek(PDFInteger offset);
+
+    /// Returns currently scanned token
+    const PDFLexicalAnalyzer::Token& lookahead() const { return m_lookAhead1; }
+
+    /// If current token is a command with same string, then eat this command
+    /// and return true. Otherwise do nothing and return false.
+    /// \param command Command to be fetched
+    bool fetchCommand(const char* command);
+
 private:
    void shift();

    /// Parsing context (multiple parsers can share it)
    PDFParsingContext* m_context;

+    /// Enabled features
+    Features m_features;
+
    /// Lexical analyzer for scanning tokens
    PDFLexicalAnalyzer m_lexicalAnalyzer;

--- a/PdfForQtLib/sources/pdfxreftable.cpp
+++ b/PdfForQtLib/sources/pdfxreftable.cpp
@@ -0,0 +1,148 @@
+//    Copyright (C) 2018 Jakub Melka
+//
+//    This file is part of PdfForQt.
+//
+//    PdfForQt is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU Lesser General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    PdfForQt is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with PDFForQt.  If not, see <https://www.gnu.org/licenses/>.
+
+#include "pdfxreftable.h"
+#include "pdfconstants.h"
+#include "pdfparser.h"
+
+#include <stack>
+
+namespace pdf
+{
+
+void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& byteArray, PDFInteger startTableOffset)
+{
+    PDFParser parser(byteArray, context, PDFParser::None);
+
+    m_entries.clear();
+
+    std::set<PDFInteger> processedOffsets;
+    std::stack<PDFInteger> workSet;
+    workSet.push(startTableOffset);
+
+    while (!workSet.empty())
+    {
+        PDFInteger currentOffset = workSet.top();
+        workSet.pop();
+
+        // Check, if we have cyclical references between tables
+        if (processedOffsets.count(currentOffset))
+        {
+            throw PDFParserException(tr("Cyclic reference found in reference table."));
+        }
+        else
+        {
+            processedOffsets.insert(currentOffset);
+        }
+
+        // Now, we are ready to scan the table. Seek to the start of the reference table.
+        parser.seek(currentOffset);
+
+        if (parser.fetchCommand(PDF_XREF_HEADER))
+        {
+            while (!parser.fetchCommand(PDF_XREF_TRAILER))
+            {
+                // Now, first number is start offset, second number is count of table items
+                PDFObject firstObject = parser.getObject();
+                PDFObject countObject = parser.getObject();
+
+                if (!firstObject.isInt() || !countObject.isInt())
+                {
+                    throw PDFParserException(tr("Invalid format of reference table."));
+                }
+
+                PDFInteger firstObjectNumber = firstObject.getInteger();
+                PDFInteger count = countObject.getInteger();
+
+                const PDFInteger lastObjectIndex = firstObjectNumber + count - 1;
+                const PDFInteger desiredSize = lastObjectIndex + 1;
+
+                if (static_cast<PDFInteger>(m_entries.size()) < desiredSize)
+                {
+                    m_entries.resize(desiredSize);
+                }
+
+                // Now, read the records
+                for (PDFInteger i = 0; i < count; ++i)
+                {
+                    const PDFInteger objectNumber = firstObjectNumber + i;
+
+                    PDFObject offset = parser.getObject();
+                    PDFObject generation = parser.getObject();
+
+                    bool occupied = parser.fetchCommand(PDF_XREF_OCCUPIED);
+                    if (!occupied && !parser.fetchCommand(PDF_XREF_FREE))
+                    {
+                        throw PDFParserException(tr("Bad format of reference table entry."));
+                    }
+
+                    if (!offset.isInt() || !generation.isInt())
+                    {
+                        throw PDFParserException(tr("Bad format of reference table entry."));
+                    }
+
+                    Entry entry;
+                    if (occupied)
+                    {
+                        entry.reference = PDFObjectReference(objectNumber, generation.getInteger());
+                        entry.offset = offset.getInteger();
+                        entry.type = EntryType::Occupied;
+                    }
+
+                    m_entries[objectNumber] = std::move(entry);
+                }
+            }
+
+            PDFObject trailerDictionary = parser.getObject();
+            if (!trailerDictionary.isDictionary())
+            {
+                throw PDFParserException(tr("Trailer dictionary is invalid."));
+            }
+
+            // Now, we have scanned the table. If we didn't have a trailer dictionary yet, then
+            // try to load it. We must also check, that trailer dictionary is OK.
+            if (m_trailerDictionary.isNull())
+            {
+                m_trailerDictionary = trailerDictionary;
+            }
+
+            const PDFDictionary* dictionary = trailerDictionary.getDictionary();
+            if (dictionary->hasKey(PDF_XREF_TRAILER_PREVIOUS))
+            {
+                PDFObject previousOffset = dictionary->get(PDF_XREF_TRAILER_PREVIOUS);
+
+                if (!previousOffset.isInt())
+                {
+                    throw PDFParserException(tr("Offset of previous reference table is invalid."));
+                }
+
+                workSet.push(previousOffset.getInteger());
+            }
+
+            if (dictionary->hasKey(PDF_XREF_TRAILER_XREFSTM))
+            {
+                throw PDFParserException(tr("Hybrid reference tables not supported."));
+            }
+        }
+        else
+        {
+            throw PDFParserException(tr("Invalid format of reference table."));
+        }
+    }
+}
+
+}   // namespace pdf
--- a/PdfForQtLib/sources/pdfxreftable.h
+++ b/PdfForQtLib/sources/pdfxreftable.h
@@ -0,0 +1,80 @@
+//    Copyright (C) 2018 Jakub Melka
+//
+//    This file is part of PdfForQt.
+//
+//    PdfForQt is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU Lesser General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    PdfForQt is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with PDFForQt.  If not, see <https://www.gnu.org/licenses/>.
+
+#ifndef PDFXREFTABLE_H
+#define PDFXREFTABLE_H
+
+#include "pdfglobal.h"
+#include "pdfobject.h"
+
+#include <QtCore>
+
+#include <vector>
+
+namespace pdf
+{
+class PDFParsingContext;
+
+/// Represents table of references in the PDF file. It contains
+/// scanned table in the PDF file, together with information, if entry
+/// is occupied, or it is free.
+class PDFXRefTable
+{
+    Q_DECLARE_TR_FUNCTIONS(pdf::PDFXRefTable)
+
+public:
+    constexpr inline explicit PDFXRefTable() = default;
+
+    // Enforce default copy constructor and default move constructor
+    constexpr inline PDFXRefTable(const PDFXRefTable&) = default;
+    constexpr inline PDFXRefTable(PDFXRefTable&&) = default;
+
+    // Enforce default copy assignment operator and move assignment operator
+    constexpr inline PDFXRefTable& operator=(const PDFXRefTable&) = default;
+    constexpr inline PDFXRefTable& operator=(PDFXRefTable&&) = default;
+
+    enum class EntryType
+    {
+        Free,       ///< Entry represents a free item (no object)
+        Occupied    ///< Entry represents a occupied item (object)
+    };
+
+    struct Entry
+    {
+        PDFObjectReference reference;
+        PDFInteger offset = -1;
+        EntryType type = EntryType::Free;
+    };
+
+    /// Tries to read reference table from the byte array. If error occurs, then exception
+    /// is raised. This fuction also checks redundant entries.
+    /// \param context Current parsing context
+    /// \param byteArray Input byte array (containing the PDF file)
+    /// \param startTableOffset Offset of first reference table
+    void readXRefTable(PDFParsingContext* context, const QByteArray& byteArray, PDFInteger startTableOffset);
+
+private:
+    /// Reference table entries
+    std::vector<Entry> m_entries;
+
+    /// Trailer dictionary
+    PDFObject m_trailerDictionary;
+};
+
+}   // namespace pdf
+
+#endif // PDFXREFTABLE_H