Parsing X Reference table

2025-06-05 21:59:17 +02:00 · 2018-11-21 19:30:15 +01:00
parent 58ad59e407
commit 8c93c82228
19 changed files with 625 additions and 58 deletions
--- a/PdfForQtLib/sources/pdfxreftable.cpp
+++ b/PdfForQtLib/sources/pdfxreftable.cpp
@@ -0,0 +1,148 @@
+//    Copyright (C) 2018 Jakub Melka
+//
+//    This file is part of PdfForQt.
+//
+//    PdfForQt is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU Lesser General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    PdfForQt is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU Lesser General Public License for more details.
+//
+//    You should have received a copy of the GNU Lesser General Public License
+//    along with PDFForQt.  If not, see <https://www.gnu.org/licenses/>.
+
+#include "pdfxreftable.h"
+#include "pdfconstants.h"
+#include "pdfparser.h"
+
+#include <stack>
+
+namespace pdf
+{
+
+void PDFXRefTable::readXRefTable(PDFParsingContext* context, const QByteArray& byteArray, PDFInteger startTableOffset)
+{
+    PDFParser parser(byteArray, context, PDFParser::None);
+
+    m_entries.clear();
+
+    std::set<PDFInteger> processedOffsets;
+    std::stack<PDFInteger> workSet;
+    workSet.push(startTableOffset);
+
+    while (!workSet.empty())
+    {
+        PDFInteger currentOffset = workSet.top();
+        workSet.pop();
+
+        // Check, if we have cyclical references between tables
+        if (processedOffsets.count(currentOffset))
+        {
+            throw PDFParserException(tr("Cyclic reference found in reference table."));
+        }
+        else
+        {
+            processedOffsets.insert(currentOffset);
+        }
+
+        // Now, we are ready to scan the table. Seek to the start of the reference table.
+        parser.seek(currentOffset);
+
+        if (parser.fetchCommand(PDF_XREF_HEADER))
+        {
+            while (!parser.fetchCommand(PDF_XREF_TRAILER))
+            {
+                // Now, first number is start offset, second number is count of table items
+                PDFObject firstObject = parser.getObject();
+                PDFObject countObject = parser.getObject();
+
+                if (!firstObject.isInt() || !countObject.isInt())
+                {
+                    throw PDFParserException(tr("Invalid format of reference table."));
+                }
+
+                PDFInteger firstObjectNumber = firstObject.getInteger();
+                PDFInteger count = countObject.getInteger();
+
+                const PDFInteger lastObjectIndex = firstObjectNumber + count - 1;
+                const PDFInteger desiredSize = lastObjectIndex + 1;
+
+                if (static_cast<PDFInteger>(m_entries.size()) < desiredSize)
+                {
+                    m_entries.resize(desiredSize);
+                }
+
+                // Now, read the records
+                for (PDFInteger i = 0; i < count; ++i)
+                {
+                    const PDFInteger objectNumber = firstObjectNumber + i;
+
+                    PDFObject offset = parser.getObject();
+                    PDFObject generation = parser.getObject();
+
+                    bool occupied = parser.fetchCommand(PDF_XREF_OCCUPIED);
+                    if (!occupied && !parser.fetchCommand(PDF_XREF_FREE))
+                    {
+                        throw PDFParserException(tr("Bad format of reference table entry."));
+                    }
+
+                    if (!offset.isInt() || !generation.isInt())
+                    {
+                        throw PDFParserException(tr("Bad format of reference table entry."));
+                    }
+
+                    Entry entry;
+                    if (occupied)
+                    {
+                        entry.reference = PDFObjectReference(objectNumber, generation.getInteger());
+                        entry.offset = offset.getInteger();
+                        entry.type = EntryType::Occupied;
+                    }
+
+                    m_entries[objectNumber] = std::move(entry);
+                }
+            }
+
+            PDFObject trailerDictionary = parser.getObject();
+            if (!trailerDictionary.isDictionary())
+            {
+                throw PDFParserException(tr("Trailer dictionary is invalid."));
+            }
+
+            // Now, we have scanned the table. If we didn't have a trailer dictionary yet, then
+            // try to load it. We must also check, that trailer dictionary is OK.
+            if (m_trailerDictionary.isNull())
+            {
+                m_trailerDictionary = trailerDictionary;
+            }
+
+            const PDFDictionary* dictionary = trailerDictionary.getDictionary();
+            if (dictionary->hasKey(PDF_XREF_TRAILER_PREVIOUS))
+            {
+                PDFObject previousOffset = dictionary->get(PDF_XREF_TRAILER_PREVIOUS);
+
+                if (!previousOffset.isInt())
+                {
+                    throw PDFParserException(tr("Offset of previous reference table is invalid."));
+                }
+
+                workSet.push(previousOffset.getInteger());
+            }
+
+            if (dictionary->hasKey(PDF_XREF_TRAILER_XREFSTM))
+            {
+                throw PDFParserException(tr("Hybrid reference tables not supported."));
+            }
+        }
+        else
+        {
+            throw PDFParserException(tr("Invalid format of reference table."));
+        }
+    }
+}
+
+}   // namespace pdf