Object classifier

2025-06-05 21:59:17 +02:00 · 2021-06-11 19:01:18 +02:00
parent 2745c7828c
commit eb5f904842
9 changed files with 428 additions and 40 deletions
--- a/Pdf4QtLib/sources/pdfencoding.cpp
+++ b/Pdf4QtLib/sources/pdfencoding.cpp
@@ -2413,6 +2413,28 @@ QString PDFEncoding::convertSmartFromByteStringToUnicode(const QByteArray& strea
    return QString::fromLatin1(stream.toHex()).toUpper();
 }

+QString PDFEncoding::convertSmartFromByteStringToRepresentableQString(const QByteArray& stream)
+{
+    if (stream.startsWith("D:"))
+    {
+        QDateTime dateTime = convertToDateTime(stream);
+        if (dateTime.isValid())
+        {
+            return dateTime.toString(Qt::TextDate);
+        }
+    }
+
+    bool isBinary = false;
+    QString text = convertSmartFromByteStringToUnicode(stream, &isBinary);
+
+    if (!isBinary)
+    {
+        return text;
+    }
+
+    return stream.toPercentEncoding(" ", QByteArray(), '%');
+}
+
 QString PDFEncoding::getEncodingCharacters(Encoding encoding)
 {
    QString string;
--- a/Pdf4QtLib/sources/pdfencoding.h
+++ b/Pdf4QtLib/sources/pdfencoding.h
@@ -117,10 +117,16 @@ public:
    /// Function checks if stream can be converted to unicode by heuristic
    /// way, it is not always reliable.
    /// \param stream Stream
-    /// \param isBinary If specified, it is set to true if conversion failed
+    /// \param[out] isBinary If specified, it is set to true if conversion failed
    /// \returns Unicode string or string converted to hexadecimal representation
    static QString convertSmartFromByteStringToUnicode(const QByteArray& stream, bool* isBinary);

+    /// Tries to convert stream to representable string. If it cannot be done,
+    /// percentage encoding is used.
+    /// \param stream Stream
+    /// \returns Unicode string or string converted to percentage representation
+    static QString convertSmartFromByteStringToRepresentableQString(const QByteArray& stream);
+
    /// Returns all characters of the given encoding
    /// \param encoding Encoding
    /// \returns All characters reprezentable by encoding.
--- a/Pdf4QtLib/sources/pdfobjectutils.cpp
+++ b/Pdf4QtLib/sources/pdfobjectutils.cpp
@@ -15,7 +15,6 @@
 //    You should have received a copy of the GNU Lesser General Public License
 //    along with Pdf4Qt. If not, see <https://www.gnu.org/licenses/>.

-
 #include "pdfobjectutils.h"
 #include "pdfvisitor.h"

@@ -210,6 +209,16 @@ std::set<PDFObjectReference> PDFObjectUtils::getReferences(const std::vector<PDF
    return references;
 }

+std::set<PDFObjectReference> PDFObjectUtils::getDirectReferences(const PDFObject& object)
+{
+    std::set<PDFObjectReference> references;
+
+    PDFCollectReferencesVisitor collectReferencesVisitor(references);
+    object.accept(&collectReferencesVisitor);
+
+    return references;
+}
+
 PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::map<PDFObjectReference, PDFObjectReference>& referenceMapping)
 {
    PDFReplaceReferencesVisitor replaceReferencesVisitor(referenceMapping);
@@ -217,4 +226,162 @@ PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::
    return replaceReferencesVisitor.getObject();
 }

+void PDFObjectClassifier::classify(const PDFDocument* document)
+{
+    // Clear old classification, if it exist
+    m_classification.clear();
+    m_allTypesUsed = None;
+
+    if (!document)
+    {
+        return;
+    }
+
+    PDFDocumentDataLoaderDecorator loader(document);
+    const PDFObjectStorage& storage = document->getStorage();
+    const PDFObjectStorage::PDFObjects& objects = storage.getObjects();
+
+    m_classification.resize(objects.size(), Classification());
+    for (size_t i = 0; i < objects.size(); ++i)
+    {
+        PDFObjectReference reference(i, objects[i].generation);
+        m_classification[i].reference = reference;
+    }
+
+    // First, iterate trough pages of the document
+    const PDFCatalog* catalog = document->getCatalog();
+    const size_t pageCount = catalog->getPageCount();
+    for (size_t i = 0; i < pageCount; ++i)
+    {
+        const PDFPage* page = catalog->getPage(i);
+
+        if (!page)
+        {
+            continue;
+        }
+
+        // Handle page itself
+        if (hasObject(page->getPageReference()))
+        {
+            mark(page->getPageReference(), Page);
+        }
+
+        // Handle annotations
+        for (const PDFObjectReference& reference : page->getAnnotations())
+        {
+            if (hasObject(reference))
+            {
+                mark(reference, Annotation);
+            }
+        }
+
+        // Handle contents
+        PDFObject pageObject = document->getObjectByReference(page->getPageReference());
+        Q_ASSERT(pageObject.isDictionary());
+
+        const PDFDictionary* dictionary = pageObject.getDictionary();
+        const PDFObject& contentsObject = dictionary->get("Contents");
+        if (contentsObject.isReference())
+        {
+            mark(contentsObject.getReference(), ContentStream);
+        }
+
+        // Handle resources
+        if (const PDFDictionary* resourcesDictionary = document->getDictionaryFromObject(dictionary->get("Resources")))
+        {
+            markDictionary(document, resourcesDictionary->get("ExtGState"), GraphicState);
+            markDictionary(document, resourcesDictionary->get("ColorSpace"), ColorSpace);
+            markDictionary(document, resourcesDictionary->get("Pattern"), Pattern);
+            markDictionary(document, resourcesDictionary->get("Shading"), Shading);
+            markDictionary(document, resourcesDictionary->get("Font"), Font);
+
+            if (const PDFDictionary* xobjectDictionary = document->getDictionaryFromObject(resourcesDictionary->get("XObject")))
+            {
+                const size_t count = xobjectDictionary->getCount();
+                for (size_t i = 0; i < count; ++i)
+                {
+                    const PDFObject& item = xobjectDictionary->getValue(i);
+                    if (item.isReference() && hasObject(item.getReference()))
+                    {
+                        if (const PDFDictionary* xobjectItemDictionary = document->getDictionaryFromObject(item))
+                        {
+                            QByteArray subtype = loader.readNameFromDictionary(xobjectItemDictionary, "Subtype");
+
+                            if (subtype == "Image")
+                            {
+                                mark(item.getReference(), Image);
+                            }
+                            else if (subtype == "Form")
+                            {
+                                mark(item.getReference(), Form);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (Classification& classification : m_classification)
+    {
+        if (const PDFDictionary* dictionary = document->getDictionaryFromObject(document->getObjectByReference(classification.reference)))
+        {
+            QByteArray typeName = loader.readNameFromDictionary(dictionary, "Type");
+            if (typeName == "Action")
+            {
+                classification.types.setFlag(Action);
+            }
+        }
+    }
+
+    for (const Classification& classification : m_classification)
+    {
+        m_allTypesUsed |= classification.types;
+    }
+}
+
+bool PDFObjectClassifier::hasObject(PDFObjectReference reference) const
+{
+    return reference.isValid() &&
+           reference.objectNumber < PDFInteger(m_classification.size()) &&
+            m_classification[reference.objectNumber].reference == reference;
+}
+
+std::vector<PDFObjectReference> PDFObjectClassifier::getObjectsByType(Type type) const
+{
+    std::vector<PDFObjectReference> result;
+
+    for (const Classification& classification : m_classification)
+    {
+        if (classification.types.testFlag(type))
+        {
+            result.push_back(classification.reference);
+        }
+    }
+
+    return result;
+}
+
+void PDFObjectClassifier::mark(PDFObjectReference reference, Type type)
+{
+    Q_ASSERT(hasObject(reference));
+    m_classification[reference.objectNumber].types.setFlag(type, true);
+}
+
+void PDFObjectClassifier::markDictionary(const PDFDocument* document, PDFObject object, Type type)
+{
+    if (const PDFDictionary* dictionary = document->getDictionaryFromObject(object))
+    {
+        const size_t count = dictionary->getCount();
+        for (size_t i = 0; i < count; ++i)
+        {
+            const PDFObject& item = dictionary->getValue(i);
+            if (item.isReference() && hasObject(item.getReference()))
+            {
+                mark(item.getReference(), type);
+            }
+        }
+    }
+}
+
 }   // namespace pdf
--- a/Pdf4QtLib/sources/pdfobjectutils.h
+++ b/Pdf4QtLib/sources/pdfobjectutils.h
@@ -20,23 +20,30 @@

 #include "pdfobject.h"

+#include <QtCore>
+
 #include <set>
+#include <vector>

 namespace pdf
 {
 class PDFObjectStorage;
+class PDFDocument;

 /// Utilities for manipulation with objects
 class PDFObjectUtils
 {
 public:
-    /// Returns list of references referenced by \p objects. So, all references, which are present
+    /// Returns a list of references referenced by \p objects. So, all references, which are present
    /// in objects, appear in the result set, including objects, which are referenced by referenced
    /// objects (so, transitive closure above reference graph is returned).
    /// \param objects Objects
    /// \param storage Storage
    static std::set<PDFObjectReference> getReferences(const std::vector<PDFObject>& objects, const PDFObjectStorage& storage);

+    /// Returns a list of references directly referenced from object. References itself are not followed.
+    static std::set<PDFObjectReference> getDirectReferences(const PDFObject& object);
+
    static PDFObject replaceReferences(const PDFObject& object, const std::map<PDFObjectReference, PDFObjectReference>& referenceMapping);

 private:
@@ -97,6 +104,67 @@ private:
    bool m_locked;
 };

+/// Classifies objects according to their type. Some heuristic is used
+/// when object type is missing or document is not well-formed.
+class Pdf4QtLIBSHARED_EXPORT PDFObjectClassifier
+{
+public:
+
+    inline PDFObjectClassifier() = default;
+
+    /// Performs object classification on a document. Old classification
+    /// is being cleared.
+    /// \param document Document
+    void classify(const PDFDocument* document);
+
+    enum Type : uint32_t
+    {
+        None            = 0x00000000,
+        Page            = 0x00000001,
+        ContentStream   = 0x00000002,
+        GraphicState    = 0x00000004,
+        ColorSpace      = 0x00000008,
+        Pattern         = 0x00000010,
+        Shading         = 0x00000020,
+        Image           = 0x00000040,
+        Form            = 0x00000080,
+        Font            = 0x00000100,
+        Action          = 0x00000200,
+        Annotation      = 0x00000400
+    };
+
+    Q_DECLARE_FLAGS(Types, Type)
+
+    /// Returns true, if object with given reference exists
+    /// and was classified.
+    /// \param reference Reference
+    bool hasObject(PDFObjectReference reference) const;
+
+    /// Returns true, if any object with given type is present in a document
+    /// \param type Object type
+    bool hasType(Type type) const { return m_allTypesUsed.testFlag(type); }
+
+    /// Returns a list of objects with a given type
+    /// \param type Type
+    std::vector<PDFObjectReference> getObjectsByType(Type type) const;
+
+private:
+    struct Classification
+    {
+        PDFObjectReference reference;
+        Types types = None;
+    };
+
+    /// Marks object with a given type
+    void mark(PDFObjectReference reference, Type type);
+
+    /// Marks objects in dictionary with a given type
+    void markDictionary(const PDFDocument* document, PDFObject object, Type type);
+
+    std::vector<Classification> m_classification;
+    Types m_allTypesUsed;
+};
+
 } // namespace pdf

 #endif // PDFOBJECTUTILS_H