From e734946ae9d1095131545a7da049847246dcd3dc Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sun, 26 Jul 2020 18:40:42 +0200 Subject: [PATCH] Catalog update --- PdfForQtLib/sources/pdfcatalog.cpp | 58 +++++++++++++++ PdfForQtLib/sources/pdfcatalog.h | 89 ++++++++++++++++++++++++ PdfForQtLib/sources/pdfstructuretree.cpp | 82 ++++++++++++++++++++-- PdfForQtLib/sources/pdfstructuretree.h | 34 ++++++++- 4 files changed, 256 insertions(+), 7 deletions(-) diff --git a/PdfForQtLib/sources/pdfcatalog.cpp b/PdfForQtLib/sources/pdfcatalog.cpp index dd406ac..9d4188a 100644 --- a/PdfForQtLib/sources/pdfcatalog.cpp +++ b/PdfForQtLib/sources/pdfcatalog.cpp @@ -207,6 +207,11 @@ PDFCatalog PDFCatalog::parse(const PDFObject& catalog, const PDFDocument* docume catalogObject.m_markInfoFlags.setFlag(MarkInfo_Suspects, loader.readBooleanFromDictionary(markInfoDictionary, "Suspects", false)); } + catalogObject.m_structureTreeRoot = catalogDictionary->get("StructTreeRoot"); + catalogObject.m_language = loader.readTextStringFromDictionary(catalogDictionary, "Lang", QString()); + catalogObject.m_webCaptureInfo = PDFWebCaptureInfo::parse(catalogDictionary->get("SpiderInfo"), &document->getStorage()); + catalogObject.m_outputIntents = loader.readObjectList(catalogDictionary->get("OutputIntents")); + return catalogObject; } @@ -766,4 +771,57 @@ PDFArticleThread PDFArticleThread::parse(const PDFObjectStorage* storage, const return result; } +PDFWebCaptureInfo PDFWebCaptureInfo::parse(const PDFObject& object, const PDFObjectStorage* storage) +{ + PDFWebCaptureInfo result; + + if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) + { + PDFDocumentDataLoaderDecorator loader(storage); + result.m_version = loader.readNameFromDictionary(dictionary, "V"); + result.m_commands = loader.readReferenceArrayFromDictionary(dictionary, "C"); + } + + return result; +} + +PDFOutputIntent PDFOutputIntent::parse(const PDFObjectStorage* storage, const PDFObject& object) +{ + PDFOutputIntent result; + + if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) + { + PDFDocumentDataLoaderDecorator loader(storage); + result.m_subtype = loader.readNameFromDictionary(dictionary, "S"); + result.m_outputCondition = loader.readTextStringFromDictionary(dictionary, "OutputCondition", QString()); + result.m_outputConditionIdentifier = loader.readTextStringFromDictionary(dictionary, "OutputConditionIdentifier", QString()); + result.m_registryName = loader.readTextStringFromDictionary(dictionary, "RegistryName", QString()); + result.m_info = loader.readTextStringFromDictionary(dictionary, "Info", QString()); + result.m_destOutputProfile = dictionary->get("DestOutputProfile"); + result.m_destOutputProfileRef = PDFOutputIntentICCProfileInfo::parse(dictionary->get("DestOutputProfileRef"), storage); + result.m_mixingHints = dictionary->get("MixingHints"); + result.m_spectralData = dictionary->get("SpectralData"); + } + + return result; +} + +PDFOutputIntentICCProfileInfo PDFOutputIntentICCProfileInfo::parse(const PDFObject& object, const PDFObjectStorage* storage) +{ + PDFOutputIntentICCProfileInfo result; + + if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) + { + PDFDocumentDataLoaderDecorator loader(storage); + result.m_checkSum = loader.readStringFromDictionary(dictionary, "CheckSum"); + result.m_colorants = loader.readNameArrayFromDictionary(dictionary, "ColorantTable"); + result.m_iccVersion = loader.readStringFromDictionary(dictionary, "ICCVersion"); + result.m_signature = loader.readStringFromDictionary(dictionary, "ProfileCS"); + result.m_profileName = loader.readTextStringFromDictionary(dictionary, "ProfileName", QString()); + result.m_urls = dictionary->get("URLs"); + } + + return result; +} + } // namespace pdf diff --git a/PdfForQtLib/sources/pdfcatalog.h b/PdfForQtLib/sources/pdfcatalog.h index 8d12e07..dba25c2 100644 --- a/PdfForQtLib/sources/pdfcatalog.h +++ b/PdfForQtLib/sources/pdfcatalog.h @@ -322,6 +322,87 @@ private: Extensions m_extensions; }; +/// Web capture info +class PDFFORQTLIBSHARED_EXPORT PDFWebCaptureInfo +{ +public: + explicit PDFWebCaptureInfo() = default; + + const QByteArray& getVersion() const { return m_version; } + const std::vector& getCommands() const { return m_commands; } + + /// Parses web capture info from catalog dictionary. If object cannot be parsed, or error occurs, + /// then empty object is returned, no exception is thrown. + /// \param object Spider info dictionary + /// \param storage Storage + static PDFWebCaptureInfo parse(const PDFObject& object, const PDFObjectStorage* storage); + +private: + QByteArray m_version; + std::vector m_commands; +}; + +class PDFFORQTLIBSHARED_EXPORT PDFOutputIntentICCProfileInfo +{ +public: + explicit PDFOutputIntentICCProfileInfo() = default; + + const QByteArray& getChecksum() const { return m_checkSum; } + const std::vector& getColorants() const { return m_colorants; } + const QByteArray& getIccVersion() const { return m_iccVersion; } + const QByteArray& getSignature() const { return m_signature; } + const QString& getProfileName() const { return m_profileName; } + const PDFObject& getUrls() const { return m_urls; } + + /// Parses icc profile info from object. If object cannot be parsed, or error occurs, + /// then empty object is returned, no exception is thrown. + /// \param object Output intent dictionary + /// \param storage Storage + static PDFOutputIntentICCProfileInfo parse(const PDFObject& object, const PDFObjectStorage* storage); + +private: + QByteArray m_checkSum; + std::vector m_colorants; + QByteArray m_iccVersion; + QByteArray m_signature; + QString m_profileName; + PDFObject m_urls; +}; + +/// Output intent +class PDFFORQTLIBSHARED_EXPORT PDFOutputIntent +{ +public: + explicit PDFOutputIntent() = default; + + const QByteArray& getSubtype() const { return m_subtype; } + const QString& getOutputCondition() const { return m_outputCondition; } + const QString& getOutputConditionIdentifier() const { return m_outputConditionIdentifier; } + const QString& getRegistryName() const { return m_registryName; } + const QString& getInfo() const { return m_info; } + const PDFObject& getOutputProfile() const { return m_destOutputProfile; } + const PDFOutputIntentICCProfileInfo& getOutputProfileInfo() const { return m_destOutputProfileRef; } + const PDFObject& getMixingHints() const { return m_mixingHints; } + const PDFObject& getSpectralData() const { return m_spectralData; } + + /// Parses output intent from object. If object cannot be parsed, or error occurs, + /// then empty object is returned, no exception is thrown. + /// \param object Output intent dictionary + /// \param storage Storage + static PDFOutputIntent parse(const PDFObjectStorage* storage, const PDFObject& object); + +private: + QByteArray m_subtype; + QString m_outputCondition; + QString m_outputConditionIdentifier; + QString m_registryName; + QString m_info; + PDFObject m_destOutputProfile; + PDFOutputIntentICCProfileInfo m_destOutputProfileRef; + PDFObject m_mixingHints; + PDFObject m_spectralData; +}; + class PDFFORQTLIBSHARED_EXPORT PDFCatalog { public: @@ -379,6 +460,10 @@ public: const std::vector& getArticleThreads() const { return m_threads; } const PDFAction* getDocumentAction(DocumentAction action) const { return m_documentActions.at(action).get(); } const PDFObject& getMetadata() const { return m_metadata; } + const PDFObject& getStructureTreeRoot() const { return m_structureTreeRoot; } + const QString& getLanguage() const { return m_language; } + const PDFWebCaptureInfo& getWebCaptureInfo() const { return m_webCaptureInfo; } + const std::vector& getOutputIntents() const { return m_outputIntents; } /// Is document marked to have structure tree conforming to tagged document convention? bool isLogicalStructureMarked() const { return m_markInfoFlags.testFlag(MarkInfo_Marked); } @@ -422,11 +507,15 @@ private: PageMode m_pageMode = PageMode::UseNone; QByteArray m_baseURI; PDFObject m_formObject; + PDFObject m_structureTreeRoot; PDFDeveloperExtensions m_extensions; PDFDocumentSecurityStore m_documentSecurityStore; std::vector m_threads; PDFObject m_metadata; MarkInfoFlags m_markInfoFlags = MarkInfo_None; + QString m_language; + PDFWebCaptureInfo m_webCaptureInfo; + std::vector m_outputIntents; // Maps from Names dictionary std::map m_destinations; diff --git a/PdfForQtLib/sources/pdfstructuretree.cpp b/PdfForQtLib/sources/pdfstructuretree.cpp index b77a4d0..d9bfaee 100644 --- a/PdfForQtLib/sources/pdfstructuretree.cpp +++ b/PdfForQtLib/sources/pdfstructuretree.cpp @@ -45,6 +45,12 @@ struct PDFStructureTreeAttributeDefinition /// \param name Attribute name static const PDFStructureTreeAttributeDefinition* getDefinition(const QByteArray& name); + /// Returns attribute definition for given attribute type. This function + /// always returns valid pointer. For uknown attribute, it returns + /// user attribute definition. + /// \param name Attribute name + static const PDFStructureTreeAttributeDefinition* getDefinition(PDFStructureTreeAttribute::Attribute type); + /// Returns owner from string. If owner is not valid, then invalid /// owner is returned. /// \param string String @@ -214,6 +220,20 @@ const PDFStructureTreeAttributeDefinition* PDFStructureTreeAttributeDefinition:: return &s_attributeDefinitions.front(); } +const PDFStructureTreeAttributeDefinition* PDFStructureTreeAttributeDefinition::getDefinition(PDFStructureTreeAttribute::Attribute type) +{ + for (const PDFStructureTreeAttributeDefinition& definition : s_attributeDefinitions) + { + if (type == definition.type) + { + return &definition; + } + } + + Q_ASSERT(s_attributeDefinitions.front().type == PDFStructureTreeAttribute::Attribute::User); + return &s_attributeDefinitions.front(); +} + PDFStructureTreeAttribute::Owner PDFStructureTreeAttributeDefinition::getOwnerFromString(const QByteArray& string) { for (const auto& item : s_ownerDefinitions) @@ -512,7 +532,7 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj return tree; } -PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context) +PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent) { if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) { @@ -521,15 +541,15 @@ PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, if (typeName == "MCR") { - return PDFStructureMarkedContentReference::parse(storage, object, context); + return PDFStructureMarkedContentReference::parseMarkedContentReference(storage, object, context, parent, parent->getTree()); } else if (typeName == "OBJR") { - return PDFStructureObjectReference::parse(storage, object, context); + return PDFStructureObjectReference::parseObjectReference(storage, object, context, parent, parent->getTree()); } else { - return PDFStructureElement::parse(storage, object, context); + return PDFStructureElement::parseElement(storage, object, context, parent, parent->getTree()); } } @@ -557,7 +577,7 @@ void PDFStructureItem::parseKids(const PDFObjectStorage* storage, PDFStructureIt const PDFArray* kidsArray = kids.getArray(); for (const PDFObject& object : *kidsArray) { - PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, context); + PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, context, parentItem); if (item) { parentItem->m_children.emplace_back(qMove(item)); @@ -566,7 +586,7 @@ void PDFStructureItem::parseKids(const PDFObjectStorage* storage, PDFStructureIt } else if (!kids.isNull()) { - PDFStructureItemPointer item = PDFStructureItem::parse(storage, kids, context); + PDFStructureItemPointer item = PDFStructureItem::parse(storage, kids, context, parentItem); if (item) { parentItem->m_children.emplace_back(qMove(item)); @@ -595,6 +615,56 @@ PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorag return result; } +const PDFStructureTreeAttribute* PDFStructureElement::findAttribute(Attribute attribute, + AttributeOwner owner, + RevisionPolicy policy) const +{ + const PDFStructureTreeAttributeDefinition* definition = PDFStructureTreeAttributeDefinition::getDefinition(attribute); + + if (const PDFStructureTreeAttribute* result = findAttributeImpl(attribute, owner, policy, definition)) + { + return result; + } + + if (owner != AttributeOwner::Invalid) + { + return findAttributeImpl(attribute, AttributeOwner::Invalid, policy, definition); + } + + return nullptr; +} + +const PDFStructureTreeAttribute* PDFStructureElement::findAttributeImpl(Attribute attribute, + AttributeOwner owner, + RevisionPolicy policy, + const PDFStructureTreeAttributeDefinition* definition) const +{ + // We do not search for user properties + if (attribute == Attribute::User) + { + return nullptr; + } + + // Try to search for attribute in attribute list + for (const PDFStructureTreeAttribute& attributeObject : m_attributes) + { + if ((attributeObject.getType() == attribute) && + (attributeObject.getOwner() == owner || owner == AttributeOwner::Invalid) && + (attributeObject.getRevision() == m_revision || policy == RevisionPolicy::Ignore)) + { + return &attributeObject; + } + } + + // Check, if attribute is inheritable and then search for it in parent + if (definition->inheritable && m_parent && m_parent->asStructureElement()) + { + return m_parent->asStructureElement()->findAttributeImpl(attribute, owner, policy, definition); + } + + return nullptr; +} + PDFStructureItemPointer PDFStructureElement::parseElement(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, diff --git a/PdfForQtLib/sources/pdfstructuretree.h b/PdfForQtLib/sources/pdfstructuretree.h index dd2e7c8..c89c012 100644 --- a/PdfForQtLib/sources/pdfstructuretree.h +++ b/PdfForQtLib/sources/pdfstructuretree.h @@ -272,7 +272,9 @@ public: virtual const PDFStructureObjectReference* asStructureObjectReference() const { return nullptr; } const PDFStructureItem* getParent() const { return m_parent; } + PDFStructureItem* getParent() { return m_parent; } const PDFStructureTree* getTree() const { return m_root; } + PDFStructureTree* getTree() { return m_root; } PDFObjectReference getSelfReference() const { return m_selfReference; } std::size_t getChildCount() const { return m_children.size(); } const PDFStructureItem* getChild(size_t i) const { return m_children.at(i).get(); } @@ -282,7 +284,8 @@ public: /// \param storage Storage /// \param object Structure tree item object /// \param context Parsing context - static PDFStructureItemPointer parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context); + /// \param parent Parent item + static PDFStructureItemPointer parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent); /// Get structure tree type from name /// \param name Name @@ -427,6 +430,23 @@ public: const PDFObjectReference& getNamespace() const { return m_namespace; } const QByteArray& getPhoneticAlphabet() const { return m_phoneticAlphabet; } + enum class RevisionPolicy + { + Ignore, + Match + }; + + using Attribute = PDFStructureTreeAttribute::Attribute; + using AttributeOwner = PDFStructureTreeAttribute::Owner; + + /// Finds attribute matching given owner and revision policy. If attribute with given + /// owner is not found, then any matching attribute is returned. If none is found, + /// then nullptr is returned. + /// \param attribute Attribute + /// \param owner Owner + /// \param policy Revision number policy + const PDFStructureTreeAttribute* findAttribute(Attribute attribute, AttributeOwner owner, RevisionPolicy policy) const; + /// Parses structure element from the object. If error occurs, nullptr is returned. /// \param storage Storage /// \param object Structure element object @@ -440,6 +460,18 @@ public: PDFStructureTree* root); private: + /// Finds attribute matching given owner and revision policy. If attribute with given + /// owner is not found, then any matching attribute is returned. If none is found, + /// then nullptr is returned. + /// \param attribute Attribute + /// \param owner Owner + /// \param policy Revision number policy + /// \param definition Definition + const PDFStructureTreeAttribute* findAttributeImpl(Attribute attribute, + AttributeOwner owner, + RevisionPolicy policy, + const PDFStructureTreeAttributeDefinition* definition) const; + QByteArray m_typeName; Type m_standardType; QByteArray m_id;