diff --git a/PdfForQtLib/sources/pdfstructuretree.cpp b/PdfForQtLib/sources/pdfstructuretree.cpp index faa51ec..f18139e 100644 --- a/PdfForQtLib/sources/pdfstructuretree.cpp +++ b/PdfForQtLib/sources/pdfstructuretree.cpp @@ -43,9 +43,12 @@ struct PDFStructureTreeAttributeDefinition /// always returns valid pointer. For uknown attribute, it returns /// user attribute definition. /// \param name Attribute name - const PDFStructureTreeAttributeDefinition* getDefinition(const QByteArray& name); + static const PDFStructureTreeAttributeDefinition* getDefinition(const QByteArray& name); - PDFStructureTreeAttribute::Owner getOwnerFromString(const QByteArray& string); + /// Returns owner from string. If owner is not valid, then invalid + /// owner is returned. + /// \param string String + static PDFStructureTreeAttribute::Owner getOwnerFromString(const QByteArray& string); PDFStructureTreeAttribute::Attribute type = PDFStructureTreeAttribute::Attribute::User; const char* name = nullptr; @@ -136,6 +139,67 @@ static constexpr std::array s_structureTreeItemTypes[] = { + std::make_pair(PDFStructureItem::Document, "Document"), + std::make_pair(PDFStructureItem::DocumentFragment, "DocumentFragment"), + std::make_pair(PDFStructureItem::Part, "Part"), + std::make_pair(PDFStructureItem::Div, "Div"), + std::make_pair(PDFStructureItem::Aside, "Aside"), + std::make_pair(PDFStructureItem::P, "P"), + std::make_pair(PDFStructureItem::H1, "H1"), + std::make_pair(PDFStructureItem::H2, "H2"), + std::make_pair(PDFStructureItem::H3, "H3"), + std::make_pair(PDFStructureItem::H4, "H4"), + std::make_pair(PDFStructureItem::H5, "H5"), + std::make_pair(PDFStructureItem::H6, "H6"), + std::make_pair(PDFStructureItem::H7, "H7"), + std::make_pair(PDFStructureItem::H, "H"), + std::make_pair(PDFStructureItem::Title, "Title"), + std::make_pair(PDFStructureItem::FENote, "FENote"), + std::make_pair(PDFStructureItem::Sub, "Sub"), + std::make_pair(PDFStructureItem::Lbl, "Lbl"), + std::make_pair(PDFStructureItem::Span, "Span"), + std::make_pair(PDFStructureItem::Em, "Em"), + std::make_pair(PDFStructureItem::Strong, "Strong"), + std::make_pair(PDFStructureItem::Link, "Link"), + std::make_pair(PDFStructureItem::Annot, "Annot"), + std::make_pair(PDFStructureItem::Form, "Form"), + std::make_pair(PDFStructureItem::Ruby, "Ruby"), + std::make_pair(PDFStructureItem::RB, "RB"), + std::make_pair(PDFStructureItem::RT, "RT"), + std::make_pair(PDFStructureItem::RP, "RP"), + std::make_pair(PDFStructureItem::Warichu, "Warichu"), + std::make_pair(PDFStructureItem::WR, "WR"), + std::make_pair(PDFStructureItem::WP, "WP"), + std::make_pair(PDFStructureItem::L, "L"), + std::make_pair(PDFStructureItem::LI, "LI"), + std::make_pair(PDFStructureItem::LBody, "LBody"), + std::make_pair(PDFStructureItem::Table, "Table"), + std::make_pair(PDFStructureItem::TR, "TR"), + std::make_pair(PDFStructureItem::TH, "TH"), + std::make_pair(PDFStructureItem::TD, "TD"), + std::make_pair(PDFStructureItem::THead, "THead"), + std::make_pair(PDFStructureItem::TBody, "TBody"), + std::make_pair(PDFStructureItem::TFoot, "TFoot"), + std::make_pair(PDFStructureItem::Caption, "Caption"), + std::make_pair(PDFStructureItem::Figure, "Figure"), + std::make_pair(PDFStructureItem::Formula, "Formula"), + std::make_pair(PDFStructureItem::Artifact, "Artifact"), + std::make_pair(PDFStructureItem::Sect, "Sect"), + std::make_pair(PDFStructureItem::Art, "Art"), + std::make_pair(PDFStructureItem::BlockQuote, "BlockQuote"), + std::make_pair(PDFStructureItem::TOC, "TOC"), + std::make_pair(PDFStructureItem::TOCI, "TOCI"), + std::make_pair(PDFStructureItem::Index, "Index"), + std::make_pair(PDFStructureItem::NonStruct, "NonStruct"), + std::make_pair(PDFStructureItem::Private, "Private"), + std::make_pair(PDFStructureItem::Quote, "Quote"), + std::make_pair(PDFStructureItem::Note, "Note"), + std::make_pair(PDFStructureItem::Reference, "Reference"), + std::make_pair(PDFStructureItem::BibEntry, "BibEntry"), + std::make_pair(PDFStructureItem::Code, "Code") +}; + const PDFStructureTreeAttributeDefinition* PDFStructureTreeAttributeDefinition::getDefinition(const QByteArray& name) { for (const PDFStructureTreeAttributeDefinition& definition : s_attributeDefinitions) @@ -160,7 +224,7 @@ PDFStructureTreeAttribute::Owner PDFStructureTreeAttributeDefinition::getOwnerFr } } - return PDFStructureTreeAttribute::Owner::User; + return PDFStructureTreeAttribute::Owner::Invalid; } PDFStructureTreeAttribute::PDFStructureTreeAttribute() : @@ -173,6 +237,20 @@ PDFStructureTreeAttribute::PDFStructureTreeAttribute() : } +PDFStructureTreeAttribute::PDFStructureTreeAttribute(const PDFStructureTreeAttributeDefinition* definition, + PDFStructureTreeAttribute::Owner owner, + PDFInteger revision, + PDFObjectReference namespaceReference, + PDFObject value) : + m_definition(definition), + m_owner(owner), + m_revision(revision), + m_namespace(namespaceReference), + m_value(qMove(value)) +{ + +} + PDFStructureTreeAttribute::Attribute PDFStructureTreeAttribute::getType() const { Q_ASSERT(m_definition); @@ -228,6 +306,76 @@ bool PDFStructureTreeAttribute::getUserPropertyIsHidden(const PDFObjectStorage* return false; } +void PDFStructureTreeAttribute::parseAttributes(const PDFObjectStorage* storage, PDFObject object, std::vector& attributes) +{ + object = storage->getObject(object); + if (object.isDictionary()) + { + parseAttributeDictionary(storage, object, attributes); + } + else if (object.isArray()) + { + size_t startIndex = attributes.size(); + + for (PDFObject itemObject : *object.getArray()) + { + itemObject = storage->getObject(itemObject); + if (itemObject.isInt()) + { + // It is revision number + const PDFInteger revision = itemObject.getInteger(); + for (; startIndex < attributes.size(); ++startIndex) + { + attributes[startIndex].setRevision(revision); + } + } + else if (itemObject.isDictionary()) + { + // It is attribute + parseAttributeDictionary(storage, itemObject, attributes); + } + } + } +} + +void PDFStructureTreeAttribute::parseAttributeDictionary(const PDFObjectStorage* storage, PDFObject object, std::vector& attributes) +{ + Q_ASSERT(object.isDictionary()); + const PDFDictionary* attributeDictionary = object.getDictionary(); + + PDFDocumentDataLoaderDecorator loader(storage); + const QByteArray ownerName = loader.readNameFromDictionary(attributeDictionary, "O"); + const Owner owner = PDFStructureTreeAttributeDefinition::getOwnerFromString(ownerName); + if (owner == Owner::UserProperties) + { + // User properties + PDFObject userPropertiesArrayObject = storage->getObject(attributeDictionary->get("P")); + if (userPropertiesArrayObject.isArray()) + { + const PDFArray* userPropertiesArray = userPropertiesArrayObject.getArray(); + for (const PDFObject& userPropertyObject : *userPropertiesArray) + { + attributes.emplace_back(&s_attributeDefinitions.front(), owner, 0, PDFObjectReference(), userPropertyObject); + } + } + } + else + { + const PDFObjectReference namespaceReference = loader.readReferenceFromDictionary(attributeDictionary, "NS"); + const size_t count = attributeDictionary->getCount(); + for (size_t i = 0; i < count; ++i) + { + const PDFInplaceOrMemoryString& key = attributeDictionary->getKey(i); + if (key == "O" || key == "NS") + { + continue; + } + + attributes.emplace_back(PDFStructureTreeAttributeDefinition::getDefinition(key.getString()), owner, 0, namespaceReference, attributeDictionary->getValue(i)); + } + } +} + std::vector PDFStructureTree::getParents(PDFInteger id) const { std::vector result; @@ -240,15 +388,39 @@ std::vector PDFStructureTree::getParents(PDFInteger id) cons return result; } +PDFStructureItem::Type PDFStructureTree::getTypeFromRole(const QByteArray& role) const +{ + auto it = m_roleMap.find(role); + if (it != m_roleMap.cend()) + { + return it->second; + } + + return getTypeFromName(role); +} + +const std::vector& PDFStructureTree::getClassAttributes(const QByteArray& className) const +{ + auto it = m_classMap.find(className); + if (it != m_classMap.cend()) + { + return it->second; + } + + static const std::vector dummy; + return dummy; +} + PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObject object) { PDFStructureTree tree; if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) { + PDFDocumentDataLoaderDecorator loader(storage); + PDFMarkedObjectsContext context; PDFObject kids = dictionary->get("K"); - if (kids.isArray()) { const PDFArray* kidsArray = kids.getArray(); @@ -320,9 +492,78 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj } std::stable_sort(tree.m_parentTreeEntries.begin(), tree.m_parentTreeEntries.end()); } + + tree.m_parentNextKey = loader.readIntegerFromDictionary(dictionary, "ParentTreeNextKey", 0); + + if (const PDFDictionary* roleMapDictionary = storage->getDictionaryFromObject(dictionary->get("RoleMap"))) + { + const size_t size = roleMapDictionary->getCount(); + for (size_t i = 0; i < size; ++i) + { + tree.m_roleMap[roleMapDictionary->getKey(i).getString()] = getTypeFromName(loader.readName(roleMapDictionary->getValue(i))); + } + } + + if (const PDFDictionary* classMapDictionary = storage->getDictionaryFromObject(dictionary->get("ClassMap"))) + { + const size_t size = classMapDictionary->getCount(); + for (size_t i = 0; i < size; ++i) + { + PDFStructureTreeAttribute::parseAttributes(storage, classMapDictionary->getValue(i), tree.m_classMap[classMapDictionary->getKey(i).getString()]); + } + } + + if (dictionary->hasKey("Namespaces")) + { + tree.m_namespaces = loader.readObjectList(dictionary->get("Namespaces")); + } + + if (dictionary->hasKey("PronunciationLexicon")) + { + tree.m_pronunciationLexicons = loader.readObjectList(dictionary->get("PronunciationLexicon")); + } + + if (dictionary->hasKey("AF")) + { + tree.m_associatedFiles = loader.readObjectList(dictionary->get("AF")); + } } return tree; } +PDFStructureItem::Type PDFStructureItem::getTypeFromName(const QByteArray& name) +{ + for (const auto& item : s_structureTreeItemTypes) + { + if (name == item.second) + { + return item.first; + } + } + + return Invalid; +} + +PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorage* storage, PDFObject object) +{ + PDFStructureTreeNamespace result; + + if (object.isReference()) + { + result.m_selfReference = object.getReference(); + } + object = storage->getObject(object); + + if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) + { + PDFDocumentDataLoaderDecorator loader(storage); + result.m_namespace = loader.readTextStringFromDictionary(dictionary, "NS", QString()); + result.m_schema = PDFFileSpecification::parse(storage, dictionary->get("Schema")); + result.m_roleMapNS = dictionary->get("RoleMapNS"); + } + + return result; +} + } // namespace pdf diff --git a/PdfForQtLib/sources/pdfstructuretree.h b/PdfForQtLib/sources/pdfstructuretree.h index 94a8e15..1ebeb3e 100644 --- a/PdfForQtLib/sources/pdfstructuretree.h +++ b/PdfForQtLib/sources/pdfstructuretree.h @@ -20,6 +20,7 @@ #include "pdfobject.h" #include "pdfobjectutils.h" +#include "pdffile.h" namespace pdf { @@ -30,14 +31,13 @@ struct PDFStructureTreeAttributeDefinition; class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeAttribute { public: - explicit PDFStructureTreeAttribute(); enum class Owner { Invalid, /// Defined for user owner - User, + UserProperties, /// Defined for NSO (namespace owner) NSO, @@ -62,6 +62,13 @@ public: ARIA_1_1, }; + explicit PDFStructureTreeAttribute(); + explicit PDFStructureTreeAttribute(const PDFStructureTreeAttributeDefinition* definition, + Owner owner, + PDFInteger revision, + PDFObjectReference namespaceReference, + PDFObject value); + enum Attribute { User, @@ -138,6 +145,9 @@ public: /// Returns attribute revision number PDFInteger getRevision() const { return m_revision; } + /// Sets attribute revision number + void setRevision(PDFInteger revision) { m_revision = revision; } + /// Returns namespace for this attribute (or empty reference, if it doesn't exists) PDFObjectReference getNamespace() const { return m_namespace; } @@ -167,7 +177,21 @@ public: /// \param storage Storage (for resolving of indirect objects) bool getUserPropertyIsHidden(const PDFObjectStorage* storage) const; + /// Parses attributes and adds them into \p attributes array. Invalid + /// attributes are not added. New attributes are appended to the end + /// of the array. + /// \param storage Storage + /// \param object Container of attributes + /// \param attributes[in,out] Attributes + static void parseAttributes(const PDFObjectStorage* storage, PDFObject object, std::vector& attributes); + private: + /// Parses single attribute dictionary and appends new attributes to the end of the list. + /// \param storage Storage + /// \param object Container of attributes + /// \param attributes[in,out] Attributes + static void parseAttributeDictionary(const PDFObjectStorage* storage, PDFObject object, std::vector& attributes); + const PDFStructureTreeAttributeDefinition* m_definition = nullptr; /// Attribute owner @@ -200,8 +224,38 @@ public: { } + virtual ~PDFStructureItem() = default; + enum Type + { + Invalid, + + // Document level types - chapter 14.8.4.3 of PDF 2.0 specification + Document, DocumentFragment, + + // Grouping types - chapter 14.8.4.4 of PDF 2.0 specification + Part, Div, Aside, + + // Block level structure types - chapter 14.8.4.5 of PDF 2.0 specification + P, H1, H2, H3, H4, H5, H6, H7, H, Title, FENote, + + // Subblock level structure types - chapter 14.8.4.6 of PDF 2.0 specification + Sub, + + // Inline structure types - chapter 14.8.4.7 of PDF 2.0 specification + Lbl, Span, Em, Strong, Link, Annot, Form, Ruby, RB, RT, RP, Warichu, WR, WP, + + // Other structure types - chapter 14.8.4.7 of PDF 2.0 specification + L, LI, LBody, Table, TR, TH, TD, THead, TBody, TFoot, Caption, Figure, Formula, Artifact, + + // PDF 1.7 backward compatibility types + Sect, Art, BlockQuote, TOC, TOCI, Index, NonStruct, Private, Quote, Note, Reference, BibEntry, Code, + + // Last type identifier + LastType, + }; + virtual PDFStructureTree* asStructureTree() { return nullptr; } virtual const PDFStructureTree* asStructureTree() const { return nullptr; } @@ -217,12 +271,38 @@ public: /// \param context Parsing context static PDFStructureItemPointer parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context); + /// Get structure tree type from name + /// \param name Name + static Type getTypeFromName(const QByteArray& name); + protected: PDFStructureItem* m_parent; PDFStructureTree* m_root; std::vector m_children; }; +/// Structure tree namespace +class PDFStructureTreeNamespace +{ +public: + explicit inline PDFStructureTreeNamespace() = default; + + const PDFObjectReference& getSelfReference() const { return m_selfReference; } + const QString& getNamespace() const { return m_namespace; } + const PDFFileSpecification& getSchema() const { return m_schema; } + const PDFObject& getRoleMapNS() const { return m_roleMapNS; } + + static PDFStructureTreeNamespace parse(const PDFObjectStorage* storage, PDFObject object); + +private: + PDFObjectReference m_selfReference; + QString m_namespace; + PDFFileSpecification m_schema; + PDFObject m_roleMapNS; +}; + +using PDFStructureTreeNamespaces = std::vector; + /// Structure tree, contains structure element hierarchy class PDFStructureTree : public PDFStructureItem { @@ -237,6 +317,25 @@ public: /// \param id Id std::vector getParents(PDFInteger id) const; + /// Returns type from role. Role can be an entry in RoleMap dictionary, + /// or one of the standard roles. + /// \param role Role + Type getTypeFromRole(const QByteArray& role) const; + + /// Returns class attributes for given class. If class is not found, + /// then empty attributes are returned. + /// \param className Class name + const std::vector& getClassAttributes(const QByteArray& className) const; + + /// Returns a list of namespaces + const PDFStructureTreeNamespaces& getNamespaces() const { return m_namespaces; } + + /// Returns a list of pronunciation lexicons + const std::vector& getPronunciationLexicons() const { return m_pronunciationLexicons; } + + /// Returns a list of associated files + const std::vector& getAssociatedFiles() const { return m_associatedFiles; } + /// Parses structure tree from the object. If error occurs, empty structure /// tree is returned. /// \param storage Storage @@ -259,6 +358,12 @@ private: std::map m_idTreeMap; ParentTreeEntries m_parentTreeEntries; + PDFInteger m_parentNextKey = 0; + std::map m_roleMap; + std::map> m_classMap; + PDFStructureTreeNamespaces m_namespaces; + std::vector m_pronunciationLexicons; + std::vector m_associatedFiles; }; } // namespace pdf