From 707f68fa19b94bf86c29af8853ffcca6b40610b9 Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Wed, 22 Jul 2020 19:52:23 +0200 Subject: [PATCH] Parsing structure element --- PdfForQtLib/sources/pdfobjectutils.h | 12 ++- PdfForQtLib/sources/pdfstructuretree.cpp | 131 +++++++++++++++++++---- PdfForQtLib/sources/pdfstructuretree.h | 79 ++++++++++++++ 3 files changed, 198 insertions(+), 24 deletions(-) diff --git a/PdfForQtLib/sources/pdfobjectutils.h b/PdfForQtLib/sources/pdfobjectutils.h index f3dfe5d..88be50c 100644 --- a/PdfForQtLib/sources/pdfobjectutils.h +++ b/PdfForQtLib/sources/pdfobjectutils.h @@ -67,17 +67,23 @@ public: explicit inline PDFMarkedObjectsLock(PDFMarkedObjectsContext* context, PDFObjectReference reference) : m_context(context), m_reference(reference), - m_locked(!context->isMarked(reference)) + m_locked(!reference.isValid() || !context->isMarked(reference)) { - if (m_locked) + if (m_locked && reference.isValid()) { context->mark(reference); } } + explicit inline PDFMarkedObjectsLock(PDFMarkedObjectsContext* context, const PDFObject& object) : + PDFMarkedObjectsLock(context, object.isReference() ? object.getReference() : PDFObjectReference()) + { + + } + inline ~PDFMarkedObjectsLock() { - if (m_locked) + if (m_locked && m_reference.isValid()) { m_context->unmark(m_reference); } diff --git a/PdfForQtLib/sources/pdfstructuretree.cpp b/PdfForQtLib/sources/pdfstructuretree.cpp index f18139e..26242e5 100644 --- a/PdfForQtLib/sources/pdfstructuretree.cpp +++ b/PdfForQtLib/sources/pdfstructuretree.cpp @@ -420,27 +420,7 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj PDFDocumentDataLoaderDecorator loader(storage); PDFMarkedObjectsContext context; - PDFObject kids = dictionary->get("K"); - if (kids.isArray()) - { - const PDFArray* kidsArray = kids.getArray(); - for (const PDFObject& object : *kidsArray) - { - PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, &context); - if (item) - { - tree.m_children.emplace_back(qMove(item)); - } - } - } - else - { - PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, &context); - if (item) - { - tree.m_children.emplace_back(qMove(item)); - } - } + parseKids(storage, &tree, dictionary, &context); if (dictionary->hasKey("IDTree")) { @@ -545,6 +525,31 @@ PDFStructureItem::Type PDFStructureItem::getTypeFromName(const QByteArray& name) return Invalid; } +void PDFStructureItem::parseKids(const PDFObjectStorage* storage, PDFStructureItem* parentItem, const PDFDictionary* dictionary, PDFMarkedObjectsContext* context) +{ + PDFObject kids = dictionary->get("K"); + if (kids.isArray()) + { + const PDFArray* kidsArray = kids.getArray(); + for (const PDFObject& object : *kidsArray) + { + PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, context); + if (item) + { + parentItem->m_children.emplace_back(qMove(item)); + } + } + } + else if (!kids.isNull()) + { + PDFStructureItemPointer item = PDFStructureItem::parse(storage, kids, context); + if (item) + { + parentItem->m_children.emplace_back(qMove(item)); + } + } +} + PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorage* storage, PDFObject object) { PDFStructureTreeNamespace result; @@ -566,4 +571,88 @@ PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorag return result; } +PDFStructureItemPointer PDFStructureElement::parseElement(const PDFObjectStorage* storage, + PDFObject object, + PDFMarkedObjectsContext* context, + PDFStructureItem* parent, + PDFStructureTree* root) +{ + PDFStructureItemPointer pointer; + + Q_ASSERT(root); + + if (auto lock = PDFMarkedObjectsLock(context, object)) + { + if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) + { + PDFStructureElement* item = new PDFStructureElement(parent, root); + pointer.reset(item); + + if (object.isReference()) + { + item->m_selfReference = object.getReference(); + } + + PDFDocumentDataLoaderDecorator loader(storage); + item->m_typeName = loader.readNameFromDictionary(dictionary, "S"); + item->m_standardType = root->getTypeFromRole(item->m_typeName); + item->m_id = loader.readStringFromDictionary(dictionary, "ID"); + item->m_references = loader.readReferenceArrayFromDictionary(dictionary, "Ref"); + item->m_pageReference = loader.readReferenceFromDictionary(dictionary, "Pg"); + + std::vector attributes; + PDFObject classObject = storage->getObject(dictionary->get("C")); + if (classObject.isName()) + { + QByteArray name = classObject.getString(); + const std::vector& classAttributes = root->getClassAttributes(name); + attributes.insert(attributes.end(), classAttributes.begin(), classAttributes.end()); + } + else if (classObject.isArray()) + { + size_t startIndex = attributes.size(); + + for (PDFObject itemObject : *classObject.getArray()) + { + itemObject = storage->getObject(itemObject); + if (itemObject.isInt()) + { + // It is revision number + const PDFInteger revision = itemObject.getInteger(); + for (; startIndex < attributes.size(); ++startIndex) + { + attributes[startIndex].setRevision(revision); + } + } + else if (itemObject.isName()) + { + // It is class name + QByteArray name = itemObject.getString(); + const std::vector& classAttributes = root->getClassAttributes(name); + attributes.insert(attributes.end(), classAttributes.begin(), classAttributes.end()); + } + } + } + PDFStructureTreeAttribute::parseAttributes(storage, dictionary->get("A"), attributes); + std::reverse(attributes.begin(), attributes.end()); + item->m_attributes = qMove(attributes); + item->m_revision = loader.readIntegerFromDictionary(dictionary, "R", 0); + item->m_texts[Title] = loader.readTextStringFromDictionary(dictionary, "T", QString()); + item->m_texts[Language] = loader.readTextStringFromDictionary(dictionary, "Lang", QString()); + item->m_texts[AlternativeDescription] = loader.readTextStringFromDictionary(dictionary, "Alt", QString()); + item->m_texts[ExpandedForm] = loader.readTextStringFromDictionary(dictionary, "E", QString()); + item->m_texts[ActualText] = loader.readTextStringFromDictionary(dictionary, "ActualText", QString()); + item->m_texts[Phoneme] = loader.readTextStringFromDictionary(dictionary, "Phoneme", QString()); + + item->m_associatedFiles = loader.readObjectList(dictionary->get("AF")); + item->m_namespace = loader.readReferenceFromDictionary(dictionary, "NS"); + item->m_phoneticAlphabet = loader.readNameFromDictionary(dictionary, "PhoneticAlphabet"); + + parseKids(storage, item, dictionary, context); + } + } + + return pointer; +} + } // namespace pdf diff --git a/PdfForQtLib/sources/pdfstructuretree.h b/PdfForQtLib/sources/pdfstructuretree.h index 1ebeb3e..9ecdd08 100644 --- a/PdfForQtLib/sources/pdfstructuretree.h +++ b/PdfForQtLib/sources/pdfstructuretree.h @@ -211,6 +211,7 @@ private: class PDFStructureTree; class PDFStructureItem; +class PDFStructureElement; using PDFStructureItemPointer = QSharedPointer; @@ -259,8 +260,12 @@ public: virtual PDFStructureTree* asStructureTree() { return nullptr; } virtual const PDFStructureTree* asStructureTree() const { return nullptr; } + virtual PDFStructureElement* asStructureElement() { return nullptr; } + virtual const PDFStructureElement* asStructureElement() const { return nullptr; } + const PDFStructureItem* getParent() const { return m_parent; } const PDFStructureTree* getTree() const { return m_root; } + PDFObjectReference getSelfReference() const { return m_selfReference; } std::size_t getChildCount() const { return m_children.size(); } const PDFStructureItem* getChild(size_t i) const { return m_children.at(i).get(); } @@ -276,8 +281,20 @@ public: static Type getTypeFromName(const QByteArray& name); protected: + /// Parses kids of the item. Invalid items aren't added + /// to the kid list. + /// \param storage Storage + /// \param parentItem Parent item, where children are inserted + /// \param dictionary Dictionary + /// \param context Context + static void parseKids(const PDFObjectStorage* storage, + PDFStructureItem* parentItem, + const PDFDictionary* dictionary, + PDFMarkedObjectsContext* context); + PDFStructureItem* m_parent; PDFStructureTree* m_root; + PDFObjectReference m_selfReference; std::vector m_children; }; @@ -366,6 +383,68 @@ private: std::vector m_associatedFiles; }; +/// Structure element +class PDFStructureElement : public PDFStructureItem +{ +public: + explicit inline PDFStructureElement(PDFStructureItem* parent, PDFStructureTree* root) : + PDFStructureItem(parent, root) + { + + } + + enum StringValue + { + Title, + Language, + AlternativeDescription, + ExpandedForm, + ActualText, + Phoneme, + LastStringValue + }; + + virtual PDFStructureElement* asStructureElement() override { return this; } + virtual const PDFStructureElement* asStructureElement() const override { return this; } + + const QByteArray& getTypeName() const { return m_typeName; } + Type getStandardType() const { return m_standardType; } + const QByteArray& getId() const { return m_id; } + const std::vector& getReferences() const { return m_references; } + const PDFObjectReference& getPageReference() const { return m_pageReference; } + const std::vector& getAttributes() const { return m_attributes; } + PDFInteger getRevision() const { return m_revision; } + const QString& getText(StringValue stringValue) const { return m_texts.at(stringValue); } + const std::vector& getAssociatedFiles() const { return m_associatedFiles; } + const PDFObjectReference& getNamespace() const { return m_namespace; } + const QByteArray& getPhoneticAlphabet() const { return m_phoneticAlphabet; } + + /// Parses structure element from the object. If error occurs, nullptr is returned. + /// \param storage Storage + /// \param object Structure element object + /// \param context Visited elements context + /// \param parent Parent structure tree item + /// \param root Structure tree root + static PDFStructureItemPointer parseElement(const PDFObjectStorage* storage, + PDFObject object, + PDFMarkedObjectsContext* context, + PDFStructureItem* parent, + PDFStructureTree* root); + +private: + QByteArray m_typeName; + Type m_standardType; + QByteArray m_id; + std::vector m_references; + PDFObjectReference m_pageReference; + std::vector m_attributes; + PDFInteger m_revision = 0; + std::array m_texts; + std::vector m_associatedFiles; + PDFObjectReference m_namespace; + QByteArray m_phoneticAlphabet; +}; + } // namespace pdf #endif // PDFSTRUCTURETREE_H