Structure tree parsing

This commit is contained in:
Jakub Melka 2020-07-19 18:18:24 +02:00
parent 0a62ad618d
commit 2daf7bccf4
2 changed files with 352 additions and 6 deletions

View File

@ -43,9 +43,12 @@ struct PDFStructureTreeAttributeDefinition
/// always returns valid pointer. For uknown attribute, it returns /// always returns valid pointer. For uknown attribute, it returns
/// user attribute definition. /// user attribute definition.
/// \param name Attribute name /// \param name Attribute name
const PDFStructureTreeAttributeDefinition* getDefinition(const QByteArray& name); static const PDFStructureTreeAttributeDefinition* getDefinition(const QByteArray& name);
PDFStructureTreeAttribute::Owner getOwnerFromString(const QByteArray& string); /// Returns owner from string. If owner is not valid, then invalid
/// owner is returned.
/// \param string String
static PDFStructureTreeAttribute::Owner getOwnerFromString(const QByteArray& string);
PDFStructureTreeAttribute::Attribute type = PDFStructureTreeAttribute::Attribute::User; PDFStructureTreeAttribute::Attribute type = PDFStructureTreeAttribute::Attribute::User;
const char* name = nullptr; const char* name = nullptr;
@ -136,6 +139,67 @@ static constexpr std::array<const PDFStructureTreeAttributeDefinition, PDFStruct
PDFStructureTreeAttributeDefinition(PDFStructureTreeAttribute::Attribute::Subtype, "Subtype", false) PDFStructureTreeAttributeDefinition(PDFStructureTreeAttribute::Attribute::Subtype, "Subtype", false)
}; };
static constexpr std::pair<PDFStructureItem::Type, const char*> s_structureTreeItemTypes[] = {
std::make_pair(PDFStructureItem::Document, "Document"),
std::make_pair(PDFStructureItem::DocumentFragment, "DocumentFragment"),
std::make_pair(PDFStructureItem::Part, "Part"),
std::make_pair(PDFStructureItem::Div, "Div"),
std::make_pair(PDFStructureItem::Aside, "Aside"),
std::make_pair(PDFStructureItem::P, "P"),
std::make_pair(PDFStructureItem::H1, "H1"),
std::make_pair(PDFStructureItem::H2, "H2"),
std::make_pair(PDFStructureItem::H3, "H3"),
std::make_pair(PDFStructureItem::H4, "H4"),
std::make_pair(PDFStructureItem::H5, "H5"),
std::make_pair(PDFStructureItem::H6, "H6"),
std::make_pair(PDFStructureItem::H7, "H7"),
std::make_pair(PDFStructureItem::H, "H"),
std::make_pair(PDFStructureItem::Title, "Title"),
std::make_pair(PDFStructureItem::FENote, "FENote"),
std::make_pair(PDFStructureItem::Sub, "Sub"),
std::make_pair(PDFStructureItem::Lbl, "Lbl"),
std::make_pair(PDFStructureItem::Span, "Span"),
std::make_pair(PDFStructureItem::Em, "Em"),
std::make_pair(PDFStructureItem::Strong, "Strong"),
std::make_pair(PDFStructureItem::Link, "Link"),
std::make_pair(PDFStructureItem::Annot, "Annot"),
std::make_pair(PDFStructureItem::Form, "Form"),
std::make_pair(PDFStructureItem::Ruby, "Ruby"),
std::make_pair(PDFStructureItem::RB, "RB"),
std::make_pair(PDFStructureItem::RT, "RT"),
std::make_pair(PDFStructureItem::RP, "RP"),
std::make_pair(PDFStructureItem::Warichu, "Warichu"),
std::make_pair(PDFStructureItem::WR, "WR"),
std::make_pair(PDFStructureItem::WP, "WP"),
std::make_pair(PDFStructureItem::L, "L"),
std::make_pair(PDFStructureItem::LI, "LI"),
std::make_pair(PDFStructureItem::LBody, "LBody"),
std::make_pair(PDFStructureItem::Table, "Table"),
std::make_pair(PDFStructureItem::TR, "TR"),
std::make_pair(PDFStructureItem::TH, "TH"),
std::make_pair(PDFStructureItem::TD, "TD"),
std::make_pair(PDFStructureItem::THead, "THead"),
std::make_pair(PDFStructureItem::TBody, "TBody"),
std::make_pair(PDFStructureItem::TFoot, "TFoot"),
std::make_pair(PDFStructureItem::Caption, "Caption"),
std::make_pair(PDFStructureItem::Figure, "Figure"),
std::make_pair(PDFStructureItem::Formula, "Formula"),
std::make_pair(PDFStructureItem::Artifact, "Artifact"),
std::make_pair(PDFStructureItem::Sect, "Sect"),
std::make_pair(PDFStructureItem::Art, "Art"),
std::make_pair(PDFStructureItem::BlockQuote, "BlockQuote"),
std::make_pair(PDFStructureItem::TOC, "TOC"),
std::make_pair(PDFStructureItem::TOCI, "TOCI"),
std::make_pair(PDFStructureItem::Index, "Index"),
std::make_pair(PDFStructureItem::NonStruct, "NonStruct"),
std::make_pair(PDFStructureItem::Private, "Private"),
std::make_pair(PDFStructureItem::Quote, "Quote"),
std::make_pair(PDFStructureItem::Note, "Note"),
std::make_pair(PDFStructureItem::Reference, "Reference"),
std::make_pair(PDFStructureItem::BibEntry, "BibEntry"),
std::make_pair(PDFStructureItem::Code, "Code")
};
const PDFStructureTreeAttributeDefinition* PDFStructureTreeAttributeDefinition::getDefinition(const QByteArray& name) const PDFStructureTreeAttributeDefinition* PDFStructureTreeAttributeDefinition::getDefinition(const QByteArray& name)
{ {
for (const PDFStructureTreeAttributeDefinition& definition : s_attributeDefinitions) for (const PDFStructureTreeAttributeDefinition& definition : s_attributeDefinitions)
@ -160,7 +224,7 @@ PDFStructureTreeAttribute::Owner PDFStructureTreeAttributeDefinition::getOwnerFr
} }
} }
return PDFStructureTreeAttribute::Owner::User; return PDFStructureTreeAttribute::Owner::Invalid;
} }
PDFStructureTreeAttribute::PDFStructureTreeAttribute() : PDFStructureTreeAttribute::PDFStructureTreeAttribute() :
@ -173,6 +237,20 @@ PDFStructureTreeAttribute::PDFStructureTreeAttribute() :
} }
PDFStructureTreeAttribute::PDFStructureTreeAttribute(const PDFStructureTreeAttributeDefinition* definition,
PDFStructureTreeAttribute::Owner owner,
PDFInteger revision,
PDFObjectReference namespaceReference,
PDFObject value) :
m_definition(definition),
m_owner(owner),
m_revision(revision),
m_namespace(namespaceReference),
m_value(qMove(value))
{
}
PDFStructureTreeAttribute::Attribute PDFStructureTreeAttribute::getType() const PDFStructureTreeAttribute::Attribute PDFStructureTreeAttribute::getType() const
{ {
Q_ASSERT(m_definition); Q_ASSERT(m_definition);
@ -228,6 +306,76 @@ bool PDFStructureTreeAttribute::getUserPropertyIsHidden(const PDFObjectStorage*
return false; return false;
} }
void PDFStructureTreeAttribute::parseAttributes(const PDFObjectStorage* storage, PDFObject object, std::vector<PDFStructureTreeAttribute>& attributes)
{
object = storage->getObject(object);
if (object.isDictionary())
{
parseAttributeDictionary(storage, object, attributes);
}
else if (object.isArray())
{
size_t startIndex = attributes.size();
for (PDFObject itemObject : *object.getArray())
{
itemObject = storage->getObject(itemObject);
if (itemObject.isInt())
{
// It is revision number
const PDFInteger revision = itemObject.getInteger();
for (; startIndex < attributes.size(); ++startIndex)
{
attributes[startIndex].setRevision(revision);
}
}
else if (itemObject.isDictionary())
{
// It is attribute
parseAttributeDictionary(storage, itemObject, attributes);
}
}
}
}
void PDFStructureTreeAttribute::parseAttributeDictionary(const PDFObjectStorage* storage, PDFObject object, std::vector<PDFStructureTreeAttribute>& attributes)
{
Q_ASSERT(object.isDictionary());
const PDFDictionary* attributeDictionary = object.getDictionary();
PDFDocumentDataLoaderDecorator loader(storage);
const QByteArray ownerName = loader.readNameFromDictionary(attributeDictionary, "O");
const Owner owner = PDFStructureTreeAttributeDefinition::getOwnerFromString(ownerName);
if (owner == Owner::UserProperties)
{
// User properties
PDFObject userPropertiesArrayObject = storage->getObject(attributeDictionary->get("P"));
if (userPropertiesArrayObject.isArray())
{
const PDFArray* userPropertiesArray = userPropertiesArrayObject.getArray();
for (const PDFObject& userPropertyObject : *userPropertiesArray)
{
attributes.emplace_back(&s_attributeDefinitions.front(), owner, 0, PDFObjectReference(), userPropertyObject);
}
}
}
else
{
const PDFObjectReference namespaceReference = loader.readReferenceFromDictionary(attributeDictionary, "NS");
const size_t count = attributeDictionary->getCount();
for (size_t i = 0; i < count; ++i)
{
const PDFInplaceOrMemoryString& key = attributeDictionary->getKey(i);
if (key == "O" || key == "NS")
{
continue;
}
attributes.emplace_back(PDFStructureTreeAttributeDefinition::getDefinition(key.getString()), owner, 0, namespaceReference, attributeDictionary->getValue(i));
}
}
}
std::vector<PDFObjectReference> PDFStructureTree::getParents(PDFInteger id) const std::vector<PDFObjectReference> PDFStructureTree::getParents(PDFInteger id) const
{ {
std::vector<PDFObjectReference> result; std::vector<PDFObjectReference> result;
@ -240,15 +388,39 @@ std::vector<PDFObjectReference> PDFStructureTree::getParents(PDFInteger id) cons
return result; return result;
} }
PDFStructureItem::Type PDFStructureTree::getTypeFromRole(const QByteArray& role) const
{
auto it = m_roleMap.find(role);
if (it != m_roleMap.cend())
{
return it->second;
}
return getTypeFromName(role);
}
const std::vector<PDFStructureTreeAttribute>& PDFStructureTree::getClassAttributes(const QByteArray& className) const
{
auto it = m_classMap.find(className);
if (it != m_classMap.cend())
{
return it->second;
}
static const std::vector<PDFStructureTreeAttribute> dummy;
return dummy;
}
PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObject object) PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObject object)
{ {
PDFStructureTree tree; PDFStructureTree tree;
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
{ {
PDFDocumentDataLoaderDecorator loader(storage);
PDFMarkedObjectsContext context; PDFMarkedObjectsContext context;
PDFObject kids = dictionary->get("K"); PDFObject kids = dictionary->get("K");
if (kids.isArray()) if (kids.isArray())
{ {
const PDFArray* kidsArray = kids.getArray(); const PDFArray* kidsArray = kids.getArray();
@ -320,9 +492,78 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
} }
std::stable_sort(tree.m_parentTreeEntries.begin(), tree.m_parentTreeEntries.end()); std::stable_sort(tree.m_parentTreeEntries.begin(), tree.m_parentTreeEntries.end());
} }
tree.m_parentNextKey = loader.readIntegerFromDictionary(dictionary, "ParentTreeNextKey", 0);
if (const PDFDictionary* roleMapDictionary = storage->getDictionaryFromObject(dictionary->get("RoleMap")))
{
const size_t size = roleMapDictionary->getCount();
for (size_t i = 0; i < size; ++i)
{
tree.m_roleMap[roleMapDictionary->getKey(i).getString()] = getTypeFromName(loader.readName(roleMapDictionary->getValue(i)));
}
}
if (const PDFDictionary* classMapDictionary = storage->getDictionaryFromObject(dictionary->get("ClassMap")))
{
const size_t size = classMapDictionary->getCount();
for (size_t i = 0; i < size; ++i)
{
PDFStructureTreeAttribute::parseAttributes(storage, classMapDictionary->getValue(i), tree.m_classMap[classMapDictionary->getKey(i).getString()]);
}
}
if (dictionary->hasKey("Namespaces"))
{
tree.m_namespaces = loader.readObjectList<PDFStructureTreeNamespace>(dictionary->get("Namespaces"));
}
if (dictionary->hasKey("PronunciationLexicon"))
{
tree.m_pronunciationLexicons = loader.readObjectList<PDFFileSpecification>(dictionary->get("PronunciationLexicon"));
}
if (dictionary->hasKey("AF"))
{
tree.m_associatedFiles = loader.readObjectList<PDFFileSpecification>(dictionary->get("AF"));
}
} }
return tree; return tree;
} }
PDFStructureItem::Type PDFStructureItem::getTypeFromName(const QByteArray& name)
{
for (const auto& item : s_structureTreeItemTypes)
{
if (name == item.second)
{
return item.first;
}
}
return Invalid;
}
PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorage* storage, PDFObject object)
{
PDFStructureTreeNamespace result;
if (object.isReference())
{
result.m_selfReference = object.getReference();
}
object = storage->getObject(object);
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
{
PDFDocumentDataLoaderDecorator loader(storage);
result.m_namespace = loader.readTextStringFromDictionary(dictionary, "NS", QString());
result.m_schema = PDFFileSpecification::parse(storage, dictionary->get("Schema"));
result.m_roleMapNS = dictionary->get("RoleMapNS");
}
return result;
}
} // namespace pdf } // namespace pdf

View File

@ -20,6 +20,7 @@
#include "pdfobject.h" #include "pdfobject.h"
#include "pdfobjectutils.h" #include "pdfobjectutils.h"
#include "pdffile.h"
namespace pdf namespace pdf
{ {
@ -30,14 +31,13 @@ struct PDFStructureTreeAttributeDefinition;
class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeAttribute class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeAttribute
{ {
public: public:
explicit PDFStructureTreeAttribute();
enum class Owner enum class Owner
{ {
Invalid, Invalid,
/// Defined for user owner /// Defined for user owner
User, UserProperties,
/// Defined for NSO (namespace owner) /// Defined for NSO (namespace owner)
NSO, NSO,
@ -62,6 +62,13 @@ public:
ARIA_1_1, ARIA_1_1,
}; };
explicit PDFStructureTreeAttribute();
explicit PDFStructureTreeAttribute(const PDFStructureTreeAttributeDefinition* definition,
Owner owner,
PDFInteger revision,
PDFObjectReference namespaceReference,
PDFObject value);
enum Attribute enum Attribute
{ {
User, User,
@ -138,6 +145,9 @@ public:
/// Returns attribute revision number /// Returns attribute revision number
PDFInteger getRevision() const { return m_revision; } PDFInteger getRevision() const { return m_revision; }
/// Sets attribute revision number
void setRevision(PDFInteger revision) { m_revision = revision; }
/// Returns namespace for this attribute (or empty reference, if it doesn't exists) /// Returns namespace for this attribute (or empty reference, if it doesn't exists)
PDFObjectReference getNamespace() const { return m_namespace; } PDFObjectReference getNamespace() const { return m_namespace; }
@ -167,7 +177,21 @@ public:
/// \param storage Storage (for resolving of indirect objects) /// \param storage Storage (for resolving of indirect objects)
bool getUserPropertyIsHidden(const PDFObjectStorage* storage) const; bool getUserPropertyIsHidden(const PDFObjectStorage* storage) const;
/// Parses attributes and adds them into \p attributes array. Invalid
/// attributes are not added. New attributes are appended to the end
/// of the array.
/// \param storage Storage
/// \param object Container of attributes
/// \param attributes[in,out] Attributes
static void parseAttributes(const PDFObjectStorage* storage, PDFObject object, std::vector<PDFStructureTreeAttribute>& attributes);
private: private:
/// Parses single attribute dictionary and appends new attributes to the end of the list.
/// \param storage Storage
/// \param object Container of attributes
/// \param attributes[in,out] Attributes
static void parseAttributeDictionary(const PDFObjectStorage* storage, PDFObject object, std::vector<PDFStructureTreeAttribute>& attributes);
const PDFStructureTreeAttributeDefinition* m_definition = nullptr; const PDFStructureTreeAttributeDefinition* m_definition = nullptr;
/// Attribute owner /// Attribute owner
@ -200,8 +224,38 @@ public:
{ {
} }
virtual ~PDFStructureItem() = default; virtual ~PDFStructureItem() = default;
enum Type
{
Invalid,
// Document level types - chapter 14.8.4.3 of PDF 2.0 specification
Document, DocumentFragment,
// Grouping types - chapter 14.8.4.4 of PDF 2.0 specification
Part, Div, Aside,
// Block level structure types - chapter 14.8.4.5 of PDF 2.0 specification
P, H1, H2, H3, H4, H5, H6, H7, H, Title, FENote,
// Subblock level structure types - chapter 14.8.4.6 of PDF 2.0 specification
Sub,
// Inline structure types - chapter 14.8.4.7 of PDF 2.0 specification
Lbl, Span, Em, Strong, Link, Annot, Form, Ruby, RB, RT, RP, Warichu, WR, WP,
// Other structure types - chapter 14.8.4.7 of PDF 2.0 specification
L, LI, LBody, Table, TR, TH, TD, THead, TBody, TFoot, Caption, Figure, Formula, Artifact,
// PDF 1.7 backward compatibility types
Sect, Art, BlockQuote, TOC, TOCI, Index, NonStruct, Private, Quote, Note, Reference, BibEntry, Code,
// Last type identifier
LastType,
};
virtual PDFStructureTree* asStructureTree() { return nullptr; } virtual PDFStructureTree* asStructureTree() { return nullptr; }
virtual const PDFStructureTree* asStructureTree() const { return nullptr; } virtual const PDFStructureTree* asStructureTree() const { return nullptr; }
@ -217,12 +271,38 @@ public:
/// \param context Parsing context /// \param context Parsing context
static PDFStructureItemPointer parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context); static PDFStructureItemPointer parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context);
/// Get structure tree type from name
/// \param name Name
static Type getTypeFromName(const QByteArray& name);
protected: protected:
PDFStructureItem* m_parent; PDFStructureItem* m_parent;
PDFStructureTree* m_root; PDFStructureTree* m_root;
std::vector<PDFStructureItemPointer> m_children; std::vector<PDFStructureItemPointer> m_children;
}; };
/// Structure tree namespace
class PDFStructureTreeNamespace
{
public:
explicit inline PDFStructureTreeNamespace() = default;
const PDFObjectReference& getSelfReference() const { return m_selfReference; }
const QString& getNamespace() const { return m_namespace; }
const PDFFileSpecification& getSchema() const { return m_schema; }
const PDFObject& getRoleMapNS() const { return m_roleMapNS; }
static PDFStructureTreeNamespace parse(const PDFObjectStorage* storage, PDFObject object);
private:
PDFObjectReference m_selfReference;
QString m_namespace;
PDFFileSpecification m_schema;
PDFObject m_roleMapNS;
};
using PDFStructureTreeNamespaces = std::vector<PDFStructureTreeNamespace>;
/// Structure tree, contains structure element hierarchy /// Structure tree, contains structure element hierarchy
class PDFStructureTree : public PDFStructureItem class PDFStructureTree : public PDFStructureItem
{ {
@ -237,6 +317,25 @@ public:
/// \param id Id /// \param id Id
std::vector<PDFObjectReference> getParents(PDFInteger id) const; std::vector<PDFObjectReference> getParents(PDFInteger id) const;
/// Returns type from role. Role can be an entry in RoleMap dictionary,
/// or one of the standard roles.
/// \param role Role
Type getTypeFromRole(const QByteArray& role) const;
/// Returns class attributes for given class. If class is not found,
/// then empty attributes are returned.
/// \param className Class name
const std::vector<PDFStructureTreeAttribute>& getClassAttributes(const QByteArray& className) const;
/// Returns a list of namespaces
const PDFStructureTreeNamespaces& getNamespaces() const { return m_namespaces; }
/// Returns a list of pronunciation lexicons
const std::vector<PDFFileSpecification>& getPronunciationLexicons() const { return m_pronunciationLexicons; }
/// Returns a list of associated files
const std::vector<PDFFileSpecification>& getAssociatedFiles() const { return m_associatedFiles; }
/// Parses structure tree from the object. If error occurs, empty structure /// Parses structure tree from the object. If error occurs, empty structure
/// tree is returned. /// tree is returned.
/// \param storage Storage /// \param storage Storage
@ -259,6 +358,12 @@ private:
std::map<QByteArray, PDFObjectReference> m_idTreeMap; std::map<QByteArray, PDFObjectReference> m_idTreeMap;
ParentTreeEntries m_parentTreeEntries; ParentTreeEntries m_parentTreeEntries;
PDFInteger m_parentNextKey = 0;
std::map<QByteArray, Type> m_roleMap;
std::map<QByteArray, std::vector<PDFStructureTreeAttribute>> m_classMap;
PDFStructureTreeNamespaces m_namespaces;
std::vector<PDFFileSpecification> m_pronunciationLexicons;
std::vector<PDFFileSpecification> m_associatedFiles;
}; };
} // namespace pdf } // namespace pdf