Parsing structure element

This commit is contained in:
Jakub Melka
2020-07-22 19:52:23 +02:00
parent 2daf7bccf4
commit 707f68fa19
3 changed files with 198 additions and 24 deletions

View File

@ -67,17 +67,23 @@ public:
explicit inline PDFMarkedObjectsLock(PDFMarkedObjectsContext* context, PDFObjectReference reference) :
m_context(context),
m_reference(reference),
m_locked(!context->isMarked(reference))
m_locked(!reference.isValid() || !context->isMarked(reference))
{
if (m_locked)
if (m_locked && reference.isValid())
{
context->mark(reference);
}
}
explicit inline PDFMarkedObjectsLock(PDFMarkedObjectsContext* context, const PDFObject& object) :
PDFMarkedObjectsLock(context, object.isReference() ? object.getReference() : PDFObjectReference())
{
}
inline ~PDFMarkedObjectsLock()
{
if (m_locked)
if (m_locked && m_reference.isValid())
{
m_context->unmark(m_reference);
}

View File

@ -420,27 +420,7 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
PDFDocumentDataLoaderDecorator loader(storage);
PDFMarkedObjectsContext context;
PDFObject kids = dictionary->get("K");
if (kids.isArray())
{
const PDFArray* kidsArray = kids.getArray();
for (const PDFObject& object : *kidsArray)
{
PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, &context);
if (item)
{
tree.m_children.emplace_back(qMove(item));
}
}
}
else
{
PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, &context);
if (item)
{
tree.m_children.emplace_back(qMove(item));
}
}
parseKids(storage, &tree, dictionary, &context);
if (dictionary->hasKey("IDTree"))
{
@ -545,6 +525,31 @@ PDFStructureItem::Type PDFStructureItem::getTypeFromName(const QByteArray& name)
return Invalid;
}
void PDFStructureItem::parseKids(const PDFObjectStorage* storage, PDFStructureItem* parentItem, const PDFDictionary* dictionary, PDFMarkedObjectsContext* context)
{
PDFObject kids = dictionary->get("K");
if (kids.isArray())
{
const PDFArray* kidsArray = kids.getArray();
for (const PDFObject& object : *kidsArray)
{
PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, context);
if (item)
{
parentItem->m_children.emplace_back(qMove(item));
}
}
}
else if (!kids.isNull())
{
PDFStructureItemPointer item = PDFStructureItem::parse(storage, kids, context);
if (item)
{
parentItem->m_children.emplace_back(qMove(item));
}
}
}
PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorage* storage, PDFObject object)
{
PDFStructureTreeNamespace result;
@ -566,4 +571,88 @@ PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorag
return result;
}
PDFStructureItemPointer PDFStructureElement::parseElement(const PDFObjectStorage* storage,
PDFObject object,
PDFMarkedObjectsContext* context,
PDFStructureItem* parent,
PDFStructureTree* root)
{
PDFStructureItemPointer pointer;
Q_ASSERT(root);
if (auto lock = PDFMarkedObjectsLock(context, object))
{
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
{
PDFStructureElement* item = new PDFStructureElement(parent, root);
pointer.reset(item);
if (object.isReference())
{
item->m_selfReference = object.getReference();
}
PDFDocumentDataLoaderDecorator loader(storage);
item->m_typeName = loader.readNameFromDictionary(dictionary, "S");
item->m_standardType = root->getTypeFromRole(item->m_typeName);
item->m_id = loader.readStringFromDictionary(dictionary, "ID");
item->m_references = loader.readReferenceArrayFromDictionary(dictionary, "Ref");
item->m_pageReference = loader.readReferenceFromDictionary(dictionary, "Pg");
std::vector<PDFStructureTreeAttribute> attributes;
PDFObject classObject = storage->getObject(dictionary->get("C"));
if (classObject.isName())
{
QByteArray name = classObject.getString();
const std::vector<PDFStructureTreeAttribute>& classAttributes = root->getClassAttributes(name);
attributes.insert(attributes.end(), classAttributes.begin(), classAttributes.end());
}
else if (classObject.isArray())
{
size_t startIndex = attributes.size();
for (PDFObject itemObject : *classObject.getArray())
{
itemObject = storage->getObject(itemObject);
if (itemObject.isInt())
{
// It is revision number
const PDFInteger revision = itemObject.getInteger();
for (; startIndex < attributes.size(); ++startIndex)
{
attributes[startIndex].setRevision(revision);
}
}
else if (itemObject.isName())
{
// It is class name
QByteArray name = itemObject.getString();
const std::vector<PDFStructureTreeAttribute>& classAttributes = root->getClassAttributes(name);
attributes.insert(attributes.end(), classAttributes.begin(), classAttributes.end());
}
}
}
PDFStructureTreeAttribute::parseAttributes(storage, dictionary->get("A"), attributes);
std::reverse(attributes.begin(), attributes.end());
item->m_attributes = qMove(attributes);
item->m_revision = loader.readIntegerFromDictionary(dictionary, "R", 0);
item->m_texts[Title] = loader.readTextStringFromDictionary(dictionary, "T", QString());
item->m_texts[Language] = loader.readTextStringFromDictionary(dictionary, "Lang", QString());
item->m_texts[AlternativeDescription] = loader.readTextStringFromDictionary(dictionary, "Alt", QString());
item->m_texts[ExpandedForm] = loader.readTextStringFromDictionary(dictionary, "E", QString());
item->m_texts[ActualText] = loader.readTextStringFromDictionary(dictionary, "ActualText", QString());
item->m_texts[Phoneme] = loader.readTextStringFromDictionary(dictionary, "Phoneme", QString());
item->m_associatedFiles = loader.readObjectList<PDFFileSpecification>(dictionary->get("AF"));
item->m_namespace = loader.readReferenceFromDictionary(dictionary, "NS");
item->m_phoneticAlphabet = loader.readNameFromDictionary(dictionary, "PhoneticAlphabet");
parseKids(storage, item, dictionary, context);
}
}
return pointer;
}
} // namespace pdf

View File

@ -211,6 +211,7 @@ private:
class PDFStructureTree;
class PDFStructureItem;
class PDFStructureElement;
using PDFStructureItemPointer = QSharedPointer<PDFStructureItem>;
@ -259,8 +260,12 @@ public:
virtual PDFStructureTree* asStructureTree() { return nullptr; }
virtual const PDFStructureTree* asStructureTree() const { return nullptr; }
virtual PDFStructureElement* asStructureElement() { return nullptr; }
virtual const PDFStructureElement* asStructureElement() const { return nullptr; }
const PDFStructureItem* getParent() const { return m_parent; }
const PDFStructureTree* getTree() const { return m_root; }
PDFObjectReference getSelfReference() const { return m_selfReference; }
std::size_t getChildCount() const { return m_children.size(); }
const PDFStructureItem* getChild(size_t i) const { return m_children.at(i).get(); }
@ -276,8 +281,20 @@ public:
static Type getTypeFromName(const QByteArray& name);
protected:
/// Parses kids of the item. Invalid items aren't added
/// to the kid list.
/// \param storage Storage
/// \param parentItem Parent item, where children are inserted
/// \param dictionary Dictionary
/// \param context Context
static void parseKids(const PDFObjectStorage* storage,
PDFStructureItem* parentItem,
const PDFDictionary* dictionary,
PDFMarkedObjectsContext* context);
PDFStructureItem* m_parent;
PDFStructureTree* m_root;
PDFObjectReference m_selfReference;
std::vector<PDFStructureItemPointer> m_children;
};
@ -366,6 +383,68 @@ private:
std::vector<PDFFileSpecification> m_associatedFiles;
};
/// Structure element
class PDFStructureElement : public PDFStructureItem
{
public:
explicit inline PDFStructureElement(PDFStructureItem* parent, PDFStructureTree* root) :
PDFStructureItem(parent, root)
{
}
enum StringValue
{
Title,
Language,
AlternativeDescription,
ExpandedForm,
ActualText,
Phoneme,
LastStringValue
};
virtual PDFStructureElement* asStructureElement() override { return this; }
virtual const PDFStructureElement* asStructureElement() const override { return this; }
const QByteArray& getTypeName() const { return m_typeName; }
Type getStandardType() const { return m_standardType; }
const QByteArray& getId() const { return m_id; }
const std::vector<PDFObjectReference>& getReferences() const { return m_references; }
const PDFObjectReference& getPageReference() const { return m_pageReference; }
const std::vector<PDFStructureTreeAttribute>& getAttributes() const { return m_attributes; }
PDFInteger getRevision() const { return m_revision; }
const QString& getText(StringValue stringValue) const { return m_texts.at(stringValue); }
const std::vector<PDFFileSpecification>& getAssociatedFiles() const { return m_associatedFiles; }
const PDFObjectReference& getNamespace() const { return m_namespace; }
const QByteArray& getPhoneticAlphabet() const { return m_phoneticAlphabet; }
/// Parses structure element from the object. If error occurs, nullptr is returned.
/// \param storage Storage
/// \param object Structure element object
/// \param context Visited elements context
/// \param parent Parent structure tree item
/// \param root Structure tree root
static PDFStructureItemPointer parseElement(const PDFObjectStorage* storage,
PDFObject object,
PDFMarkedObjectsContext* context,
PDFStructureItem* parent,
PDFStructureTree* root);
private:
QByteArray m_typeName;
Type m_standardType;
QByteArray m_id;
std::vector<PDFObjectReference> m_references;
PDFObjectReference m_pageReference;
std::vector<PDFStructureTreeAttribute> m_attributes;
PDFInteger m_revision = 0;
std::array<QString, LastStringValue> m_texts;
std::vector<PDFFileSpecification> m_associatedFiles;
PDFObjectReference m_namespace;
QByteArray m_phoneticAlphabet;
};
} // namespace pdf
#endif // PDFSTRUCTURETREE_H