Catalog update

This commit is contained in:
Jakub Melka 2020-07-26 18:40:42 +02:00
parent 5d0b485d4e
commit e734946ae9
4 changed files with 256 additions and 7 deletions

View File

@ -207,6 +207,11 @@ PDFCatalog PDFCatalog::parse(const PDFObject& catalog, const PDFDocument* docume
catalogObject.m_markInfoFlags.setFlag(MarkInfo_Suspects, loader.readBooleanFromDictionary(markInfoDictionary, "Suspects", false));
}
catalogObject.m_structureTreeRoot = catalogDictionary->get("StructTreeRoot");
catalogObject.m_language = loader.readTextStringFromDictionary(catalogDictionary, "Lang", QString());
catalogObject.m_webCaptureInfo = PDFWebCaptureInfo::parse(catalogDictionary->get("SpiderInfo"), &document->getStorage());
catalogObject.m_outputIntents = loader.readObjectList<PDFOutputIntent>(catalogDictionary->get("OutputIntents"));
return catalogObject;
}
@ -766,4 +771,57 @@ PDFArticleThread PDFArticleThread::parse(const PDFObjectStorage* storage, const
return result;
}
PDFWebCaptureInfo PDFWebCaptureInfo::parse(const PDFObject& object, const PDFObjectStorage* storage)
{
PDFWebCaptureInfo result;
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
{
PDFDocumentDataLoaderDecorator loader(storage);
result.m_version = loader.readNameFromDictionary(dictionary, "V");
result.m_commands = loader.readReferenceArrayFromDictionary(dictionary, "C");
}
return result;
}
PDFOutputIntent PDFOutputIntent::parse(const PDFObjectStorage* storage, const PDFObject& object)
{
PDFOutputIntent result;
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
{
PDFDocumentDataLoaderDecorator loader(storage);
result.m_subtype = loader.readNameFromDictionary(dictionary, "S");
result.m_outputCondition = loader.readTextStringFromDictionary(dictionary, "OutputCondition", QString());
result.m_outputConditionIdentifier = loader.readTextStringFromDictionary(dictionary, "OutputConditionIdentifier", QString());
result.m_registryName = loader.readTextStringFromDictionary(dictionary, "RegistryName", QString());
result.m_info = loader.readTextStringFromDictionary(dictionary, "Info", QString());
result.m_destOutputProfile = dictionary->get("DestOutputProfile");
result.m_destOutputProfileRef = PDFOutputIntentICCProfileInfo::parse(dictionary->get("DestOutputProfileRef"), storage);
result.m_mixingHints = dictionary->get("MixingHints");
result.m_spectralData = dictionary->get("SpectralData");
}
return result;
}
PDFOutputIntentICCProfileInfo PDFOutputIntentICCProfileInfo::parse(const PDFObject& object, const PDFObjectStorage* storage)
{
PDFOutputIntentICCProfileInfo result;
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
{
PDFDocumentDataLoaderDecorator loader(storage);
result.m_checkSum = loader.readStringFromDictionary(dictionary, "CheckSum");
result.m_colorants = loader.readNameArrayFromDictionary(dictionary, "ColorantTable");
result.m_iccVersion = loader.readStringFromDictionary(dictionary, "ICCVersion");
result.m_signature = loader.readStringFromDictionary(dictionary, "ProfileCS");
result.m_profileName = loader.readTextStringFromDictionary(dictionary, "ProfileName", QString());
result.m_urls = dictionary->get("URLs");
}
return result;
}
} // namespace pdf

View File

@ -322,6 +322,87 @@ private:
Extensions m_extensions;
};
/// Web capture info
class PDFFORQTLIBSHARED_EXPORT PDFWebCaptureInfo
{
public:
explicit PDFWebCaptureInfo() = default;
const QByteArray& getVersion() const { return m_version; }
const std::vector<PDFObjectReference>& getCommands() const { return m_commands; }
/// Parses web capture info from catalog dictionary. If object cannot be parsed, or error occurs,
/// then empty object is returned, no exception is thrown.
/// \param object Spider info dictionary
/// \param storage Storage
static PDFWebCaptureInfo parse(const PDFObject& object, const PDFObjectStorage* storage);
private:
QByteArray m_version;
std::vector<PDFObjectReference> m_commands;
};
class PDFFORQTLIBSHARED_EXPORT PDFOutputIntentICCProfileInfo
{
public:
explicit PDFOutputIntentICCProfileInfo() = default;
const QByteArray& getChecksum() const { return m_checkSum; }
const std::vector<QByteArray>& getColorants() const { return m_colorants; }
const QByteArray& getIccVersion() const { return m_iccVersion; }
const QByteArray& getSignature() const { return m_signature; }
const QString& getProfileName() const { return m_profileName; }
const PDFObject& getUrls() const { return m_urls; }
/// Parses icc profile info from object. If object cannot be parsed, or error occurs,
/// then empty object is returned, no exception is thrown.
/// \param object Output intent dictionary
/// \param storage Storage
static PDFOutputIntentICCProfileInfo parse(const PDFObject& object, const PDFObjectStorage* storage);
private:
QByteArray m_checkSum;
std::vector<QByteArray> m_colorants;
QByteArray m_iccVersion;
QByteArray m_signature;
QString m_profileName;
PDFObject m_urls;
};
/// Output intent
class PDFFORQTLIBSHARED_EXPORT PDFOutputIntent
{
public:
explicit PDFOutputIntent() = default;
const QByteArray& getSubtype() const { return m_subtype; }
const QString& getOutputCondition() const { return m_outputCondition; }
const QString& getOutputConditionIdentifier() const { return m_outputConditionIdentifier; }
const QString& getRegistryName() const { return m_registryName; }
const QString& getInfo() const { return m_info; }
const PDFObject& getOutputProfile() const { return m_destOutputProfile; }
const PDFOutputIntentICCProfileInfo& getOutputProfileInfo() const { return m_destOutputProfileRef; }
const PDFObject& getMixingHints() const { return m_mixingHints; }
const PDFObject& getSpectralData() const { return m_spectralData; }
/// Parses output intent from object. If object cannot be parsed, or error occurs,
/// then empty object is returned, no exception is thrown.
/// \param object Output intent dictionary
/// \param storage Storage
static PDFOutputIntent parse(const PDFObjectStorage* storage, const PDFObject& object);
private:
QByteArray m_subtype;
QString m_outputCondition;
QString m_outputConditionIdentifier;
QString m_registryName;
QString m_info;
PDFObject m_destOutputProfile;
PDFOutputIntentICCProfileInfo m_destOutputProfileRef;
PDFObject m_mixingHints;
PDFObject m_spectralData;
};
class PDFFORQTLIBSHARED_EXPORT PDFCatalog
{
public:
@ -379,6 +460,10 @@ public:
const std::vector<PDFArticleThread>& getArticleThreads() const { return m_threads; }
const PDFAction* getDocumentAction(DocumentAction action) const { return m_documentActions.at(action).get(); }
const PDFObject& getMetadata() const { return m_metadata; }
const PDFObject& getStructureTreeRoot() const { return m_structureTreeRoot; }
const QString& getLanguage() const { return m_language; }
const PDFWebCaptureInfo& getWebCaptureInfo() const { return m_webCaptureInfo; }
const std::vector<PDFOutputIntent>& getOutputIntents() const { return m_outputIntents; }
/// Is document marked to have structure tree conforming to tagged document convention?
bool isLogicalStructureMarked() const { return m_markInfoFlags.testFlag(MarkInfo_Marked); }
@ -422,11 +507,15 @@ private:
PageMode m_pageMode = PageMode::UseNone;
QByteArray m_baseURI;
PDFObject m_formObject;
PDFObject m_structureTreeRoot;
PDFDeveloperExtensions m_extensions;
PDFDocumentSecurityStore m_documentSecurityStore;
std::vector<PDFArticleThread> m_threads;
PDFObject m_metadata;
MarkInfoFlags m_markInfoFlags = MarkInfo_None;
QString m_language;
PDFWebCaptureInfo m_webCaptureInfo;
std::vector<PDFOutputIntent> m_outputIntents;
// Maps from Names dictionary
std::map<QByteArray, PDFDestination> m_destinations;

View File

@ -45,6 +45,12 @@ struct PDFStructureTreeAttributeDefinition
/// \param name Attribute name
static const PDFStructureTreeAttributeDefinition* getDefinition(const QByteArray& name);
/// Returns attribute definition for given attribute type. This function
/// always returns valid pointer. For uknown attribute, it returns
/// user attribute definition.
/// \param name Attribute name
static const PDFStructureTreeAttributeDefinition* getDefinition(PDFStructureTreeAttribute::Attribute type);
/// Returns owner from string. If owner is not valid, then invalid
/// owner is returned.
/// \param string String
@ -214,6 +220,20 @@ const PDFStructureTreeAttributeDefinition* PDFStructureTreeAttributeDefinition::
return &s_attributeDefinitions.front();
}
const PDFStructureTreeAttributeDefinition* PDFStructureTreeAttributeDefinition::getDefinition(PDFStructureTreeAttribute::Attribute type)
{
for (const PDFStructureTreeAttributeDefinition& definition : s_attributeDefinitions)
{
if (type == definition.type)
{
return &definition;
}
}
Q_ASSERT(s_attributeDefinitions.front().type == PDFStructureTreeAttribute::Attribute::User);
return &s_attributeDefinitions.front();
}
PDFStructureTreeAttribute::Owner PDFStructureTreeAttributeDefinition::getOwnerFromString(const QByteArray& string)
{
for (const auto& item : s_ownerDefinitions)
@ -512,7 +532,7 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
return tree;
}
PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context)
PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent)
{
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
{
@ -521,15 +541,15 @@ PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage,
if (typeName == "MCR")
{
return PDFStructureMarkedContentReference::parse(storage, object, context);
return PDFStructureMarkedContentReference::parseMarkedContentReference(storage, object, context, parent, parent->getTree());
}
else if (typeName == "OBJR")
{
return PDFStructureObjectReference::parse(storage, object, context);
return PDFStructureObjectReference::parseObjectReference(storage, object, context, parent, parent->getTree());
}
else
{
return PDFStructureElement::parse(storage, object, context);
return PDFStructureElement::parseElement(storage, object, context, parent, parent->getTree());
}
}
@ -557,7 +577,7 @@ void PDFStructureItem::parseKids(const PDFObjectStorage* storage, PDFStructureIt
const PDFArray* kidsArray = kids.getArray();
for (const PDFObject& object : *kidsArray)
{
PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, context);
PDFStructureItemPointer item = PDFStructureItem::parse(storage, object, context, parentItem);
if (item)
{
parentItem->m_children.emplace_back(qMove(item));
@ -566,7 +586,7 @@ void PDFStructureItem::parseKids(const PDFObjectStorage* storage, PDFStructureIt
}
else if (!kids.isNull())
{
PDFStructureItemPointer item = PDFStructureItem::parse(storage, kids, context);
PDFStructureItemPointer item = PDFStructureItem::parse(storage, kids, context, parentItem);
if (item)
{
parentItem->m_children.emplace_back(qMove(item));
@ -595,6 +615,56 @@ PDFStructureTreeNamespace PDFStructureTreeNamespace::parse(const PDFObjectStorag
return result;
}
const PDFStructureTreeAttribute* PDFStructureElement::findAttribute(Attribute attribute,
AttributeOwner owner,
RevisionPolicy policy) const
{
const PDFStructureTreeAttributeDefinition* definition = PDFStructureTreeAttributeDefinition::getDefinition(attribute);
if (const PDFStructureTreeAttribute* result = findAttributeImpl(attribute, owner, policy, definition))
{
return result;
}
if (owner != AttributeOwner::Invalid)
{
return findAttributeImpl(attribute, AttributeOwner::Invalid, policy, definition);
}
return nullptr;
}
const PDFStructureTreeAttribute* PDFStructureElement::findAttributeImpl(Attribute attribute,
AttributeOwner owner,
RevisionPolicy policy,
const PDFStructureTreeAttributeDefinition* definition) const
{
// We do not search for user properties
if (attribute == Attribute::User)
{
return nullptr;
}
// Try to search for attribute in attribute list
for (const PDFStructureTreeAttribute& attributeObject : m_attributes)
{
if ((attributeObject.getType() == attribute) &&
(attributeObject.getOwner() == owner || owner == AttributeOwner::Invalid) &&
(attributeObject.getRevision() == m_revision || policy == RevisionPolicy::Ignore))
{
return &attributeObject;
}
}
// Check, if attribute is inheritable and then search for it in parent
if (definition->inheritable && m_parent && m_parent->asStructureElement())
{
return m_parent->asStructureElement()->findAttributeImpl(attribute, owner, policy, definition);
}
return nullptr;
}
PDFStructureItemPointer PDFStructureElement::parseElement(const PDFObjectStorage* storage,
PDFObject object,
PDFMarkedObjectsContext* context,

View File

@ -272,7 +272,9 @@ public:
virtual const PDFStructureObjectReference* asStructureObjectReference() const { return nullptr; }
const PDFStructureItem* getParent() const { return m_parent; }
PDFStructureItem* getParent() { return m_parent; }
const PDFStructureTree* getTree() const { return m_root; }
PDFStructureTree* getTree() { return m_root; }
PDFObjectReference getSelfReference() const { return m_selfReference; }
std::size_t getChildCount() const { return m_children.size(); }
const PDFStructureItem* getChild(size_t i) const { return m_children.at(i).get(); }
@ -282,7 +284,8 @@ public:
/// \param storage Storage
/// \param object Structure tree item object
/// \param context Parsing context
static PDFStructureItemPointer parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context);
/// \param parent Parent item
static PDFStructureItemPointer parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent);
/// Get structure tree type from name
/// \param name Name
@ -427,6 +430,23 @@ public:
const PDFObjectReference& getNamespace() const { return m_namespace; }
const QByteArray& getPhoneticAlphabet() const { return m_phoneticAlphabet; }
enum class RevisionPolicy
{
Ignore,
Match
};
using Attribute = PDFStructureTreeAttribute::Attribute;
using AttributeOwner = PDFStructureTreeAttribute::Owner;
/// Finds attribute matching given owner and revision policy. If attribute with given
/// owner is not found, then any matching attribute is returned. If none is found,
/// then nullptr is returned.
/// \param attribute Attribute
/// \param owner Owner
/// \param policy Revision number policy
const PDFStructureTreeAttribute* findAttribute(Attribute attribute, AttributeOwner owner, RevisionPolicy policy) const;
/// Parses structure element from the object. If error occurs, nullptr is returned.
/// \param storage Storage
/// \param object Structure element object
@ -440,6 +460,18 @@ public:
PDFStructureTree* root);
private:
/// Finds attribute matching given owner and revision policy. If attribute with given
/// owner is not found, then any matching attribute is returned. If none is found,
/// then nullptr is returned.
/// \param attribute Attribute
/// \param owner Owner
/// \param policy Revision number policy
/// \param definition Definition
const PDFStructureTreeAttribute* findAttributeImpl(Attribute attribute,
AttributeOwner owner,
RevisionPolicy policy,
const PDFStructureTreeAttributeDefinition* definition) const;
QByteArray m_typeName;
Type m_standardType;
QByteArray m_id;