// Copyright (C) 2020-2022 Jakub Melka // // This file is part of PDF4QT. // // PDF4QT is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // with the written consent of the copyright owner, any later version. // // PDF4QT is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with PDF4QT. If not, see . #include "pdfobjectutils.h" #include "pdfvisitor.h" #include "pdfexecutionpolicy.h" #include "pdfdocumentwriter.h" #include "pdfdbgheap.h" namespace pdf { class PDFCollectReferencesVisitor : public PDFAbstractVisitor { public: explicit PDFCollectReferencesVisitor(std::set& references) : m_references(references) { } virtual void visitArray(const PDFArray* array) override; virtual void visitDictionary(const PDFDictionary* dictionary) override; virtual void visitStream(const PDFStream* stream) override; virtual void visitReference(const PDFObjectReference reference) override; private: std::set& m_references; }; void PDFCollectReferencesVisitor::visitArray(const PDFArray* array) { acceptArray(array); } void PDFCollectReferencesVisitor::visitDictionary(const PDFDictionary* dictionary) { acceptDictionary(dictionary); } void PDFCollectReferencesVisitor::visitStream(const PDFStream* stream) { acceptStream(stream); } void PDFCollectReferencesVisitor::visitReference(const PDFObjectReference reference) { m_references.insert(reference); } class PDFReplaceReferencesVisitor : public PDFAbstractVisitor { public: explicit PDFReplaceReferencesVisitor(const std::map& replacements) : m_replacements(replacements) { m_objectStack.reserve(32); } virtual void visitNull() override; virtual void visitBool(bool value) override; virtual void visitInt(PDFInteger value) override; virtual void visitReal(PDFReal value) override; virtual void visitString(PDFStringRef string) override; virtual void visitName(PDFStringRef name) override; virtual void visitArray(const PDFArray* array) override; virtual void visitDictionary(const PDFDictionary* dictionary) override; virtual void visitStream(const PDFStream* stream) override; virtual void visitReference(const PDFObjectReference reference) override; PDFObject getObject(); private: const std::map& m_replacements; std::vector m_objectStack; }; void PDFReplaceReferencesVisitor::visitNull() { m_objectStack.push_back(PDFObject::createNull()); } void PDFReplaceReferencesVisitor::visitBool(bool value) { m_objectStack.push_back(PDFObject::createBool(value)); } void PDFReplaceReferencesVisitor::visitInt(PDFInteger value) { m_objectStack.push_back(PDFObject::createInteger(value)); } void PDFReplaceReferencesVisitor::visitReal(PDFReal value) { m_objectStack.push_back(PDFObject::createReal(value)); } void PDFReplaceReferencesVisitor::visitString(PDFStringRef string) { m_objectStack.push_back(PDFObject::createString(string)); } void PDFReplaceReferencesVisitor::visitName(PDFStringRef name) { m_objectStack.push_back(PDFObject::createName(name)); } void PDFReplaceReferencesVisitor::visitArray(const PDFArray* array) { acceptArray(array); // We have all objects on the stack Q_ASSERT(array->getCount() <= m_objectStack.size()); auto it = std::next(m_objectStack.cbegin(), m_objectStack.size() - array->getCount()); std::vector objects(it, m_objectStack.cend()); PDFObject object = PDFObject::createArray(std::make_shared(qMove(objects))); m_objectStack.erase(it, m_objectStack.cend()); m_objectStack.push_back(object); } void PDFReplaceReferencesVisitor::visitDictionary(const PDFDictionary* dictionary) { Q_ASSERT(dictionary); std::vector entries; entries.reserve(dictionary->getCount()); for (size_t i = 0, count = dictionary->getCount(); i < count; ++i) { dictionary->getValue(i).accept(this); entries.emplace_back(dictionary->getKey(i), m_objectStack.back()); m_objectStack.pop_back(); } m_objectStack.push_back(PDFObject::createDictionary(std::make_shared(qMove(entries)))); } void PDFReplaceReferencesVisitor::visitStream(const PDFStream* stream) { // Replace references in the dictionary visitDictionary(stream->getDictionary()); PDFObject dictionaryObject = m_objectStack.back(); m_objectStack.pop_back(); m_objectStack.push_back(PDFObject::createStream(std::make_shared(PDFDictionary(*dictionaryObject.getDictionary()), QByteArray(*stream->getContent())))); } void PDFReplaceReferencesVisitor::visitReference(const PDFObjectReference reference) { auto it = m_replacements.find(reference); if (it != m_replacements.cend()) { // Replace the reference m_objectStack.push_back(PDFObject::createReference(it->second)); } else { // Preserve old reference m_objectStack.push_back(PDFObject::createReference(reference)); } } PDFObject PDFReplaceReferencesVisitor::getObject() { Q_ASSERT(m_objectStack.size() == 1); return qMove(m_objectStack.back()); } std::set PDFObjectUtils::getReferences(const std::vector& objects, const PDFObjectStorage& storage) { std::set references; { PDFCollectReferencesVisitor collectReferencesVisitor(references); for (const PDFObject& object : objects) { object.accept(&collectReferencesVisitor); } } // Iterative algorihm, which adds additional references from referenced objects. // If new reference is added, then we must also check, that all referenced objects // from this object are added. std::set workSet = references; while (!workSet.empty()) { std::set addedReferences; PDFCollectReferencesVisitor collectReferencesVisitor(addedReferences); for (const PDFObjectReference& objectReference : workSet) { storage.getObject(objectReference).accept(&collectReferencesVisitor); } workSet.clear(); std::set_difference(addedReferences.cbegin(), addedReferences.cend(), references.cbegin(), references.cend(), std::inserter(workSet, workSet.cend())); references.merge(addedReferences); } return references; } std::set PDFObjectUtils::getDirectReferences(const PDFObject& object) { std::set references; PDFCollectReferencesVisitor collectReferencesVisitor(references); object.accept(&collectReferencesVisitor); return references; } PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::map& referenceMapping) { PDFReplaceReferencesVisitor replaceReferencesVisitor(referenceMapping); object.accept(&replaceReferencesVisitor); return replaceReferencesVisitor.getObject(); } QString PDFObjectUtils::getObjectTypeName(PDFObject::Type type) { switch (type) { case pdf::PDFObject::Type::Null: return PDFTranslationContext::tr("Null"); case pdf::PDFObject::Type::Bool: return PDFTranslationContext::tr("Boolean"); case pdf::PDFObject::Type::Int: return PDFTranslationContext::tr("Integer"); case pdf::PDFObject::Type::Real: return PDFTranslationContext::tr("Real"); case pdf::PDFObject::Type::String: return PDFTranslationContext::tr("String"); case pdf::PDFObject::Type::Name: return PDFTranslationContext::tr("Name"); case pdf::PDFObject::Type::Array: return PDFTranslationContext::tr("Array"); case pdf::PDFObject::Type::Dictionary: return PDFTranslationContext::tr("Dictionary"); case pdf::PDFObject::Type::Stream: return PDFTranslationContext::tr("Stream"); case pdf::PDFObject::Type::Reference: return PDFTranslationContext::tr("Reference"); default: Q_ASSERT(false); break; } return QString(); } void PDFObjectClassifier::classify(const PDFDocument* document) { // Clear old classification, if it exist m_classification.clear(); m_allTypesUsed = None; if (!document) { return; } PDFDocumentDataLoaderDecorator loader(document); const PDFObjectStorage& storage = document->getStorage(); const PDFObjectStorage::PDFObjects& objects = storage.getObjects(); m_classification.resize(objects.size(), Classification()); for (size_t i = 0; i < objects.size(); ++i) { PDFObjectReference reference(i, objects[i].generation); m_classification[i].reference = reference; } // First, iterate trough pages of the document const PDFCatalog* catalog = document->getCatalog(); const size_t pageCount = catalog->getPageCount(); for (size_t i = 0; i < pageCount; ++i) { const PDFPage* page = catalog->getPage(i); if (!page) { continue; } // Handle page itself if (hasObject(page->getPageReference())) { mark(page->getPageReference(), Page); } // Handle annotations for (const PDFObjectReference& reference : page->getAnnotations()) { if (hasObject(reference)) { mark(reference, Annotation); } } // Handle contents PDFObject pageObject = document->getObjectByReference(page->getPageReference()); Q_ASSERT(pageObject.isDictionary()); const PDFDictionary* dictionary = pageObject.getDictionary(); const PDFObject& contentsObject = dictionary->get("Contents"); if (contentsObject.isReference()) { mark(contentsObject.getReference(), ContentStream); } // Handle resources if (const PDFDictionary* resourcesDictionary = document->getDictionaryFromObject(dictionary->get("Resources"))) { markDictionary(document, resourcesDictionary->get("ExtGState"), GraphicState); markDictionary(document, resourcesDictionary->get("ColorSpace"), ColorSpace); markDictionary(document, resourcesDictionary->get("Pattern"), Pattern); markDictionary(document, resourcesDictionary->get("Shading"), Shading); markDictionary(document, resourcesDictionary->get("Font"), Font); if (const PDFDictionary* xobjectDictionary = document->getDictionaryFromObject(resourcesDictionary->get("XObject"))) { const size_t count = xobjectDictionary->getCount(); for (size_t i = 0; i < count; ++i) { const PDFObject& item = xobjectDictionary->getValue(i); if (item.isReference() && hasObject(item.getReference())) { if (const PDFDictionary* xobjectItemDictionary = document->getDictionaryFromObject(item)) { QByteArray subtype = loader.readNameFromDictionary(xobjectItemDictionary, "Subtype"); if (subtype == "Image") { mark(item.getReference(), Image); } else if (subtype == "Form") { mark(item.getReference(), Form); } } } } } } } for (Classification& classification : m_classification) { if (const PDFDictionary* dictionary = document->getDictionaryFromObject(document->getObjectByReference(classification.reference))) { QByteArray typeName = loader.readNameFromDictionary(dictionary, "Type"); if (typeName == "Action") { classification.types.setFlag(Action); } } } for (const Classification& classification : m_classification) { m_allTypesUsed |= classification.types; } } bool PDFObjectClassifier::hasObject(PDFObjectReference reference) const { return reference.isValid() && reference.objectNumber < PDFInteger(m_classification.size()) && m_classification[reference.objectNumber].reference == reference; } std::vector PDFObjectClassifier::getObjectsByType(Type type) const { std::vector result; for (const Classification& classification : m_classification) { if (classification.types.testFlag(type)) { result.push_back(classification.reference); } } return result; } PDFObjectClassifier::Statistics PDFObjectClassifier::calculateStatistics(const PDFDocument* document) const { Statistics result; // Jakub Melka: prepare statistics map result.statistics[None]; for (uint i = 0; i < 32; ++i) { uint32_t mask = 1 << i; if (m_allTypesUsed & mask) { result.statistics[Type(mask)]; } } auto processEntry = [document, &result](const Classification& entry) { const PDFObject& object = document->getObjectByReference(entry.reference); if (object.isNull()) { return; } Type type = Type(uint32_t(entry.types)); if (!result.statistics.count(type)) { type = None; } Q_ASSERT(result.statistics.count(type)); const qint64 objectSize = PDFDocumentWriter::getObjectSize(document, entry.reference); StatisticsItem& statisticsItem = result.statistics.at(type); statisticsItem.count.fetch_add(1); statisticsItem.bytes.fetch_add(objectSize); }; PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Unknown, m_classification.cbegin(), m_classification.cend(), processEntry); PDFStatisticsCollector collector; PDFApplyVisitor(*document, &collector); for (PDFObject::Type objectType : PDFObject::getTypes()) { result.objectCountByType[size_t(objectType)] = collector.getObjectCount(objectType); } return result; } void PDFObjectClassifier::mark(PDFObjectReference reference, Type type) { Q_ASSERT(hasObject(reference)); m_classification[reference.objectNumber].types.setFlag(type, true); } void PDFObjectClassifier::markDictionary(const PDFDocument* document, PDFObject object, Type type) { if (const PDFDictionary* dictionary = document->getDictionaryFromObject(object)) { const size_t count = dictionary->getCount(); for (size_t i = 0; i < count; ++i) { const PDFObject& item = dictionary->getValue(i); if (item.isReference() && hasObject(item.getReference())) { mark(item.getReference(), type); } } } } } // namespace pdf