mirror of https://github.com/JakubMelka/PDF4QT.git
486 lines
15 KiB
C++
486 lines
15 KiB
C++
// Copyright (C) 2020-2022 Jakub Melka
|
|
//
|
|
// This file is part of PDF4QT.
|
|
//
|
|
// PDF4QT is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// with the written consent of the copyright owner, any later version.
|
|
//
|
|
// PDF4QT is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
// along with PDF4QT. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
#include "pdfobjectutils.h"
|
|
#include "pdfvisitor.h"
|
|
#include "pdfexecutionpolicy.h"
|
|
#include "pdfdocumentwriter.h"
|
|
#include "pdfdbgheap.h"
|
|
|
|
namespace pdf
|
|
{
|
|
|
|
class PDFCollectReferencesVisitor : public PDFAbstractVisitor
|
|
{
|
|
public:
|
|
explicit PDFCollectReferencesVisitor(std::set<PDFObjectReference>& references) :
|
|
m_references(references)
|
|
{
|
|
|
|
}
|
|
|
|
virtual void visitArray(const PDFArray* array) override;
|
|
virtual void visitDictionary(const PDFDictionary* dictionary) override;
|
|
virtual void visitStream(const PDFStream* stream) override;
|
|
virtual void visitReference(const PDFObjectReference reference) override;
|
|
|
|
private:
|
|
std::set<PDFObjectReference>& m_references;
|
|
};
|
|
|
|
void PDFCollectReferencesVisitor::visitArray(const PDFArray* array)
|
|
{
|
|
acceptArray(array);
|
|
}
|
|
|
|
void PDFCollectReferencesVisitor::visitDictionary(const PDFDictionary* dictionary)
|
|
{
|
|
acceptDictionary(dictionary);
|
|
}
|
|
|
|
void PDFCollectReferencesVisitor::visitStream(const PDFStream* stream)
|
|
{
|
|
acceptStream(stream);
|
|
}
|
|
|
|
void PDFCollectReferencesVisitor::visitReference(const PDFObjectReference reference)
|
|
{
|
|
m_references.insert(reference);
|
|
}
|
|
|
|
class PDFReplaceReferencesVisitor : public PDFAbstractVisitor
|
|
{
|
|
public:
|
|
explicit PDFReplaceReferencesVisitor(const std::map<PDFObjectReference, PDFObjectReference>& replacements) :
|
|
m_replacements(replacements)
|
|
{
|
|
m_objectStack.reserve(32);
|
|
}
|
|
|
|
virtual void visitNull() override;
|
|
virtual void visitBool(bool value) override;
|
|
virtual void visitInt(PDFInteger value) override;
|
|
virtual void visitReal(PDFReal value) override;
|
|
virtual void visitString(PDFStringRef string) override;
|
|
virtual void visitName(PDFStringRef name) override;
|
|
virtual void visitArray(const PDFArray* array) override;
|
|
virtual void visitDictionary(const PDFDictionary* dictionary) override;
|
|
virtual void visitStream(const PDFStream* stream) override;
|
|
virtual void visitReference(const PDFObjectReference reference) override;
|
|
|
|
PDFObject getObject();
|
|
|
|
private:
|
|
const std::map<PDFObjectReference, PDFObjectReference>& m_replacements;
|
|
std::vector<PDFObject> m_objectStack;
|
|
};
|
|
|
|
void PDFReplaceReferencesVisitor::visitNull()
|
|
{
|
|
m_objectStack.push_back(PDFObject::createNull());
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitBool(bool value)
|
|
{
|
|
m_objectStack.push_back(PDFObject::createBool(value));
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitInt(PDFInteger value)
|
|
{
|
|
m_objectStack.push_back(PDFObject::createInteger(value));
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitReal(PDFReal value)
|
|
{
|
|
m_objectStack.push_back(PDFObject::createReal(value));
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitString(PDFStringRef string)
|
|
{
|
|
m_objectStack.push_back(PDFObject::createString(string));
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitName(PDFStringRef name)
|
|
{
|
|
m_objectStack.push_back(PDFObject::createName(name));
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitArray(const PDFArray* array)
|
|
{
|
|
acceptArray(array);
|
|
|
|
// We have all objects on the stack
|
|
Q_ASSERT(array->getCount() <= m_objectStack.size());
|
|
|
|
auto it = std::next(m_objectStack.cbegin(), m_objectStack.size() - array->getCount());
|
|
std::vector<PDFObject> objects(it, m_objectStack.cend());
|
|
PDFObject object = PDFObject::createArray(std::make_shared<PDFArray>(qMove(objects)));
|
|
m_objectStack.erase(it, m_objectStack.cend());
|
|
m_objectStack.push_back(object);
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitDictionary(const PDFDictionary* dictionary)
|
|
{
|
|
Q_ASSERT(dictionary);
|
|
|
|
std::vector<PDFDictionary::DictionaryEntry> entries;
|
|
entries.reserve(dictionary->getCount());
|
|
|
|
for (size_t i = 0, count = dictionary->getCount(); i < count; ++i)
|
|
{
|
|
dictionary->getValue(i).accept(this);
|
|
entries.emplace_back(dictionary->getKey(i), m_objectStack.back());
|
|
m_objectStack.pop_back();
|
|
}
|
|
|
|
m_objectStack.push_back(PDFObject::createDictionary(std::make_shared<PDFDictionary>(qMove(entries))));
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitStream(const PDFStream* stream)
|
|
{
|
|
// Replace references in the dictionary
|
|
visitDictionary(stream->getDictionary());
|
|
PDFObject dictionaryObject = m_objectStack.back();
|
|
m_objectStack.pop_back();
|
|
m_objectStack.push_back(PDFObject::createStream(std::make_shared<PDFStream>(PDFDictionary(*dictionaryObject.getDictionary()), QByteArray(*stream->getContent()))));
|
|
}
|
|
|
|
void PDFReplaceReferencesVisitor::visitReference(const PDFObjectReference reference)
|
|
{
|
|
auto it = m_replacements.find(reference);
|
|
if (it != m_replacements.cend())
|
|
{
|
|
// Replace the reference
|
|
m_objectStack.push_back(PDFObject::createReference(it->second));
|
|
}
|
|
else
|
|
{
|
|
// Preserve old reference
|
|
m_objectStack.push_back(PDFObject::createReference(reference));
|
|
}
|
|
}
|
|
|
|
PDFObject PDFReplaceReferencesVisitor::getObject()
|
|
{
|
|
Q_ASSERT(m_objectStack.size() == 1);
|
|
return qMove(m_objectStack.back());
|
|
}
|
|
|
|
std::set<PDFObjectReference> PDFObjectUtils::getReferences(const std::vector<PDFObject>& objects, const PDFObjectStorage& storage)
|
|
{
|
|
std::set<PDFObjectReference> references;
|
|
{
|
|
PDFCollectReferencesVisitor collectReferencesVisitor(references);
|
|
for (const PDFObject& object : objects)
|
|
{
|
|
object.accept(&collectReferencesVisitor);
|
|
}
|
|
}
|
|
|
|
// Iterative algorihm, which adds additional references from referenced objects.
|
|
// If new reference is added, then we must also check, that all referenced objects
|
|
// from this object are added.
|
|
std::set<PDFObjectReference> workSet = references;
|
|
while (!workSet.empty())
|
|
{
|
|
std::set<PDFObjectReference> addedReferences;
|
|
PDFCollectReferencesVisitor collectReferencesVisitor(addedReferences);
|
|
for (const PDFObjectReference& objectReference : workSet)
|
|
{
|
|
storage.getObject(objectReference).accept(&collectReferencesVisitor);
|
|
}
|
|
|
|
workSet.clear();
|
|
std::set_difference(addedReferences.cbegin(), addedReferences.cend(), references.cbegin(), references.cend(), std::inserter(workSet, workSet.cend()));
|
|
references.merge(addedReferences);
|
|
}
|
|
|
|
return references;
|
|
}
|
|
|
|
std::set<PDFObjectReference> PDFObjectUtils::getDirectReferences(const PDFObject& object)
|
|
{
|
|
std::set<PDFObjectReference> references;
|
|
|
|
PDFCollectReferencesVisitor collectReferencesVisitor(references);
|
|
object.accept(&collectReferencesVisitor);
|
|
|
|
return references;
|
|
}
|
|
|
|
PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::map<PDFObjectReference, PDFObjectReference>& referenceMapping)
|
|
{
|
|
PDFReplaceReferencesVisitor replaceReferencesVisitor(referenceMapping);
|
|
object.accept(&replaceReferencesVisitor);
|
|
return replaceReferencesVisitor.getObject();
|
|
}
|
|
|
|
QString PDFObjectUtils::getObjectTypeName(PDFObject::Type type)
|
|
{
|
|
switch (type)
|
|
{
|
|
case pdf::PDFObject::Type::Null:
|
|
return PDFTranslationContext::tr("Null");
|
|
|
|
case pdf::PDFObject::Type::Bool:
|
|
return PDFTranslationContext::tr("Boolean");
|
|
|
|
case pdf::PDFObject::Type::Int:
|
|
return PDFTranslationContext::tr("Integer");
|
|
|
|
case pdf::PDFObject::Type::Real:
|
|
return PDFTranslationContext::tr("Real");
|
|
|
|
case pdf::PDFObject::Type::String:
|
|
return PDFTranslationContext::tr("String");
|
|
|
|
case pdf::PDFObject::Type::Name:
|
|
return PDFTranslationContext::tr("Name");
|
|
|
|
case pdf::PDFObject::Type::Array:
|
|
return PDFTranslationContext::tr("Array");
|
|
|
|
case pdf::PDFObject::Type::Dictionary:
|
|
return PDFTranslationContext::tr("Dictionary");
|
|
|
|
case pdf::PDFObject::Type::Stream:
|
|
return PDFTranslationContext::tr("Stream");
|
|
|
|
case pdf::PDFObject::Type::Reference:
|
|
return PDFTranslationContext::tr("Reference");
|
|
|
|
default:
|
|
Q_ASSERT(false);
|
|
break;
|
|
}
|
|
|
|
return QString();
|
|
}
|
|
|
|
void PDFObjectClassifier::classify(const PDFDocument* document)
|
|
{
|
|
// Clear old classification, if it exist
|
|
m_classification.clear();
|
|
m_allTypesUsed = None;
|
|
|
|
if (!document)
|
|
{
|
|
return;
|
|
}
|
|
|
|
PDFDocumentDataLoaderDecorator loader(document);
|
|
const PDFObjectStorage& storage = document->getStorage();
|
|
const PDFObjectStorage::PDFObjects& objects = storage.getObjects();
|
|
|
|
m_classification.resize(objects.size(), Classification());
|
|
for (size_t i = 0; i < objects.size(); ++i)
|
|
{
|
|
PDFObjectReference reference(i, objects[i].generation);
|
|
m_classification[i].reference = reference;
|
|
}
|
|
|
|
// First, iterate trough pages of the document
|
|
const PDFCatalog* catalog = document->getCatalog();
|
|
const size_t pageCount = catalog->getPageCount();
|
|
for (size_t i = 0; i < pageCount; ++i)
|
|
{
|
|
const PDFPage* page = catalog->getPage(i);
|
|
|
|
if (!page)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Handle page itself
|
|
if (hasObject(page->getPageReference()))
|
|
{
|
|
mark(page->getPageReference(), Page);
|
|
}
|
|
|
|
// Handle annotations
|
|
for (const PDFObjectReference& reference : page->getAnnotations())
|
|
{
|
|
if (hasObject(reference))
|
|
{
|
|
mark(reference, Annotation);
|
|
}
|
|
}
|
|
|
|
// Handle contents
|
|
PDFObject pageObject = document->getObjectByReference(page->getPageReference());
|
|
Q_ASSERT(pageObject.isDictionary());
|
|
|
|
const PDFDictionary* dictionary = pageObject.getDictionary();
|
|
const PDFObject& contentsObject = dictionary->get("Contents");
|
|
if (contentsObject.isReference())
|
|
{
|
|
mark(contentsObject.getReference(), ContentStream);
|
|
}
|
|
|
|
// Handle resources
|
|
if (const PDFDictionary* resourcesDictionary = document->getDictionaryFromObject(dictionary->get("Resources")))
|
|
{
|
|
markDictionary(document, resourcesDictionary->get("ExtGState"), GraphicState);
|
|
markDictionary(document, resourcesDictionary->get("ColorSpace"), ColorSpace);
|
|
markDictionary(document, resourcesDictionary->get("Pattern"), Pattern);
|
|
markDictionary(document, resourcesDictionary->get("Shading"), Shading);
|
|
markDictionary(document, resourcesDictionary->get("Font"), Font);
|
|
|
|
if (const PDFDictionary* xobjectDictionary = document->getDictionaryFromObject(resourcesDictionary->get("XObject")))
|
|
{
|
|
const size_t count = xobjectDictionary->getCount();
|
|
for (size_t i = 0; i < count; ++i)
|
|
{
|
|
const PDFObject& item = xobjectDictionary->getValue(i);
|
|
if (item.isReference() && hasObject(item.getReference()))
|
|
{
|
|
if (const PDFDictionary* xobjectItemDictionary = document->getDictionaryFromObject(item))
|
|
{
|
|
QByteArray subtype = loader.readNameFromDictionary(xobjectItemDictionary, "Subtype");
|
|
|
|
if (subtype == "Image")
|
|
{
|
|
mark(item.getReference(), Image);
|
|
}
|
|
else if (subtype == "Form")
|
|
{
|
|
mark(item.getReference(), Form);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (Classification& classification : m_classification)
|
|
{
|
|
if (const PDFDictionary* dictionary = document->getDictionaryFromObject(document->getObjectByReference(classification.reference)))
|
|
{
|
|
QByteArray typeName = loader.readNameFromDictionary(dictionary, "Type");
|
|
if (typeName == "Action")
|
|
{
|
|
classification.types.setFlag(Action);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const Classification& classification : m_classification)
|
|
{
|
|
m_allTypesUsed |= classification.types;
|
|
}
|
|
}
|
|
|
|
bool PDFObjectClassifier::hasObject(PDFObjectReference reference) const
|
|
{
|
|
return reference.isValid() &&
|
|
reference.objectNumber < PDFInteger(m_classification.size()) &&
|
|
m_classification[reference.objectNumber].reference == reference;
|
|
}
|
|
|
|
std::vector<PDFObjectReference> PDFObjectClassifier::getObjectsByType(Type type) const
|
|
{
|
|
std::vector<PDFObjectReference> result;
|
|
|
|
for (const Classification& classification : m_classification)
|
|
{
|
|
if (classification.types.testFlag(type))
|
|
{
|
|
result.push_back(classification.reference);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
PDFObjectClassifier::Statistics PDFObjectClassifier::calculateStatistics(const PDFDocument* document) const
|
|
{
|
|
Statistics result;
|
|
|
|
// Jakub Melka: prepare statistics map
|
|
result.statistics[None];
|
|
|
|
for (uint i = 0; i < 32; ++i)
|
|
{
|
|
uint32_t mask = 1 << i;
|
|
if (m_allTypesUsed & mask)
|
|
{
|
|
result.statistics[Type(mask)];
|
|
}
|
|
}
|
|
|
|
auto processEntry = [document, &result](const Classification& entry)
|
|
{
|
|
const PDFObject& object = document->getObjectByReference(entry.reference);
|
|
|
|
if (object.isNull())
|
|
{
|
|
return;
|
|
}
|
|
|
|
Type type = Type(uint32_t(entry.types));
|
|
if (!result.statistics.count(type))
|
|
{
|
|
type = None;
|
|
}
|
|
|
|
Q_ASSERT(result.statistics.count(type));
|
|
|
|
const qint64 objectSize = PDFDocumentWriter::getObjectSize(document, entry.reference);
|
|
|
|
StatisticsItem& statisticsItem = result.statistics.at(type);
|
|
statisticsItem.count.fetch_add(1);
|
|
statisticsItem.bytes.fetch_add(objectSize);
|
|
};
|
|
|
|
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Unknown, m_classification.cbegin(), m_classification.cend(), processEntry);
|
|
|
|
PDFStatisticsCollector collector;
|
|
PDFApplyVisitor(*document, &collector);
|
|
|
|
for (PDFObject::Type objectType : PDFObject::getTypes())
|
|
{
|
|
result.objectCountByType[size_t(objectType)] = collector.getObjectCount(objectType);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void PDFObjectClassifier::mark(PDFObjectReference reference, Type type)
|
|
{
|
|
Q_ASSERT(hasObject(reference));
|
|
m_classification[reference.objectNumber].types.setFlag(type, true);
|
|
}
|
|
|
|
void PDFObjectClassifier::markDictionary(const PDFDocument* document, PDFObject object, Type type)
|
|
{
|
|
if (const PDFDictionary* dictionary = document->getDictionaryFromObject(object))
|
|
{
|
|
const size_t count = dictionary->getCount();
|
|
for (size_t i = 0; i < count; ++i)
|
|
{
|
|
const PDFObject& item = dictionary->getValue(i);
|
|
if (item.isReference() && hasObject(item.getReference()))
|
|
{
|
|
mark(item.getReference(), type);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace pdf
|