PDF4QT/PdfForQtLib/sources/pdfoptimizer.cpp

371 lines
12 KiB
C++
Raw Normal View History

2020-05-31 18:31:59 +02:00
// Copyright (C) 2020 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#include "pdfoptimizer.h"
2020-05-31 19:55:24 +02:00
#include "pdfvisitor.h"
#include "pdfexecutionpolicy.h"
2020-06-02 19:45:28 +02:00
#include "pdfobjectutils.h"
#include "pdfutils.h"
2020-05-31 18:31:59 +02:00
namespace pdf
{
2020-06-02 19:45:28 +02:00
class PDFUpdateObjectVisitor : public PDFAbstractVisitor
2020-05-31 19:55:24 +02:00
{
public:
2020-06-02 19:45:28 +02:00
explicit inline PDFUpdateObjectVisitor(const PDFObjectStorage* storage) :
m_storage(storage)
2020-05-31 19:55:24 +02:00
{
m_objectStack.reserve(32);
}
virtual void visitNull() override;
virtual void visitBool(bool value) override;
virtual void visitInt(PDFInteger value) override;
virtual void visitReal(PDFReal value) override;
virtual void visitString(PDFStringRef string) override;
virtual void visitName(PDFStringRef name) override;
virtual void visitArray(const PDFArray* array) override;
virtual void visitDictionary(const PDFDictionary* dictionary) override;
virtual void visitStream(const PDFStream* stream) override;
virtual void visitReference(const PDFObjectReference reference) override;
PDFObject getObject();
2020-06-02 19:45:28 +02:00
protected:
2020-05-31 19:55:24 +02:00
const PDFObjectStorage* m_storage;
std::vector<PDFObject> m_objectStack;
};
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitNull()
2020-05-31 19:55:24 +02:00
{
m_objectStack.push_back(PDFObject::createNull());
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitBool(bool value)
2020-05-31 19:55:24 +02:00
{
m_objectStack.push_back(PDFObject::createBool(value));
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitInt(PDFInteger value)
2020-05-31 19:55:24 +02:00
{
m_objectStack.push_back(PDFObject::createInteger(value));
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitReal(PDFReal value)
2020-05-31 19:55:24 +02:00
{
m_objectStack.push_back(PDFObject::createReal(value));
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitString(PDFStringRef string)
2020-05-31 19:55:24 +02:00
{
m_objectStack.push_back(PDFObject::createString(string));
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitName(PDFStringRef name)
2020-05-31 19:55:24 +02:00
{
m_objectStack.push_back(PDFObject::createName(name));
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitArray(const PDFArray* array)
2020-05-31 19:55:24 +02:00
{
acceptArray(array);
// We have all objects on the stack
Q_ASSERT(array->getCount() <= m_objectStack.size());
auto it = std::next(m_objectStack.cbegin(), m_objectStack.size() - array->getCount());
std::vector<PDFObject> objects(it, m_objectStack.cend());
PDFObject object = PDFObject::createArray(std::make_shared<PDFArray>(qMove(objects)));
m_objectStack.erase(it, m_objectStack.cend());
m_objectStack.push_back(object);
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitDictionary(const PDFDictionary* dictionary)
2020-05-31 19:55:24 +02:00
{
Q_ASSERT(dictionary);
std::vector<PDFDictionary::DictionaryEntry> entries;
entries.reserve(dictionary->getCount());
for (size_t i = 0, count = dictionary->getCount(); i < count; ++i)
{
dictionary->getValue(i).accept(this);
2020-06-02 19:45:28 +02:00
Q_ASSERT(!m_objectStack.empty());
2020-05-31 19:55:24 +02:00
entries.emplace_back(dictionary->getKey(i), m_objectStack.back());
m_objectStack.pop_back();
}
m_objectStack.push_back(PDFObject::createDictionary(std::make_shared<PDFDictionary>(qMove(entries))));
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitStream(const PDFStream* stream)
2020-05-31 19:55:24 +02:00
{
const PDFDictionary* dictionary = stream->getDictionary();
visitDictionary(dictionary);
2020-06-02 19:45:28 +02:00
Q_ASSERT(!m_objectStack.empty());
2020-05-31 19:55:24 +02:00
PDFObject dictionaryObject = m_objectStack.back();
m_objectStack.pop_back();
2020-06-02 19:45:28 +02:00
PDFDictionary newDictionary(*dictionaryObject.getDictionary());
m_objectStack.push_back(PDFObject::createStream(std::make_shared<PDFStream>(qMove(newDictionary), QByteArray(*stream->getContent()))));
2020-05-31 19:55:24 +02:00
}
2020-06-02 19:45:28 +02:00
void PDFUpdateObjectVisitor::visitReference(const PDFObjectReference reference)
{
m_objectStack.push_back(PDFObject::createReference(reference));
}
PDFObject PDFUpdateObjectVisitor::getObject()
{
Q_ASSERT(m_objectStack.size() == 1);
return qMove(m_objectStack.back());
}
class PDFRemoveSimpleObjectsVisitor : public PDFUpdateObjectVisitor
{
public:
explicit inline PDFRemoveSimpleObjectsVisitor(const PDFObjectStorage* storage, std::atomic<PDFInteger>* counter) :
PDFUpdateObjectVisitor(storage),
m_counter(counter)
{
}
virtual void visitReference(const PDFObjectReference reference) override;
private:
std::atomic<PDFInteger>* m_counter;
};
2020-05-31 19:55:24 +02:00
void PDFRemoveSimpleObjectsVisitor::visitReference(const PDFObjectReference reference)
{
PDFObject object = m_storage->getObjectByReference(reference);
switch (object.getType())
{
case PDFObject::Type::Null:
case PDFObject::Type::Bool:
case PDFObject::Type::Int:
case PDFObject::Type::Real:
case PDFObject::Type::String:
case PDFObject::Type::Name:
++*m_counter;
m_objectStack.push_back(qMove(object));
break;
default:
m_objectStack.push_back(PDFObject::createReference(reference));
break;
}
}
2020-06-02 19:45:28 +02:00
class PDFRemoveNullDictionaryEntriesVisitor : public PDFUpdateObjectVisitor
2020-05-31 19:55:24 +02:00
{
2020-06-02 19:45:28 +02:00
public:
explicit PDFRemoveNullDictionaryEntriesVisitor(const PDFObjectStorage* storage, std::atomic<PDFInteger>* counter) :
PDFUpdateObjectVisitor(storage),
m_counter(counter)
{
}
virtual void visitDictionary(const PDFDictionary* dictionary) override;
private:
std::atomic<PDFInteger>* m_counter;
};
void PDFRemoveNullDictionaryEntriesVisitor::visitDictionary(const PDFDictionary* dictionary)
{
Q_ASSERT(dictionary);
std::vector<PDFDictionary::DictionaryEntry> entries;
entries.reserve(dictionary->getCount());
for (size_t i = 0, count = dictionary->getCount(); i < count; ++i)
{
dictionary->getValue(i).accept(this);
Q_ASSERT(!m_objectStack.empty());
if (!m_objectStack.back().isNull())
{
entries.emplace_back(dictionary->getKey(i), m_objectStack.back());
}
else
{
++*m_counter;
}
m_objectStack.pop_back();
}
m_objectStack.push_back(PDFObject::createDictionary(std::make_shared<PDFDictionary>(qMove(entries))));
2020-05-31 19:55:24 +02:00
}
2020-05-31 18:31:59 +02:00
PDFOptimizer::PDFOptimizer(OptimizationFlags flags, QObject* parent) :
QObject(parent),
m_flags(flags)
{
}
void PDFOptimizer::optimize()
{
// Jakub Melka: We divide optimization into stages, each
// stage can consist from multiple passes.
constexpr auto stages = { OptimizationFlags(DereferenceSimpleObjects),
OptimizationFlags(RemoveNullObjects),
OptimizationFlags(RemoveUnusedObjects | MergeIdenticalObjects),
OptimizationFlags(ShrinkObjectStorage),
OptimizationFlags(RecompressFlateStreams) };
int stage = 1;
emit optimizationStarted();
for (OptimizationFlags flags : stages)
{
emit optimizationProgress(tr("Stage %1").arg(stage++));
OptimizationFlags currentSteps = flags & m_flags;
int passIndex = 1;
bool pass = true;
while (pass)
{
emit optimizationProgress(tr("Pass %1").arg(passIndex++));
pass = false;
if (currentSteps.testFlag(DereferenceSimpleObjects))
{
pass = performDereferenceSimpleObjects() || pass;
}
if (currentSteps.testFlag(RemoveNullObjects))
{
pass = performRemoveNullObjects() || pass;
}
if (currentSteps.testFlag(RemoveUnusedObjects))
{
pass = performRemoveUnusedObjects() || pass;
}
if (currentSteps.testFlag(MergeIdenticalObjects))
{
pass = performMergeIdenticalObjects() || pass;
}
if (currentSteps.testFlag(ShrinkObjectStorage))
{
pass = performShrinkObjectStorage() || pass;
}
if (currentSteps.testFlag(RecompressFlateStreams))
{
pass = performRecompressFlateStreams() || pass;
}
}
}
emit optimizationFinished();
}
PDFOptimizer::OptimizationFlags PDFOptimizer::getFlags() const
{
return m_flags;
}
void PDFOptimizer::setFlags(OptimizationFlags flags)
{
m_flags = flags;
}
bool PDFOptimizer::performDereferenceSimpleObjects()
{
2020-05-31 19:55:24 +02:00
std::atomic<PDFInteger> counter = 0;
2020-06-02 19:45:28 +02:00
PDFObjectStorage::PDFObjects objects = m_storage.getObjects();
2020-05-31 19:55:24 +02:00
auto processEntry = [this, &counter](PDFObjectStorage::Entry& entry)
{
PDFRemoveSimpleObjectsVisitor visitor(&m_storage, &counter);
entry.object.accept(&visitor);
entry.object = visitor.getObject();
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Unknown, objects.begin(), objects.end(), processEntry);
2020-06-02 19:45:28 +02:00
m_storage.setObjects(qMove(objects));
2020-05-31 19:55:24 +02:00
emit optimizationProgress(tr("Simple objects dereferenced and embedded: %1").arg(counter));
2020-05-31 18:31:59 +02:00
return false;
}
bool PDFOptimizer::performRemoveNullObjects()
{
2020-06-02 19:45:28 +02:00
std::atomic<PDFInteger> counter = 0;
PDFObjectStorage::PDFObjects objects = m_storage.getObjects();
auto processEntry = [this, &counter](PDFObjectStorage::Entry& entry)
{
PDFRemoveNullDictionaryEntriesVisitor visitor(&m_storage, &counter);
entry.object.accept(&visitor);
entry.object = visitor.getObject();
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Unknown, objects.begin(), objects.end(), processEntry);
m_storage.setObjects(qMove(objects));
emit optimizationProgress(tr("Null objects entries from dictionaries removed: %1").arg(counter));
2020-05-31 18:31:59 +02:00
return false;
}
bool PDFOptimizer::performRemoveUnusedObjects()
{
2020-06-02 19:45:28 +02:00
std::atomic<PDFInteger> counter = 0;
PDFObjectStorage::PDFObjects objects = m_storage.getObjects();
std::set<PDFObjectReference> references = PDFObjectUtils::getReferences({ m_storage.getTrailerDictionary() }, m_storage);
PDFIntegerRange<size_t> range(0, objects.size());
auto processEntry = [this, &counter, &objects, &references](size_t index)
{
PDFObjectStorage::Entry& entry = objects[index];
PDFObjectReference reference(PDFInteger(index), entry.generation);
if (!references.count(reference) && !entry.object.isNull())
{
entry.object = PDFObject();
++counter;
}
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Unknown, range.begin(), range.end(), processEntry);
m_storage.setObjects(qMove(objects));
emit optimizationProgress(tr("Unused objects removed: %1").arg(counter));
2020-05-31 18:31:59 +02:00
return false;
}
bool PDFOptimizer::performMergeIdenticalObjects()
{
return false;
}
bool PDFOptimizer::performShrinkObjectStorage()
{
return false;
}
bool PDFOptimizer::performRecompressFlateStreams()
{
return false;
}
} // namespace pdf