DocDiff application: LCS algorithm

This commit is contained in:
Jakub Melka 2021-09-09 18:28:55 +02:00
parent b2a9342047
commit 69c988c756
6 changed files with 303 additions and 26 deletions

View File

@ -110,6 +110,7 @@ SOURCES += \
HEADERS += \
sources/pdfaction.h \
sources/pdfadvancedtools.h \
sources/pdfalgorithmlcs.h \
sources/pdfannotation.h \
sources/pdfblendfunction.h \
sources/pdfccittfaxdecoder.h \

View File

@ -0,0 +1,126 @@
// Copyright (C) 2021 Jakub Melka
//
// This file is part of PDF4QT.
//
// PDF4QT is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// with the written consent of the copyright owner, any later version.
//
// PDF4QT is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDF4QT. If not, see <https://www.gnu.org/licenses/>.
#ifndef PDFALGORITHMLCS_H
#define PDFALGORITHMLCS_H
#include "pdfglobal.h"
namespace pdf
{
/// Algorithm for computing longest common subsequence, on two sequences
/// of objects, which are implementing operator "==" (equal operator).
/// Constructor takes bidirectional iterators to the sequence. So, iterators
/// are requred to be bidirectional.
template<typename Iterator, typename Comparator>
class PDFAlgorithmLongestCommonSubsequence
{
public:
PDFAlgorithmLongestCommonSubsequence(Iterator it1,
Iterator it1End,
Iterator it2,
Iterator it2End,
Comparator comparator);
void perform();
private:
Iterator m_it1;
Iterator m_it1End;
Iterator m_it2;
Iterator m_it2End;
size_t m_size1;
size_t m_size2;
size_t m_matrixSize;
Comparator m_comparator;
std::vector<bool> m_backtrackData;
};
template<typename Iterator, typename Comparator>
PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::PDFAlgorithmLongestCommonSubsequence(Iterator it1,
Iterator it1End,
Iterator it2,
Iterator it2End,
Comparator comparator) :
m_it1(std::move(it1)),
m_it1End(std::move(it1End)),
m_it2(std::move(it2)),
m_it2End(std::move(it2End)),
m_size1(0),
m_size2(0),
m_matrixSize(0),
m_comparator(std::move(comparator))
{
m_size1 = std::distance(m_it1, m_it1End) + 1;
m_size2 = std::distance(m_it2, m_it2End) + 1;
m_matrixSize = m_size1 * m_size2;
}
template<typename Iterator, typename Comparator>
void PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::perform()
{
m_backtrackData.resize(m_matrixSize);
std::vector<size_t> rowTop(m_size1, size_t());
std::vector<size_t> rowBottom(m_size1, size_t());
// Jakub Melka: we will have columns consisting of it1...it1End
// and rows consisting of it2...it2End. We iterate trough rows,
// and for each row, we update longest common subsequence data.
auto it2 = m_it2;
for (size_t i2 = 1; i2 < m_size2; ++i2, ++it2)
{
auto it1 = m_it1;
for (size_t i1 = 1; i1 < m_size1; ++i1, ++it1)
{
if (m_comparator(*it1, *it2))
{
// We have match
rowBottom[i1] = rowTop[i1 - 1] + 1;
}
else
{
const size_t leftCellValue = rowBottom[i1 - 1];
const size_t upperCellValue = rowTop[i1];
bool isLeftBigger = leftCellValue > upperCellValue;
if (isLeftBigger)
{
rowBottom[i1] = leftCellValue;
m_backtrackData[i2 * m_size1 + i1] = true;
}
else
{
rowBottom[i1] = upperCellValue;
m_backtrackData[i2 * m_size1 + i1] = false;
}
}
}
// Bottom row will become top row
std::swap(rowTop, rowBottom);
}
}
} // namespace pdf
#endif // PDFALGORITHMLCS_H

View File

@ -23,6 +23,7 @@
#include "pdfcms.h"
#include "pdfcompiler.h"
#include "pdfconstants.h"
#include "pdfalgorithmlcs.h"
#include <QtConcurrent/QtConcurrent>
@ -34,7 +35,7 @@ PDFDiff::PDFDiff(QObject* parent) :
m_progress(nullptr),
m_leftDocument(nullptr),
m_rightDocument(nullptr),
m_options(Asynchronous),
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
m_epsilon(0.0001),
m_cancelled(false)
{
@ -173,6 +174,7 @@ void PDFDiff::stepProgress()
struct PDFDiffPageContext
{
PDFInteger pageIndex = 0;
std::array<uint8_t, 64> pageHash = { };
PDFPrecompiledPage::GraphicPieceInfos graphicPieces;
};
@ -208,8 +210,11 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
PDFRenderer renderer(m_leftDocument, &fontCache, cms.data(), &optionalContentActivity, features, pdf::PDFMeshQualitySettings());
renderer.compile(&compiledPage, context.pageIndex);
PDFReal epsilon = calculateEpsilonForPage(m_leftDocument->getCatalog()->getPage(context.pageIndex));
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(epsilon);
auto page = m_leftDocument->getCatalog()->getPage(context.pageIndex);
PDFReal epsilon = calculateEpsilonForPage(page);
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(page->getMediaBox(), epsilon);
finalizeGraphicsPieces(context);
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftPreparedPages.begin(), leftPreparedPages.end(), fillPageContext);
stepProgress();
@ -233,14 +238,33 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
PDFRenderer renderer(m_rightDocument, &fontCache, cms.data(), &optionalContentActivity, features, pdf::PDFMeshQualitySettings());
renderer.compile(&compiledPage, context.pageIndex);
PDFReal epsilon = calculateEpsilonForPage(m_leftDocument->getCatalog()->getPage(context.pageIndex));
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(epsilon);
const PDFPage* page = m_leftDocument->getCatalog()->getPage(context.pageIndex);
PDFReal epsilon = calculateEpsilonForPage(page);
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(page->getMediaBox(), epsilon);
finalizeGraphicsPieces(context);
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, rightPreparedPages.begin(), rightPreparedPages.end(), fillPageContext);
stepProgress();
}
// StepMatchPages
if (!m_cancelled)
{
// Match pages
auto comparePages = [](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
{
return left.pageHash == right.pageHash;
};
PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(),
rightPreparedPages.cbegin(), rightPreparedPages.cend(),
comparePages);
algorithm.perform();
stepProgress();
}
// StepExtractTextLeftDocument
if (!m_cancelled)
{
@ -266,6 +290,43 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
}
}
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
{
std::sort(context.graphicPieces.begin(), context.graphicPieces.end());
// Compute page hash using active settings
QCryptographicHash hasher(QCryptographicHash::Sha512);
hasher.reset();
for (const PDFPrecompiledPage::GraphicPieceInfo& info : context.graphicPieces)
{
if (info.isText() && !m_options.testFlag(PC_Text))
{
continue;
}
if (info.isVectorGraphics() && !m_options.testFlag(PC_VectorGraphics))
{
continue;
}
if (info.isImage() && !m_options.testFlag(PC_Images))
{
continue;
}
if (info.isShading() && !m_options.testFlag(PC_Mesh))
{
continue;
}
hasher.addData(reinterpret_cast<const char*>(info.hash.data()), int(info.hash.size()));
}
QByteArray hash = hasher.result();
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
size_t size = qMin<size_t>(hash.length(), context.pageHash.size());
std::copy(hash.data(), hash.data() + size, context.pageHash.data());
}
void PDFDiff::onComparationPerformed()
{
m_cancelled = false;

View File

@ -31,6 +31,8 @@
namespace pdf
{
struct PDFDiffPageContext;
class PDFDiffResult
{
public:
@ -57,8 +59,12 @@ public:
enum Option
{
None = 0x0000,
Asynchronous = 0x0001, ///< Compare document asynchronously
None = 0x0000,
Asynchronous = 0x0001, ///< Compare document asynchronously
PC_Text = 0x0002, ///< Use text to compare pages (determine, which pages correspond to each other)
PC_VectorGraphics = 0x0004, ///< Use vector graphics to compare pages (determine, which pages correspond to each other)
PC_Images = 0x0008, ///< Use images to compare pages (determine, which pages correspond to each other)
PC_Mesh = 0x0010, ///< Use mesh to compare pages (determine, which pages correspond to each other)
};
Q_DECLARE_FLAGS(Options, Option)
@ -109,6 +115,7 @@ private:
{
StepExtractContentLeftDocument,
StepExtractContentRightDocument,
StepMatchPages,
StepExtractTextLeftDocument,
StepExtractTextRightDocument,
StepCompare,
@ -119,6 +126,7 @@ private:
void stepProgress();
void performSteps(const std::vector<PDFInteger>& leftPages,
const std::vector<PDFInteger>& rightPages);
void finalizeGraphicsPieces(PDFDiffPageContext& context);
void onComparationPerformed();

View File

@ -832,7 +832,8 @@ void PDFPrecompiledPage::finalize(qint64 compilingTimeNS, QList<PDFRenderError>
}
}
PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceInfos(PDFReal epsilon) const
PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceInfos(QRectF mediaBox,
PDFReal epsilon) const
{
GraphicPieceInfos infos;
@ -850,6 +851,8 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
}
PDFReal factor = 1.0 / epsilon;
QImage shadingTestImage;
// Process all instructions
for (const Instruction& instruction : m_instructions)
{
@ -903,33 +906,98 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
case InstructionType::DrawImage:
{
/*const ImageData& data = m_images[instruction.dataIndex];
const ImageData& data = m_images[instruction.dataIndex];
const QImage& image = data.image;
painter->save();
GraphicPieceInfo info;
QByteArray serializedPath;
QMatrix imageTransform(1.0 / image.width(), 0, 0, 1.0 / image.height(), 0, 0);
QMatrix worldMatrix = imageTransform * painter->worldMatrix();
// Serialize data
if (true)
{
QDataStream stream(&serializedPath, QIODevice::WriteOnly);
// Jakub Melka: Because Qt uses opposite axis direction than PDF, then we must transform the y-axis
// to the opposite (so the image is then unchanged)
worldMatrix.translate(0, image.height());
worldMatrix.scale(1, -1);
// Jakub Melka: serialize image position
QMatrix worldMatrix = stateStack.top().matrix;
painter->setWorldMatrix(worldMatrix);
painter->drawImage(0, 0, image);
painter->restore();*/
QPainterPath pagePath;
pagePath.addRect(0, 0, 1, 1);
pagePath = worldMatrix.map(pagePath);
info.type = GraphicPieceInfo::Type::Image;
info.boundingRect = pagePath.controlPointRect();
const int elementCount = pagePath.elementCount();
for (int i = 0; i < elementCount; ++i)
{
QPainterPath::Element element = pagePath.elementAt(i);
PDFReal roundedX = qRound(element.x * factor);
PDFReal roundedY = qRound(element.y * factor);
stream << roundedX;
stream << roundedY;
stream << element.type;
}
// serialize image data
stream.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes());
}
QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512);
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
size_t size = qMin<size_t>(hash.length(), info.hash.size());
std::copy(hash.data(), hash.data() + size, info.hash.data());
infos.emplace_back(std::move(info));
break;
}
case InstructionType::DrawMesh:
{
/*const MeshPaintData& data = m_meshes[instruction.dataIndex];
const MeshPaintData& data = m_meshes[instruction.dataIndex];
painter->save();
painter->setWorldMatrix(pagePointToDevicePointMatrix);
data.mesh.paint(painter, data.alpha);
painter->restore();*/
if (shadingTestImage.isNull())
{
QSizeF mediaBoxSize = mediaBox.size();
mediaBoxSize = mediaBoxSize.scaled(256, 256, Qt::KeepAspectRatio);
QSize imageSize = mediaBoxSize.toSize();
shadingTestImage = QImage(imageSize, QImage::Format_ARGB32);
}
shadingTestImage.fill(Qt::transparent);
QMatrix pagePointToDevicePointMatrix;
pagePointToDevicePointMatrix.scale(shadingTestImage.width() / mediaBox.width(), -shadingTestImage.height() / mediaBox.height());
{
QPainter painter(&shadingTestImage);
painter.setWorldMatrix(pagePointToDevicePointMatrix);
data.mesh.paint(&painter, data.alpha);
}
GraphicPieceInfo info;
QByteArray serializedMesh;
// Serialize data
if (true)
{
QDataStream stream(&serializedMesh, QIODevice::WriteOnly);
// serialize image data
stream.writeBytes(reinterpret_cast<const char*>(shadingTestImage.bits()), shadingTestImage.sizeInBytes());
}
QByteArray hash = QCryptographicHash::hash(serializedMesh, QCryptographicHash::Sha512);
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
size_t size = qMin<size_t>(hash.length(), info.hash.size());
std::copy(hash.data(), hash.data() + size, info.hash.data());
info.boundingRect = QRectF();
info.type = GraphicPieceInfo::Type::Shading;
infos.emplace_back(std::move(info));
break;
}

View File

@ -241,9 +241,20 @@ public:
Unknown,
Text,
VectorGraphics,
Image
Image,
Shading
};
bool operator<(const GraphicPieceInfo& other) const
{
return std::tie(type, hash) < std::tie(other.type, other.hash);
}
bool isText() const { return type == Type::Text; }
bool isVectorGraphics() const { return type == Type::VectorGraphics; }
bool isImage() const { return type == Type::Image; }
bool isShading() const { return type == Type::Shading; }
Type type = Type::Unknown;
QRectF boundingRect;
std::array<uint8_t, 64> hash = { };
@ -255,8 +266,10 @@ public:
/// for example, for comparation reasons. Parameter \p epsilon
/// is for numerical precision - values under epsilon are considered
/// as equal.
/// \param mediaBox Page's media box
/// \param epsilon Epsilon
GraphicPieceInfos calculateGraphicPieceInfos(PDFReal epsilon) const;
GraphicPieceInfos calculateGraphicPieceInfos(QRectF mediaBox,
PDFReal epsilon) const;
private:
struct PathPaintData