mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2024-12-26 16:22:50 +01:00
DocDiff application: LCS algorithm
This commit is contained in:
parent
b2a9342047
commit
69c988c756
@ -110,6 +110,7 @@ SOURCES += \
|
||||
HEADERS += \
|
||||
sources/pdfaction.h \
|
||||
sources/pdfadvancedtools.h \
|
||||
sources/pdfalgorithmlcs.h \
|
||||
sources/pdfannotation.h \
|
||||
sources/pdfblendfunction.h \
|
||||
sources/pdfccittfaxdecoder.h \
|
||||
|
126
Pdf4QtLib/sources/pdfalgorithmlcs.h
Normal file
126
Pdf4QtLib/sources/pdfalgorithmlcs.h
Normal file
@ -0,0 +1,126 @@
|
||||
// Copyright (C) 2021 Jakub Melka
|
||||
//
|
||||
// This file is part of PDF4QT.
|
||||
//
|
||||
// PDF4QT is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Lesser General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// with the written consent of the copyright owner, any later version.
|
||||
//
|
||||
// PDF4QT is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with PDF4QT. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef PDFALGORITHMLCS_H
|
||||
#define PDFALGORITHMLCS_H
|
||||
|
||||
#include "pdfglobal.h"
|
||||
|
||||
namespace pdf
|
||||
{
|
||||
|
||||
/// Algorithm for computing longest common subsequence, on two sequences
|
||||
/// of objects, which are implementing operator "==" (equal operator).
|
||||
/// Constructor takes bidirectional iterators to the sequence. So, iterators
|
||||
/// are requred to be bidirectional.
|
||||
template<typename Iterator, typename Comparator>
|
||||
class PDFAlgorithmLongestCommonSubsequence
|
||||
{
|
||||
public:
|
||||
PDFAlgorithmLongestCommonSubsequence(Iterator it1,
|
||||
Iterator it1End,
|
||||
Iterator it2,
|
||||
Iterator it2End,
|
||||
Comparator comparator);
|
||||
|
||||
void perform();
|
||||
|
||||
private:
|
||||
Iterator m_it1;
|
||||
Iterator m_it1End;
|
||||
Iterator m_it2;
|
||||
Iterator m_it2End;
|
||||
|
||||
size_t m_size1;
|
||||
size_t m_size2;
|
||||
size_t m_matrixSize;
|
||||
|
||||
Comparator m_comparator;
|
||||
|
||||
std::vector<bool> m_backtrackData;
|
||||
};
|
||||
|
||||
template<typename Iterator, typename Comparator>
|
||||
PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::PDFAlgorithmLongestCommonSubsequence(Iterator it1,
|
||||
Iterator it1End,
|
||||
Iterator it2,
|
||||
Iterator it2End,
|
||||
Comparator comparator) :
|
||||
m_it1(std::move(it1)),
|
||||
m_it1End(std::move(it1End)),
|
||||
m_it2(std::move(it2)),
|
||||
m_it2End(std::move(it2End)),
|
||||
m_size1(0),
|
||||
m_size2(0),
|
||||
m_matrixSize(0),
|
||||
m_comparator(std::move(comparator))
|
||||
{
|
||||
m_size1 = std::distance(m_it1, m_it1End) + 1;
|
||||
m_size2 = std::distance(m_it2, m_it2End) + 1;
|
||||
m_matrixSize = m_size1 * m_size2;
|
||||
}
|
||||
|
||||
template<typename Iterator, typename Comparator>
|
||||
void PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::perform()
|
||||
{
|
||||
m_backtrackData.resize(m_matrixSize);
|
||||
|
||||
std::vector<size_t> rowTop(m_size1, size_t());
|
||||
std::vector<size_t> rowBottom(m_size1, size_t());
|
||||
|
||||
// Jakub Melka: we will have columns consisting of it1...it1End
|
||||
// and rows consisting of it2...it2End. We iterate trough rows,
|
||||
// and for each row, we update longest common subsequence data.
|
||||
|
||||
auto it2 = m_it2;
|
||||
for (size_t i2 = 1; i2 < m_size2; ++i2, ++it2)
|
||||
{
|
||||
auto it1 = m_it1;
|
||||
for (size_t i1 = 1; i1 < m_size1; ++i1, ++it1)
|
||||
{
|
||||
if (m_comparator(*it1, *it2))
|
||||
{
|
||||
// We have match
|
||||
rowBottom[i1] = rowTop[i1 - 1] + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
const size_t leftCellValue = rowBottom[i1 - 1];
|
||||
const size_t upperCellValue = rowTop[i1];
|
||||
bool isLeftBigger = leftCellValue > upperCellValue;
|
||||
|
||||
if (isLeftBigger)
|
||||
{
|
||||
rowBottom[i1] = leftCellValue;
|
||||
m_backtrackData[i2 * m_size1 + i1] = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
rowBottom[i1] = upperCellValue;
|
||||
m_backtrackData[i2 * m_size1 + i1] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Bottom row will become top row
|
||||
std::swap(rowTop, rowBottom);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
||||
#endif // PDFALGORITHMLCS_H
|
@ -23,6 +23,7 @@
|
||||
#include "pdfcms.h"
|
||||
#include "pdfcompiler.h"
|
||||
#include "pdfconstants.h"
|
||||
#include "pdfalgorithmlcs.h"
|
||||
|
||||
#include <QtConcurrent/QtConcurrent>
|
||||
|
||||
@ -34,7 +35,7 @@ PDFDiff::PDFDiff(QObject* parent) :
|
||||
m_progress(nullptr),
|
||||
m_leftDocument(nullptr),
|
||||
m_rightDocument(nullptr),
|
||||
m_options(Asynchronous),
|
||||
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
|
||||
m_epsilon(0.0001),
|
||||
m_cancelled(false)
|
||||
{
|
||||
@ -173,6 +174,7 @@ void PDFDiff::stepProgress()
|
||||
struct PDFDiffPageContext
|
||||
{
|
||||
PDFInteger pageIndex = 0;
|
||||
std::array<uint8_t, 64> pageHash = { };
|
||||
PDFPrecompiledPage::GraphicPieceInfos graphicPieces;
|
||||
};
|
||||
|
||||
@ -208,8 +210,11 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
||||
PDFRenderer renderer(m_leftDocument, &fontCache, cms.data(), &optionalContentActivity, features, pdf::PDFMeshQualitySettings());
|
||||
renderer.compile(&compiledPage, context.pageIndex);
|
||||
|
||||
PDFReal epsilon = calculateEpsilonForPage(m_leftDocument->getCatalog()->getPage(context.pageIndex));
|
||||
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(epsilon);
|
||||
auto page = m_leftDocument->getCatalog()->getPage(context.pageIndex);
|
||||
PDFReal epsilon = calculateEpsilonForPage(page);
|
||||
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(page->getMediaBox(), epsilon);
|
||||
|
||||
finalizeGraphicsPieces(context);
|
||||
};
|
||||
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftPreparedPages.begin(), leftPreparedPages.end(), fillPageContext);
|
||||
stepProgress();
|
||||
@ -233,14 +238,33 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
||||
PDFRenderer renderer(m_rightDocument, &fontCache, cms.data(), &optionalContentActivity, features, pdf::PDFMeshQualitySettings());
|
||||
renderer.compile(&compiledPage, context.pageIndex);
|
||||
|
||||
PDFReal epsilon = calculateEpsilonForPage(m_leftDocument->getCatalog()->getPage(context.pageIndex));
|
||||
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(epsilon);
|
||||
const PDFPage* page = m_leftDocument->getCatalog()->getPage(context.pageIndex);
|
||||
PDFReal epsilon = calculateEpsilonForPage(page);
|
||||
context.graphicPieces = compiledPage.calculateGraphicPieceInfos(page->getMediaBox(), epsilon);
|
||||
|
||||
finalizeGraphicsPieces(context);
|
||||
};
|
||||
|
||||
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, rightPreparedPages.begin(), rightPreparedPages.end(), fillPageContext);
|
||||
stepProgress();
|
||||
}
|
||||
|
||||
// StepMatchPages
|
||||
if (!m_cancelled)
|
||||
{
|
||||
// Match pages
|
||||
auto comparePages = [](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
|
||||
{
|
||||
return left.pageHash == right.pageHash;
|
||||
};
|
||||
PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(),
|
||||
rightPreparedPages.cbegin(), rightPreparedPages.cend(),
|
||||
comparePages);
|
||||
algorithm.perform();
|
||||
|
||||
stepProgress();
|
||||
}
|
||||
|
||||
// StepExtractTextLeftDocument
|
||||
if (!m_cancelled)
|
||||
{
|
||||
@ -266,6 +290,43 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
||||
}
|
||||
}
|
||||
|
||||
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
|
||||
{
|
||||
std::sort(context.graphicPieces.begin(), context.graphicPieces.end());
|
||||
|
||||
// Compute page hash using active settings
|
||||
QCryptographicHash hasher(QCryptographicHash::Sha512);
|
||||
hasher.reset();
|
||||
|
||||
for (const PDFPrecompiledPage::GraphicPieceInfo& info : context.graphicPieces)
|
||||
{
|
||||
if (info.isText() && !m_options.testFlag(PC_Text))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (info.isVectorGraphics() && !m_options.testFlag(PC_VectorGraphics))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (info.isImage() && !m_options.testFlag(PC_Images))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (info.isShading() && !m_options.testFlag(PC_Mesh))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
hasher.addData(reinterpret_cast<const char*>(info.hash.data()), int(info.hash.size()));
|
||||
}
|
||||
|
||||
QByteArray hash = hasher.result();
|
||||
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
|
||||
|
||||
size_t size = qMin<size_t>(hash.length(), context.pageHash.size());
|
||||
std::copy(hash.data(), hash.data() + size, context.pageHash.data());
|
||||
}
|
||||
|
||||
void PDFDiff::onComparationPerformed()
|
||||
{
|
||||
m_cancelled = false;
|
||||
|
@ -31,6 +31,8 @@
|
||||
namespace pdf
|
||||
{
|
||||
|
||||
struct PDFDiffPageContext;
|
||||
|
||||
class PDFDiffResult
|
||||
{
|
||||
public:
|
||||
@ -57,8 +59,12 @@ public:
|
||||
|
||||
enum Option
|
||||
{
|
||||
None = 0x0000,
|
||||
Asynchronous = 0x0001, ///< Compare document asynchronously
|
||||
None = 0x0000,
|
||||
Asynchronous = 0x0001, ///< Compare document asynchronously
|
||||
PC_Text = 0x0002, ///< Use text to compare pages (determine, which pages correspond to each other)
|
||||
PC_VectorGraphics = 0x0004, ///< Use vector graphics to compare pages (determine, which pages correspond to each other)
|
||||
PC_Images = 0x0008, ///< Use images to compare pages (determine, which pages correspond to each other)
|
||||
PC_Mesh = 0x0010, ///< Use mesh to compare pages (determine, which pages correspond to each other)
|
||||
};
|
||||
Q_DECLARE_FLAGS(Options, Option)
|
||||
|
||||
@ -109,6 +115,7 @@ private:
|
||||
{
|
||||
StepExtractContentLeftDocument,
|
||||
StepExtractContentRightDocument,
|
||||
StepMatchPages,
|
||||
StepExtractTextLeftDocument,
|
||||
StepExtractTextRightDocument,
|
||||
StepCompare,
|
||||
@ -119,6 +126,7 @@ private:
|
||||
void stepProgress();
|
||||
void performSteps(const std::vector<PDFInteger>& leftPages,
|
||||
const std::vector<PDFInteger>& rightPages);
|
||||
void finalizeGraphicsPieces(PDFDiffPageContext& context);
|
||||
|
||||
void onComparationPerformed();
|
||||
|
||||
|
@ -832,7 +832,8 @@ void PDFPrecompiledPage::finalize(qint64 compilingTimeNS, QList<PDFRenderError>
|
||||
}
|
||||
}
|
||||
|
||||
PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceInfos(PDFReal epsilon) const
|
||||
PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceInfos(QRectF mediaBox,
|
||||
PDFReal epsilon) const
|
||||
{
|
||||
GraphicPieceInfos infos;
|
||||
|
||||
@ -850,6 +851,8 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
|
||||
}
|
||||
PDFReal factor = 1.0 / epsilon;
|
||||
|
||||
QImage shadingTestImage;
|
||||
|
||||
// Process all instructions
|
||||
for (const Instruction& instruction : m_instructions)
|
||||
{
|
||||
@ -903,33 +906,98 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
|
||||
|
||||
case InstructionType::DrawImage:
|
||||
{
|
||||
/*const ImageData& data = m_images[instruction.dataIndex];
|
||||
const ImageData& data = m_images[instruction.dataIndex];
|
||||
const QImage& image = data.image;
|
||||
|
||||
painter->save();
|
||||
GraphicPieceInfo info;
|
||||
QByteArray serializedPath;
|
||||
|
||||
QMatrix imageTransform(1.0 / image.width(), 0, 0, 1.0 / image.height(), 0, 0);
|
||||
QMatrix worldMatrix = imageTransform * painter->worldMatrix();
|
||||
// Serialize data
|
||||
if (true)
|
||||
{
|
||||
QDataStream stream(&serializedPath, QIODevice::WriteOnly);
|
||||
|
||||
// Jakub Melka: Because Qt uses opposite axis direction than PDF, then we must transform the y-axis
|
||||
// to the opposite (so the image is then unchanged)
|
||||
worldMatrix.translate(0, image.height());
|
||||
worldMatrix.scale(1, -1);
|
||||
// Jakub Melka: serialize image position
|
||||
QMatrix worldMatrix = stateStack.top().matrix;
|
||||
|
||||
painter->setWorldMatrix(worldMatrix);
|
||||
painter->drawImage(0, 0, image);
|
||||
painter->restore();*/
|
||||
QPainterPath pagePath;
|
||||
pagePath.addRect(0, 0, 1, 1);
|
||||
pagePath = worldMatrix.map(pagePath);
|
||||
|
||||
info.type = GraphicPieceInfo::Type::Image;
|
||||
info.boundingRect = pagePath.controlPointRect();
|
||||
|
||||
const int elementCount = pagePath.elementCount();
|
||||
for (int i = 0; i < elementCount; ++i)
|
||||
{
|
||||
QPainterPath::Element element = pagePath.elementAt(i);
|
||||
|
||||
PDFReal roundedX = qRound(element.x * factor);
|
||||
PDFReal roundedY = qRound(element.y * factor);
|
||||
|
||||
stream << roundedX;
|
||||
stream << roundedY;
|
||||
stream << element.type;
|
||||
}
|
||||
|
||||
// serialize image data
|
||||
stream.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes());
|
||||
}
|
||||
|
||||
QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512);
|
||||
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
|
||||
|
||||
size_t size = qMin<size_t>(hash.length(), info.hash.size());
|
||||
std::copy(hash.data(), hash.data() + size, info.hash.data());
|
||||
|
||||
infos.emplace_back(std::move(info));
|
||||
break;
|
||||
}
|
||||
|
||||
case InstructionType::DrawMesh:
|
||||
{
|
||||
/*const MeshPaintData& data = m_meshes[instruction.dataIndex];
|
||||
const MeshPaintData& data = m_meshes[instruction.dataIndex];
|
||||
|
||||
painter->save();
|
||||
painter->setWorldMatrix(pagePointToDevicePointMatrix);
|
||||
data.mesh.paint(painter, data.alpha);
|
||||
painter->restore();*/
|
||||
if (shadingTestImage.isNull())
|
||||
{
|
||||
QSizeF mediaBoxSize = mediaBox.size();
|
||||
mediaBoxSize = mediaBoxSize.scaled(256, 256, Qt::KeepAspectRatio);
|
||||
QSize imageSize = mediaBoxSize.toSize();
|
||||
shadingTestImage = QImage(imageSize, QImage::Format_ARGB32);
|
||||
}
|
||||
|
||||
shadingTestImage.fill(Qt::transparent);
|
||||
|
||||
QMatrix pagePointToDevicePointMatrix;
|
||||
pagePointToDevicePointMatrix.scale(shadingTestImage.width() / mediaBox.width(), -shadingTestImage.height() / mediaBox.height());
|
||||
|
||||
{
|
||||
QPainter painter(&shadingTestImage);
|
||||
painter.setWorldMatrix(pagePointToDevicePointMatrix);
|
||||
data.mesh.paint(&painter, data.alpha);
|
||||
}
|
||||
|
||||
GraphicPieceInfo info;
|
||||
QByteArray serializedMesh;
|
||||
|
||||
// Serialize data
|
||||
if (true)
|
||||
{
|
||||
QDataStream stream(&serializedMesh, QIODevice::WriteOnly);
|
||||
|
||||
// serialize image data
|
||||
stream.writeBytes(reinterpret_cast<const char*>(shadingTestImage.bits()), shadingTestImage.sizeInBytes());
|
||||
}
|
||||
|
||||
QByteArray hash = QCryptographicHash::hash(serializedMesh, QCryptographicHash::Sha512);
|
||||
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
|
||||
|
||||
size_t size = qMin<size_t>(hash.length(), info.hash.size());
|
||||
std::copy(hash.data(), hash.data() + size, info.hash.data());
|
||||
|
||||
info.boundingRect = QRectF();
|
||||
info.type = GraphicPieceInfo::Type::Shading;
|
||||
infos.emplace_back(std::move(info));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -241,9 +241,20 @@ public:
|
||||
Unknown,
|
||||
Text,
|
||||
VectorGraphics,
|
||||
Image
|
||||
Image,
|
||||
Shading
|
||||
};
|
||||
|
||||
bool operator<(const GraphicPieceInfo& other) const
|
||||
{
|
||||
return std::tie(type, hash) < std::tie(other.type, other.hash);
|
||||
}
|
||||
|
||||
bool isText() const { return type == Type::Text; }
|
||||
bool isVectorGraphics() const { return type == Type::VectorGraphics; }
|
||||
bool isImage() const { return type == Type::Image; }
|
||||
bool isShading() const { return type == Type::Shading; }
|
||||
|
||||
Type type = Type::Unknown;
|
||||
QRectF boundingRect;
|
||||
std::array<uint8_t, 64> hash = { };
|
||||
@ -255,8 +266,10 @@ public:
|
||||
/// for example, for comparation reasons. Parameter \p epsilon
|
||||
/// is for numerical precision - values under epsilon are considered
|
||||
/// as equal.
|
||||
/// \param mediaBox Page's media box
|
||||
/// \param epsilon Epsilon
|
||||
GraphicPieceInfos calculateGraphicPieceInfos(PDFReal epsilon) const;
|
||||
GraphicPieceInfos calculateGraphicPieceInfos(QRectF mediaBox,
|
||||
PDFReal epsilon) const;
|
||||
|
||||
private:
|
||||
struct PathPaintData
|
||||
|
Loading…
Reference in New Issue
Block a user