DocDiff application: page matching

This commit is contained in:
Jakub Melka 2021-09-11 18:17:38 +02:00
parent 7de0fdad3c
commit 050ba869f3
5 changed files with 304 additions and 25 deletions

View File

@ -23,12 +23,29 @@
namespace pdf namespace pdf
{ {
class PDFAlgorithmLongestCommonSubsequenceBase
{
public:
struct SequenceItem
{
size_t index1 = std::numeric_limits<size_t>::max();
size_t index2 = std::numeric_limits<size_t>::max();
bool isLeftValid() const { return index1 != std::numeric_limits<size_t>::max(); }
bool isRightValid() const { return index2 != std::numeric_limits<size_t>::max(); }
bool isLeft() const { return isLeftValid() && !isRightValid(); }
bool isRight() const { return isRightValid() && !isLeftValid(); }
bool isMatch() const { return isLeftValid() && isRightValid(); }
};
using Sequence = std::vector<SequenceItem>;
};
/// Algorithm for computing longest common subsequence, on two sequences /// Algorithm for computing longest common subsequence, on two sequences
/// of objects, which are implementing operator "==" (equal operator). /// of objects, which are implementing operator "==" (equal operator).
/// Constructor takes bidirectional iterators to the sequence. So, iterators /// Constructor takes bidirectional iterators to the sequence. So, iterators
/// are requred to be bidirectional. /// are requred to be bidirectional.
template<typename Iterator, typename Comparator> template<typename Iterator, typename Comparator>
class PDFAlgorithmLongestCommonSubsequence class PDFAlgorithmLongestCommonSubsequence : public PDFAlgorithmLongestCommonSubsequenceBase
{ {
public: public:
PDFAlgorithmLongestCommonSubsequence(Iterator it1, PDFAlgorithmLongestCommonSubsequence(Iterator it1,
@ -37,16 +54,6 @@ public:
Iterator it2End, Iterator it2End,
Comparator comparator); Comparator comparator);
struct SequenceItem
{
size_t index1 = std::numeric_limits<size_t>::max();
size_t index2 = std::numeric_limits<size_t>::max();
bool isLeftValid() const { return index1 == std::numeric_limits<size_t>::max(); }
bool isRightValid() const { return index2 == std::numeric_limits<size_t>::max(); }
bool isMatch() const { return isLeftValid() && isRightValid(); }
};
using Sequence = std::vector<SequenceItem>;
void perform(); void perform();
@ -92,6 +99,7 @@ template<typename Iterator, typename Comparator>
void PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::perform() void PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::perform()
{ {
m_backtrackData.resize(m_matrixSize); m_backtrackData.resize(m_matrixSize);
m_sequence.clear();
std::vector<size_t> rowTop(m_size1, size_t()); std::vector<size_t> rowTop(m_size1, size_t());
std::vector<size_t> rowBottom(m_size1, size_t()); std::vector<size_t> rowBottom(m_size1, size_t());

View File

@ -30,13 +30,35 @@
namespace pdf namespace pdf
{ {
class PDFDiffHelper
{
public:
using GraphicPieceInfo = PDFPrecompiledPage::GraphicPieceInfo;
using GraphicPieceInfos = PDFPrecompiledPage::GraphicPieceInfos;
using PageSequence = PDFAlgorithmLongestCommonSubsequenceBase::Sequence;
struct Differences
{
GraphicPieceInfos left;
GraphicPieceInfos right;
bool isEmpty() const { return left.empty() && right.empty(); }
};
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
static void matchPage(PageSequence& sequence, size_t leftPage, size_t rightPage);
};
PDFDiff::PDFDiff(QObject* parent) : PDFDiff::PDFDiff(QObject* parent) :
BaseClass(parent), BaseClass(parent),
m_progress(nullptr), m_progress(nullptr),
m_leftDocument(nullptr), m_leftDocument(nullptr),
m_rightDocument(nullptr), m_rightDocument(nullptr),
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
m_epsilon(0.0001), m_epsilon(0.001),
m_cancelled(false) m_cancelled(false)
{ {
@ -178,11 +200,107 @@ struct PDFDiffPageContext
PDFPrecompiledPage::GraphicPieceInfos graphicPieces; PDFPrecompiledPage::GraphicPieceInfos graphicPieces;
}; };
void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
const std::vector<PDFDiffPageContext>& rightPreparedPages,
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence)
{
// Match pages. We will use following algorithm: exact solution can fail, because
// we are using hashes and due to numerical instability, hashes can be different
// even for exactly the same page. But if hashes are the same, the page must be the same.
// So, we use longest common subsequence algorithm to detect same page ranges,
// and then we match the rest. We assume the number of failing pages is relatively small.
std::map<size_t, size_t> pageMatches;
auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
{
if (left.pageHash == right.pageHash)
{
return true;
}
auto it = pageMatches.find(left.pageIndex);
if (it != pageMatches.cend())
{
return it->second == right.pageIndex;
}
return false;
};
PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(),
rightPreparedPages.cbegin(), rightPreparedPages.cend(),
comparePages);
algorithm.perform();
pageSequence = algorithm.getSequence();
std::vector<size_t> leftUnmatched = PDFDiffHelper::getLeftUnmatched(pageSequence);
std::vector<size_t> rightUnmatched = PDFDiffHelper::getRightUnmatched(pageSequence);
// We are matching left pages to the right ones
std::map<size_t, std::vector<size_t>> matchedPages;
for (const size_t index : leftUnmatched)
{
matchedPages[index] = std::vector<size_t>();
}
auto matchLeftPage = [&, this](size_t leftIndex)
{
const PDFDiffPageContext& leftPageContext = leftPreparedPages[leftIndex];
auto page = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex);
PDFReal epsilon = calculateEpsilonForPage(page);
for (const size_t rightIndex : rightUnmatched)
{
const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightIndex];
if (leftPageContext.graphicPieces.size() != rightPageContext.graphicPieces.size())
{
// Match cannot exist, graphic pieces have different size
continue;
}
PDFDiffHelper::Differences differences = PDFDiffHelper::calculateDifferences(leftPageContext.graphicPieces, rightPageContext.graphicPieces, epsilon);
if (differences.isEmpty())
{
// Jakub Melka: we have a match
matchedPages[leftIndex].push_back(rightIndex);
}
}
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftUnmatched.begin(), leftUnmatched.end(), matchLeftPage);
std::set<size_t> matchedRightPages;
for (const auto& matchedPage : matchedPages)
{
for (size_t rightContextIndex : matchedPage.second)
{
if (!matchedRightPages.count(rightContextIndex))
{
matchedRightPages.insert(rightContextIndex);
const PDFDiffPageContext& leftPageContext = leftPreparedPages[matchedPage.first];
const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightContextIndex];
pageMatches[leftPageContext.pageIndex] = rightPageContext.pageIndex;
}
}
}
if (!pageMatches.empty())
{
algorithm.perform();
pageSequence = algorithm.getSequence();
}
}
void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::vector<PDFInteger>& rightPages) void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::vector<PDFInteger>& rightPages)
{ {
std::vector<PDFDiffPageContext> leftPreparedPages; std::vector<PDFDiffPageContext> leftPreparedPages;
std::vector<PDFDiffPageContext> rightPreparedPages; std::vector<PDFDiffPageContext> rightPreparedPages;
PDFDiffHelper::PageSequence pageSequence;
auto createDiffPageContext = [](auto pageIndex) auto createDiffPageContext = [](auto pageIndex)
{ {
PDFDiffPageContext context; PDFDiffPageContext context;
@ -252,16 +370,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
// StepMatchPages // StepMatchPages
if (!m_cancelled) if (!m_cancelled)
{ {
// Match pages performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence);
auto comparePages = [](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
{
return left.pageHash == right.pageHash;
};
PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(),
rightPreparedPages.cbegin(), rightPreparedPages.cend(),
comparePages);
algorithm.perform();
stepProgress(); stepProgress();
} }
@ -353,4 +462,150 @@ PDFDiffResult::PDFDiffResult() :
} }
PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left,
const GraphicPieceInfos& right,
PDFReal epsilon)
{
Differences differences;
Q_ASSERT(std::is_sorted(left.cbegin(), left.cend()));
Q_ASSERT(std::is_sorted(right.cbegin(), right.cend()));
for (const GraphicPieceInfo& info : left)
{
if (!std::binary_search(right.cbegin(), right.cend(), info))
{
differences.left.push_back(info);
}
}
for (const GraphicPieceInfo& info : right)
{
if (!std::binary_search(left.cbegin(), left.cend(), info))
{
differences.right.push_back(info);
}
}
const PDFReal epsilonSquared = epsilon * epsilon;
// If exact match fails, then try to use match with epsilon. For each
// item in left, we try to find matching item in right.
for (auto it = differences.left.begin(); it != differences.left.end();)
{
bool hasMatch = false;
const GraphicPieceInfo& leftInfo = *it;
for (auto it2 = differences.right.begin(); it2 != differences.right.end();)
{
// Heuristically compare these items
const GraphicPieceInfo& rightInfo = *it2;
if (leftInfo.type != rightInfo.type || !leftInfo.boundingRect.intersects(rightInfo.boundingRect))
{
++it2;
continue;
}
const int elementCountPath1 = leftInfo.pagePath.elementCount();
const int elementCountPath2 = rightInfo.pagePath.elementCount();
if (elementCountPath1 != elementCountPath2)
{
++it2;
continue;
}
hasMatch = (leftInfo.type != GraphicPieceInfo::Type::Image) || (leftInfo.imageHash == rightInfo.imageHash);
const int elementCount = leftInfo.pagePath.elementCount();
for (int i = 0; i < elementCount && hasMatch; ++i)
{
QPainterPath::Element leftElement = leftInfo.pagePath.elementAt(i);
QPainterPath::Element rightElement = rightInfo.pagePath.elementAt(i);
PDFReal diffX = leftElement.x - rightElement.x;
PDFReal diffY = leftElement.y - rightElement.y;
PDFReal squaredDistance = diffX * diffX + diffY * diffY;
hasMatch = (leftElement.type == rightElement.type) &&
(squaredDistance < epsilonSquared);
}
if (hasMatch)
{
it2 = differences.right.erase(it2);
}
else
{
++it2;
}
}
if (hasMatch)
{
it = differences.left.erase(it);
}
else
{
++it;
}
}
return differences;
}
std::vector<size_t> PDFDiffHelper::getLeftUnmatched(const PageSequence& sequence)
{
std::vector<size_t> result;
for (const auto& item : sequence)
{
if (item.isLeft())
{
result.push_back(item.index1);
}
}
return result;
}
std::vector<size_t> PDFDiffHelper::getRightUnmatched(const PageSequence& sequence)
{
std::vector<size_t> result;
for (const auto& item : sequence)
{
if (item.isRight())
{
result.push_back(item.index2);
}
}
return result;
}
void PDFDiffHelper::matchPage(PageSequence& sequence,
size_t leftPage,
size_t rightPage)
{
for (auto it = sequence.begin(); it != sequence.end();)
{
auto& item = *it;
if (item.isLeft() && item.index1 == leftPage)
{
item.index2 = rightPage;
}
if (item.isRight() && item.index2 == rightPage)
{
it = sequence.erase(it);
}
else
{
++it;
}
}
}
} // namespace pdf } // namespace pdf

View File

@ -21,6 +21,7 @@
#include "pdfdocument.h" #include "pdfdocument.h"
#include "pdfprogress.h" #include "pdfprogress.h"
#include "pdfutils.h" #include "pdfutils.h"
#include "pdfalgorithmlcs.h"
#include <QObject> #include <QObject>
#include <QFuture> #include <QFuture>
@ -126,6 +127,9 @@ private:
void stepProgress(); void stepProgress();
void performSteps(const std::vector<PDFInteger>& leftPages, void performSteps(const std::vector<PDFInteger>& leftPages,
const std::vector<PDFInteger>& rightPages); const std::vector<PDFInteger>& rightPages);
void performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
const std::vector<PDFDiffPageContext>& rightPreparedPages,
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence);
void finalizeGraphicsPieces(PDFDiffPageContext& context); void finalizeGraphicsPieces(PDFDiffPageContext& context);
void onComparationPerformed(); void onComparationPerformed();

View File

@ -879,14 +879,15 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
info.type = data.isText ? GraphicPieceInfo::Type::Text : GraphicPieceInfo::Type::VectorGraphics; info.type = data.isText ? GraphicPieceInfo::Type::Text : GraphicPieceInfo::Type::VectorGraphics;
info.boundingRect = pagePath.controlPointRect(); info.boundingRect = pagePath.controlPointRect();
info.pagePath = pagePath;
const int elementCount = pagePath.elementCount(); const int elementCount = pagePath.elementCount();
for (int i = 0; i < elementCount; ++i) for (int i = 0; i < elementCount; ++i)
{ {
QPainterPath::Element element = pagePath.elementAt(i); QPainterPath::Element element = pagePath.elementAt(i);
PDFReal roundedX = qRound(element.x * factor); PDFReal roundedX = qFloor(element.x * factor);
PDFReal roundedY = qRound(element.y * factor); PDFReal roundedY = qFloor(element.y * factor);
stream << roundedX; stream << roundedX;
stream << roundedY; stream << roundedY;
@ -911,11 +912,13 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
GraphicPieceInfo info; GraphicPieceInfo info;
QByteArray serializedPath; QByteArray serializedPath;
QByteArray serializedImage;
// Serialize data // Serialize data
if (true) if (true)
{ {
QDataStream stream(&serializedPath, QIODevice::WriteOnly); QDataStream stream(&serializedPath, QIODevice::WriteOnly);
QDataStream streamImage(&serializedImage, QIODevice::WriteOnly);
// Jakub Melka: serialize image position // Jakub Melka: serialize image position
QMatrix worldMatrix = stateStack.top().matrix; QMatrix worldMatrix = stateStack.top().matrix;
@ -926,6 +929,7 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
info.type = GraphicPieceInfo::Type::Image; info.type = GraphicPieceInfo::Type::Image;
info.boundingRect = pagePath.controlPointRect(); info.boundingRect = pagePath.controlPointRect();
info.pagePath = pagePath;
const int elementCount = pagePath.elementCount(); const int elementCount = pagePath.elementCount();
for (int i = 0; i < elementCount; ++i) for (int i = 0; i < elementCount; ++i)
@ -942,14 +946,20 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
// serialize image data // serialize image data
stream.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes()); stream.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes());
streamImage.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes());
} }
QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512); QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512);
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64); Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
QByteArray imageHash = QCryptographicHash::hash(serializedImage, QCryptographicHash::Sha512);
size_t size = qMin<size_t>(hash.length(), info.hash.size()); size_t size = qMin<size_t>(hash.length(), info.hash.size());
std::copy(hash.data(), hash.data() + size, info.hash.data()); std::copy(hash.data(), hash.data() + size, info.hash.data());
size_t sizeImage = qMin<size_t>(imageHash.length(), info.imageHash.size());
std::copy(imageHash.data(), imageHash.data() + sizeImage, info.imageHash.data());
infos.emplace_back(std::move(info)); infos.emplace_back(std::move(info));
break; break;
} }

View File

@ -257,7 +257,9 @@ public:
Type type = Type::Unknown; Type type = Type::Unknown;
QRectF boundingRect; QRectF boundingRect;
std::array<uint8_t, 64> hash = { }; std::array<uint8_t, 64> hash = { }; ///< Hash of all data
std::array<uint8_t, 64> imageHash = { }; ///< Hash of the image only
QPainterPath pagePath;
}; };
using GraphicPieceInfos = std::vector<GraphicPieceInfo>; using GraphicPieceInfos = std::vector<GraphicPieceInfo>;