mirror of https://github.com/JakubMelka/PDF4QT.git
DocDiff application: page matching
This commit is contained in:
parent
7de0fdad3c
commit
050ba869f3
|
@ -23,12 +23,29 @@
|
|||
namespace pdf
|
||||
{
|
||||
|
||||
class PDFAlgorithmLongestCommonSubsequenceBase
|
||||
{
|
||||
public:
|
||||
struct SequenceItem
|
||||
{
|
||||
size_t index1 = std::numeric_limits<size_t>::max();
|
||||
size_t index2 = std::numeric_limits<size_t>::max();
|
||||
|
||||
bool isLeftValid() const { return index1 != std::numeric_limits<size_t>::max(); }
|
||||
bool isRightValid() const { return index2 != std::numeric_limits<size_t>::max(); }
|
||||
bool isLeft() const { return isLeftValid() && !isRightValid(); }
|
||||
bool isRight() const { return isRightValid() && !isLeftValid(); }
|
||||
bool isMatch() const { return isLeftValid() && isRightValid(); }
|
||||
};
|
||||
using Sequence = std::vector<SequenceItem>;
|
||||
};
|
||||
|
||||
/// Algorithm for computing longest common subsequence, on two sequences
|
||||
/// of objects, which are implementing operator "==" (equal operator).
|
||||
/// Constructor takes bidirectional iterators to the sequence. So, iterators
|
||||
/// are requred to be bidirectional.
|
||||
template<typename Iterator, typename Comparator>
|
||||
class PDFAlgorithmLongestCommonSubsequence
|
||||
class PDFAlgorithmLongestCommonSubsequence : public PDFAlgorithmLongestCommonSubsequenceBase
|
||||
{
|
||||
public:
|
||||
PDFAlgorithmLongestCommonSubsequence(Iterator it1,
|
||||
|
@ -37,16 +54,6 @@ public:
|
|||
Iterator it2End,
|
||||
Comparator comparator);
|
||||
|
||||
struct SequenceItem
|
||||
{
|
||||
size_t index1 = std::numeric_limits<size_t>::max();
|
||||
size_t index2 = std::numeric_limits<size_t>::max();
|
||||
|
||||
bool isLeftValid() const { return index1 == std::numeric_limits<size_t>::max(); }
|
||||
bool isRightValid() const { return index2 == std::numeric_limits<size_t>::max(); }
|
||||
bool isMatch() const { return isLeftValid() && isRightValid(); }
|
||||
};
|
||||
using Sequence = std::vector<SequenceItem>;
|
||||
|
||||
void perform();
|
||||
|
||||
|
@ -92,6 +99,7 @@ template<typename Iterator, typename Comparator>
|
|||
void PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::perform()
|
||||
{
|
||||
m_backtrackData.resize(m_matrixSize);
|
||||
m_sequence.clear();
|
||||
|
||||
std::vector<size_t> rowTop(m_size1, size_t());
|
||||
std::vector<size_t> rowBottom(m_size1, size_t());
|
||||
|
|
|
@ -30,13 +30,35 @@
|
|||
namespace pdf
|
||||
{
|
||||
|
||||
class PDFDiffHelper
|
||||
{
|
||||
public:
|
||||
using GraphicPieceInfo = PDFPrecompiledPage::GraphicPieceInfo;
|
||||
using GraphicPieceInfos = PDFPrecompiledPage::GraphicPieceInfos;
|
||||
using PageSequence = PDFAlgorithmLongestCommonSubsequenceBase::Sequence;
|
||||
|
||||
|
||||
struct Differences
|
||||
{
|
||||
GraphicPieceInfos left;
|
||||
GraphicPieceInfos right;
|
||||
|
||||
bool isEmpty() const { return left.empty() && right.empty(); }
|
||||
};
|
||||
|
||||
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
|
||||
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
|
||||
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
|
||||
static void matchPage(PageSequence& sequence, size_t leftPage, size_t rightPage);
|
||||
};
|
||||
|
||||
PDFDiff::PDFDiff(QObject* parent) :
|
||||
BaseClass(parent),
|
||||
m_progress(nullptr),
|
||||
m_leftDocument(nullptr),
|
||||
m_rightDocument(nullptr),
|
||||
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
|
||||
m_epsilon(0.0001),
|
||||
m_epsilon(0.001),
|
||||
m_cancelled(false)
|
||||
{
|
||||
|
||||
|
@ -178,11 +200,107 @@ struct PDFDiffPageContext
|
|||
PDFPrecompiledPage::GraphicPieceInfos graphicPieces;
|
||||
};
|
||||
|
||||
void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
|
||||
const std::vector<PDFDiffPageContext>& rightPreparedPages,
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence)
|
||||
{
|
||||
// Match pages. We will use following algorithm: exact solution can fail, because
|
||||
// we are using hashes and due to numerical instability, hashes can be different
|
||||
// even for exactly the same page. But if hashes are the same, the page must be the same.
|
||||
// So, we use longest common subsequence algorithm to detect same page ranges,
|
||||
// and then we match the rest. We assume the number of failing pages is relatively small.
|
||||
|
||||
std::map<size_t, size_t> pageMatches;
|
||||
auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
|
||||
{
|
||||
if (left.pageHash == right.pageHash)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
auto it = pageMatches.find(left.pageIndex);
|
||||
if (it != pageMatches.cend())
|
||||
{
|
||||
return it->second == right.pageIndex;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(),
|
||||
rightPreparedPages.cbegin(), rightPreparedPages.cend(),
|
||||
comparePages);
|
||||
algorithm.perform();
|
||||
pageSequence = algorithm.getSequence();
|
||||
|
||||
std::vector<size_t> leftUnmatched = PDFDiffHelper::getLeftUnmatched(pageSequence);
|
||||
std::vector<size_t> rightUnmatched = PDFDiffHelper::getRightUnmatched(pageSequence);
|
||||
|
||||
// We are matching left pages to the right ones
|
||||
std::map<size_t, std::vector<size_t>> matchedPages;
|
||||
|
||||
for (const size_t index : leftUnmatched)
|
||||
{
|
||||
matchedPages[index] = std::vector<size_t>();
|
||||
}
|
||||
|
||||
auto matchLeftPage = [&, this](size_t leftIndex)
|
||||
{
|
||||
const PDFDiffPageContext& leftPageContext = leftPreparedPages[leftIndex];
|
||||
|
||||
auto page = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex);
|
||||
PDFReal epsilon = calculateEpsilonForPage(page);
|
||||
|
||||
for (const size_t rightIndex : rightUnmatched)
|
||||
{
|
||||
const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightIndex];
|
||||
if (leftPageContext.graphicPieces.size() != rightPageContext.graphicPieces.size())
|
||||
{
|
||||
// Match cannot exist, graphic pieces have different size
|
||||
continue;
|
||||
}
|
||||
|
||||
PDFDiffHelper::Differences differences = PDFDiffHelper::calculateDifferences(leftPageContext.graphicPieces, rightPageContext.graphicPieces, epsilon);
|
||||
|
||||
if (differences.isEmpty())
|
||||
{
|
||||
// Jakub Melka: we have a match
|
||||
matchedPages[leftIndex].push_back(rightIndex);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftUnmatched.begin(), leftUnmatched.end(), matchLeftPage);
|
||||
|
||||
std::set<size_t> matchedRightPages;
|
||||
for (const auto& matchedPage : matchedPages)
|
||||
{
|
||||
for (size_t rightContextIndex : matchedPage.second)
|
||||
{
|
||||
if (!matchedRightPages.count(rightContextIndex))
|
||||
{
|
||||
matchedRightPages.insert(rightContextIndex);
|
||||
const PDFDiffPageContext& leftPageContext = leftPreparedPages[matchedPage.first];
|
||||
const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightContextIndex];
|
||||
|
||||
pageMatches[leftPageContext.pageIndex] = rightPageContext.pageIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!pageMatches.empty())
|
||||
{
|
||||
algorithm.perform();
|
||||
pageSequence = algorithm.getSequence();
|
||||
}
|
||||
}
|
||||
|
||||
void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::vector<PDFInteger>& rightPages)
|
||||
{
|
||||
std::vector<PDFDiffPageContext> leftPreparedPages;
|
||||
std::vector<PDFDiffPageContext> rightPreparedPages;
|
||||
|
||||
PDFDiffHelper::PageSequence pageSequence;
|
||||
|
||||
auto createDiffPageContext = [](auto pageIndex)
|
||||
{
|
||||
PDFDiffPageContext context;
|
||||
|
@ -252,16 +370,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
|||
// StepMatchPages
|
||||
if (!m_cancelled)
|
||||
{
|
||||
// Match pages
|
||||
auto comparePages = [](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
|
||||
{
|
||||
return left.pageHash == right.pageHash;
|
||||
};
|
||||
PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(),
|
||||
rightPreparedPages.cbegin(), rightPreparedPages.cend(),
|
||||
comparePages);
|
||||
algorithm.perform();
|
||||
|
||||
performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence);
|
||||
stepProgress();
|
||||
}
|
||||
|
||||
|
@ -353,4 +462,150 @@ PDFDiffResult::PDFDiffResult() :
|
|||
|
||||
}
|
||||
|
||||
PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left,
|
||||
const GraphicPieceInfos& right,
|
||||
PDFReal epsilon)
|
||||
{
|
||||
Differences differences;
|
||||
|
||||
Q_ASSERT(std::is_sorted(left.cbegin(), left.cend()));
|
||||
Q_ASSERT(std::is_sorted(right.cbegin(), right.cend()));
|
||||
|
||||
for (const GraphicPieceInfo& info : left)
|
||||
{
|
||||
if (!std::binary_search(right.cbegin(), right.cend(), info))
|
||||
{
|
||||
differences.left.push_back(info);
|
||||
}
|
||||
}
|
||||
|
||||
for (const GraphicPieceInfo& info : right)
|
||||
{
|
||||
if (!std::binary_search(left.cbegin(), left.cend(), info))
|
||||
{
|
||||
differences.right.push_back(info);
|
||||
}
|
||||
}
|
||||
|
||||
const PDFReal epsilonSquared = epsilon * epsilon;
|
||||
|
||||
// If exact match fails, then try to use match with epsilon. For each
|
||||
// item in left, we try to find matching item in right.
|
||||
for (auto it = differences.left.begin(); it != differences.left.end();)
|
||||
{
|
||||
bool hasMatch = false;
|
||||
|
||||
const GraphicPieceInfo& leftInfo = *it;
|
||||
for (auto it2 = differences.right.begin(); it2 != differences.right.end();)
|
||||
{
|
||||
// Heuristically compare these items
|
||||
|
||||
const GraphicPieceInfo& rightInfo = *it2;
|
||||
if (leftInfo.type != rightInfo.type || !leftInfo.boundingRect.intersects(rightInfo.boundingRect))
|
||||
{
|
||||
++it2;
|
||||
continue;
|
||||
}
|
||||
|
||||
const int elementCountPath1 = leftInfo.pagePath.elementCount();
|
||||
const int elementCountPath2 = rightInfo.pagePath.elementCount();
|
||||
|
||||
if (elementCountPath1 != elementCountPath2)
|
||||
{
|
||||
++it2;
|
||||
continue;
|
||||
}
|
||||
|
||||
hasMatch = (leftInfo.type != GraphicPieceInfo::Type::Image) || (leftInfo.imageHash == rightInfo.imageHash);
|
||||
const int elementCount = leftInfo.pagePath.elementCount();
|
||||
for (int i = 0; i < elementCount && hasMatch; ++i)
|
||||
{
|
||||
QPainterPath::Element leftElement = leftInfo.pagePath.elementAt(i);
|
||||
QPainterPath::Element rightElement = rightInfo.pagePath.elementAt(i);
|
||||
|
||||
PDFReal diffX = leftElement.x - rightElement.x;
|
||||
PDFReal diffY = leftElement.y - rightElement.y;
|
||||
PDFReal squaredDistance = diffX * diffX + diffY * diffY;
|
||||
|
||||
hasMatch = (leftElement.type == rightElement.type) &&
|
||||
(squaredDistance < epsilonSquared);
|
||||
}
|
||||
|
||||
if (hasMatch)
|
||||
{
|
||||
it2 = differences.right.erase(it2);
|
||||
}
|
||||
else
|
||||
{
|
||||
++it2;
|
||||
}
|
||||
}
|
||||
|
||||
if (hasMatch)
|
||||
{
|
||||
it = differences.left.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
return differences;
|
||||
}
|
||||
|
||||
std::vector<size_t> PDFDiffHelper::getLeftUnmatched(const PageSequence& sequence)
|
||||
{
|
||||
std::vector<size_t> result;
|
||||
|
||||
for (const auto& item : sequence)
|
||||
{
|
||||
if (item.isLeft())
|
||||
{
|
||||
result.push_back(item.index1);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<size_t> PDFDiffHelper::getRightUnmatched(const PageSequence& sequence)
|
||||
{
|
||||
std::vector<size_t> result;
|
||||
|
||||
for (const auto& item : sequence)
|
||||
{
|
||||
if (item.isRight())
|
||||
{
|
||||
result.push_back(item.index2);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void PDFDiffHelper::matchPage(PageSequence& sequence,
|
||||
size_t leftPage,
|
||||
size_t rightPage)
|
||||
{
|
||||
for (auto it = sequence.begin(); it != sequence.end();)
|
||||
{
|
||||
auto& item = *it;
|
||||
|
||||
if (item.isLeft() && item.index1 == leftPage)
|
||||
{
|
||||
item.index2 = rightPage;
|
||||
}
|
||||
|
||||
if (item.isRight() && item.index2 == rightPage)
|
||||
{
|
||||
it = sequence.erase(it);
|
||||
}
|
||||
else
|
||||
{
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "pdfdocument.h"
|
||||
#include "pdfprogress.h"
|
||||
#include "pdfutils.h"
|
||||
#include "pdfalgorithmlcs.h"
|
||||
|
||||
#include <QObject>
|
||||
#include <QFuture>
|
||||
|
@ -126,6 +127,9 @@ private:
|
|||
void stepProgress();
|
||||
void performSteps(const std::vector<PDFInteger>& leftPages,
|
||||
const std::vector<PDFInteger>& rightPages);
|
||||
void performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
|
||||
const std::vector<PDFDiffPageContext>& rightPreparedPages,
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence);
|
||||
void finalizeGraphicsPieces(PDFDiffPageContext& context);
|
||||
|
||||
void onComparationPerformed();
|
||||
|
|
|
@ -879,14 +879,15 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
|
|||
|
||||
info.type = data.isText ? GraphicPieceInfo::Type::Text : GraphicPieceInfo::Type::VectorGraphics;
|
||||
info.boundingRect = pagePath.controlPointRect();
|
||||
info.pagePath = pagePath;
|
||||
|
||||
const int elementCount = pagePath.elementCount();
|
||||
for (int i = 0; i < elementCount; ++i)
|
||||
{
|
||||
QPainterPath::Element element = pagePath.elementAt(i);
|
||||
|
||||
PDFReal roundedX = qRound(element.x * factor);
|
||||
PDFReal roundedY = qRound(element.y * factor);
|
||||
PDFReal roundedX = qFloor(element.x * factor);
|
||||
PDFReal roundedY = qFloor(element.y * factor);
|
||||
|
||||
stream << roundedX;
|
||||
stream << roundedY;
|
||||
|
@ -911,11 +912,13 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
|
|||
|
||||
GraphicPieceInfo info;
|
||||
QByteArray serializedPath;
|
||||
QByteArray serializedImage;
|
||||
|
||||
// Serialize data
|
||||
if (true)
|
||||
{
|
||||
QDataStream stream(&serializedPath, QIODevice::WriteOnly);
|
||||
QDataStream streamImage(&serializedImage, QIODevice::WriteOnly);
|
||||
|
||||
// Jakub Melka: serialize image position
|
||||
QMatrix worldMatrix = stateStack.top().matrix;
|
||||
|
@ -926,6 +929,7 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
|
|||
|
||||
info.type = GraphicPieceInfo::Type::Image;
|
||||
info.boundingRect = pagePath.controlPointRect();
|
||||
info.pagePath = pagePath;
|
||||
|
||||
const int elementCount = pagePath.elementCount();
|
||||
for (int i = 0; i < elementCount; ++i)
|
||||
|
@ -942,14 +946,20 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI
|
|||
|
||||
// serialize image data
|
||||
stream.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes());
|
||||
streamImage.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes());
|
||||
}
|
||||
|
||||
QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512);
|
||||
Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64);
|
||||
|
||||
QByteArray imageHash = QCryptographicHash::hash(serializedImage, QCryptographicHash::Sha512);
|
||||
|
||||
size_t size = qMin<size_t>(hash.length(), info.hash.size());
|
||||
std::copy(hash.data(), hash.data() + size, info.hash.data());
|
||||
|
||||
size_t sizeImage = qMin<size_t>(imageHash.length(), info.imageHash.size());
|
||||
std::copy(imageHash.data(), imageHash.data() + sizeImage, info.imageHash.data());
|
||||
|
||||
infos.emplace_back(std::move(info));
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -257,7 +257,9 @@ public:
|
|||
|
||||
Type type = Type::Unknown;
|
||||
QRectF boundingRect;
|
||||
std::array<uint8_t, 64> hash = { };
|
||||
std::array<uint8_t, 64> hash = { }; ///< Hash of all data
|
||||
std::array<uint8_t, 64> imageHash = { }; ///< Hash of the image only
|
||||
QPainterPath pagePath;
|
||||
};
|
||||
|
||||
using GraphicPieceInfos = std::vector<GraphicPieceInfo>;
|
||||
|
|
Loading…
Reference in New Issue