From 050ba869f3a030e1c23dc7f32e67e9646dfeda6d Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sat, 11 Sep 2021 18:17:38 +0200 Subject: [PATCH] DocDiff application: page matching --- Pdf4QtLib/sources/pdfalgorithmlcs.h | 30 +-- Pdf4QtLib/sources/pdfdiff.cpp | 277 ++++++++++++++++++++++++++-- Pdf4QtLib/sources/pdfdiff.h | 4 + Pdf4QtLib/sources/pdfpainter.cpp | 14 +- Pdf4QtLib/sources/pdfpainter.h | 4 +- 5 files changed, 304 insertions(+), 25 deletions(-) diff --git a/Pdf4QtLib/sources/pdfalgorithmlcs.h b/Pdf4QtLib/sources/pdfalgorithmlcs.h index 1e55925..fcc0faf 100644 --- a/Pdf4QtLib/sources/pdfalgorithmlcs.h +++ b/Pdf4QtLib/sources/pdfalgorithmlcs.h @@ -23,12 +23,29 @@ namespace pdf { +class PDFAlgorithmLongestCommonSubsequenceBase +{ +public: + struct SequenceItem + { + size_t index1 = std::numeric_limits::max(); + size_t index2 = std::numeric_limits::max(); + + bool isLeftValid() const { return index1 != std::numeric_limits::max(); } + bool isRightValid() const { return index2 != std::numeric_limits::max(); } + bool isLeft() const { return isLeftValid() && !isRightValid(); } + bool isRight() const { return isRightValid() && !isLeftValid(); } + bool isMatch() const { return isLeftValid() && isRightValid(); } + }; + using Sequence = std::vector; +}; + /// Algorithm for computing longest common subsequence, on two sequences /// of objects, which are implementing operator "==" (equal operator). /// Constructor takes bidirectional iterators to the sequence. So, iterators /// are requred to be bidirectional. template -class PDFAlgorithmLongestCommonSubsequence +class PDFAlgorithmLongestCommonSubsequence : public PDFAlgorithmLongestCommonSubsequenceBase { public: PDFAlgorithmLongestCommonSubsequence(Iterator it1, @@ -37,16 +54,6 @@ public: Iterator it2End, Comparator comparator); - struct SequenceItem - { - size_t index1 = std::numeric_limits::max(); - size_t index2 = std::numeric_limits::max(); - - bool isLeftValid() const { return index1 == std::numeric_limits::max(); } - bool isRightValid() const { return index2 == std::numeric_limits::max(); } - bool isMatch() const { return isLeftValid() && isRightValid(); } - }; - using Sequence = std::vector; void perform(); @@ -92,6 +99,7 @@ template void PDFAlgorithmLongestCommonSubsequence::perform() { m_backtrackData.resize(m_matrixSize); + m_sequence.clear(); std::vector rowTop(m_size1, size_t()); std::vector rowBottom(m_size1, size_t()); diff --git a/Pdf4QtLib/sources/pdfdiff.cpp b/Pdf4QtLib/sources/pdfdiff.cpp index ad7266d..f5d46a6 100644 --- a/Pdf4QtLib/sources/pdfdiff.cpp +++ b/Pdf4QtLib/sources/pdfdiff.cpp @@ -30,13 +30,35 @@ namespace pdf { +class PDFDiffHelper +{ +public: + using GraphicPieceInfo = PDFPrecompiledPage::GraphicPieceInfo; + using GraphicPieceInfos = PDFPrecompiledPage::GraphicPieceInfos; + using PageSequence = PDFAlgorithmLongestCommonSubsequenceBase::Sequence; + + + struct Differences + { + GraphicPieceInfos left; + GraphicPieceInfos right; + + bool isEmpty() const { return left.empty() && right.empty(); } + }; + + static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon); + static std::vector getLeftUnmatched(const PageSequence& sequence); + static std::vector getRightUnmatched(const PageSequence& sequence); + static void matchPage(PageSequence& sequence, size_t leftPage, size_t rightPage); +}; + PDFDiff::PDFDiff(QObject* parent) : BaseClass(parent), m_progress(nullptr), m_leftDocument(nullptr), m_rightDocument(nullptr), m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), - m_epsilon(0.0001), + m_epsilon(0.001), m_cancelled(false) { @@ -178,11 +200,107 @@ struct PDFDiffPageContext PDFPrecompiledPage::GraphicPieceInfos graphicPieces; }; +void PDFDiff::performPageMatching(const std::vector& leftPreparedPages, + const std::vector& rightPreparedPages, + PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence) +{ + // Match pages. We will use following algorithm: exact solution can fail, because + // we are using hashes and due to numerical instability, hashes can be different + // even for exactly the same page. But if hashes are the same, the page must be the same. + // So, we use longest common subsequence algorithm to detect same page ranges, + // and then we match the rest. We assume the number of failing pages is relatively small. + + std::map pageMatches; + auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right) + { + if (left.pageHash == right.pageHash) + { + return true; + } + + auto it = pageMatches.find(left.pageIndex); + if (it != pageMatches.cend()) + { + return it->second == right.pageIndex; + } + + return false; + }; + PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(), + rightPreparedPages.cbegin(), rightPreparedPages.cend(), + comparePages); + algorithm.perform(); + pageSequence = algorithm.getSequence(); + + std::vector leftUnmatched = PDFDiffHelper::getLeftUnmatched(pageSequence); + std::vector rightUnmatched = PDFDiffHelper::getRightUnmatched(pageSequence); + + // We are matching left pages to the right ones + std::map> matchedPages; + + for (const size_t index : leftUnmatched) + { + matchedPages[index] = std::vector(); + } + + auto matchLeftPage = [&, this](size_t leftIndex) + { + const PDFDiffPageContext& leftPageContext = leftPreparedPages[leftIndex]; + + auto page = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex); + PDFReal epsilon = calculateEpsilonForPage(page); + + for (const size_t rightIndex : rightUnmatched) + { + const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightIndex]; + if (leftPageContext.graphicPieces.size() != rightPageContext.graphicPieces.size()) + { + // Match cannot exist, graphic pieces have different size + continue; + } + + PDFDiffHelper::Differences differences = PDFDiffHelper::calculateDifferences(leftPageContext.graphicPieces, rightPageContext.graphicPieces, epsilon); + + if (differences.isEmpty()) + { + // Jakub Melka: we have a match + matchedPages[leftIndex].push_back(rightIndex); + } + } + }; + + PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftUnmatched.begin(), leftUnmatched.end(), matchLeftPage); + + std::set matchedRightPages; + for (const auto& matchedPage : matchedPages) + { + for (size_t rightContextIndex : matchedPage.second) + { + if (!matchedRightPages.count(rightContextIndex)) + { + matchedRightPages.insert(rightContextIndex); + const PDFDiffPageContext& leftPageContext = leftPreparedPages[matchedPage.first]; + const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightContextIndex]; + + pageMatches[leftPageContext.pageIndex] = rightPageContext.pageIndex; + } + } + } + + if (!pageMatches.empty()) + { + algorithm.perform(); + pageSequence = algorithm.getSequence(); + } +} + void PDFDiff::performSteps(const std::vector& leftPages, const std::vector& rightPages) { std::vector leftPreparedPages; std::vector rightPreparedPages; + PDFDiffHelper::PageSequence pageSequence; + auto createDiffPageContext = [](auto pageIndex) { PDFDiffPageContext context; @@ -252,16 +370,7 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: // StepMatchPages if (!m_cancelled) { - // Match pages - auto comparePages = [](const PDFDiffPageContext& left, const PDFDiffPageContext& right) - { - return left.pageHash == right.pageHash; - }; - PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(), - rightPreparedPages.cbegin(), rightPreparedPages.cend(), - comparePages); - algorithm.perform(); - + performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence); stepProgress(); } @@ -353,4 +462,150 @@ PDFDiffResult::PDFDiffResult() : } +PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left, + const GraphicPieceInfos& right, + PDFReal epsilon) +{ + Differences differences; + + Q_ASSERT(std::is_sorted(left.cbegin(), left.cend())); + Q_ASSERT(std::is_sorted(right.cbegin(), right.cend())); + + for (const GraphicPieceInfo& info : left) + { + if (!std::binary_search(right.cbegin(), right.cend(), info)) + { + differences.left.push_back(info); + } + } + + for (const GraphicPieceInfo& info : right) + { + if (!std::binary_search(left.cbegin(), left.cend(), info)) + { + differences.right.push_back(info); + } + } + + const PDFReal epsilonSquared = epsilon * epsilon; + + // If exact match fails, then try to use match with epsilon. For each + // item in left, we try to find matching item in right. + for (auto it = differences.left.begin(); it != differences.left.end();) + { + bool hasMatch = false; + + const GraphicPieceInfo& leftInfo = *it; + for (auto it2 = differences.right.begin(); it2 != differences.right.end();) + { + // Heuristically compare these items + + const GraphicPieceInfo& rightInfo = *it2; + if (leftInfo.type != rightInfo.type || !leftInfo.boundingRect.intersects(rightInfo.boundingRect)) + { + ++it2; + continue; + } + + const int elementCountPath1 = leftInfo.pagePath.elementCount(); + const int elementCountPath2 = rightInfo.pagePath.elementCount(); + + if (elementCountPath1 != elementCountPath2) + { + ++it2; + continue; + } + + hasMatch = (leftInfo.type != GraphicPieceInfo::Type::Image) || (leftInfo.imageHash == rightInfo.imageHash); + const int elementCount = leftInfo.pagePath.elementCount(); + for (int i = 0; i < elementCount && hasMatch; ++i) + { + QPainterPath::Element leftElement = leftInfo.pagePath.elementAt(i); + QPainterPath::Element rightElement = rightInfo.pagePath.elementAt(i); + + PDFReal diffX = leftElement.x - rightElement.x; + PDFReal diffY = leftElement.y - rightElement.y; + PDFReal squaredDistance = diffX * diffX + diffY * diffY; + + hasMatch = (leftElement.type == rightElement.type) && + (squaredDistance < epsilonSquared); + } + + if (hasMatch) + { + it2 = differences.right.erase(it2); + } + else + { + ++it2; + } + } + + if (hasMatch) + { + it = differences.left.erase(it); + } + else + { + ++it; + } + } + + return differences; +} + +std::vector PDFDiffHelper::getLeftUnmatched(const PageSequence& sequence) +{ + std::vector result; + + for (const auto& item : sequence) + { + if (item.isLeft()) + { + result.push_back(item.index1); + } + } + + return result; +} + +std::vector PDFDiffHelper::getRightUnmatched(const PageSequence& sequence) +{ + std::vector result; + + for (const auto& item : sequence) + { + if (item.isRight()) + { + result.push_back(item.index2); + } + } + + return result; +} + +void PDFDiffHelper::matchPage(PageSequence& sequence, + size_t leftPage, + size_t rightPage) +{ + for (auto it = sequence.begin(); it != sequence.end();) + { + auto& item = *it; + + if (item.isLeft() && item.index1 == leftPage) + { + item.index2 = rightPage; + } + + if (item.isRight() && item.index2 == rightPage) + { + it = sequence.erase(it); + } + else + { + ++it; + } + } +} + } // namespace pdf diff --git a/Pdf4QtLib/sources/pdfdiff.h b/Pdf4QtLib/sources/pdfdiff.h index 9256c4d..dd62cb7 100644 --- a/Pdf4QtLib/sources/pdfdiff.h +++ b/Pdf4QtLib/sources/pdfdiff.h @@ -21,6 +21,7 @@ #include "pdfdocument.h" #include "pdfprogress.h" #include "pdfutils.h" +#include "pdfalgorithmlcs.h" #include #include @@ -126,6 +127,9 @@ private: void stepProgress(); void performSteps(const std::vector& leftPages, const std::vector& rightPages); + void performPageMatching(const std::vector& leftPreparedPages, + const std::vector& rightPreparedPages, + PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence); void finalizeGraphicsPieces(PDFDiffPageContext& context); void onComparationPerformed(); diff --git a/Pdf4QtLib/sources/pdfpainter.cpp b/Pdf4QtLib/sources/pdfpainter.cpp index e7fea70..410377a 100644 --- a/Pdf4QtLib/sources/pdfpainter.cpp +++ b/Pdf4QtLib/sources/pdfpainter.cpp @@ -879,14 +879,15 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI info.type = data.isText ? GraphicPieceInfo::Type::Text : GraphicPieceInfo::Type::VectorGraphics; info.boundingRect = pagePath.controlPointRect(); + info.pagePath = pagePath; const int elementCount = pagePath.elementCount(); for (int i = 0; i < elementCount; ++i) { QPainterPath::Element element = pagePath.elementAt(i); - PDFReal roundedX = qRound(element.x * factor); - PDFReal roundedY = qRound(element.y * factor); + PDFReal roundedX = qFloor(element.x * factor); + PDFReal roundedY = qFloor(element.y * factor); stream << roundedX; stream << roundedY; @@ -911,11 +912,13 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI GraphicPieceInfo info; QByteArray serializedPath; + QByteArray serializedImage; // Serialize data if (true) { QDataStream stream(&serializedPath, QIODevice::WriteOnly); + QDataStream streamImage(&serializedImage, QIODevice::WriteOnly); // Jakub Melka: serialize image position QMatrix worldMatrix = stateStack.top().matrix; @@ -926,6 +929,7 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI info.type = GraphicPieceInfo::Type::Image; info.boundingRect = pagePath.controlPointRect(); + info.pagePath = pagePath; const int elementCount = pagePath.elementCount(); for (int i = 0; i < elementCount; ++i) @@ -942,14 +946,20 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI // serialize image data stream.writeBytes(reinterpret_cast(image.bits()), image.sizeInBytes()); + streamImage.writeBytes(reinterpret_cast(image.bits()), image.sizeInBytes()); } QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512); Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64); + QByteArray imageHash = QCryptographicHash::hash(serializedImage, QCryptographicHash::Sha512); + size_t size = qMin(hash.length(), info.hash.size()); std::copy(hash.data(), hash.data() + size, info.hash.data()); + size_t sizeImage = qMin(imageHash.length(), info.imageHash.size()); + std::copy(imageHash.data(), imageHash.data() + sizeImage, info.imageHash.data()); + infos.emplace_back(std::move(info)); break; } diff --git a/Pdf4QtLib/sources/pdfpainter.h b/Pdf4QtLib/sources/pdfpainter.h index 26f85df..ec89813 100644 --- a/Pdf4QtLib/sources/pdfpainter.h +++ b/Pdf4QtLib/sources/pdfpainter.h @@ -257,7 +257,9 @@ public: Type type = Type::Unknown; QRectF boundingRect; - std::array hash = { }; + std::array hash = { }; ///< Hash of all data + std::array imageHash = { }; ///< Hash of the image only + QPainterPath pagePath; }; using GraphicPieceInfos = std::vector;