mirror of
				https://github.com/JakubMelka/PDF4QT.git
				synced 2025-06-05 21:59:17 +02:00 
			
		
		
		
	DocDiff application: page matching
This commit is contained in:
		| @@ -23,12 +23,29 @@ | ||||
| namespace pdf | ||||
| { | ||||
|  | ||||
| class PDFAlgorithmLongestCommonSubsequenceBase | ||||
| { | ||||
| public: | ||||
|     struct SequenceItem | ||||
|     { | ||||
|         size_t index1 = std::numeric_limits<size_t>::max(); | ||||
|         size_t index2 = std::numeric_limits<size_t>::max(); | ||||
|  | ||||
|         bool isLeftValid() const { return index1 != std::numeric_limits<size_t>::max(); } | ||||
|         bool isRightValid() const { return index2 != std::numeric_limits<size_t>::max(); } | ||||
|         bool isLeft() const { return isLeftValid() && !isRightValid(); } | ||||
|         bool isRight() const { return isRightValid() && !isLeftValid(); } | ||||
|         bool isMatch() const { return isLeftValid() && isRightValid(); } | ||||
|     }; | ||||
|     using Sequence = std::vector<SequenceItem>; | ||||
| }; | ||||
|  | ||||
| /// Algorithm for computing longest common subsequence, on two sequences | ||||
| /// of objects, which are implementing operator "==" (equal operator). | ||||
| /// Constructor takes bidirectional iterators to the sequence. So, iterators | ||||
| /// are requred to be bidirectional. | ||||
| template<typename Iterator, typename Comparator> | ||||
| class PDFAlgorithmLongestCommonSubsequence | ||||
| class PDFAlgorithmLongestCommonSubsequence : public PDFAlgorithmLongestCommonSubsequenceBase | ||||
| { | ||||
| public: | ||||
|     PDFAlgorithmLongestCommonSubsequence(Iterator it1, | ||||
| @@ -37,16 +54,6 @@ public: | ||||
|                                          Iterator it2End, | ||||
|                                          Comparator comparator); | ||||
|  | ||||
|     struct SequenceItem | ||||
|     { | ||||
|         size_t index1 = std::numeric_limits<size_t>::max(); | ||||
|         size_t index2 = std::numeric_limits<size_t>::max(); | ||||
|  | ||||
|         bool isLeftValid() const { return index1 == std::numeric_limits<size_t>::max(); } | ||||
|         bool isRightValid() const { return index2 == std::numeric_limits<size_t>::max(); } | ||||
|         bool isMatch() const { return isLeftValid() && isRightValid(); } | ||||
|     }; | ||||
|     using Sequence = std::vector<SequenceItem>; | ||||
|  | ||||
|     void perform(); | ||||
|  | ||||
| @@ -92,6 +99,7 @@ template<typename Iterator, typename Comparator> | ||||
| void PDFAlgorithmLongestCommonSubsequence<Iterator, Comparator>::perform() | ||||
| { | ||||
|     m_backtrackData.resize(m_matrixSize); | ||||
|     m_sequence.clear(); | ||||
|  | ||||
|     std::vector<size_t> rowTop(m_size1, size_t()); | ||||
|     std::vector<size_t> rowBottom(m_size1, size_t()); | ||||
|   | ||||
| @@ -30,13 +30,35 @@ | ||||
| namespace pdf | ||||
| { | ||||
|  | ||||
| class PDFDiffHelper | ||||
| { | ||||
| public: | ||||
|     using GraphicPieceInfo = PDFPrecompiledPage::GraphicPieceInfo; | ||||
|     using GraphicPieceInfos = PDFPrecompiledPage::GraphicPieceInfos; | ||||
|     using PageSequence = PDFAlgorithmLongestCommonSubsequenceBase::Sequence; | ||||
|  | ||||
|  | ||||
|     struct Differences | ||||
|     { | ||||
|         GraphicPieceInfos left; | ||||
|         GraphicPieceInfos right; | ||||
|  | ||||
|         bool isEmpty() const { return left.empty() && right.empty(); } | ||||
|     }; | ||||
|  | ||||
|     static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon); | ||||
|     static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence); | ||||
|     static std::vector<size_t> getRightUnmatched(const PageSequence& sequence); | ||||
|     static void matchPage(PageSequence& sequence, size_t leftPage, size_t rightPage); | ||||
| }; | ||||
|  | ||||
| PDFDiff::PDFDiff(QObject* parent) : | ||||
|     BaseClass(parent), | ||||
|     m_progress(nullptr), | ||||
|     m_leftDocument(nullptr), | ||||
|     m_rightDocument(nullptr), | ||||
|     m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), | ||||
|     m_epsilon(0.0001), | ||||
|     m_epsilon(0.001), | ||||
|     m_cancelled(false) | ||||
| { | ||||
|  | ||||
| @@ -178,11 +200,107 @@ struct PDFDiffPageContext | ||||
|     PDFPrecompiledPage::GraphicPieceInfos graphicPieces; | ||||
| }; | ||||
|  | ||||
| void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages, | ||||
|                                   const std::vector<PDFDiffPageContext>& rightPreparedPages, | ||||
|                                   PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence) | ||||
| { | ||||
|     // Match pages. We will use following algorithm: exact solution can fail, because | ||||
|     // we are using hashes and due to numerical instability, hashes can be different | ||||
|     // even for exactly the same page. But if hashes are the same, the page must be the same. | ||||
|     // So, we use longest common subsequence algorithm to detect same page ranges, | ||||
|     // and then we match the rest. We assume the number of failing pages is relatively small. | ||||
|  | ||||
|     std::map<size_t, size_t> pageMatches; | ||||
|     auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right) | ||||
|     { | ||||
|         if (left.pageHash == right.pageHash) | ||||
|         { | ||||
|             return true; | ||||
|         } | ||||
|  | ||||
|         auto it = pageMatches.find(left.pageIndex); | ||||
|         if (it != pageMatches.cend()) | ||||
|         { | ||||
|             return it->second == right.pageIndex; | ||||
|         } | ||||
|  | ||||
|         return false; | ||||
|     }; | ||||
|     PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(), | ||||
|                                                    rightPreparedPages.cbegin(), rightPreparedPages.cend(), | ||||
|                                                    comparePages); | ||||
|     algorithm.perform(); | ||||
|     pageSequence = algorithm.getSequence(); | ||||
|  | ||||
|     std::vector<size_t> leftUnmatched = PDFDiffHelper::getLeftUnmatched(pageSequence); | ||||
|     std::vector<size_t> rightUnmatched = PDFDiffHelper::getRightUnmatched(pageSequence); | ||||
|  | ||||
|     // We are matching left pages to the right ones | ||||
|     std::map<size_t, std::vector<size_t>> matchedPages; | ||||
|  | ||||
|     for (const size_t index : leftUnmatched) | ||||
|     { | ||||
|         matchedPages[index] = std::vector<size_t>(); | ||||
|     } | ||||
|  | ||||
|     auto matchLeftPage = [&, this](size_t leftIndex) | ||||
|     { | ||||
|         const PDFDiffPageContext& leftPageContext = leftPreparedPages[leftIndex]; | ||||
|  | ||||
|         auto page = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex); | ||||
|         PDFReal epsilon = calculateEpsilonForPage(page); | ||||
|  | ||||
|         for (const size_t rightIndex : rightUnmatched) | ||||
|         { | ||||
|             const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightIndex]; | ||||
|             if (leftPageContext.graphicPieces.size() != rightPageContext.graphicPieces.size()) | ||||
|             { | ||||
|                 // Match cannot exist, graphic pieces have different size | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             PDFDiffHelper::Differences differences = PDFDiffHelper::calculateDifferences(leftPageContext.graphicPieces, rightPageContext.graphicPieces, epsilon); | ||||
|  | ||||
|             if (differences.isEmpty()) | ||||
|             { | ||||
|                 // Jakub Melka: we have a match | ||||
|                 matchedPages[leftIndex].push_back(rightIndex); | ||||
|             } | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftUnmatched.begin(), leftUnmatched.end(), matchLeftPage); | ||||
|  | ||||
|     std::set<size_t> matchedRightPages; | ||||
|     for (const auto& matchedPage : matchedPages) | ||||
|     { | ||||
|         for (size_t rightContextIndex : matchedPage.second) | ||||
|         { | ||||
|             if (!matchedRightPages.count(rightContextIndex)) | ||||
|             { | ||||
|                 matchedRightPages.insert(rightContextIndex); | ||||
|                 const PDFDiffPageContext& leftPageContext = leftPreparedPages[matchedPage.first]; | ||||
|                 const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightContextIndex]; | ||||
|  | ||||
|                 pageMatches[leftPageContext.pageIndex] = rightPageContext.pageIndex; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (!pageMatches.empty()) | ||||
|     { | ||||
|         algorithm.perform(); | ||||
|         pageSequence = algorithm.getSequence(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::vector<PDFInteger>& rightPages) | ||||
| { | ||||
|     std::vector<PDFDiffPageContext> leftPreparedPages; | ||||
|     std::vector<PDFDiffPageContext> rightPreparedPages; | ||||
|  | ||||
|     PDFDiffHelper::PageSequence pageSequence; | ||||
|  | ||||
|     auto createDiffPageContext = [](auto pageIndex) | ||||
|     { | ||||
|        PDFDiffPageContext context; | ||||
| @@ -252,16 +370,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | ||||
|     // StepMatchPages | ||||
|     if (!m_cancelled) | ||||
|     { | ||||
|         // Match pages | ||||
|         auto comparePages = [](const PDFDiffPageContext& left, const PDFDiffPageContext& right) | ||||
|         { | ||||
|             return left.pageHash == right.pageHash; | ||||
|         }; | ||||
|         PDFAlgorithmLongestCommonSubsequence algorithm(leftPreparedPages.cbegin(), leftPreparedPages.cend(), | ||||
|                                                        rightPreparedPages.cbegin(), rightPreparedPages.cend(), | ||||
|                                                        comparePages); | ||||
|         algorithm.perform(); | ||||
|  | ||||
|         performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence); | ||||
|         stepProgress(); | ||||
|     } | ||||
|  | ||||
| @@ -353,4 +462,150 @@ PDFDiffResult::PDFDiffResult() : | ||||
|  | ||||
| } | ||||
|  | ||||
| PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left, | ||||
|                                                                const GraphicPieceInfos& right, | ||||
|                                                                PDFReal epsilon) | ||||
| { | ||||
|     Differences differences; | ||||
|  | ||||
|     Q_ASSERT(std::is_sorted(left.cbegin(), left.cend())); | ||||
|     Q_ASSERT(std::is_sorted(right.cbegin(), right.cend())); | ||||
|  | ||||
|     for (const GraphicPieceInfo& info : left) | ||||
|     { | ||||
|         if (!std::binary_search(right.cbegin(), right.cend(), info)) | ||||
|         { | ||||
|             differences.left.push_back(info); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (const GraphicPieceInfo& info : right) | ||||
|     { | ||||
|         if (!std::binary_search(left.cbegin(), left.cend(), info)) | ||||
|         { | ||||
|             differences.right.push_back(info); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     const PDFReal epsilonSquared = epsilon * epsilon; | ||||
|  | ||||
|     // If exact match fails, then try to use match with epsilon. For each | ||||
|     // item in left, we try to find matching item in right. | ||||
|     for (auto it = differences.left.begin(); it != differences.left.end();) | ||||
|     { | ||||
|         bool hasMatch = false; | ||||
|  | ||||
|         const GraphicPieceInfo& leftInfo = *it; | ||||
|         for (auto it2 = differences.right.begin(); it2 != differences.right.end();) | ||||
|         { | ||||
|             // Heuristically compare these items | ||||
|  | ||||
|             const GraphicPieceInfo& rightInfo = *it2; | ||||
|             if (leftInfo.type != rightInfo.type || !leftInfo.boundingRect.intersects(rightInfo.boundingRect)) | ||||
|             { | ||||
|                 ++it2; | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             const int elementCountPath1 = leftInfo.pagePath.elementCount(); | ||||
|             const int elementCountPath2 = rightInfo.pagePath.elementCount(); | ||||
|  | ||||
|             if (elementCountPath1 != elementCountPath2) | ||||
|             { | ||||
|                 ++it2; | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             hasMatch = (leftInfo.type != GraphicPieceInfo::Type::Image) || (leftInfo.imageHash == rightInfo.imageHash); | ||||
|             const int elementCount = leftInfo.pagePath.elementCount(); | ||||
|             for (int i = 0; i < elementCount && hasMatch; ++i) | ||||
|             { | ||||
|                 QPainterPath::Element leftElement = leftInfo.pagePath.elementAt(i); | ||||
|                 QPainterPath::Element rightElement = rightInfo.pagePath.elementAt(i); | ||||
|  | ||||
|                 PDFReal diffX = leftElement.x - rightElement.x; | ||||
|                 PDFReal diffY = leftElement.y - rightElement.y; | ||||
|                 PDFReal squaredDistance = diffX * diffX + diffY * diffY; | ||||
|  | ||||
|                 hasMatch = (leftElement.type == rightElement.type) && | ||||
|                            (squaredDistance < epsilonSquared); | ||||
|             } | ||||
|  | ||||
|             if (hasMatch) | ||||
|             { | ||||
|                 it2 = differences.right.erase(it2); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 ++it2; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (hasMatch) | ||||
|         { | ||||
|             it = differences.left.erase(it); | ||||
|         } | ||||
|         else | ||||
|         { | ||||
|             ++it; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return differences; | ||||
| } | ||||
|  | ||||
| std::vector<size_t> PDFDiffHelper::getLeftUnmatched(const PageSequence& sequence) | ||||
| { | ||||
|     std::vector<size_t> result; | ||||
|  | ||||
|     for (const auto& item : sequence) | ||||
|     { | ||||
|         if (item.isLeft()) | ||||
|         { | ||||
|             result.push_back(item.index1); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| std::vector<size_t> PDFDiffHelper::getRightUnmatched(const PageSequence& sequence) | ||||
| { | ||||
|     std::vector<size_t> result; | ||||
|  | ||||
|     for (const auto& item : sequence) | ||||
|     { | ||||
|         if (item.isRight()) | ||||
|         { | ||||
|             result.push_back(item.index2); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| void PDFDiffHelper::matchPage(PageSequence& sequence, | ||||
|                               size_t leftPage, | ||||
|                               size_t rightPage) | ||||
| { | ||||
|     for (auto it = sequence.begin(); it != sequence.end();) | ||||
|     { | ||||
|         auto& item = *it; | ||||
|  | ||||
|         if (item.isLeft() && item.index1 == leftPage) | ||||
|         { | ||||
|             item.index2 = rightPage; | ||||
|         } | ||||
|  | ||||
|         if (item.isRight() && item.index2 == rightPage) | ||||
|         { | ||||
|             it = sequence.erase(it); | ||||
|         } | ||||
|         else | ||||
|         { | ||||
|             ++it; | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| }   // namespace pdf | ||||
|   | ||||
| @@ -21,6 +21,7 @@ | ||||
| #include "pdfdocument.h" | ||||
| #include "pdfprogress.h" | ||||
| #include "pdfutils.h" | ||||
| #include "pdfalgorithmlcs.h" | ||||
|  | ||||
| #include <QObject> | ||||
| #include <QFuture> | ||||
| @@ -126,6 +127,9 @@ private: | ||||
|     void stepProgress(); | ||||
|     void performSteps(const std::vector<PDFInteger>& leftPages, | ||||
|                       const std::vector<PDFInteger>& rightPages); | ||||
|     void performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages, | ||||
|                              const std::vector<PDFDiffPageContext>& rightPreparedPages, | ||||
|                              PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence); | ||||
|     void finalizeGraphicsPieces(PDFDiffPageContext& context); | ||||
|  | ||||
|     void onComparationPerformed(); | ||||
|   | ||||
| @@ -879,14 +879,15 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI | ||||
|  | ||||
|                     info.type = data.isText ? GraphicPieceInfo::Type::Text : GraphicPieceInfo::Type::VectorGraphics; | ||||
|                     info.boundingRect = pagePath.controlPointRect(); | ||||
|                     info.pagePath = pagePath; | ||||
|  | ||||
|                     const int elementCount = pagePath.elementCount(); | ||||
|                     for (int i = 0; i < elementCount; ++i) | ||||
|                     { | ||||
|                         QPainterPath::Element element = pagePath.elementAt(i); | ||||
|  | ||||
|                         PDFReal roundedX = qRound(element.x * factor); | ||||
|                         PDFReal roundedY = qRound(element.y * factor); | ||||
|                         PDFReal roundedX = qFloor(element.x * factor); | ||||
|                         PDFReal roundedY = qFloor(element.y * factor); | ||||
|  | ||||
|                         stream << roundedX; | ||||
|                         stream << roundedY; | ||||
| @@ -911,11 +912,13 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI | ||||
|  | ||||
|                 GraphicPieceInfo info; | ||||
|                 QByteArray serializedPath; | ||||
|                 QByteArray serializedImage; | ||||
|  | ||||
|                 // Serialize data | ||||
|                 if (true) | ||||
|                 { | ||||
|                     QDataStream stream(&serializedPath, QIODevice::WriteOnly); | ||||
|                     QDataStream streamImage(&serializedImage, QIODevice::WriteOnly); | ||||
|  | ||||
|                     // Jakub Melka: serialize image position | ||||
|                     QMatrix worldMatrix = stateStack.top().matrix; | ||||
| @@ -926,6 +929,7 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI | ||||
|  | ||||
|                     info.type = GraphicPieceInfo::Type::Image; | ||||
|                     info.boundingRect = pagePath.controlPointRect(); | ||||
|                     info.pagePath = pagePath; | ||||
|  | ||||
|                     const int elementCount = pagePath.elementCount(); | ||||
|                     for (int i = 0; i < elementCount; ++i) | ||||
| @@ -942,14 +946,20 @@ PDFPrecompiledPage::GraphicPieceInfos PDFPrecompiledPage::calculateGraphicPieceI | ||||
|  | ||||
|                     // serialize image data | ||||
|                     stream.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes()); | ||||
|                     streamImage.writeBytes(reinterpret_cast<const char*>(image.bits()), image.sizeInBytes()); | ||||
|                 } | ||||
|  | ||||
|                 QByteArray hash = QCryptographicHash::hash(serializedPath, QCryptographicHash::Sha512); | ||||
|                 Q_ASSERT(QCryptographicHash::hashLength(QCryptographicHash::Sha512) == 64); | ||||
|  | ||||
|                 QByteArray imageHash = QCryptographicHash::hash(serializedImage, QCryptographicHash::Sha512); | ||||
|  | ||||
|                 size_t size = qMin<size_t>(hash.length(), info.hash.size()); | ||||
|                 std::copy(hash.data(), hash.data() + size, info.hash.data()); | ||||
|  | ||||
|                 size_t sizeImage = qMin<size_t>(imageHash.length(), info.imageHash.size()); | ||||
|                 std::copy(imageHash.data(), imageHash.data() + sizeImage, info.imageHash.data()); | ||||
|  | ||||
|                 infos.emplace_back(std::move(info)); | ||||
|                 break; | ||||
|             } | ||||
|   | ||||
| @@ -257,7 +257,9 @@ public: | ||||
|  | ||||
|         Type type = Type::Unknown; | ||||
|         QRectF boundingRect; | ||||
|         std::array<uint8_t, 64> hash = { }; | ||||
|         std::array<uint8_t, 64> hash = { }; ///< Hash of all data | ||||
|         std::array<uint8_t, 64> imageHash = { }; ///< Hash of the image only | ||||
|         QPainterPath pagePath; | ||||
|     }; | ||||
|  | ||||
|     using GraphicPieceInfos = std::vector<GraphicPieceInfo>; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user