mirror of
				https://github.com/JakubMelka/PDF4QT.git
				synced 2025-06-05 21:59:17 +02:00 
			
		
		
		
	DocDiff application: detect moved pages
This commit is contained in:
		| @@ -121,7 +121,57 @@ void PDFAlgorithmLongestCommonSubsequenceBase::markSequence(Sequence& sequence, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (SequenceItem& item : updatedSequence) | ||||
|     { | ||||
|         if (item.isMatch() && !item.isRemoved() && !item.isReplaced() && !item.isAdded() && item.index1 != item.index2) | ||||
|         { | ||||
|             item.markMoved(); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     sequence = qMove(updatedSequence); | ||||
| } | ||||
|  | ||||
| PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(Sequence& sequence) | ||||
| { | ||||
|     SequenceItemRanges result; | ||||
|  | ||||
|     for (auto it = sequence.begin(); it != sequence.end();) | ||||
|     { | ||||
|         const SequenceItem& item = *it; | ||||
|         if (!item.isModified()) | ||||
|         { | ||||
|             ++it; | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         // Jakub Melka: now, we have iterator pointing on item, | ||||
|         // which has been modified. We will search for modification | ||||
|         // range. | ||||
|  | ||||
|         auto itEnd = it; | ||||
|         while (itEnd != sequence.end() && itEnd->isModified()) | ||||
|         { | ||||
|             ++itEnd; | ||||
|         } | ||||
|  | ||||
|         result.emplace_back(it, itEnd); | ||||
|         it = itEnd; | ||||
|     } | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemFlags PDFAlgorithmLongestCommonSubsequenceBase::collectFlags(const SequenceItemRange& range) | ||||
| { | ||||
|     SequenceItemFlags flags = 0; | ||||
|  | ||||
|     for (auto it = range.first; it != range.second; ++it) | ||||
|     { | ||||
|         flags |= it->flags; | ||||
|     } | ||||
|  | ||||
|     return flags; | ||||
| } | ||||
|  | ||||
| }   // namespace pdf | ||||
|   | ||||
| @@ -30,7 +30,7 @@ public: | ||||
|     enum SequenceItemFlag | ||||
|     { | ||||
|         None            = 0x0000, | ||||
|         MovedLeft       = 0x0001,   ///< Item has been moved from this position (is present in sequence no. 1) | ||||
|         MovedLeft       = 0x0001,   ///< Item has been moved from this position (is present in a sequence no. 1) | ||||
|         MovedRight      = 0x0002,   ///< Item has been moved to this position (is present in a sequence no. 2) | ||||
|         Moved           = 0x0004,   ///< Index of item has been changed | ||||
|         Added           = 0x0008,   ///< Item has been added to a sequence no. 2 | ||||
| @@ -56,6 +56,7 @@ public: | ||||
|         bool isAdded() const { return flags.testFlag(Added); } | ||||
|         bool isRemoved() const { return flags.testFlag(Removed); } | ||||
|         bool isReplaced() const { return flags.testFlag(Replaced); } | ||||
|         bool isModified() const { return isAdded() || isRemoved() || isReplaced(); } | ||||
|  | ||||
|         void markMovedLeft() { flags.setFlag(MovedLeft); } | ||||
|         void markMovedRight() { flags.setFlag(MovedRight); } | ||||
| @@ -64,7 +65,11 @@ public: | ||||
|         void markRemoved() { flags.setFlag(Removed); } | ||||
|         void markReplaced() { flags.setFlag(Replaced); } | ||||
|     }; | ||||
|     using Sequence = std::vector<SequenceItem>; | ||||
|  | ||||
|     using Sequence = typename std::vector<SequenceItem>; | ||||
|     using SequenceIterator = typename Sequence::iterator; | ||||
|     using SequenceItemRange = typename std::pair<SequenceIterator, SequenceIterator>; | ||||
|     using SequenceItemRanges = typename std::vector<SequenceItemRange>; | ||||
|  | ||||
|     /// Marks a sequence with set of flags representing added/removed/replaced/moved | ||||
|     /// items. Moved items sequences must be sorted. | ||||
| @@ -74,6 +79,15 @@ public: | ||||
|     static void markSequence(Sequence& sequence, | ||||
|                              const std::vector<size_t>& movedItemsLeft, | ||||
|                              const std::vector<size_t>& movedItemsRight); | ||||
|  | ||||
|     /// Returns item ranges, which should be checked - for example, | ||||
|     /// for text modification. | ||||
|     /// \param sequence Sequence | ||||
|     static SequenceItemRanges getModifiedRanges(Sequence& sequence); | ||||
|  | ||||
|     /// Collect flags from given item range | ||||
|     /// \param range Range | ||||
|     static SequenceItemFlags collectFlags(const SequenceItemRange& range); | ||||
| }; | ||||
|  | ||||
| /// Algorithm for computing longest common subsequence, on two sequences | ||||
|   | ||||
| @@ -198,11 +198,13 @@ struct PDFDiffPageContext | ||||
|     PDFInteger pageIndex = 0; | ||||
|     std::array<uint8_t, 64> pageHash = { }; | ||||
|     PDFPrecompiledPage::GraphicPieceInfos graphicPieces; | ||||
|     PDFDocumentTextFlow text; | ||||
| }; | ||||
|  | ||||
| void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages, | ||||
|                                   const std::vector<PDFDiffPageContext>& rightPreparedPages, | ||||
|                                   PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence) | ||||
|                                   PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, | ||||
|                                   std::map<size_t, size_t>& pageMatches) | ||||
| { | ||||
|     // Match pages. We will use following algorithm: exact solution can fail, because | ||||
|     // we are using hashes and due to numerical instability, hashes can be different | ||||
| @@ -210,7 +212,6 @@ void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPre | ||||
|     // So, we use longest common subsequence algorithm to detect same page ranges, | ||||
|     // and then we match the rest. We assume the number of failing pages is relatively small. | ||||
|  | ||||
|     std::map<size_t, size_t> pageMatches; | ||||
|     auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right) | ||||
|     { | ||||
|         if (left.pageHash == right.pageHash) | ||||
| @@ -311,6 +312,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | ||||
|     std::vector<PDFDiffPageContext> rightPreparedPages; | ||||
|  | ||||
|     PDFDiffHelper::PageSequence pageSequence; | ||||
|     std::map<size_t, size_t> pageMatches; // Indices are real page indices, not indices to page contexts | ||||
|  | ||||
|     auto createDiffPageContext = [](auto pageIndex) | ||||
|     { | ||||
| @@ -381,7 +383,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | ||||
|     // StepMatchPages | ||||
|     if (!m_cancelled) | ||||
|     { | ||||
|         performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence); | ||||
|         performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches); | ||||
|         stepProgress(); | ||||
|     } | ||||
|  | ||||
| @@ -391,6 +393,16 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | ||||
|         pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow; | ||||
|         factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true); | ||||
|         PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto); | ||||
|         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text); | ||||
|         for (PDFDiffPageContext& leftContext : leftPreparedPages) | ||||
|         { | ||||
|             auto it = splittedText.find(leftContext.pageIndex); | ||||
|             if (it != splittedText.cend()) | ||||
|             { | ||||
|                 leftContext.text = std::move(it->second); | ||||
|                 splittedText.erase(it); | ||||
|             } | ||||
|         } | ||||
|         stepProgress(); | ||||
|     } | ||||
|  | ||||
| @@ -400,16 +412,61 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | ||||
|         pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow; | ||||
|         factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true); | ||||
|         PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto); | ||||
|         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text); | ||||
|         for (PDFDiffPageContext& rightContext : rightPreparedPages) | ||||
|         { | ||||
|             auto it = splittedText.find(rightContext.pageIndex); | ||||
|             if (it != splittedText.cend()) | ||||
|             { | ||||
|                 rightContext.text = std::move(it->second); | ||||
|                 splittedText.erase(it); | ||||
|             } | ||||
|         } | ||||
|         stepProgress(); | ||||
|     } | ||||
|  | ||||
|     // StepCompare | ||||
|     if (!m_cancelled) | ||||
|     { | ||||
|         performCompare(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches); | ||||
|         stepProgress(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPreparedPages, | ||||
|                              const std::vector<PDFDiffPageContext>& rightPreparedPages, | ||||
|                              PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, | ||||
|                              const std::map<size_t, size_t>& pageMatches) | ||||
| { | ||||
|     using AlgorithmLCS = PDFAlgorithmLongestCommonSubsequenceBase; | ||||
|  | ||||
|     auto modifiedRanges = AlgorithmLCS::getModifiedRanges(pageSequence); | ||||
|  | ||||
|     // First find all moved pages | ||||
|     for (const AlgorithmLCS::SequenceItem& item : pageSequence) | ||||
|     { | ||||
|         if (item.isMovedLeft()) | ||||
|         { | ||||
|             Q_ASSERT(pageMatches.contains(leftPreparedPages.at(item.index1).pageIndex)); | ||||
|             const PDFInteger leftIndex = leftPreparedPages[item.index1].pageIndex; | ||||
|             const PDFInteger rightIndex = pageMatches.at(leftIndex); | ||||
|             m_result.addPageMoved(leftIndex, rightIndex); | ||||
|         } | ||||
|         if (item.isMoved()) | ||||
|         { | ||||
|             m_result.addPageMoved(leftPreparedPages[item.index1].pageIndex, rightPreparedPages[item.index2].pageIndex); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (const auto& range : modifiedRanges) | ||||
|     { | ||||
|         AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range); | ||||
|  | ||||
|         const bool isAdded = flags.testFlag(AlgorithmLCS::Added); | ||||
|         const bool isRemoved = flags.testFlag(AlgorithmLCS::Removed); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context) | ||||
| { | ||||
|     std::sort(context.graphicPieces.begin(), context.graphicPieces.end()); | ||||
| @@ -473,6 +530,18 @@ PDFDiffResult::PDFDiffResult() : | ||||
|  | ||||
| } | ||||
|  | ||||
| void PDFDiffResult::addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2) | ||||
| { | ||||
|     Difference difference; | ||||
|  | ||||
|     difference.type = Type::PageMoved; | ||||
|     difference.pageIndex1 = pageIndex1; | ||||
|     difference.pageIndex2 = pageIndex2; | ||||
|     difference.message = PDFDiff::tr("Page no. %1 from old document has been moved to a new document at page no. %2.").arg(pageIndex1 + 1).arg(pageIndex2 + 1); | ||||
|  | ||||
|     m_differences.emplace_back(std::move(difference)); | ||||
| } | ||||
|  | ||||
| PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left, | ||||
|                                                                const GraphicPieceInfos& right, | ||||
|                                                                PDFReal epsilon) | ||||
|   | ||||
| @@ -39,10 +39,29 @@ class PDFDiffResult | ||||
| public: | ||||
|     explicit PDFDiffResult(); | ||||
|  | ||||
|     enum class Type | ||||
|     { | ||||
|         Invalid, | ||||
|         PageMoved | ||||
|     }; | ||||
|  | ||||
|     struct Difference | ||||
|     { | ||||
|         Type type = Type::Invalid; | ||||
|         PDFInteger pageIndex1 = -1; | ||||
|         PDFInteger pageIndex2 = -1; | ||||
|         QString message; | ||||
|     }; | ||||
|  | ||||
|     using Differences = std::vector<Difference>; | ||||
|  | ||||
|     void setResult(PDFOperationResult result) { m_result = std::move(result); } | ||||
|     const PDFOperationResult& getResult() const { return m_result; } | ||||
|  | ||||
|     void addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2); | ||||
|  | ||||
| private: | ||||
|     Differences m_differences; | ||||
|     PDFOperationResult m_result; | ||||
| }; | ||||
|  | ||||
| @@ -129,7 +148,12 @@ private: | ||||
|                       const std::vector<PDFInteger>& rightPages); | ||||
|     void performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages, | ||||
|                              const std::vector<PDFDiffPageContext>& rightPreparedPages, | ||||
|                              PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence); | ||||
|                              PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, | ||||
|                              std::map<size_t, size_t>& pageMatches); | ||||
|     void performCompare(const std::vector<PDFDiffPageContext>& leftPreparedPages, | ||||
|                         const std::vector<PDFDiffPageContext>& rightPreparedPages, | ||||
|                         PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, | ||||
|                         const std::map<size_t, size_t>& pageMatches); | ||||
|     void finalizeGraphicsPieces(PDFDiffPageContext& context); | ||||
|  | ||||
|     void onComparationPerformed(); | ||||
|   | ||||
| @@ -1040,4 +1040,19 @@ void PDFDocumentTextFlowEditor::updateModifiedFlag(size_t index) | ||||
|     item->editedItemFlags.setFlag(Modified, isModified); | ||||
| } | ||||
|  | ||||
| std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask) const | ||||
| { | ||||
|     std::map<PDFInteger, PDFDocumentTextFlow> result; | ||||
|  | ||||
|     for (const Item& item : m_items) | ||||
|     { | ||||
|         if (item.flags & mask) | ||||
|         { | ||||
|             result[item.pageIndex].addItem(item); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| }   // namespace pdf | ||||
|   | ||||
| @@ -71,6 +71,9 @@ public: | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /// Add text item | ||||
|     void addItem(Item item) { m_items.emplace_back(std::move(item)); } | ||||
|  | ||||
|     const Items& getItems() const { return m_items; } | ||||
|  | ||||
|     /// Returns item at a given index | ||||
| @@ -83,6 +86,11 @@ public: | ||||
|     /// Returns true, if text flow is empty | ||||
|     bool isEmpty() const { return m_items.empty(); } | ||||
|  | ||||
|     /// Split text flow to pages using given mask. Items, which | ||||
|     /// are masked out, are not added. | ||||
|     /// \param mask Mask | ||||
|     std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const; | ||||
|  | ||||
| private: | ||||
|     Items m_items; | ||||
| }; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user