diff --git a/Pdf4QtLib/sources/pdfalgorithmlcs.cpp b/Pdf4QtLib/sources/pdfalgorithmlcs.cpp index 41f5d1d..5a3ea10 100644 --- a/Pdf4QtLib/sources/pdfalgorithmlcs.cpp +++ b/Pdf4QtLib/sources/pdfalgorithmlcs.cpp @@ -121,7 +121,57 @@ void PDFAlgorithmLongestCommonSubsequenceBase::markSequence(Sequence& sequence, } } + for (SequenceItem& item : updatedSequence) + { + if (item.isMatch() && !item.isRemoved() && !item.isReplaced() && !item.isAdded() && item.index1 != item.index2) + { + item.markMoved(); + } + } + sequence = qMove(updatedSequence); } +PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(Sequence& sequence) +{ + SequenceItemRanges result; + + for (auto it = sequence.begin(); it != sequence.end();) + { + const SequenceItem& item = *it; + if (!item.isModified()) + { + ++it; + continue; + } + + // Jakub Melka: now, we have iterator pointing on item, + // which has been modified. We will search for modification + // range. + + auto itEnd = it; + while (itEnd != sequence.end() && itEnd->isModified()) + { + ++itEnd; + } + + result.emplace_back(it, itEnd); + it = itEnd; + } + + return result; +} + +PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemFlags PDFAlgorithmLongestCommonSubsequenceBase::collectFlags(const SequenceItemRange& range) +{ + SequenceItemFlags flags = 0; + + for (auto it = range.first; it != range.second; ++it) + { + flags |= it->flags; + } + + return flags; +} + } // namespace pdf diff --git a/Pdf4QtLib/sources/pdfalgorithmlcs.h b/Pdf4QtLib/sources/pdfalgorithmlcs.h index 882ab62..781662e 100644 --- a/Pdf4QtLib/sources/pdfalgorithmlcs.h +++ b/Pdf4QtLib/sources/pdfalgorithmlcs.h @@ -30,7 +30,7 @@ public: enum SequenceItemFlag { None = 0x0000, - MovedLeft = 0x0001, ///< Item has been moved from this position (is present in sequence no. 1) + MovedLeft = 0x0001, ///< Item has been moved from this position (is present in a sequence no. 1) MovedRight = 0x0002, ///< Item has been moved to this position (is present in a sequence no. 2) Moved = 0x0004, ///< Index of item has been changed Added = 0x0008, ///< Item has been added to a sequence no. 2 @@ -56,6 +56,7 @@ public: bool isAdded() const { return flags.testFlag(Added); } bool isRemoved() const { return flags.testFlag(Removed); } bool isReplaced() const { return flags.testFlag(Replaced); } + bool isModified() const { return isAdded() || isRemoved() || isReplaced(); } void markMovedLeft() { flags.setFlag(MovedLeft); } void markMovedRight() { flags.setFlag(MovedRight); } @@ -64,7 +65,11 @@ public: void markRemoved() { flags.setFlag(Removed); } void markReplaced() { flags.setFlag(Replaced); } }; - using Sequence = std::vector; + + using Sequence = typename std::vector; + using SequenceIterator = typename Sequence::iterator; + using SequenceItemRange = typename std::pair; + using SequenceItemRanges = typename std::vector; /// Marks a sequence with set of flags representing added/removed/replaced/moved /// items. Moved items sequences must be sorted. @@ -74,6 +79,15 @@ public: static void markSequence(Sequence& sequence, const std::vector& movedItemsLeft, const std::vector& movedItemsRight); + + /// Returns item ranges, which should be checked - for example, + /// for text modification. + /// \param sequence Sequence + static SequenceItemRanges getModifiedRanges(Sequence& sequence); + + /// Collect flags from given item range + /// \param range Range + static SequenceItemFlags collectFlags(const SequenceItemRange& range); }; /// Algorithm for computing longest common subsequence, on two sequences diff --git a/Pdf4QtLib/sources/pdfdiff.cpp b/Pdf4QtLib/sources/pdfdiff.cpp index cc8f06a..9ba2f18 100644 --- a/Pdf4QtLib/sources/pdfdiff.cpp +++ b/Pdf4QtLib/sources/pdfdiff.cpp @@ -198,11 +198,13 @@ struct PDFDiffPageContext PDFInteger pageIndex = 0; std::array pageHash = { }; PDFPrecompiledPage::GraphicPieceInfos graphicPieces; + PDFDocumentTextFlow text; }; void PDFDiff::performPageMatching(const std::vector& leftPreparedPages, const std::vector& rightPreparedPages, - PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence) + PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, + std::map& pageMatches) { // Match pages. We will use following algorithm: exact solution can fail, because // we are using hashes and due to numerical instability, hashes can be different @@ -210,7 +212,6 @@ void PDFDiff::performPageMatching(const std::vector& leftPre // So, we use longest common subsequence algorithm to detect same page ranges, // and then we match the rest. We assume the number of failing pages is relatively small. - std::map pageMatches; auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right) { if (left.pageHash == right.pageHash) @@ -311,6 +312,7 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: std::vector rightPreparedPages; PDFDiffHelper::PageSequence pageSequence; + std::map pageMatches; // Indices are real page indices, not indices to page contexts auto createDiffPageContext = [](auto pageIndex) { @@ -381,7 +383,7 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: // StepMatchPages if (!m_cancelled) { - performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence); + performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches); stepProgress(); } @@ -391,6 +393,16 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow; factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true); PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto); + std::map splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text); + for (PDFDiffPageContext& leftContext : leftPreparedPages) + { + auto it = splittedText.find(leftContext.pageIndex); + if (it != splittedText.cend()) + { + leftContext.text = std::move(it->second); + splittedText.erase(it); + } + } stepProgress(); } @@ -400,16 +412,61 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow; factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true); PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto); + std::map splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text); + for (PDFDiffPageContext& rightContext : rightPreparedPages) + { + auto it = splittedText.find(rightContext.pageIndex); + if (it != splittedText.cend()) + { + rightContext.text = std::move(it->second); + splittedText.erase(it); + } + } stepProgress(); } // StepCompare if (!m_cancelled) { + performCompare(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches); stepProgress(); } } +void PDFDiff::performCompare(const std::vector& leftPreparedPages, + const std::vector& rightPreparedPages, + PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, + const std::map& pageMatches) +{ + using AlgorithmLCS = PDFAlgorithmLongestCommonSubsequenceBase; + + auto modifiedRanges = AlgorithmLCS::getModifiedRanges(pageSequence); + + // First find all moved pages + for (const AlgorithmLCS::SequenceItem& item : pageSequence) + { + if (item.isMovedLeft()) + { + Q_ASSERT(pageMatches.contains(leftPreparedPages.at(item.index1).pageIndex)); + const PDFInteger leftIndex = leftPreparedPages[item.index1].pageIndex; + const PDFInteger rightIndex = pageMatches.at(leftIndex); + m_result.addPageMoved(leftIndex, rightIndex); + } + if (item.isMoved()) + { + m_result.addPageMoved(leftPreparedPages[item.index1].pageIndex, rightPreparedPages[item.index2].pageIndex); + } + } + + for (const auto& range : modifiedRanges) + { + AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range); + + const bool isAdded = flags.testFlag(AlgorithmLCS::Added); + const bool isRemoved = flags.testFlag(AlgorithmLCS::Removed); + } +} + void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context) { std::sort(context.graphicPieces.begin(), context.graphicPieces.end()); @@ -473,6 +530,18 @@ PDFDiffResult::PDFDiffResult() : } +void PDFDiffResult::addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2) +{ + Difference difference; + + difference.type = Type::PageMoved; + difference.pageIndex1 = pageIndex1; + difference.pageIndex2 = pageIndex2; + difference.message = PDFDiff::tr("Page no. %1 from old document has been moved to a new document at page no. %2.").arg(pageIndex1 + 1).arg(pageIndex2 + 1); + + m_differences.emplace_back(std::move(difference)); +} + PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon) diff --git a/Pdf4QtLib/sources/pdfdiff.h b/Pdf4QtLib/sources/pdfdiff.h index dd62cb7..1e3d1a0 100644 --- a/Pdf4QtLib/sources/pdfdiff.h +++ b/Pdf4QtLib/sources/pdfdiff.h @@ -39,10 +39,29 @@ class PDFDiffResult public: explicit PDFDiffResult(); + enum class Type + { + Invalid, + PageMoved + }; + + struct Difference + { + Type type = Type::Invalid; + PDFInteger pageIndex1 = -1; + PDFInteger pageIndex2 = -1; + QString message; + }; + + using Differences = std::vector; + void setResult(PDFOperationResult result) { m_result = std::move(result); } const PDFOperationResult& getResult() const { return m_result; } + void addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2); + private: + Differences m_differences; PDFOperationResult m_result; }; @@ -129,7 +148,12 @@ private: const std::vector& rightPages); void performPageMatching(const std::vector& leftPreparedPages, const std::vector& rightPreparedPages, - PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence); + PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, + std::map& pageMatches); + void performCompare(const std::vector& leftPreparedPages, + const std::vector& rightPreparedPages, + PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence, + const std::map& pageMatches); void finalizeGraphicsPieces(PDFDiffPageContext& context); void onComparationPerformed(); diff --git a/Pdf4QtLib/sources/pdfdocumenttextflow.cpp b/Pdf4QtLib/sources/pdfdocumenttextflow.cpp index 995f0d9..83b7515 100644 --- a/Pdf4QtLib/sources/pdfdocumenttextflow.cpp +++ b/Pdf4QtLib/sources/pdfdocumenttextflow.cpp @@ -1040,4 +1040,19 @@ void PDFDocumentTextFlowEditor::updateModifiedFlag(size_t index) item->editedItemFlags.setFlag(Modified, isModified); } +std::map PDFDocumentTextFlow::split(Flags mask) const +{ + std::map result; + + for (const Item& item : m_items) + { + if (item.flags & mask) + { + result[item.pageIndex].addItem(item); + } + } + + return result; +} + } // namespace pdf diff --git a/Pdf4QtLib/sources/pdfdocumenttextflow.h b/Pdf4QtLib/sources/pdfdocumenttextflow.h index 8e82332..f247606 100644 --- a/Pdf4QtLib/sources/pdfdocumenttextflow.h +++ b/Pdf4QtLib/sources/pdfdocumenttextflow.h @@ -71,6 +71,9 @@ public: } + /// Add text item + void addItem(Item item) { m_items.emplace_back(std::move(item)); } + const Items& getItems() const { return m_items; } /// Returns item at a given index @@ -83,6 +86,11 @@ public: /// Returns true, if text flow is empty bool isEmpty() const { return m_items.empty(); } + /// Split text flow to pages using given mask. Items, which + /// are masked out, are not added. + /// \param mask Mask + std::map split(Flags mask) const; + private: Items m_items; };