From 3e327f8201fca6b3ee7ba3a9588fe21330a4644d Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sun, 12 Sep 2021 16:02:25 +0200 Subject: [PATCH] DocDiff application: finish page matching algorithm --- Pdf4QtLib/Pdf4QtLib.pro | 1 + Pdf4QtLib/sources/pdfalgorithmlcs.cpp | 127 ++++++++++++++++++++++++++ Pdf4QtLib/sources/pdfalgorithmlcs.h | 36 ++++++++ Pdf4QtLib/sources/pdfdiff.cpp | 13 ++- 4 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 Pdf4QtLib/sources/pdfalgorithmlcs.cpp diff --git a/Pdf4QtLib/Pdf4QtLib.pro b/Pdf4QtLib/Pdf4QtLib.pro index 72743f3..77618f4 100644 --- a/Pdf4QtLib/Pdf4QtLib.pro +++ b/Pdf4QtLib/Pdf4QtLib.pro @@ -44,6 +44,7 @@ DESTDIR = $$OUT_PWD/.. SOURCES += \ sources/pdfaction.cpp \ sources/pdfadvancedtools.cpp \ + sources/pdfalgorithmlcs.cpp \ sources/pdfannotation.cpp \ sources/pdfblendfunction.cpp \ sources/pdfccittfaxdecoder.cpp \ diff --git a/Pdf4QtLib/sources/pdfalgorithmlcs.cpp b/Pdf4QtLib/sources/pdfalgorithmlcs.cpp new file mode 100644 index 0000000..41f5d1d --- /dev/null +++ b/Pdf4QtLib/sources/pdfalgorithmlcs.cpp @@ -0,0 +1,127 @@ +// Copyright (C) 2021 Jakub Melka +// +// This file is part of PDF4QT. +// +// PDF4QT is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// with the written consent of the copyright owner, any later version. +// +// PDF4QT is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDF4QT. If not, see . + +#include "pdfalgorithmlcs.h" + +namespace pdf +{ + +void PDFAlgorithmLongestCommonSubsequenceBase::markSequence(Sequence& sequence, + const std::vector& movedItemsLeft, + const std::vector& movedItemsRight) +{ + Sequence updatedSequence; + + Q_ASSERT(std::is_sorted(movedItemsLeft.cbegin(), movedItemsLeft.cend())); + Q_ASSERT(std::is_sorted(movedItemsRight.cbegin(), movedItemsRight.cend())); + + for (auto it = sequence.cbegin(); it != sequence.cend();) + { + if (it->isMatch()) + { + updatedSequence.push_back(*it); + ++it; + continue; + } + + Sequence leftItems; + Sequence rightItems; + + for (; it != sequence.cend() && !it->isMatch(); ++it) + { + const SequenceItem& currentItem = *it; + Q_ASSERT(currentItem.isLeft() || currentItem.isRight()); + + if (currentItem.isLeft()) + { + if (std::binary_search(movedItemsLeft.cbegin(), movedItemsLeft.cend(), currentItem.index1)) + { + SequenceItem item = *it; + item.markMovedLeft(); + updatedSequence.push_back(item); + } + else + { + leftItems.push_back(currentItem); + } + } + + if (currentItem.isRight()) + { + if (std::binary_search(movedItemsRight.cbegin(), movedItemsRight.cend(), currentItem.index2)) + { + SequenceItem item = *it; + item.markMovedRight(); + updatedSequence.push_back(item); + } + else + { + rightItems.push_back(currentItem); + } + } + } + + std::reverse(leftItems.begin(), leftItems.end()); + std::reverse(rightItems.begin(), rightItems.end()); + + bool isReplaced = !leftItems.empty() && !rightItems.empty(); + + while (!leftItems.empty() && !rightItems.empty()) + { + SequenceItem item; + item.index1 = leftItems.back().index1; + item.index2 = rightItems.back().index2; + item.markReplaced(); + updatedSequence.push_back(item); + + leftItems.pop_back(); + rightItems.pop_back(); + } + + while (!leftItems.empty()) + { + SequenceItem item = leftItems.back(); + item.markRemoved(); + + if (isReplaced) + { + item.markReplaced(); + } + + updatedSequence.push_back(item); + leftItems.pop_back(); + } + + while (!rightItems.empty()) + { + SequenceItem item = rightItems.back(); + item.markAdded(); + + if (isReplaced) + { + item.markReplaced(); + } + + updatedSequence.push_back(item); + rightItems.pop_back(); + } + } + + sequence = qMove(updatedSequence); +} + +} // namespace pdf diff --git a/Pdf4QtLib/sources/pdfalgorithmlcs.h b/Pdf4QtLib/sources/pdfalgorithmlcs.h index fcc0faf..882ab62 100644 --- a/Pdf4QtLib/sources/pdfalgorithmlcs.h +++ b/Pdf4QtLib/sources/pdfalgorithmlcs.h @@ -26,18 +26,54 @@ namespace pdf class PDFAlgorithmLongestCommonSubsequenceBase { public: + + enum SequenceItemFlag + { + None = 0x0000, + MovedLeft = 0x0001, ///< Item has been moved from this position (is present in sequence no. 1) + MovedRight = 0x0002, ///< Item has been moved to this position (is present in a sequence no. 2) + Moved = 0x0004, ///< Index of item has been changed + Added = 0x0008, ///< Item has been added to a sequence no. 2 + Removed = 0x0010, ///< Item has been removed from a sequence no. 1 + Replaced = 0x0020, ///< Item has been replaced (or sequence of items has been replaced) + }; + Q_DECLARE_FLAGS(SequenceItemFlags, SequenceItemFlag) + struct SequenceItem { size_t index1 = std::numeric_limits::max(); size_t index2 = std::numeric_limits::max(); + SequenceItemFlags flags = None; bool isLeftValid() const { return index1 != std::numeric_limits::max(); } bool isRightValid() const { return index2 != std::numeric_limits::max(); } bool isLeft() const { return isLeftValid() && !isRightValid(); } bool isRight() const { return isRightValid() && !isLeftValid(); } bool isMatch() const { return isLeftValid() && isRightValid(); } + bool isMovedLeft() const { return flags.testFlag(MovedLeft); } + bool isMovedRight() const { return flags.testFlag(MovedRight); } + bool isMoved() const { return flags.testFlag(Moved); } + bool isAdded() const { return flags.testFlag(Added); } + bool isRemoved() const { return flags.testFlag(Removed); } + bool isReplaced() const { return flags.testFlag(Replaced); } + + void markMovedLeft() { flags.setFlag(MovedLeft); } + void markMovedRight() { flags.setFlag(MovedRight); } + void markMoved() { flags.setFlag(Moved); } + void markAdded() { flags.setFlag(Added); } + void markRemoved() { flags.setFlag(Removed); } + void markReplaced() { flags.setFlag(Replaced); } }; using Sequence = std::vector; + + /// Marks a sequence with set of flags representing added/removed/replaced/moved + /// items. Moved items sequences must be sorted. + /// \param sequence Sequence to be marked + /// \param movedItemsLeft Sorted sequence of left indices, which have been moved + /// \param movedItemsRight sorted sequence of right indices, which have been moved + static void markSequence(Sequence& sequence, + const std::vector& movedItemsLeft, + const std::vector& movedItemsRight); }; /// Algorithm for computing longest common subsequence, on two sequences diff --git a/Pdf4QtLib/sources/pdfdiff.cpp b/Pdf4QtLib/sources/pdfdiff.cpp index f5d46a6..cc8f06a 100644 --- a/Pdf4QtLib/sources/pdfdiff.cpp +++ b/Pdf4QtLib/sources/pdfdiff.cpp @@ -271,6 +271,9 @@ void PDFDiff::performPageMatching(const std::vector& leftPre PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, leftUnmatched.begin(), leftUnmatched.end(), matchLeftPage); + std::vector leftPagesMoved; + std::vector rightPagesMoved; + std::set matchedRightPages; for (const auto& matchedPage : matchedPages) { @@ -282,6 +285,9 @@ void PDFDiff::performPageMatching(const std::vector& leftPre const PDFDiffPageContext& leftPageContext = leftPreparedPages[matchedPage.first]; const PDFDiffPageContext& rightPageContext = rightPreparedPages[rightContextIndex]; + leftPagesMoved.push_back(leftPageContext.pageIndex); + rightPagesMoved.push_back(rightPageContext.pageIndex); + pageMatches[leftPageContext.pageIndex] = rightPageContext.pageIndex; } } @@ -292,6 +298,11 @@ void PDFDiff::performPageMatching(const std::vector& leftPre algorithm.perform(); pageSequence = algorithm.getSequence(); } + + std::sort(leftPagesMoved.begin(), leftPagesMoved.end()); + std::sort(rightPagesMoved.begin(), rightPagesMoved.end()); + + PDFAlgorithmLongestCommonSubsequenceBase::markSequence(pageSequence, leftPagesMoved, rightPagesMoved); } void PDFDiff::performSteps(const std::vector& leftPages, const std::vector& rightPages) @@ -356,7 +367,7 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: PDFRenderer renderer(m_rightDocument, &fontCache, cms.data(), &optionalContentActivity, features, pdf::PDFMeshQualitySettings()); renderer.compile(&compiledPage, context.pageIndex); - const PDFPage* page = m_leftDocument->getCatalog()->getPage(context.pageIndex); + const PDFPage* page = m_rightDocument->getCatalog()->getPage(context.pageIndex); PDFReal epsilon = calculateEpsilonForPage(page); context.graphicPieces = compiledPage.calculateGraphicPieceInfos(page->getMediaBox(), epsilon);