DocDiff application: detect moved pages

This commit is contained in:
Jakub Melka 2021-09-19 19:46:02 +02:00
parent 3e327f8201
commit c00939f536
6 changed files with 186 additions and 6 deletions

View File

@ -121,7 +121,57 @@ void PDFAlgorithmLongestCommonSubsequenceBase::markSequence(Sequence& sequence,
}
}
for (SequenceItem& item : updatedSequence)
{
if (item.isMatch() && !item.isRemoved() && !item.isReplaced() && !item.isAdded() && item.index1 != item.index2)
{
item.markMoved();
}
}
sequence = qMove(updatedSequence);
}
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(Sequence& sequence)
{
SequenceItemRanges result;
for (auto it = sequence.begin(); it != sequence.end();)
{
const SequenceItem& item = *it;
if (!item.isModified())
{
++it;
continue;
}
// Jakub Melka: now, we have iterator pointing on item,
// which has been modified. We will search for modification
// range.
auto itEnd = it;
while (itEnd != sequence.end() && itEnd->isModified())
{
++itEnd;
}
result.emplace_back(it, itEnd);
it = itEnd;
}
return result;
}
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemFlags PDFAlgorithmLongestCommonSubsequenceBase::collectFlags(const SequenceItemRange& range)
{
SequenceItemFlags flags = 0;
for (auto it = range.first; it != range.second; ++it)
{
flags |= it->flags;
}
return flags;
}
} // namespace pdf

View File

@ -30,7 +30,7 @@ public:
enum SequenceItemFlag
{
None = 0x0000,
MovedLeft = 0x0001, ///< Item has been moved from this position (is present in sequence no. 1)
MovedLeft = 0x0001, ///< Item has been moved from this position (is present in a sequence no. 1)
MovedRight = 0x0002, ///< Item has been moved to this position (is present in a sequence no. 2)
Moved = 0x0004, ///< Index of item has been changed
Added = 0x0008, ///< Item has been added to a sequence no. 2
@ -56,6 +56,7 @@ public:
bool isAdded() const { return flags.testFlag(Added); }
bool isRemoved() const { return flags.testFlag(Removed); }
bool isReplaced() const { return flags.testFlag(Replaced); }
bool isModified() const { return isAdded() || isRemoved() || isReplaced(); }
void markMovedLeft() { flags.setFlag(MovedLeft); }
void markMovedRight() { flags.setFlag(MovedRight); }
@ -64,7 +65,11 @@ public:
void markRemoved() { flags.setFlag(Removed); }
void markReplaced() { flags.setFlag(Replaced); }
};
using Sequence = std::vector<SequenceItem>;
using Sequence = typename std::vector<SequenceItem>;
using SequenceIterator = typename Sequence::iterator;
using SequenceItemRange = typename std::pair<SequenceIterator, SequenceIterator>;
using SequenceItemRanges = typename std::vector<SequenceItemRange>;
/// Marks a sequence with set of flags representing added/removed/replaced/moved
/// items. Moved items sequences must be sorted.
@ -74,6 +79,15 @@ public:
static void markSequence(Sequence& sequence,
const std::vector<size_t>& movedItemsLeft,
const std::vector<size_t>& movedItemsRight);
/// Returns item ranges, which should be checked - for example,
/// for text modification.
/// \param sequence Sequence
static SequenceItemRanges getModifiedRanges(Sequence& sequence);
/// Collect flags from given item range
/// \param range Range
static SequenceItemFlags collectFlags(const SequenceItemRange& range);
};
/// Algorithm for computing longest common subsequence, on two sequences

View File

@ -198,11 +198,13 @@ struct PDFDiffPageContext
PDFInteger pageIndex = 0;
std::array<uint8_t, 64> pageHash = { };
PDFPrecompiledPage::GraphicPieceInfos graphicPieces;
PDFDocumentTextFlow text;
};
void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
const std::vector<PDFDiffPageContext>& rightPreparedPages,
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence)
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
std::map<size_t, size_t>& pageMatches)
{
// Match pages. We will use following algorithm: exact solution can fail, because
// we are using hashes and due to numerical instability, hashes can be different
@ -210,7 +212,6 @@ void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPre
// So, we use longest common subsequence algorithm to detect same page ranges,
// and then we match the rest. We assume the number of failing pages is relatively small.
std::map<size_t, size_t> pageMatches;
auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
{
if (left.pageHash == right.pageHash)
@ -311,6 +312,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
std::vector<PDFDiffPageContext> rightPreparedPages;
PDFDiffHelper::PageSequence pageSequence;
std::map<size_t, size_t> pageMatches; // Indices are real page indices, not indices to page contexts
auto createDiffPageContext = [](auto pageIndex)
{
@ -381,7 +383,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
// StepMatchPages
if (!m_cancelled)
{
performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence);
performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches);
stepProgress();
}
@ -391,6 +393,16 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow;
factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true);
PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text);
for (PDFDiffPageContext& leftContext : leftPreparedPages)
{
auto it = splittedText.find(leftContext.pageIndex);
if (it != splittedText.cend())
{
leftContext.text = std::move(it->second);
splittedText.erase(it);
}
}
stepProgress();
}
@ -400,16 +412,61 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow;
factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true);
PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text);
for (PDFDiffPageContext& rightContext : rightPreparedPages)
{
auto it = splittedText.find(rightContext.pageIndex);
if (it != splittedText.cend())
{
rightContext.text = std::move(it->second);
splittedText.erase(it);
}
}
stepProgress();
}
// StepCompare
if (!m_cancelled)
{
performCompare(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches);
stepProgress();
}
}
void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPreparedPages,
const std::vector<PDFDiffPageContext>& rightPreparedPages,
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
const std::map<size_t, size_t>& pageMatches)
{
using AlgorithmLCS = PDFAlgorithmLongestCommonSubsequenceBase;
auto modifiedRanges = AlgorithmLCS::getModifiedRanges(pageSequence);
// First find all moved pages
for (const AlgorithmLCS::SequenceItem& item : pageSequence)
{
if (item.isMovedLeft())
{
Q_ASSERT(pageMatches.contains(leftPreparedPages.at(item.index1).pageIndex));
const PDFInteger leftIndex = leftPreparedPages[item.index1].pageIndex;
const PDFInteger rightIndex = pageMatches.at(leftIndex);
m_result.addPageMoved(leftIndex, rightIndex);
}
if (item.isMoved())
{
m_result.addPageMoved(leftPreparedPages[item.index1].pageIndex, rightPreparedPages[item.index2].pageIndex);
}
}
for (const auto& range : modifiedRanges)
{
AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range);
const bool isAdded = flags.testFlag(AlgorithmLCS::Added);
const bool isRemoved = flags.testFlag(AlgorithmLCS::Removed);
}
}
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
{
std::sort(context.graphicPieces.begin(), context.graphicPieces.end());
@ -473,6 +530,18 @@ PDFDiffResult::PDFDiffResult() :
}
void PDFDiffResult::addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2)
{
Difference difference;
difference.type = Type::PageMoved;
difference.pageIndex1 = pageIndex1;
difference.pageIndex2 = pageIndex2;
difference.message = PDFDiff::tr("Page no. %1 from old document has been moved to a new document at page no. %2.").arg(pageIndex1 + 1).arg(pageIndex2 + 1);
m_differences.emplace_back(std::move(difference));
}
PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left,
const GraphicPieceInfos& right,
PDFReal epsilon)

View File

@ -39,10 +39,29 @@ class PDFDiffResult
public:
explicit PDFDiffResult();
enum class Type
{
Invalid,
PageMoved
};
struct Difference
{
Type type = Type::Invalid;
PDFInteger pageIndex1 = -1;
PDFInteger pageIndex2 = -1;
QString message;
};
using Differences = std::vector<Difference>;
void setResult(PDFOperationResult result) { m_result = std::move(result); }
const PDFOperationResult& getResult() const { return m_result; }
void addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2);
private:
Differences m_differences;
PDFOperationResult m_result;
};
@ -129,7 +148,12 @@ private:
const std::vector<PDFInteger>& rightPages);
void performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
const std::vector<PDFDiffPageContext>& rightPreparedPages,
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence);
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
std::map<size_t, size_t>& pageMatches);
void performCompare(const std::vector<PDFDiffPageContext>& leftPreparedPages,
const std::vector<PDFDiffPageContext>& rightPreparedPages,
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
const std::map<size_t, size_t>& pageMatches);
void finalizeGraphicsPieces(PDFDiffPageContext& context);
void onComparationPerformed();

View File

@ -1040,4 +1040,19 @@ void PDFDocumentTextFlowEditor::updateModifiedFlag(size_t index)
item->editedItemFlags.setFlag(Modified, isModified);
}
std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask) const
{
std::map<PDFInteger, PDFDocumentTextFlow> result;
for (const Item& item : m_items)
{
if (item.flags & mask)
{
result[item.pageIndex].addItem(item);
}
}
return result;
}
} // namespace pdf

View File

@ -71,6 +71,9 @@ public:
}
/// Add text item
void addItem(Item item) { m_items.emplace_back(std::move(item)); }
const Items& getItems() const { return m_items; }
/// Returns item at a given index
@ -83,6 +86,11 @@ public:
/// Returns true, if text flow is empty
bool isEmpty() const { return m_items.empty(); }
/// Split text flow to pages using given mask. Items, which
/// are masked out, are not added.
/// \param mask Mask
std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const;
private:
Items m_items;
};