mirror of https://github.com/JakubMelka/PDF4QT.git
DocDiff application: detect moved pages
This commit is contained in:
parent
3e327f8201
commit
c00939f536
|
@ -121,7 +121,57 @@ void PDFAlgorithmLongestCommonSubsequenceBase::markSequence(Sequence& sequence,
|
|||
}
|
||||
}
|
||||
|
||||
for (SequenceItem& item : updatedSequence)
|
||||
{
|
||||
if (item.isMatch() && !item.isRemoved() && !item.isReplaced() && !item.isAdded() && item.index1 != item.index2)
|
||||
{
|
||||
item.markMoved();
|
||||
}
|
||||
}
|
||||
|
||||
sequence = qMove(updatedSequence);
|
||||
}
|
||||
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(Sequence& sequence)
|
||||
{
|
||||
SequenceItemRanges result;
|
||||
|
||||
for (auto it = sequence.begin(); it != sequence.end();)
|
||||
{
|
||||
const SequenceItem& item = *it;
|
||||
if (!item.isModified())
|
||||
{
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Jakub Melka: now, we have iterator pointing on item,
|
||||
// which has been modified. We will search for modification
|
||||
// range.
|
||||
|
||||
auto itEnd = it;
|
||||
while (itEnd != sequence.end() && itEnd->isModified())
|
||||
{
|
||||
++itEnd;
|
||||
}
|
||||
|
||||
result.emplace_back(it, itEnd);
|
||||
it = itEnd;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemFlags PDFAlgorithmLongestCommonSubsequenceBase::collectFlags(const SequenceItemRange& range)
|
||||
{
|
||||
SequenceItemFlags flags = 0;
|
||||
|
||||
for (auto it = range.first; it != range.second; ++it)
|
||||
{
|
||||
flags |= it->flags;
|
||||
}
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
|
|
@ -30,7 +30,7 @@ public:
|
|||
enum SequenceItemFlag
|
||||
{
|
||||
None = 0x0000,
|
||||
MovedLeft = 0x0001, ///< Item has been moved from this position (is present in sequence no. 1)
|
||||
MovedLeft = 0x0001, ///< Item has been moved from this position (is present in a sequence no. 1)
|
||||
MovedRight = 0x0002, ///< Item has been moved to this position (is present in a sequence no. 2)
|
||||
Moved = 0x0004, ///< Index of item has been changed
|
||||
Added = 0x0008, ///< Item has been added to a sequence no. 2
|
||||
|
@ -56,6 +56,7 @@ public:
|
|||
bool isAdded() const { return flags.testFlag(Added); }
|
||||
bool isRemoved() const { return flags.testFlag(Removed); }
|
||||
bool isReplaced() const { return flags.testFlag(Replaced); }
|
||||
bool isModified() const { return isAdded() || isRemoved() || isReplaced(); }
|
||||
|
||||
void markMovedLeft() { flags.setFlag(MovedLeft); }
|
||||
void markMovedRight() { flags.setFlag(MovedRight); }
|
||||
|
@ -64,7 +65,11 @@ public:
|
|||
void markRemoved() { flags.setFlag(Removed); }
|
||||
void markReplaced() { flags.setFlag(Replaced); }
|
||||
};
|
||||
using Sequence = std::vector<SequenceItem>;
|
||||
|
||||
using Sequence = typename std::vector<SequenceItem>;
|
||||
using SequenceIterator = typename Sequence::iterator;
|
||||
using SequenceItemRange = typename std::pair<SequenceIterator, SequenceIterator>;
|
||||
using SequenceItemRanges = typename std::vector<SequenceItemRange>;
|
||||
|
||||
/// Marks a sequence with set of flags representing added/removed/replaced/moved
|
||||
/// items. Moved items sequences must be sorted.
|
||||
|
@ -74,6 +79,15 @@ public:
|
|||
static void markSequence(Sequence& sequence,
|
||||
const std::vector<size_t>& movedItemsLeft,
|
||||
const std::vector<size_t>& movedItemsRight);
|
||||
|
||||
/// Returns item ranges, which should be checked - for example,
|
||||
/// for text modification.
|
||||
/// \param sequence Sequence
|
||||
static SequenceItemRanges getModifiedRanges(Sequence& sequence);
|
||||
|
||||
/// Collect flags from given item range
|
||||
/// \param range Range
|
||||
static SequenceItemFlags collectFlags(const SequenceItemRange& range);
|
||||
};
|
||||
|
||||
/// Algorithm for computing longest common subsequence, on two sequences
|
||||
|
|
|
@ -198,11 +198,13 @@ struct PDFDiffPageContext
|
|||
PDFInteger pageIndex = 0;
|
||||
std::array<uint8_t, 64> pageHash = { };
|
||||
PDFPrecompiledPage::GraphicPieceInfos graphicPieces;
|
||||
PDFDocumentTextFlow text;
|
||||
};
|
||||
|
||||
void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
|
||||
const std::vector<PDFDiffPageContext>& rightPreparedPages,
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence)
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
|
||||
std::map<size_t, size_t>& pageMatches)
|
||||
{
|
||||
// Match pages. We will use following algorithm: exact solution can fail, because
|
||||
// we are using hashes and due to numerical instability, hashes can be different
|
||||
|
@ -210,7 +212,6 @@ void PDFDiff::performPageMatching(const std::vector<PDFDiffPageContext>& leftPre
|
|||
// So, we use longest common subsequence algorithm to detect same page ranges,
|
||||
// and then we match the rest. We assume the number of failing pages is relatively small.
|
||||
|
||||
std::map<size_t, size_t> pageMatches;
|
||||
auto comparePages = [&](const PDFDiffPageContext& left, const PDFDiffPageContext& right)
|
||||
{
|
||||
if (left.pageHash == right.pageHash)
|
||||
|
@ -311,6 +312,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
|||
std::vector<PDFDiffPageContext> rightPreparedPages;
|
||||
|
||||
PDFDiffHelper::PageSequence pageSequence;
|
||||
std::map<size_t, size_t> pageMatches; // Indices are real page indices, not indices to page contexts
|
||||
|
||||
auto createDiffPageContext = [](auto pageIndex)
|
||||
{
|
||||
|
@ -381,7 +383,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
|||
// StepMatchPages
|
||||
if (!m_cancelled)
|
||||
{
|
||||
performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence);
|
||||
performPageMatching(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches);
|
||||
stepProgress();
|
||||
}
|
||||
|
||||
|
@ -391,6 +393,16 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
|||
pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow;
|
||||
factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true);
|
||||
PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
|
||||
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text);
|
||||
for (PDFDiffPageContext& leftContext : leftPreparedPages)
|
||||
{
|
||||
auto it = splittedText.find(leftContext.pageIndex);
|
||||
if (it != splittedText.cend())
|
||||
{
|
||||
leftContext.text = std::move(it->second);
|
||||
splittedText.erase(it);
|
||||
}
|
||||
}
|
||||
stepProgress();
|
||||
}
|
||||
|
||||
|
@ -400,16 +412,61 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
|||
pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow;
|
||||
factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true);
|
||||
PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
|
||||
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text);
|
||||
for (PDFDiffPageContext& rightContext : rightPreparedPages)
|
||||
{
|
||||
auto it = splittedText.find(rightContext.pageIndex);
|
||||
if (it != splittedText.cend())
|
||||
{
|
||||
rightContext.text = std::move(it->second);
|
||||
splittedText.erase(it);
|
||||
}
|
||||
}
|
||||
stepProgress();
|
||||
}
|
||||
|
||||
// StepCompare
|
||||
if (!m_cancelled)
|
||||
{
|
||||
performCompare(leftPreparedPages, rightPreparedPages, pageSequence, pageMatches);
|
||||
stepProgress();
|
||||
}
|
||||
}
|
||||
|
||||
void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPreparedPages,
|
||||
const std::vector<PDFDiffPageContext>& rightPreparedPages,
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
|
||||
const std::map<size_t, size_t>& pageMatches)
|
||||
{
|
||||
using AlgorithmLCS = PDFAlgorithmLongestCommonSubsequenceBase;
|
||||
|
||||
auto modifiedRanges = AlgorithmLCS::getModifiedRanges(pageSequence);
|
||||
|
||||
// First find all moved pages
|
||||
for (const AlgorithmLCS::SequenceItem& item : pageSequence)
|
||||
{
|
||||
if (item.isMovedLeft())
|
||||
{
|
||||
Q_ASSERT(pageMatches.contains(leftPreparedPages.at(item.index1).pageIndex));
|
||||
const PDFInteger leftIndex = leftPreparedPages[item.index1].pageIndex;
|
||||
const PDFInteger rightIndex = pageMatches.at(leftIndex);
|
||||
m_result.addPageMoved(leftIndex, rightIndex);
|
||||
}
|
||||
if (item.isMoved())
|
||||
{
|
||||
m_result.addPageMoved(leftPreparedPages[item.index1].pageIndex, rightPreparedPages[item.index2].pageIndex);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& range : modifiedRanges)
|
||||
{
|
||||
AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range);
|
||||
|
||||
const bool isAdded = flags.testFlag(AlgorithmLCS::Added);
|
||||
const bool isRemoved = flags.testFlag(AlgorithmLCS::Removed);
|
||||
}
|
||||
}
|
||||
|
||||
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
|
||||
{
|
||||
std::sort(context.graphicPieces.begin(), context.graphicPieces.end());
|
||||
|
@ -473,6 +530,18 @@ PDFDiffResult::PDFDiffResult() :
|
|||
|
||||
}
|
||||
|
||||
void PDFDiffResult::addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2)
|
||||
{
|
||||
Difference difference;
|
||||
|
||||
difference.type = Type::PageMoved;
|
||||
difference.pageIndex1 = pageIndex1;
|
||||
difference.pageIndex2 = pageIndex2;
|
||||
difference.message = PDFDiff::tr("Page no. %1 from old document has been moved to a new document at page no. %2.").arg(pageIndex1 + 1).arg(pageIndex2 + 1);
|
||||
|
||||
m_differences.emplace_back(std::move(difference));
|
||||
}
|
||||
|
||||
PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left,
|
||||
const GraphicPieceInfos& right,
|
||||
PDFReal epsilon)
|
||||
|
|
|
@ -39,10 +39,29 @@ class PDFDiffResult
|
|||
public:
|
||||
explicit PDFDiffResult();
|
||||
|
||||
enum class Type
|
||||
{
|
||||
Invalid,
|
||||
PageMoved
|
||||
};
|
||||
|
||||
struct Difference
|
||||
{
|
||||
Type type = Type::Invalid;
|
||||
PDFInteger pageIndex1 = -1;
|
||||
PDFInteger pageIndex2 = -1;
|
||||
QString message;
|
||||
};
|
||||
|
||||
using Differences = std::vector<Difference>;
|
||||
|
||||
void setResult(PDFOperationResult result) { m_result = std::move(result); }
|
||||
const PDFOperationResult& getResult() const { return m_result; }
|
||||
|
||||
void addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2);
|
||||
|
||||
private:
|
||||
Differences m_differences;
|
||||
PDFOperationResult m_result;
|
||||
};
|
||||
|
||||
|
@ -129,7 +148,12 @@ private:
|
|||
const std::vector<PDFInteger>& rightPages);
|
||||
void performPageMatching(const std::vector<PDFDiffPageContext>& leftPreparedPages,
|
||||
const std::vector<PDFDiffPageContext>& rightPreparedPages,
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence);
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
|
||||
std::map<size_t, size_t>& pageMatches);
|
||||
void performCompare(const std::vector<PDFDiffPageContext>& leftPreparedPages,
|
||||
const std::vector<PDFDiffPageContext>& rightPreparedPages,
|
||||
PDFAlgorithmLongestCommonSubsequenceBase::Sequence& pageSequence,
|
||||
const std::map<size_t, size_t>& pageMatches);
|
||||
void finalizeGraphicsPieces(PDFDiffPageContext& context);
|
||||
|
||||
void onComparationPerformed();
|
||||
|
|
|
@ -1040,4 +1040,19 @@ void PDFDocumentTextFlowEditor::updateModifiedFlag(size_t index)
|
|||
item->editedItemFlags.setFlag(Modified, isModified);
|
||||
}
|
||||
|
||||
std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask) const
|
||||
{
|
||||
std::map<PDFInteger, PDFDocumentTextFlow> result;
|
||||
|
||||
for (const Item& item : m_items)
|
||||
{
|
||||
if (item.flags & mask)
|
||||
{
|
||||
result[item.pageIndex].addItem(item);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
|
|
@ -71,6 +71,9 @@ public:
|
|||
|
||||
}
|
||||
|
||||
/// Add text item
|
||||
void addItem(Item item) { m_items.emplace_back(std::move(item)); }
|
||||
|
||||
const Items& getItems() const { return m_items; }
|
||||
|
||||
/// Returns item at a given index
|
||||
|
@ -83,6 +86,11 @@ public:
|
|||
/// Returns true, if text flow is empty
|
||||
bool isEmpty() const { return m_items.empty(); }
|
||||
|
||||
/// Split text flow to pages using given mask. Items, which
|
||||
/// are masked out, are not added.
|
||||
/// \param mask Mask
|
||||
std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const;
|
||||
|
||||
private:
|
||||
Items m_items;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue