From 9bc29da83c5710f70c9e25deca91e8f998c7d2bf Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sat, 25 Sep 2021 17:05:08 +0200 Subject: [PATCH] DocDiff application: text comparation --- Pdf4QtLib/sources/pdfdiff.cpp | 138 +++++++++++++++++++++- Pdf4QtLib/sources/pdfdiff.h | 5 + Pdf4QtLib/sources/pdfdocumenttextflow.cpp | 17 +++ Pdf4QtLib/sources/pdfdocumenttextflow.h | 7 ++ 4 files changed, 163 insertions(+), 4 deletions(-) diff --git a/Pdf4QtLib/sources/pdfdiff.cpp b/Pdf4QtLib/sources/pdfdiff.cpp index 13c764c..e71a248 100644 --- a/Pdf4QtLib/sources/pdfdiff.cpp +++ b/Pdf4QtLib/sources/pdfdiff.cpp @@ -46,6 +46,14 @@ public: bool isEmpty() const { return left.empty() && right.empty(); } }; + struct TextFlowDifferences + { + PDFDocumentTextFlow leftTextFlow; + PDFDocumentTextFlow rightTextFlow; + QString leftText; + QString rightText; + }; + static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon); static std::vector getLeftUnmatched(const PageSequence& sequence); static std::vector getRightUnmatched(const PageSequence& sequence); @@ -59,7 +67,8 @@ PDFDiff::PDFDiff(QObject* parent) : m_rightDocument(nullptr), m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), m_epsilon(0.001), - m_cancelled(false) + m_cancelled(false), + m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout) { } @@ -392,7 +401,7 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: { pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow; factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true); - PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto); + PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, m_textAnalysisAlgorithm); std::map splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text); for (PDFDiffPageContext& leftContext : leftPreparedPages) { @@ -411,7 +420,7 @@ void PDFDiff::performSteps(const std::vector& leftPages, const std:: { pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow; factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true); - PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto); + PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, m_textAnalysisAlgorithm); std::map splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text); for (PDFDiffPageContext& rightContext : rightPreparedPages) { @@ -458,6 +467,8 @@ void PDFDiff::performCompare(const std::vector& leftPrepared } } + std::vector textFlowDifferences; + for (const auto& range : modifiedRanges) { AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range); @@ -472,15 +483,25 @@ void PDFDiff::performCompare(const std::vector& leftPrepared // page range was added, or page range was removed. if (isReplaced) { + PDFDocumentTextFlow leftTextFlow; + PDFDocumentTextFlow rightTextFlow; + + const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector); + for (auto it = range.first; it != range.second; ++it) { const AlgorithmLCS::SequenceItem& item = *it; if (item.isReplaced()) { - const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector); const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; + if (!isTextComparedAsVectorGraphics) + { + leftTextFlow.append(leftPageContext.text); + rightTextFlow.append(rightPageContext.text); + } + auto pageLeft = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex); auto pageRight = m_rightDocument->getCatalog()->getPage(rightPageContext.pageIndex); PDFReal epsilon = (calculateEpsilonForPage(pageLeft) + calculateEpsilonForPage(pageRight)) * 0.5; @@ -549,14 +570,42 @@ void PDFDiff::performCompare(const std::vector& leftPrepared if (item.isAdded()) { const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; + + if (!isTextComparedAsVectorGraphics) + { + rightTextFlow.append(rightPageContext.text); + } + m_result.addPageAdded(rightPageContext.pageIndex); } if (item.isRemoved()) { const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; + + if (!isTextComparedAsVectorGraphics) + { + leftTextFlow.append(leftPageContext.text); + } + m_result.addPageRemoved(leftPageContext.pageIndex); } } + + textFlowDifferences.emplace_back(); + PDFDiffHelper::TextFlowDifferences& addedDifferences = textFlowDifferences.back(); + addedDifferences.leftText = leftTextFlow.getText(); + addedDifferences.rightText = rightTextFlow.getText(); + + if (addedDifferences.leftText == addedDifferences.rightText) + { + // Text is the same, no difference is found + textFlowDifferences.pop_back(); + } + else + { + addedDifferences.leftTextFlow = std::move(leftTextFlow); + addedDifferences.rightTextFlow = std::move(rightTextFlow); + } } else { @@ -576,6 +625,77 @@ void PDFDiff::performCompare(const std::vector& leftPrepared } } } + + // Jakub Melka: try to compare text differences + auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context) + { + struct CompareItem + { + size_t index = 0; + int charIndex = 0; + bool left = false; + }; + + std::vector leftItems; + std::vector rightItems; + + const size_t leftCount = context.leftTextFlow.getSize(); + for (size_t i = 0; i < leftCount; ++i) + { + CompareItem item; + item.index = i; + item.left = true; + + const PDFDocumentTextFlow::Item* textFlowItem = context.leftTextFlow.getItem(i); + for (int j = 0; j < textFlowItem->text.size(); ++j) + { + item.charIndex = j; + leftItems.push_back(item); + } + } + + const size_t rightCount = context.rightTextFlow.getSize(); + for (size_t i = 0; i < rightCount; ++i) + { + CompareItem item; + item.index = i; + item.left = false; + + const PDFDocumentTextFlow::Item* textFlowItem = context.rightTextFlow.getItem(i); + for (int j = 0; j < textFlowItem->text.size(); ++j) + { + item.charIndex = j; + rightItems.push_back(item); + } + } + + auto compareCharacters = [&](const CompareItem& a, const CompareItem& b) + { + + + const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow; + const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow; + + QChar aChar = aItem.getItem(a.index)->text[a.charIndex]; + QChar bChar = bItem.getItem(b.index)->text[b.charIndex]; + + return aChar == bChar; + }; + PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(), + rightItems.cbegin(), rightItems.cend(), + compareCharacters); + algorithm.perform(); + PDFAlgorithmLongestCommonSubsequenceBase::Sequence sequence = algorithm.getSequence(); + PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { }); + PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence); + + for (const auto& range : modifiedRanges) + { + + } + }; + + PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts); } void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context) @@ -635,6 +755,16 @@ PDFReal PDFDiff::calculateEpsilonForPage(const PDFPage* page) const return factor * m_epsilon; } +PDFDocumentTextFlowFactory::Algorithm PDFDiff::getTextAnalysisAlgorithm() const +{ + return m_textAnalysisAlgorithm; +} + +void PDFDiff::setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm) +{ + m_textAnalysisAlgorithm = textAnalysisAlgorithm; +} + PDFDiffResult::PDFDiffResult() : m_result(true) { diff --git a/Pdf4QtLib/sources/pdfdiff.h b/Pdf4QtLib/sources/pdfdiff.h index 100deed..8a905e6 100644 --- a/Pdf4QtLib/sources/pdfdiff.h +++ b/Pdf4QtLib/sources/pdfdiff.h @@ -22,6 +22,7 @@ #include "pdfprogress.h" #include "pdfutils.h" #include "pdfalgorithmlcs.h" +#include "pdfdocumenttextflow.h" #include #include @@ -160,6 +161,9 @@ public: /// Returns result of a comparation process const PDFDiffResult& getResult() const { return m_result; } + PDFDocumentTextFlowFactory::Algorithm getTextAnalysisAlgorithm() const; + void setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm); + signals: void comparationFinished(); @@ -207,6 +211,7 @@ private: PDFReal m_epsilon; std::atomic_bool m_cancelled; PDFDiffResult m_result; + PDFDocumentTextFlowFactory::Algorithm m_textAnalysisAlgorithm; QFuture m_future; std::optional> m_futureWatcher; diff --git a/Pdf4QtLib/sources/pdfdocumenttextflow.cpp b/Pdf4QtLib/sources/pdfdocumenttextflow.cpp index 83b7515..3481ed7 100644 --- a/Pdf4QtLib/sources/pdfdocumenttextflow.cpp +++ b/Pdf4QtLib/sources/pdfdocumenttextflow.cpp @@ -1055,4 +1055,21 @@ std::map PDFDocumentTextFlow::split(Flags mask) return result; } +void PDFDocumentTextFlow::append(const PDFDocumentTextFlow& textFlow) +{ + m_items.insert(m_items.end(), textFlow.m_items.cbegin(), textFlow.m_items.cend()); +} + +QString PDFDocumentTextFlow::getText() const +{ + QStringList texts; + + for (const auto& item : m_items) + { + texts << item.text.trimmed(); + } + + return texts.join(" "); +} + } // namespace pdf diff --git a/Pdf4QtLib/sources/pdfdocumenttextflow.h b/Pdf4QtLib/sources/pdfdocumenttextflow.h index f247606..2d9d748 100644 --- a/Pdf4QtLib/sources/pdfdocumenttextflow.h +++ b/Pdf4QtLib/sources/pdfdocumenttextflow.h @@ -91,6 +91,13 @@ public: /// \param mask Mask std::map split(Flags mask) const; + /// Appends document text flow to this one + /// \param textFlow Text flow + void append(const PDFDocumentTextFlow& textFlow); + + /// Returns text concantecated from all items + QString getText() const; + private: Items m_items; };