mirror of
				https://github.com/JakubMelka/PDF4QT.git
				synced 2025-06-05 21:59:17 +02:00 
			
		
		
		
	DocDiff application: text comparation
This commit is contained in:
		| @@ -46,6 +46,14 @@ public: | |||||||
|         bool isEmpty() const { return left.empty() && right.empty(); } |         bool isEmpty() const { return left.empty() && right.empty(); } | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     struct TextFlowDifferences | ||||||
|  |     { | ||||||
|  |         PDFDocumentTextFlow leftTextFlow; | ||||||
|  |         PDFDocumentTextFlow rightTextFlow; | ||||||
|  |         QString leftText; | ||||||
|  |         QString rightText; | ||||||
|  |     }; | ||||||
|  |  | ||||||
|     static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon); |     static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon); | ||||||
|     static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence); |     static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence); | ||||||
|     static std::vector<size_t> getRightUnmatched(const PageSequence& sequence); |     static std::vector<size_t> getRightUnmatched(const PageSequence& sequence); | ||||||
| @@ -59,7 +67,8 @@ PDFDiff::PDFDiff(QObject* parent) : | |||||||
|     m_rightDocument(nullptr), |     m_rightDocument(nullptr), | ||||||
|     m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), |     m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), | ||||||
|     m_epsilon(0.001), |     m_epsilon(0.001), | ||||||
|     m_cancelled(false) |     m_cancelled(false), | ||||||
|  |     m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout) | ||||||
| { | { | ||||||
|  |  | ||||||
| } | } | ||||||
| @@ -392,7 +401,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | |||||||
|     { |     { | ||||||
|         pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow; |         pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow; | ||||||
|         factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true); |         factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true); | ||||||
|         PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto); |         PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, m_textAnalysisAlgorithm); | ||||||
|         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text); |         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text); | ||||||
|         for (PDFDiffPageContext& leftContext : leftPreparedPages) |         for (PDFDiffPageContext& leftContext : leftPreparedPages) | ||||||
|         { |         { | ||||||
| @@ -411,7 +420,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | |||||||
|     { |     { | ||||||
|         pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow; |         pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow; | ||||||
|         factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true); |         factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true); | ||||||
|         PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto); |         PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, m_textAnalysisAlgorithm); | ||||||
|         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text); |         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text); | ||||||
|         for (PDFDiffPageContext& rightContext : rightPreparedPages) |         for (PDFDiffPageContext& rightContext : rightPreparedPages) | ||||||
|         { |         { | ||||||
| @@ -458,6 +467,8 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     std::vector<PDFDiffHelper::TextFlowDifferences> textFlowDifferences; | ||||||
|  |  | ||||||
|     for (const auto& range : modifiedRanges) |     for (const auto& range : modifiedRanges) | ||||||
|     { |     { | ||||||
|         AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range); |         AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range); | ||||||
| @@ -472,15 +483,25 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | |||||||
|         // page range was added, or page range was removed. |         // page range was added, or page range was removed. | ||||||
|         if (isReplaced) |         if (isReplaced) | ||||||
|         { |         { | ||||||
|  |             PDFDocumentTextFlow leftTextFlow; | ||||||
|  |             PDFDocumentTextFlow rightTextFlow; | ||||||
|  |  | ||||||
|  |             const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector); | ||||||
|  |  | ||||||
|             for (auto it = range.first; it != range.second; ++it) |             for (auto it = range.first; it != range.second; ++it) | ||||||
|             { |             { | ||||||
|                 const AlgorithmLCS::SequenceItem& item = *it; |                 const AlgorithmLCS::SequenceItem& item = *it; | ||||||
|                 if (item.isReplaced()) |                 if (item.isReplaced()) | ||||||
|                 { |                 { | ||||||
|                     const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector); |  | ||||||
|                     const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; |                     const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; | ||||||
|                     const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; |                     const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; | ||||||
|  |  | ||||||
|  |                     if (!isTextComparedAsVectorGraphics) | ||||||
|  |                     { | ||||||
|  |                         leftTextFlow.append(leftPageContext.text); | ||||||
|  |                         rightTextFlow.append(rightPageContext.text); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|                     auto pageLeft = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex); |                     auto pageLeft = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex); | ||||||
|                     auto pageRight = m_rightDocument->getCatalog()->getPage(rightPageContext.pageIndex); |                     auto pageRight = m_rightDocument->getCatalog()->getPage(rightPageContext.pageIndex); | ||||||
|                     PDFReal epsilon = (calculateEpsilonForPage(pageLeft) + calculateEpsilonForPage(pageRight)) * 0.5; |                     PDFReal epsilon = (calculateEpsilonForPage(pageLeft) + calculateEpsilonForPage(pageRight)) * 0.5; | ||||||
| @@ -549,14 +570,42 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | |||||||
|                 if (item.isAdded()) |                 if (item.isAdded()) | ||||||
|                 { |                 { | ||||||
|                     const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; |                     const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; | ||||||
|  |  | ||||||
|  |                     if (!isTextComparedAsVectorGraphics) | ||||||
|  |                     { | ||||||
|  |                         rightTextFlow.append(rightPageContext.text); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|                     m_result.addPageAdded(rightPageContext.pageIndex); |                     m_result.addPageAdded(rightPageContext.pageIndex); | ||||||
|                 } |                 } | ||||||
|                 if (item.isRemoved()) |                 if (item.isRemoved()) | ||||||
|                 { |                 { | ||||||
|                     const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; |                     const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; | ||||||
|  |  | ||||||
|  |                     if (!isTextComparedAsVectorGraphics) | ||||||
|  |                     { | ||||||
|  |                         leftTextFlow.append(leftPageContext.text); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|                     m_result.addPageRemoved(leftPageContext.pageIndex); |                     m_result.addPageRemoved(leftPageContext.pageIndex); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  |             textFlowDifferences.emplace_back(); | ||||||
|  |             PDFDiffHelper::TextFlowDifferences& addedDifferences = textFlowDifferences.back(); | ||||||
|  |             addedDifferences.leftText = leftTextFlow.getText(); | ||||||
|  |             addedDifferences.rightText = rightTextFlow.getText(); | ||||||
|  |  | ||||||
|  |             if (addedDifferences.leftText == addedDifferences.rightText) | ||||||
|  |             { | ||||||
|  |                 // Text is the same, no difference is found | ||||||
|  |                 textFlowDifferences.pop_back(); | ||||||
|  |             } | ||||||
|  |             else | ||||||
|  |             { | ||||||
|  |                 addedDifferences.leftTextFlow = std::move(leftTextFlow); | ||||||
|  |                 addedDifferences.rightTextFlow = std::move(rightTextFlow); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|         else |         else | ||||||
|         { |         { | ||||||
| @@ -576,6 +625,77 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // Jakub Melka: try to compare text differences | ||||||
|  |     auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context) | ||||||
|  |     { | ||||||
|  |         struct CompareItem | ||||||
|  |         { | ||||||
|  |             size_t index = 0; | ||||||
|  |             int charIndex = 0; | ||||||
|  |             bool left = false; | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         std::vector<CompareItem> leftItems; | ||||||
|  |         std::vector<CompareItem> rightItems; | ||||||
|  |  | ||||||
|  |         const size_t leftCount = context.leftTextFlow.getSize(); | ||||||
|  |         for (size_t i = 0; i < leftCount; ++i) | ||||||
|  |         { | ||||||
|  |             CompareItem item; | ||||||
|  |             item.index = i; | ||||||
|  |             item.left = true; | ||||||
|  |  | ||||||
|  |             const PDFDocumentTextFlow::Item* textFlowItem = context.leftTextFlow.getItem(i); | ||||||
|  |             for (int j = 0; j < textFlowItem->text.size(); ++j) | ||||||
|  |             { | ||||||
|  |                 item.charIndex = j; | ||||||
|  |                 leftItems.push_back(item); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         const size_t rightCount = context.rightTextFlow.getSize(); | ||||||
|  |         for (size_t i = 0; i < rightCount; ++i) | ||||||
|  |         { | ||||||
|  |             CompareItem item; | ||||||
|  |             item.index = i; | ||||||
|  |             item.left = false; | ||||||
|  |  | ||||||
|  |             const PDFDocumentTextFlow::Item* textFlowItem = context.rightTextFlow.getItem(i); | ||||||
|  |             for (int j = 0; j < textFlowItem->text.size(); ++j) | ||||||
|  |             { | ||||||
|  |                 item.charIndex = j; | ||||||
|  |                 rightItems.push_back(item); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         auto compareCharacters = [&](const CompareItem& a, const CompareItem& b) | ||||||
|  |         { | ||||||
|  |  | ||||||
|  |  | ||||||
|  |             const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow; | ||||||
|  |             const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow; | ||||||
|  |  | ||||||
|  |             QChar aChar = aItem.getItem(a.index)->text[a.charIndex]; | ||||||
|  |             QChar bChar = bItem.getItem(b.index)->text[b.charIndex]; | ||||||
|  |  | ||||||
|  |             return aChar == bChar; | ||||||
|  |         }; | ||||||
|  |         PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(), | ||||||
|  |                                                        rightItems.cbegin(), rightItems.cend(), | ||||||
|  |                                                        compareCharacters); | ||||||
|  |         algorithm.perform(); | ||||||
|  |         PDFAlgorithmLongestCommonSubsequenceBase::Sequence sequence = algorithm.getSequence(); | ||||||
|  |         PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { }); | ||||||
|  |         PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence); | ||||||
|  |  | ||||||
|  |         for (const auto& range : modifiedRanges) | ||||||
|  |         { | ||||||
|  |  | ||||||
|  |         } | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts); | ||||||
| } | } | ||||||
|  |  | ||||||
| void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context) | void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context) | ||||||
| @@ -635,6 +755,16 @@ PDFReal PDFDiff::calculateEpsilonForPage(const PDFPage* page) const | |||||||
|     return factor * m_epsilon; |     return factor * m_epsilon; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | PDFDocumentTextFlowFactory::Algorithm PDFDiff::getTextAnalysisAlgorithm() const | ||||||
|  | { | ||||||
|  |     return m_textAnalysisAlgorithm; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void PDFDiff::setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm) | ||||||
|  | { | ||||||
|  |     m_textAnalysisAlgorithm = textAnalysisAlgorithm; | ||||||
|  | } | ||||||
|  |  | ||||||
| PDFDiffResult::PDFDiffResult() : | PDFDiffResult::PDFDiffResult() : | ||||||
|     m_result(true) |     m_result(true) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -22,6 +22,7 @@ | |||||||
| #include "pdfprogress.h" | #include "pdfprogress.h" | ||||||
| #include "pdfutils.h" | #include "pdfutils.h" | ||||||
| #include "pdfalgorithmlcs.h" | #include "pdfalgorithmlcs.h" | ||||||
|  | #include "pdfdocumenttextflow.h" | ||||||
|  |  | ||||||
| #include <QObject> | #include <QObject> | ||||||
| #include <QFuture> | #include <QFuture> | ||||||
| @@ -160,6 +161,9 @@ public: | |||||||
|     /// Returns result of a comparation process |     /// Returns result of a comparation process | ||||||
|     const PDFDiffResult& getResult() const { return m_result; } |     const PDFDiffResult& getResult() const { return m_result; } | ||||||
|  |  | ||||||
|  |     PDFDocumentTextFlowFactory::Algorithm getTextAnalysisAlgorithm() const; | ||||||
|  |     void setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm); | ||||||
|  |  | ||||||
| signals: | signals: | ||||||
|     void comparationFinished(); |     void comparationFinished(); | ||||||
|  |  | ||||||
| @@ -207,6 +211,7 @@ private: | |||||||
|     PDFReal m_epsilon; |     PDFReal m_epsilon; | ||||||
|     std::atomic_bool m_cancelled; |     std::atomic_bool m_cancelled; | ||||||
|     PDFDiffResult m_result; |     PDFDiffResult m_result; | ||||||
|  |     PDFDocumentTextFlowFactory::Algorithm m_textAnalysisAlgorithm; | ||||||
|  |  | ||||||
|     QFuture<PDFDiffResult> m_future; |     QFuture<PDFDiffResult> m_future; | ||||||
|     std::optional<QFutureWatcher<PDFDiffResult>> m_futureWatcher; |     std::optional<QFutureWatcher<PDFDiffResult>> m_futureWatcher; | ||||||
|   | |||||||
| @@ -1055,4 +1055,21 @@ std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask) | |||||||
|     return result; |     return result; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void PDFDocumentTextFlow::append(const PDFDocumentTextFlow& textFlow) | ||||||
|  | { | ||||||
|  |     m_items.insert(m_items.end(), textFlow.m_items.cbegin(), textFlow.m_items.cend()); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | QString PDFDocumentTextFlow::getText() const | ||||||
|  | { | ||||||
|  |     QStringList texts; | ||||||
|  |  | ||||||
|  |     for (const auto& item : m_items) | ||||||
|  |     { | ||||||
|  |         texts << item.text.trimmed(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return texts.join(" "); | ||||||
|  | } | ||||||
|  |  | ||||||
| }   // namespace pdf | }   // namespace pdf | ||||||
|   | |||||||
| @@ -91,6 +91,13 @@ public: | |||||||
|     /// \param mask Mask |     /// \param mask Mask | ||||||
|     std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const; |     std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const; | ||||||
|  |  | ||||||
|  |     /// Appends document text flow to this one | ||||||
|  |     /// \param textFlow Text flow | ||||||
|  |     void append(const PDFDocumentTextFlow& textFlow); | ||||||
|  |  | ||||||
|  |     /// Returns text concantecated from all items | ||||||
|  |     QString getText() const; | ||||||
|  |  | ||||||
| private: | private: | ||||||
|     Items m_items; |     Items m_items; | ||||||
| }; | }; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user