mirror of
				https://github.com/JakubMelka/PDF4QT.git
				synced 2025-06-05 21:59:17 +02:00 
			
		
		
		
	DocDiff application: text comparation
This commit is contained in:
		| @@ -46,6 +46,14 @@ public: | ||||
|         bool isEmpty() const { return left.empty() && right.empty(); } | ||||
|     }; | ||||
|  | ||||
|     struct TextFlowDifferences | ||||
|     { | ||||
|         PDFDocumentTextFlow leftTextFlow; | ||||
|         PDFDocumentTextFlow rightTextFlow; | ||||
|         QString leftText; | ||||
|         QString rightText; | ||||
|     }; | ||||
|  | ||||
|     static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon); | ||||
|     static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence); | ||||
|     static std::vector<size_t> getRightUnmatched(const PageSequence& sequence); | ||||
| @@ -59,7 +67,8 @@ PDFDiff::PDFDiff(QObject* parent) : | ||||
|     m_rightDocument(nullptr), | ||||
|     m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images), | ||||
|     m_epsilon(0.001), | ||||
|     m_cancelled(false) | ||||
|     m_cancelled(false), | ||||
|     m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout) | ||||
| { | ||||
|  | ||||
| } | ||||
| @@ -392,7 +401,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | ||||
|     { | ||||
|         pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow; | ||||
|         factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true); | ||||
|         PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto); | ||||
|         PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, m_textAnalysisAlgorithm); | ||||
|         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text); | ||||
|         for (PDFDiffPageContext& leftContext : leftPreparedPages) | ||||
|         { | ||||
| @@ -411,7 +420,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std:: | ||||
|     { | ||||
|         pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow; | ||||
|         factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true); | ||||
|         PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto); | ||||
|         PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, m_textAnalysisAlgorithm); | ||||
|         std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text); | ||||
|         for (PDFDiffPageContext& rightContext : rightPreparedPages) | ||||
|         { | ||||
| @@ -458,6 +467,8 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     std::vector<PDFDiffHelper::TextFlowDifferences> textFlowDifferences; | ||||
|  | ||||
|     for (const auto& range : modifiedRanges) | ||||
|     { | ||||
|         AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range); | ||||
| @@ -472,15 +483,25 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | ||||
|         // page range was added, or page range was removed. | ||||
|         if (isReplaced) | ||||
|         { | ||||
|             PDFDocumentTextFlow leftTextFlow; | ||||
|             PDFDocumentTextFlow rightTextFlow; | ||||
|  | ||||
|             const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector); | ||||
|  | ||||
|             for (auto it = range.first; it != range.second; ++it) | ||||
|             { | ||||
|                 const AlgorithmLCS::SequenceItem& item = *it; | ||||
|                 if (item.isReplaced()) | ||||
|                 { | ||||
|                     const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector); | ||||
|                     const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; | ||||
|                     const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; | ||||
|  | ||||
|                     if (!isTextComparedAsVectorGraphics) | ||||
|                     { | ||||
|                         leftTextFlow.append(leftPageContext.text); | ||||
|                         rightTextFlow.append(rightPageContext.text); | ||||
|                     } | ||||
|  | ||||
|                     auto pageLeft = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex); | ||||
|                     auto pageRight = m_rightDocument->getCatalog()->getPage(rightPageContext.pageIndex); | ||||
|                     PDFReal epsilon = (calculateEpsilonForPage(pageLeft) + calculateEpsilonForPage(pageRight)) * 0.5; | ||||
| @@ -549,14 +570,42 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | ||||
|                 if (item.isAdded()) | ||||
|                 { | ||||
|                     const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2]; | ||||
|  | ||||
|                     if (!isTextComparedAsVectorGraphics) | ||||
|                     { | ||||
|                         rightTextFlow.append(rightPageContext.text); | ||||
|                     } | ||||
|  | ||||
|                     m_result.addPageAdded(rightPageContext.pageIndex); | ||||
|                 } | ||||
|                 if (item.isRemoved()) | ||||
|                 { | ||||
|                     const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1]; | ||||
|  | ||||
|                     if (!isTextComparedAsVectorGraphics) | ||||
|                     { | ||||
|                         leftTextFlow.append(leftPageContext.text); | ||||
|                     } | ||||
|  | ||||
|                     m_result.addPageRemoved(leftPageContext.pageIndex); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             textFlowDifferences.emplace_back(); | ||||
|             PDFDiffHelper::TextFlowDifferences& addedDifferences = textFlowDifferences.back(); | ||||
|             addedDifferences.leftText = leftTextFlow.getText(); | ||||
|             addedDifferences.rightText = rightTextFlow.getText(); | ||||
|  | ||||
|             if (addedDifferences.leftText == addedDifferences.rightText) | ||||
|             { | ||||
|                 // Text is the same, no difference is found | ||||
|                 textFlowDifferences.pop_back(); | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 addedDifferences.leftTextFlow = std::move(leftTextFlow); | ||||
|                 addedDifferences.rightTextFlow = std::move(rightTextFlow); | ||||
|             } | ||||
|         } | ||||
|         else | ||||
|         { | ||||
| @@ -576,6 +625,77 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // Jakub Melka: try to compare text differences | ||||
|     auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context) | ||||
|     { | ||||
|         struct CompareItem | ||||
|         { | ||||
|             size_t index = 0; | ||||
|             int charIndex = 0; | ||||
|             bool left = false; | ||||
|         }; | ||||
|  | ||||
|         std::vector<CompareItem> leftItems; | ||||
|         std::vector<CompareItem> rightItems; | ||||
|  | ||||
|         const size_t leftCount = context.leftTextFlow.getSize(); | ||||
|         for (size_t i = 0; i < leftCount; ++i) | ||||
|         { | ||||
|             CompareItem item; | ||||
|             item.index = i; | ||||
|             item.left = true; | ||||
|  | ||||
|             const PDFDocumentTextFlow::Item* textFlowItem = context.leftTextFlow.getItem(i); | ||||
|             for (int j = 0; j < textFlowItem->text.size(); ++j) | ||||
|             { | ||||
|                 item.charIndex = j; | ||||
|                 leftItems.push_back(item); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         const size_t rightCount = context.rightTextFlow.getSize(); | ||||
|         for (size_t i = 0; i < rightCount; ++i) | ||||
|         { | ||||
|             CompareItem item; | ||||
|             item.index = i; | ||||
|             item.left = false; | ||||
|  | ||||
|             const PDFDocumentTextFlow::Item* textFlowItem = context.rightTextFlow.getItem(i); | ||||
|             for (int j = 0; j < textFlowItem->text.size(); ++j) | ||||
|             { | ||||
|                 item.charIndex = j; | ||||
|                 rightItems.push_back(item); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         auto compareCharacters = [&](const CompareItem& a, const CompareItem& b) | ||||
|         { | ||||
|  | ||||
|  | ||||
|             const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow; | ||||
|             const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow; | ||||
|  | ||||
|             QChar aChar = aItem.getItem(a.index)->text[a.charIndex]; | ||||
|             QChar bChar = bItem.getItem(b.index)->text[b.charIndex]; | ||||
|  | ||||
|             return aChar == bChar; | ||||
|         }; | ||||
|         PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(), | ||||
|                                                        rightItems.cbegin(), rightItems.cend(), | ||||
|                                                        compareCharacters); | ||||
|         algorithm.perform(); | ||||
|         PDFAlgorithmLongestCommonSubsequenceBase::Sequence sequence = algorithm.getSequence(); | ||||
|         PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { }); | ||||
|         PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence); | ||||
|  | ||||
|         for (const auto& range : modifiedRanges) | ||||
|         { | ||||
|  | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts); | ||||
| } | ||||
|  | ||||
| void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context) | ||||
| @@ -635,6 +755,16 @@ PDFReal PDFDiff::calculateEpsilonForPage(const PDFPage* page) const | ||||
|     return factor * m_epsilon; | ||||
| } | ||||
|  | ||||
| PDFDocumentTextFlowFactory::Algorithm PDFDiff::getTextAnalysisAlgorithm() const | ||||
| { | ||||
|     return m_textAnalysisAlgorithm; | ||||
| } | ||||
|  | ||||
| void PDFDiff::setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm) | ||||
| { | ||||
|     m_textAnalysisAlgorithm = textAnalysisAlgorithm; | ||||
| } | ||||
|  | ||||
| PDFDiffResult::PDFDiffResult() : | ||||
|     m_result(true) | ||||
| { | ||||
|   | ||||
| @@ -22,6 +22,7 @@ | ||||
| #include "pdfprogress.h" | ||||
| #include "pdfutils.h" | ||||
| #include "pdfalgorithmlcs.h" | ||||
| #include "pdfdocumenttextflow.h" | ||||
|  | ||||
| #include <QObject> | ||||
| #include <QFuture> | ||||
| @@ -160,6 +161,9 @@ public: | ||||
|     /// Returns result of a comparation process | ||||
|     const PDFDiffResult& getResult() const { return m_result; } | ||||
|  | ||||
|     PDFDocumentTextFlowFactory::Algorithm getTextAnalysisAlgorithm() const; | ||||
|     void setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm); | ||||
|  | ||||
| signals: | ||||
|     void comparationFinished(); | ||||
|  | ||||
| @@ -207,6 +211,7 @@ private: | ||||
|     PDFReal m_epsilon; | ||||
|     std::atomic_bool m_cancelled; | ||||
|     PDFDiffResult m_result; | ||||
|     PDFDocumentTextFlowFactory::Algorithm m_textAnalysisAlgorithm; | ||||
|  | ||||
|     QFuture<PDFDiffResult> m_future; | ||||
|     std::optional<QFutureWatcher<PDFDiffResult>> m_futureWatcher; | ||||
|   | ||||
| @@ -1055,4 +1055,21 @@ std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask) | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| void PDFDocumentTextFlow::append(const PDFDocumentTextFlow& textFlow) | ||||
| { | ||||
|     m_items.insert(m_items.end(), textFlow.m_items.cbegin(), textFlow.m_items.cend()); | ||||
| } | ||||
|  | ||||
| QString PDFDocumentTextFlow::getText() const | ||||
| { | ||||
|     QStringList texts; | ||||
|  | ||||
|     for (const auto& item : m_items) | ||||
|     { | ||||
|         texts << item.text.trimmed(); | ||||
|     } | ||||
|  | ||||
|     return texts.join(" "); | ||||
| } | ||||
|  | ||||
| }   // namespace pdf | ||||
|   | ||||
| @@ -91,6 +91,13 @@ public: | ||||
|     /// \param mask Mask | ||||
|     std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const; | ||||
|  | ||||
|     /// Appends document text flow to this one | ||||
|     /// \param textFlow Text flow | ||||
|     void append(const PDFDocumentTextFlow& textFlow); | ||||
|  | ||||
|     /// Returns text concantecated from all items | ||||
|     QString getText() const; | ||||
|  | ||||
| private: | ||||
|     Items m_items; | ||||
| }; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user