mirror of https://github.com/JakubMelka/PDF4QT.git
DocDiff application: text comparation
This commit is contained in:
parent
e00863c6bc
commit
9bc29da83c
|
@ -46,6 +46,14 @@ public:
|
||||||
bool isEmpty() const { return left.empty() && right.empty(); }
|
bool isEmpty() const { return left.empty() && right.empty(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct TextFlowDifferences
|
||||||
|
{
|
||||||
|
PDFDocumentTextFlow leftTextFlow;
|
||||||
|
PDFDocumentTextFlow rightTextFlow;
|
||||||
|
QString leftText;
|
||||||
|
QString rightText;
|
||||||
|
};
|
||||||
|
|
||||||
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
|
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
|
||||||
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
|
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
|
||||||
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
|
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
|
||||||
|
@ -59,7 +67,8 @@ PDFDiff::PDFDiff(QObject* parent) :
|
||||||
m_rightDocument(nullptr),
|
m_rightDocument(nullptr),
|
||||||
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
|
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
|
||||||
m_epsilon(0.001),
|
m_epsilon(0.001),
|
||||||
m_cancelled(false)
|
m_cancelled(false),
|
||||||
|
m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout)
|
||||||
{
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -392,7 +401,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
||||||
{
|
{
|
||||||
pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow;
|
pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow;
|
||||||
factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true);
|
factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true);
|
||||||
PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
|
PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, m_textAnalysisAlgorithm);
|
||||||
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text);
|
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text);
|
||||||
for (PDFDiffPageContext& leftContext : leftPreparedPages)
|
for (PDFDiffPageContext& leftContext : leftPreparedPages)
|
||||||
{
|
{
|
||||||
|
@ -411,7 +420,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
|
||||||
{
|
{
|
||||||
pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow;
|
pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow;
|
||||||
factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true);
|
factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true);
|
||||||
PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
|
PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, m_textAnalysisAlgorithm);
|
||||||
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text);
|
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text);
|
||||||
for (PDFDiffPageContext& rightContext : rightPreparedPages)
|
for (PDFDiffPageContext& rightContext : rightPreparedPages)
|
||||||
{
|
{
|
||||||
|
@ -458,6 +467,8 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<PDFDiffHelper::TextFlowDifferences> textFlowDifferences;
|
||||||
|
|
||||||
for (const auto& range : modifiedRanges)
|
for (const auto& range : modifiedRanges)
|
||||||
{
|
{
|
||||||
AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range);
|
AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range);
|
||||||
|
@ -472,15 +483,25 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
// page range was added, or page range was removed.
|
// page range was added, or page range was removed.
|
||||||
if (isReplaced)
|
if (isReplaced)
|
||||||
{
|
{
|
||||||
|
PDFDocumentTextFlow leftTextFlow;
|
||||||
|
PDFDocumentTextFlow rightTextFlow;
|
||||||
|
|
||||||
|
const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector);
|
||||||
|
|
||||||
for (auto it = range.first; it != range.second; ++it)
|
for (auto it = range.first; it != range.second; ++it)
|
||||||
{
|
{
|
||||||
const AlgorithmLCS::SequenceItem& item = *it;
|
const AlgorithmLCS::SequenceItem& item = *it;
|
||||||
if (item.isReplaced())
|
if (item.isReplaced())
|
||||||
{
|
{
|
||||||
const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector);
|
|
||||||
const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1];
|
const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1];
|
||||||
const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2];
|
const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2];
|
||||||
|
|
||||||
|
if (!isTextComparedAsVectorGraphics)
|
||||||
|
{
|
||||||
|
leftTextFlow.append(leftPageContext.text);
|
||||||
|
rightTextFlow.append(rightPageContext.text);
|
||||||
|
}
|
||||||
|
|
||||||
auto pageLeft = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex);
|
auto pageLeft = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex);
|
||||||
auto pageRight = m_rightDocument->getCatalog()->getPage(rightPageContext.pageIndex);
|
auto pageRight = m_rightDocument->getCatalog()->getPage(rightPageContext.pageIndex);
|
||||||
PDFReal epsilon = (calculateEpsilonForPage(pageLeft) + calculateEpsilonForPage(pageRight)) * 0.5;
|
PDFReal epsilon = (calculateEpsilonForPage(pageLeft) + calculateEpsilonForPage(pageRight)) * 0.5;
|
||||||
|
@ -549,14 +570,42 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
if (item.isAdded())
|
if (item.isAdded())
|
||||||
{
|
{
|
||||||
const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2];
|
const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2];
|
||||||
|
|
||||||
|
if (!isTextComparedAsVectorGraphics)
|
||||||
|
{
|
||||||
|
rightTextFlow.append(rightPageContext.text);
|
||||||
|
}
|
||||||
|
|
||||||
m_result.addPageAdded(rightPageContext.pageIndex);
|
m_result.addPageAdded(rightPageContext.pageIndex);
|
||||||
}
|
}
|
||||||
if (item.isRemoved())
|
if (item.isRemoved())
|
||||||
{
|
{
|
||||||
const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1];
|
const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1];
|
||||||
|
|
||||||
|
if (!isTextComparedAsVectorGraphics)
|
||||||
|
{
|
||||||
|
leftTextFlow.append(leftPageContext.text);
|
||||||
|
}
|
||||||
|
|
||||||
m_result.addPageRemoved(leftPageContext.pageIndex);
|
m_result.addPageRemoved(leftPageContext.pageIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
textFlowDifferences.emplace_back();
|
||||||
|
PDFDiffHelper::TextFlowDifferences& addedDifferences = textFlowDifferences.back();
|
||||||
|
addedDifferences.leftText = leftTextFlow.getText();
|
||||||
|
addedDifferences.rightText = rightTextFlow.getText();
|
||||||
|
|
||||||
|
if (addedDifferences.leftText == addedDifferences.rightText)
|
||||||
|
{
|
||||||
|
// Text is the same, no difference is found
|
||||||
|
textFlowDifferences.pop_back();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
addedDifferences.leftTextFlow = std::move(leftTextFlow);
|
||||||
|
addedDifferences.rightTextFlow = std::move(rightTextFlow);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -576,6 +625,77 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Jakub Melka: try to compare text differences
|
||||||
|
auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context)
|
||||||
|
{
|
||||||
|
struct CompareItem
|
||||||
|
{
|
||||||
|
size_t index = 0;
|
||||||
|
int charIndex = 0;
|
||||||
|
bool left = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<CompareItem> leftItems;
|
||||||
|
std::vector<CompareItem> rightItems;
|
||||||
|
|
||||||
|
const size_t leftCount = context.leftTextFlow.getSize();
|
||||||
|
for (size_t i = 0; i < leftCount; ++i)
|
||||||
|
{
|
||||||
|
CompareItem item;
|
||||||
|
item.index = i;
|
||||||
|
item.left = true;
|
||||||
|
|
||||||
|
const PDFDocumentTextFlow::Item* textFlowItem = context.leftTextFlow.getItem(i);
|
||||||
|
for (int j = 0; j < textFlowItem->text.size(); ++j)
|
||||||
|
{
|
||||||
|
item.charIndex = j;
|
||||||
|
leftItems.push_back(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t rightCount = context.rightTextFlow.getSize();
|
||||||
|
for (size_t i = 0; i < rightCount; ++i)
|
||||||
|
{
|
||||||
|
CompareItem item;
|
||||||
|
item.index = i;
|
||||||
|
item.left = false;
|
||||||
|
|
||||||
|
const PDFDocumentTextFlow::Item* textFlowItem = context.rightTextFlow.getItem(i);
|
||||||
|
for (int j = 0; j < textFlowItem->text.size(); ++j)
|
||||||
|
{
|
||||||
|
item.charIndex = j;
|
||||||
|
rightItems.push_back(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto compareCharacters = [&](const CompareItem& a, const CompareItem& b)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
|
const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
|
|
||||||
|
QChar aChar = aItem.getItem(a.index)->text[a.charIndex];
|
||||||
|
QChar bChar = bItem.getItem(b.index)->text[b.charIndex];
|
||||||
|
|
||||||
|
return aChar == bChar;
|
||||||
|
};
|
||||||
|
PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(),
|
||||||
|
rightItems.cbegin(), rightItems.cend(),
|
||||||
|
compareCharacters);
|
||||||
|
algorithm.perform();
|
||||||
|
PDFAlgorithmLongestCommonSubsequenceBase::Sequence sequence = algorithm.getSequence();
|
||||||
|
PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { });
|
||||||
|
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence);
|
||||||
|
|
||||||
|
for (const auto& range : modifiedRanges)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
|
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
|
||||||
|
@ -635,6 +755,16 @@ PDFReal PDFDiff::calculateEpsilonForPage(const PDFPage* page) const
|
||||||
return factor * m_epsilon;
|
return factor * m_epsilon;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PDFDocumentTextFlowFactory::Algorithm PDFDiff::getTextAnalysisAlgorithm() const
|
||||||
|
{
|
||||||
|
return m_textAnalysisAlgorithm;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFDiff::setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm)
|
||||||
|
{
|
||||||
|
m_textAnalysisAlgorithm = textAnalysisAlgorithm;
|
||||||
|
}
|
||||||
|
|
||||||
PDFDiffResult::PDFDiffResult() :
|
PDFDiffResult::PDFDiffResult() :
|
||||||
m_result(true)
|
m_result(true)
|
||||||
{
|
{
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include "pdfprogress.h"
|
#include "pdfprogress.h"
|
||||||
#include "pdfutils.h"
|
#include "pdfutils.h"
|
||||||
#include "pdfalgorithmlcs.h"
|
#include "pdfalgorithmlcs.h"
|
||||||
|
#include "pdfdocumenttextflow.h"
|
||||||
|
|
||||||
#include <QObject>
|
#include <QObject>
|
||||||
#include <QFuture>
|
#include <QFuture>
|
||||||
|
@ -160,6 +161,9 @@ public:
|
||||||
/// Returns result of a comparation process
|
/// Returns result of a comparation process
|
||||||
const PDFDiffResult& getResult() const { return m_result; }
|
const PDFDiffResult& getResult() const { return m_result; }
|
||||||
|
|
||||||
|
PDFDocumentTextFlowFactory::Algorithm getTextAnalysisAlgorithm() const;
|
||||||
|
void setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm);
|
||||||
|
|
||||||
signals:
|
signals:
|
||||||
void comparationFinished();
|
void comparationFinished();
|
||||||
|
|
||||||
|
@ -207,6 +211,7 @@ private:
|
||||||
PDFReal m_epsilon;
|
PDFReal m_epsilon;
|
||||||
std::atomic_bool m_cancelled;
|
std::atomic_bool m_cancelled;
|
||||||
PDFDiffResult m_result;
|
PDFDiffResult m_result;
|
||||||
|
PDFDocumentTextFlowFactory::Algorithm m_textAnalysisAlgorithm;
|
||||||
|
|
||||||
QFuture<PDFDiffResult> m_future;
|
QFuture<PDFDiffResult> m_future;
|
||||||
std::optional<QFutureWatcher<PDFDiffResult>> m_futureWatcher;
|
std::optional<QFutureWatcher<PDFDiffResult>> m_futureWatcher;
|
||||||
|
|
|
@ -1055,4 +1055,21 @@ std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFDocumentTextFlow::append(const PDFDocumentTextFlow& textFlow)
|
||||||
|
{
|
||||||
|
m_items.insert(m_items.end(), textFlow.m_items.cbegin(), textFlow.m_items.cend());
|
||||||
|
}
|
||||||
|
|
||||||
|
QString PDFDocumentTextFlow::getText() const
|
||||||
|
{
|
||||||
|
QStringList texts;
|
||||||
|
|
||||||
|
for (const auto& item : m_items)
|
||||||
|
{
|
||||||
|
texts << item.text.trimmed();
|
||||||
|
}
|
||||||
|
|
||||||
|
return texts.join(" ");
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
|
|
@ -91,6 +91,13 @@ public:
|
||||||
/// \param mask Mask
|
/// \param mask Mask
|
||||||
std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const;
|
std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const;
|
||||||
|
|
||||||
|
/// Appends document text flow to this one
|
||||||
|
/// \param textFlow Text flow
|
||||||
|
void append(const PDFDocumentTextFlow& textFlow);
|
||||||
|
|
||||||
|
/// Returns text concantecated from all items
|
||||||
|
QString getText() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Items m_items;
|
Items m_items;
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue