DocDiff application: text comparation

This commit is contained in:
Jakub Melka 2021-09-25 17:05:08 +02:00
parent e00863c6bc
commit 9bc29da83c
4 changed files with 163 additions and 4 deletions

View File

@ -46,6 +46,14 @@ public:
bool isEmpty() const { return left.empty() && right.empty(); }
};
struct TextFlowDifferences
{
PDFDocumentTextFlow leftTextFlow;
PDFDocumentTextFlow rightTextFlow;
QString leftText;
QString rightText;
};
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
@ -59,7 +67,8 @@ PDFDiff::PDFDiff(QObject* parent) :
m_rightDocument(nullptr),
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
m_epsilon(0.001),
m_cancelled(false)
m_cancelled(false),
m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout)
{
}
@ -392,7 +401,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
{
pdf::PDFDocumentTextFlowFactory factoryLeftDocumentTextFlow;
factoryLeftDocumentTextFlow.setCalculateBoundingBoxes(true);
PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
PDFDocumentTextFlow leftTextFlow = factoryLeftDocumentTextFlow.create(m_leftDocument, leftPages, m_textAnalysisAlgorithm);
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = leftTextFlow.split(PDFDocumentTextFlow::Text);
for (PDFDiffPageContext& leftContext : leftPreparedPages)
{
@ -411,7 +420,7 @@ void PDFDiff::performSteps(const std::vector<PDFInteger>& leftPages, const std::
{
pdf::PDFDocumentTextFlowFactory factoryRightDocumentTextFlow;
factoryRightDocumentTextFlow.setCalculateBoundingBoxes(true);
PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, PDFDocumentTextFlowFactory::Algorithm::Auto);
PDFDocumentTextFlow rightTextFlow = factoryRightDocumentTextFlow.create(m_rightDocument, rightPages, m_textAnalysisAlgorithm);
std::map<PDFInteger, PDFDocumentTextFlow> splittedText = rightTextFlow.split(PDFDocumentTextFlow::Text);
for (PDFDiffPageContext& rightContext : rightPreparedPages)
{
@ -458,6 +467,8 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
}
}
std::vector<PDFDiffHelper::TextFlowDifferences> textFlowDifferences;
for (const auto& range : modifiedRanges)
{
AlgorithmLCS::SequenceItemFlags flags = AlgorithmLCS::collectFlags(range);
@ -472,15 +483,25 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
// page range was added, or page range was removed.
if (isReplaced)
{
PDFDocumentTextFlow leftTextFlow;
PDFDocumentTextFlow rightTextFlow;
const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector);
for (auto it = range.first; it != range.second; ++it)
{
const AlgorithmLCS::SequenceItem& item = *it;
if (item.isReplaced())
{
const bool isTextComparedAsVectorGraphics = m_options.testFlag(CompareTextsAsVector);
const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1];
const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2];
if (!isTextComparedAsVectorGraphics)
{
leftTextFlow.append(leftPageContext.text);
rightTextFlow.append(rightPageContext.text);
}
auto pageLeft = m_leftDocument->getCatalog()->getPage(leftPageContext.pageIndex);
auto pageRight = m_rightDocument->getCatalog()->getPage(rightPageContext.pageIndex);
PDFReal epsilon = (calculateEpsilonForPage(pageLeft) + calculateEpsilonForPage(pageRight)) * 0.5;
@ -549,14 +570,42 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
if (item.isAdded())
{
const PDFDiffPageContext& rightPageContext = rightPreparedPages[item.index2];
if (!isTextComparedAsVectorGraphics)
{
rightTextFlow.append(rightPageContext.text);
}
m_result.addPageAdded(rightPageContext.pageIndex);
}
if (item.isRemoved())
{
const PDFDiffPageContext& leftPageContext = leftPreparedPages[item.index1];
if (!isTextComparedAsVectorGraphics)
{
leftTextFlow.append(leftPageContext.text);
}
m_result.addPageRemoved(leftPageContext.pageIndex);
}
}
textFlowDifferences.emplace_back();
PDFDiffHelper::TextFlowDifferences& addedDifferences = textFlowDifferences.back();
addedDifferences.leftText = leftTextFlow.getText();
addedDifferences.rightText = rightTextFlow.getText();
if (addedDifferences.leftText == addedDifferences.rightText)
{
// Text is the same, no difference is found
textFlowDifferences.pop_back();
}
else
{
addedDifferences.leftTextFlow = std::move(leftTextFlow);
addedDifferences.rightTextFlow = std::move(rightTextFlow);
}
}
else
{
@ -576,6 +625,77 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
}
}
}
// Jakub Melka: try to compare text differences
auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context)
{
struct CompareItem
{
size_t index = 0;
int charIndex = 0;
bool left = false;
};
std::vector<CompareItem> leftItems;
std::vector<CompareItem> rightItems;
const size_t leftCount = context.leftTextFlow.getSize();
for (size_t i = 0; i < leftCount; ++i)
{
CompareItem item;
item.index = i;
item.left = true;
const PDFDocumentTextFlow::Item* textFlowItem = context.leftTextFlow.getItem(i);
for (int j = 0; j < textFlowItem->text.size(); ++j)
{
item.charIndex = j;
leftItems.push_back(item);
}
}
const size_t rightCount = context.rightTextFlow.getSize();
for (size_t i = 0; i < rightCount; ++i)
{
CompareItem item;
item.index = i;
item.left = false;
const PDFDocumentTextFlow::Item* textFlowItem = context.rightTextFlow.getItem(i);
for (int j = 0; j < textFlowItem->text.size(); ++j)
{
item.charIndex = j;
rightItems.push_back(item);
}
}
auto compareCharacters = [&](const CompareItem& a, const CompareItem& b)
{
const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow;
const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow;
QChar aChar = aItem.getItem(a.index)->text[a.charIndex];
QChar bChar = bItem.getItem(b.index)->text[b.charIndex];
return aChar == bChar;
};
PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(),
rightItems.cbegin(), rightItems.cend(),
compareCharacters);
algorithm.perform();
PDFAlgorithmLongestCommonSubsequenceBase::Sequence sequence = algorithm.getSequence();
PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { });
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence);
for (const auto& range : modifiedRanges)
{
}
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts);
}
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
@ -635,6 +755,16 @@ PDFReal PDFDiff::calculateEpsilonForPage(const PDFPage* page) const
return factor * m_epsilon;
}
PDFDocumentTextFlowFactory::Algorithm PDFDiff::getTextAnalysisAlgorithm() const
{
return m_textAnalysisAlgorithm;
}
void PDFDiff::setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm)
{
m_textAnalysisAlgorithm = textAnalysisAlgorithm;
}
PDFDiffResult::PDFDiffResult() :
m_result(true)
{

View File

@ -22,6 +22,7 @@
#include "pdfprogress.h"
#include "pdfutils.h"
#include "pdfalgorithmlcs.h"
#include "pdfdocumenttextflow.h"
#include <QObject>
#include <QFuture>
@ -160,6 +161,9 @@ public:
/// Returns result of a comparation process
const PDFDiffResult& getResult() const { return m_result; }
PDFDocumentTextFlowFactory::Algorithm getTextAnalysisAlgorithm() const;
void setTextAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm);
signals:
void comparationFinished();
@ -207,6 +211,7 @@ private:
PDFReal m_epsilon;
std::atomic_bool m_cancelled;
PDFDiffResult m_result;
PDFDocumentTextFlowFactory::Algorithm m_textAnalysisAlgorithm;
QFuture<PDFDiffResult> m_future;
std::optional<QFutureWatcher<PDFDiffResult>> m_futureWatcher;

View File

@ -1055,4 +1055,21 @@ std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask)
return result;
}
void PDFDocumentTextFlow::append(const PDFDocumentTextFlow& textFlow)
{
m_items.insert(m_items.end(), textFlow.m_items.cbegin(), textFlow.m_items.cend());
}
QString PDFDocumentTextFlow::getText() const
{
QStringList texts;
for (const auto& item : m_items)
{
texts << item.text.trimmed();
}
return texts.join(" ");
}
} // namespace pdf

View File

@ -91,6 +91,13 @@ public:
/// \param mask Mask
std::map<PDFInteger, PDFDocumentTextFlow> split(Flags mask) const;
/// Appends document text flow to this one
/// \param textFlow Text flow
void append(const PDFDocumentTextFlow& textFlow);
/// Returns text concantecated from all items
QString getText() const;
private:
Items m_items;
};