DocDiff application: Text compare

This commit is contained in:
Jakub Melka 2021-09-26 14:46:23 +02:00
parent 9bc29da83c
commit 7f748295c0
3 changed files with 169 additions and 48 deletions

View File

@ -54,10 +54,21 @@ public:
QString rightText;
};
struct TextCompareItem
{
size_t index = 0;
int charIndex = 0;
int charCount = 0;
bool left = false;
};
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
static void matchPage(PageSequence& sequence, size_t leftPage, size_t rightPage);
static std::vector<TextCompareItem> prepareTextCompareItems(const PDFDocumentTextFlow& textFlow,
bool isWordsComparingMode,
bool isLeft);
};
PDFDiff::PDFDiff(QObject* parent) :
@ -65,7 +76,7 @@ PDFDiff::PDFDiff(QObject* parent) :
m_progress(nullptr),
m_leftDocument(nullptr),
m_rightDocument(nullptr),
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images | CompareWords),
m_epsilon(0.001),
m_cancelled(false),
m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout)
@ -629,57 +640,24 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
// Jakub Melka: try to compare text differences
auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context)
{
struct CompareItem
using TextCompareItem = PDFDiffHelper::TextCompareItem;
const bool isWordsComparingMode = m_options.testFlag(CompareWords);
std::vector<TextCompareItem> leftItems;
std::vector<TextCompareItem> rightItems;
leftItems = PDFDiffHelper::prepareTextCompareItems(context.leftTextFlow, isWordsComparingMode, true);
rightItems = PDFDiffHelper::prepareTextCompareItems(context.rightTextFlow, isWordsComparingMode, false);
auto compareCharacters = [&](const TextCompareItem& a, const TextCompareItem& b)
{
size_t index = 0;
int charIndex = 0;
bool left = false;
};
std::vector<CompareItem> leftItems;
std::vector<CompareItem> rightItems;
const size_t leftCount = context.leftTextFlow.getSize();
for (size_t i = 0; i < leftCount; ++i)
{
CompareItem item;
item.index = i;
item.left = true;
const PDFDocumentTextFlow::Item* textFlowItem = context.leftTextFlow.getItem(i);
for (int j = 0; j < textFlowItem->text.size(); ++j)
{
item.charIndex = j;
leftItems.push_back(item);
}
}
const size_t rightCount = context.rightTextFlow.getSize();
for (size_t i = 0; i < rightCount; ++i)
{
CompareItem item;
item.index = i;
item.left = false;
const PDFDocumentTextFlow::Item* textFlowItem = context.rightTextFlow.getItem(i);
for (int j = 0; j < textFlowItem->text.size(); ++j)
{
item.charIndex = j;
rightItems.push_back(item);
}
}
auto compareCharacters = [&](const CompareItem& a, const CompareItem& b)
{
const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow;
const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow;
QChar aChar = aItem.getItem(a.index)->text[a.charIndex];
QChar bChar = bItem.getItem(b.index)->text[b.charIndex];
QStringRef aText(&aItem.getItem(a.index)->text, a.charIndex, a.charCount);
QStringRef bText(&bItem.getItem(b.index)->text, b.charIndex, b.charCount);
return aChar == bChar;
return aText == bText;
};
PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(),
rightItems.cbegin(), rightItems.cend(),
@ -689,13 +667,100 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { });
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence);
// Merge modified sequences separated by just space
if (!isWordsComparingMode && !modifiedRanges.empty())
{
auto itPrev = sequence.end();
for (const auto& range : modifiedRanges)
{
if (itPrev != sequence.end())
{
auto itNext = range.first;
bool isReplaced = true;
for (auto it = itPrev; it != itNext && isReplaced; ++it)
{
const PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
// If we doesn't have a match, then it is not a whitespace
if (!item.isMatch())
{
isReplaced = false;
break;
}
const TextCompareItem& compareItem = leftItems[item.index1];
const auto& flowItem = compareItem.left ? context.leftTextFlow : context.rightTextFlow;
QChar character = flowItem.getItem(compareItem.index)->text.at(compareItem.charIndex);
isReplaced = !character.isSpace();
}
if (isReplaced)
{
for (auto it = itPrev; it != itNext; ++it)
{
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
item.markReplaced();
}
}
}
itPrev = range.second;
}
modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence);
}
for (const auto& range : modifiedRanges)
{
auto it = range.first;
auto itEnd = range.second;
QStringList leftStrings;
QStringList rightStrings;
for (; it != itEnd; ++it)
{
const PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
if (item.isLeftValid())
{
const TextCompareItem& textCompareItem = leftItems[item.index1];
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
QStringRef text(&textFlow.getItem(textCompareItem.index)->text, textCompareItem.charIndex, textCompareItem.charCount);
leftStrings << text.toString();
}
if (item.isRightValid())
{
const TextCompareItem& textCompareItem = rightItems[item.index2];
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
QStringRef text(&textFlow.getItem(textCompareItem.index)->text, textCompareItem.charIndex, textCompareItem.charCount);
rightStrings << text.toString();
}
}
QString leftString;
QString rightString;
if (isWordsComparingMode)
{
leftString = leftStrings.join(QChar::Space);
rightString = rightStrings.join(QChar::Space);
}
else
{
leftString = leftStrings.join(QString());
rightString = rightStrings.join(QString());
}
}
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts);
//std::for_each(textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts);
}
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
@ -1075,4 +1140,59 @@ void PDFDiffHelper::matchPage(PageSequence& sequence,
}
}
std::vector<PDFDiffHelper::TextCompareItem> PDFDiffHelper::prepareTextCompareItems(const PDFDocumentTextFlow& textFlow,
bool isWordsComparingMode,
bool isLeft)
{
std::vector<TextCompareItem> items;
const size_t leftCount = textFlow.getSize();
for (size_t i = 0; i < leftCount; ++i)
{
PDFDiffHelper::TextCompareItem item;
item.index = i;
item.left = isLeft;
item.charCount = 0;
const PDFDocumentTextFlow::Item* textFlowItem = textFlow.getItem(i);
for (int j = 0; j < textFlowItem->text.size(); ++j)
{
if (isWordsComparingMode)
{
if (textFlowItem->text[j].isSpace())
{
// Flush buffer
if (item.charCount > 0)
{
items.push_back(item);
item.charCount = 0;
}
}
else
{
if (item.charCount == 0)
{
item.charIndex = j;
}
++item.charCount;
}
}
else
{
item.charIndex = j;
item.charCount = 1;
items.push_back(item);
}
}
if (isWordsComparingMode && item.charCount > 0)
{
items.push_back(item);
item.charCount = 0;
}
}
return items;
}
} // namespace pdf

View File

@ -120,6 +120,7 @@ public:
PC_Images = 0x0008, ///< Use images to compare pages (determine, which pages correspond to each other)
PC_Mesh = 0x0010, ///< Use mesh to compare pages (determine, which pages correspond to each other)
CompareTextsAsVector = 0x0020, ///< Compare texts as vector graphics
CompareWords = 0x0040, ///< Compare words, not just characters
};
Q_DECLARE_FLAGS(Options, Option)

View File

@ -107,7 +107,7 @@ public:
// into buckets of appropriate size.
if (scope != Scope::Page)
{
const int buckets = 32 * QThread::idealThreadCount();
const int buckets = 8 * QThread::idealThreadCount();
bucketSize = qMax(1, count / buckets);
}