mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
DocDiff application: Text compare
This commit is contained in:
@ -54,10 +54,21 @@ public:
|
|||||||
QString rightText;
|
QString rightText;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct TextCompareItem
|
||||||
|
{
|
||||||
|
size_t index = 0;
|
||||||
|
int charIndex = 0;
|
||||||
|
int charCount = 0;
|
||||||
|
bool left = false;
|
||||||
|
};
|
||||||
|
|
||||||
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
|
static Differences calculateDifferences(const GraphicPieceInfos& left, const GraphicPieceInfos& right, PDFReal epsilon);
|
||||||
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
|
static std::vector<size_t> getLeftUnmatched(const PageSequence& sequence);
|
||||||
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
|
static std::vector<size_t> getRightUnmatched(const PageSequence& sequence);
|
||||||
static void matchPage(PageSequence& sequence, size_t leftPage, size_t rightPage);
|
static void matchPage(PageSequence& sequence, size_t leftPage, size_t rightPage);
|
||||||
|
static std::vector<TextCompareItem> prepareTextCompareItems(const PDFDocumentTextFlow& textFlow,
|
||||||
|
bool isWordsComparingMode,
|
||||||
|
bool isLeft);
|
||||||
};
|
};
|
||||||
|
|
||||||
PDFDiff::PDFDiff(QObject* parent) :
|
PDFDiff::PDFDiff(QObject* parent) :
|
||||||
@ -65,7 +76,7 @@ PDFDiff::PDFDiff(QObject* parent) :
|
|||||||
m_progress(nullptr),
|
m_progress(nullptr),
|
||||||
m_leftDocument(nullptr),
|
m_leftDocument(nullptr),
|
||||||
m_rightDocument(nullptr),
|
m_rightDocument(nullptr),
|
||||||
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images),
|
m_options(Asynchronous | PC_Text | PC_VectorGraphics | PC_Images | CompareWords),
|
||||||
m_epsilon(0.001),
|
m_epsilon(0.001),
|
||||||
m_cancelled(false),
|
m_cancelled(false),
|
||||||
m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout)
|
m_textAnalysisAlgorithm(PDFDocumentTextFlowFactory::Algorithm::Layout)
|
||||||
@ -629,57 +640,24 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
|||||||
// Jakub Melka: try to compare text differences
|
// Jakub Melka: try to compare text differences
|
||||||
auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context)
|
auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context)
|
||||||
{
|
{
|
||||||
struct CompareItem
|
using TextCompareItem = PDFDiffHelper::TextCompareItem;
|
||||||
|
const bool isWordsComparingMode = m_options.testFlag(CompareWords);
|
||||||
|
|
||||||
|
std::vector<TextCompareItem> leftItems;
|
||||||
|
std::vector<TextCompareItem> rightItems;
|
||||||
|
|
||||||
|
leftItems = PDFDiffHelper::prepareTextCompareItems(context.leftTextFlow, isWordsComparingMode, true);
|
||||||
|
rightItems = PDFDiffHelper::prepareTextCompareItems(context.rightTextFlow, isWordsComparingMode, false);
|
||||||
|
|
||||||
|
auto compareCharacters = [&](const TextCompareItem& a, const TextCompareItem& b)
|
||||||
{
|
{
|
||||||
size_t index = 0;
|
|
||||||
int charIndex = 0;
|
|
||||||
bool left = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<CompareItem> leftItems;
|
|
||||||
std::vector<CompareItem> rightItems;
|
|
||||||
|
|
||||||
const size_t leftCount = context.leftTextFlow.getSize();
|
|
||||||
for (size_t i = 0; i < leftCount; ++i)
|
|
||||||
{
|
|
||||||
CompareItem item;
|
|
||||||
item.index = i;
|
|
||||||
item.left = true;
|
|
||||||
|
|
||||||
const PDFDocumentTextFlow::Item* textFlowItem = context.leftTextFlow.getItem(i);
|
|
||||||
for (int j = 0; j < textFlowItem->text.size(); ++j)
|
|
||||||
{
|
|
||||||
item.charIndex = j;
|
|
||||||
leftItems.push_back(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t rightCount = context.rightTextFlow.getSize();
|
|
||||||
for (size_t i = 0; i < rightCount; ++i)
|
|
||||||
{
|
|
||||||
CompareItem item;
|
|
||||||
item.index = i;
|
|
||||||
item.left = false;
|
|
||||||
|
|
||||||
const PDFDocumentTextFlow::Item* textFlowItem = context.rightTextFlow.getItem(i);
|
|
||||||
for (int j = 0; j < textFlowItem->text.size(); ++j)
|
|
||||||
{
|
|
||||||
item.charIndex = j;
|
|
||||||
rightItems.push_back(item);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto compareCharacters = [&](const CompareItem& a, const CompareItem& b)
|
|
||||||
{
|
|
||||||
|
|
||||||
|
|
||||||
const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow;
|
const auto& aItem = a.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow;
|
const auto& bItem = b.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
|
|
||||||
QChar aChar = aItem.getItem(a.index)->text[a.charIndex];
|
QStringRef aText(&aItem.getItem(a.index)->text, a.charIndex, a.charCount);
|
||||||
QChar bChar = bItem.getItem(b.index)->text[b.charIndex];
|
QStringRef bText(&bItem.getItem(b.index)->text, b.charIndex, b.charCount);
|
||||||
|
|
||||||
return aChar == bChar;
|
return aText == bText;
|
||||||
};
|
};
|
||||||
PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(),
|
PDFAlgorithmLongestCommonSubsequence algorithm(leftItems.cbegin(), leftItems.cend(),
|
||||||
rightItems.cbegin(), rightItems.cend(),
|
rightItems.cbegin(), rightItems.cend(),
|
||||||
@ -689,13 +667,100 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
|||||||
PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { });
|
PDFAlgorithmLongestCommonSubsequenceBase::markSequence(sequence, { }, { });
|
||||||
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence);
|
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItemRanges modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence);
|
||||||
|
|
||||||
|
// Merge modified sequences separated by just space
|
||||||
|
if (!isWordsComparingMode && !modifiedRanges.empty())
|
||||||
|
{
|
||||||
|
auto itPrev = sequence.end();
|
||||||
for (const auto& range : modifiedRanges)
|
for (const auto& range : modifiedRanges)
|
||||||
{
|
{
|
||||||
|
if (itPrev != sequence.end())
|
||||||
|
{
|
||||||
|
auto itNext = range.first;
|
||||||
|
|
||||||
|
bool isReplaced = true;
|
||||||
|
for (auto it = itPrev; it != itNext && isReplaced; ++it)
|
||||||
|
{
|
||||||
|
const PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
|
||||||
|
|
||||||
|
// If we doesn't have a match, then it is not a whitespace
|
||||||
|
if (!item.isMatch())
|
||||||
|
{
|
||||||
|
isReplaced = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const TextCompareItem& compareItem = leftItems[item.index1];
|
||||||
|
const auto& flowItem = compareItem.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
|
QChar character = flowItem.getItem(compareItem.index)->text.at(compareItem.charIndex);
|
||||||
|
|
||||||
|
isReplaced = !character.isSpace();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isReplaced)
|
||||||
|
{
|
||||||
|
for (auto it = itPrev; it != itNext; ++it)
|
||||||
|
{
|
||||||
|
PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
|
||||||
|
item.markReplaced();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
itPrev = range.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
modifiedRanges = PDFAlgorithmLongestCommonSubsequenceBase::getModifiedRanges(sequence);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto& range : modifiedRanges)
|
||||||
|
{
|
||||||
|
auto it = range.first;
|
||||||
|
auto itEnd = range.second;
|
||||||
|
|
||||||
|
QStringList leftStrings;
|
||||||
|
QStringList rightStrings;
|
||||||
|
|
||||||
|
for (; it != itEnd; ++it)
|
||||||
|
{
|
||||||
|
const PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
|
||||||
|
|
||||||
|
if (item.isLeftValid())
|
||||||
|
{
|
||||||
|
const TextCompareItem& textCompareItem = leftItems[item.index1];
|
||||||
|
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
|
QStringRef text(&textFlow.getItem(textCompareItem.index)->text, textCompareItem.charIndex, textCompareItem.charCount);
|
||||||
|
leftStrings << text.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (item.isRightValid())
|
||||||
|
{
|
||||||
|
const TextCompareItem& textCompareItem = rightItems[item.index2];
|
||||||
|
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
|
QStringRef text(&textFlow.getItem(textCompareItem.index)->text, textCompareItem.charIndex, textCompareItem.charCount);
|
||||||
|
rightStrings << text.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
QString leftString;
|
||||||
|
QString rightString;
|
||||||
|
|
||||||
|
if (isWordsComparingMode)
|
||||||
|
{
|
||||||
|
leftString = leftStrings.join(QChar::Space);
|
||||||
|
rightString = rightStrings.join(QChar::Space);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
leftString = leftStrings.join(QString());
|
||||||
|
rightString = rightStrings.join(QString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts);
|
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts);
|
||||||
|
//std::for_each(textFlowDifferences.begin(), textFlowDifferences.end(), compareTexts);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
|
void PDFDiff::finalizeGraphicsPieces(PDFDiffPageContext& context)
|
||||||
@ -1075,4 +1140,59 @@ void PDFDiffHelper::matchPage(PageSequence& sequence,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<PDFDiffHelper::TextCompareItem> PDFDiffHelper::prepareTextCompareItems(const PDFDocumentTextFlow& textFlow,
|
||||||
|
bool isWordsComparingMode,
|
||||||
|
bool isLeft)
|
||||||
|
{
|
||||||
|
std::vector<TextCompareItem> items;
|
||||||
|
|
||||||
|
const size_t leftCount = textFlow.getSize();
|
||||||
|
for (size_t i = 0; i < leftCount; ++i)
|
||||||
|
{
|
||||||
|
PDFDiffHelper::TextCompareItem item;
|
||||||
|
item.index = i;
|
||||||
|
item.left = isLeft;
|
||||||
|
item.charCount = 0;
|
||||||
|
|
||||||
|
const PDFDocumentTextFlow::Item* textFlowItem = textFlow.getItem(i);
|
||||||
|
for (int j = 0; j < textFlowItem->text.size(); ++j)
|
||||||
|
{
|
||||||
|
if (isWordsComparingMode)
|
||||||
|
{
|
||||||
|
if (textFlowItem->text[j].isSpace())
|
||||||
|
{
|
||||||
|
// Flush buffer
|
||||||
|
if (item.charCount > 0)
|
||||||
|
{
|
||||||
|
items.push_back(item);
|
||||||
|
item.charCount = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (item.charCount == 0)
|
||||||
|
{
|
||||||
|
item.charIndex = j;
|
||||||
|
}
|
||||||
|
++item.charCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
item.charIndex = j;
|
||||||
|
item.charCount = 1;
|
||||||
|
items.push_back(item);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isWordsComparingMode && item.charCount > 0)
|
||||||
|
{
|
||||||
|
items.push_back(item);
|
||||||
|
item.charCount = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
@ -120,6 +120,7 @@ public:
|
|||||||
PC_Images = 0x0008, ///< Use images to compare pages (determine, which pages correspond to each other)
|
PC_Images = 0x0008, ///< Use images to compare pages (determine, which pages correspond to each other)
|
||||||
PC_Mesh = 0x0010, ///< Use mesh to compare pages (determine, which pages correspond to each other)
|
PC_Mesh = 0x0010, ///< Use mesh to compare pages (determine, which pages correspond to each other)
|
||||||
CompareTextsAsVector = 0x0020, ///< Compare texts as vector graphics
|
CompareTextsAsVector = 0x0020, ///< Compare texts as vector graphics
|
||||||
|
CompareWords = 0x0040, ///< Compare words, not just characters
|
||||||
};
|
};
|
||||||
Q_DECLARE_FLAGS(Options, Option)
|
Q_DECLARE_FLAGS(Options, Option)
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ public:
|
|||||||
// into buckets of appropriate size.
|
// into buckets of appropriate size.
|
||||||
if (scope != Scope::Page)
|
if (scope != Scope::Page)
|
||||||
{
|
{
|
||||||
const int buckets = 32 * QThread::idealThreadCount();
|
const int buckets = 8 * QThread::idealThreadCount();
|
||||||
bucketSize = qMax(1, count / buckets);
|
bucketSize = qMax(1, count / buckets);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user