mirror of https://github.com/JakubMelka/PDF4QT.git
DocDiff application: Diffing text finished
This commit is contained in:
parent
5cdb6cdab6
commit
8508fe9ef2
|
@ -69,6 +69,7 @@ public:
|
||||||
static std::vector<TextCompareItem> prepareTextCompareItems(const PDFDocumentTextFlow& textFlow,
|
static std::vector<TextCompareItem> prepareTextCompareItems(const PDFDocumentTextFlow& textFlow,
|
||||||
bool isWordsComparingMode,
|
bool isWordsComparingMode,
|
||||||
bool isLeft);
|
bool isLeft);
|
||||||
|
static void refineTextRectangles(PDFDiffResult::RectInfos& items);
|
||||||
};
|
};
|
||||||
|
|
||||||
PDFDiff::PDFDiff(QObject* parent) :
|
PDFDiff::PDFDiff(QObject* parent) :
|
||||||
|
@ -637,8 +638,10 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
QMutex mutex;
|
||||||
|
|
||||||
// Jakub Melka: try to compare text differences
|
// Jakub Melka: try to compare text differences
|
||||||
auto compareTexts = [this](PDFDiffHelper::TextFlowDifferences& context)
|
auto compareTexts = [this, &mutex](PDFDiffHelper::TextFlowDifferences& context)
|
||||||
{
|
{
|
||||||
using TextCompareItem = PDFDiffHelper::TextCompareItem;
|
using TextCompareItem = PDFDiffHelper::TextCompareItem;
|
||||||
const bool isWordsComparingMode = m_options.testFlag(CompareWords);
|
const bool isWordsComparingMode = m_options.testFlag(CompareWords);
|
||||||
|
@ -720,6 +723,12 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
QStringList leftStrings;
|
QStringList leftStrings;
|
||||||
QStringList rightStrings;
|
QStringList rightStrings;
|
||||||
|
|
||||||
|
PDFDiffResult::RectInfos leftRectInfos;
|
||||||
|
PDFDiffResult::RectInfos rightRectInfos;
|
||||||
|
|
||||||
|
PDFInteger pageIndex1 = -1;
|
||||||
|
PDFInteger pageIndex2 = -1;
|
||||||
|
|
||||||
for (; it != itEnd; ++it)
|
for (; it != itEnd; ++it)
|
||||||
{
|
{
|
||||||
const PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
|
const PDFAlgorithmLongestCommonSubsequenceBase::SequenceItem& item = *it;
|
||||||
|
@ -728,16 +737,50 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
{
|
{
|
||||||
const TextCompareItem& textCompareItem = leftItems[item.index1];
|
const TextCompareItem& textCompareItem = leftItems[item.index1];
|
||||||
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
|
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
QStringRef text(&textFlow.getItem(textCompareItem.index)->text, textCompareItem.charIndex, textCompareItem.charCount);
|
const PDFDocumentTextFlow::Item* textItem = textFlow.getItem(textCompareItem.index);
|
||||||
|
QStringRef text(&textItem->text, textCompareItem.charIndex, textCompareItem.charCount);
|
||||||
leftStrings << text.toString();
|
leftStrings << text.toString();
|
||||||
|
|
||||||
|
if (pageIndex1 == -1)
|
||||||
|
{
|
||||||
|
pageIndex1 = textItem->pageIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textCompareItem.charIndex + textCompareItem.charCount <= textItem->characterBoundingRects.size())
|
||||||
|
{
|
||||||
|
const size_t startIndex = textCompareItem.charIndex;
|
||||||
|
const size_t endIndex = startIndex + textCompareItem.charCount;
|
||||||
|
|
||||||
|
for (size_t i = startIndex; i < endIndex; ++i)
|
||||||
|
{
|
||||||
|
leftRectInfos.emplace_back(textItem->pageIndex, textItem->characterBoundingRects[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (item.isRightValid())
|
if (item.isRightValid())
|
||||||
{
|
{
|
||||||
const TextCompareItem& textCompareItem = rightItems[item.index2];
|
const TextCompareItem& textCompareItem = rightItems[item.index2];
|
||||||
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
|
const auto& textFlow = textCompareItem.left ? context.leftTextFlow : context.rightTextFlow;
|
||||||
QStringRef text(&textFlow.getItem(textCompareItem.index)->text, textCompareItem.charIndex, textCompareItem.charCount);
|
const PDFDocumentTextFlow::Item* textItem = textFlow.getItem(textCompareItem.index);
|
||||||
|
QStringRef text(&textItem->text, textCompareItem.charIndex, textCompareItem.charCount);
|
||||||
rightStrings << text.toString();
|
rightStrings << text.toString();
|
||||||
|
|
||||||
|
if (pageIndex2 == -1)
|
||||||
|
{
|
||||||
|
pageIndex2 = textItem->pageIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (textCompareItem.charIndex + textCompareItem.charCount <= textItem->characterBoundingRects.size())
|
||||||
|
{
|
||||||
|
const size_t startIndex = textCompareItem.charIndex;
|
||||||
|
const size_t endIndex = startIndex + textCompareItem.charCount;
|
||||||
|
|
||||||
|
for (size_t i = startIndex; i < endIndex; ++i)
|
||||||
|
{
|
||||||
|
rightRectInfos.emplace_back(textItem->pageIndex, textItem->characterBoundingRects[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -755,7 +798,26 @@ void PDFDiff::performCompare(const std::vector<PDFDiffPageContext>& leftPrepared
|
||||||
rightString = rightStrings.join(QString());
|
rightString = rightStrings.join(QString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PDFDiffHelper::refineTextRectangles(leftRectInfos);
|
||||||
|
PDFDiffHelper::refineTextRectangles(rightRectInfos);
|
||||||
|
|
||||||
|
QMutexLocker locker(&mutex);
|
||||||
|
if (!leftString.isEmpty() && !rightString.isEmpty())
|
||||||
|
{
|
||||||
|
m_result.addTextReplaced(pageIndex1, pageIndex2, leftString, rightString, leftRectInfos, rightRectInfos);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (!leftString.isEmpty())
|
||||||
|
{
|
||||||
|
m_result.addTextRemoved(pageIndex1, leftString, leftRectInfos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!rightString.isEmpty())
|
||||||
|
{
|
||||||
|
m_result.addTextAdded(pageIndex2, rightString, rightRectInfos);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -929,6 +991,68 @@ void PDFDiffResult::addAddedShadingContent(PDFInteger pageIndex, QRectF rect)
|
||||||
addRightItem(Type::AddedShadingContent, pageIndex, rect);
|
addRightItem(Type::AddedShadingContent, pageIndex, rect);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFDiffResult::addTextAdded(PDFInteger pageIndex,
|
||||||
|
QString text,
|
||||||
|
const RectInfos& rectInfos)
|
||||||
|
{
|
||||||
|
Difference difference;
|
||||||
|
|
||||||
|
difference.type = Type::TextAdded;
|
||||||
|
difference.pageIndex2 = pageIndex;
|
||||||
|
difference.textAddedIndex = m_strings.size();
|
||||||
|
m_strings << text;
|
||||||
|
difference.rightRectIndex = m_rects.size();
|
||||||
|
difference.rightRectCount = rectInfos.size();
|
||||||
|
m_rects.insert(m_rects.end(), rectInfos.cbegin(), rectInfos.cend());
|
||||||
|
|
||||||
|
m_differences.emplace_back(std::move(difference));
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFDiffResult::addTextRemoved(PDFInteger pageIndex,
|
||||||
|
QString text,
|
||||||
|
const RectInfos& rectInfos)
|
||||||
|
{
|
||||||
|
Difference difference;
|
||||||
|
|
||||||
|
difference.type = Type::TextRemoved;
|
||||||
|
difference.pageIndex1 = pageIndex;
|
||||||
|
difference.textRemovedIndex = m_strings.size();
|
||||||
|
m_strings << text;
|
||||||
|
difference.leftRectIndex = m_rects.size();
|
||||||
|
difference.leftRectCount = rectInfos.size();
|
||||||
|
m_rects.insert(m_rects.end(), rectInfos.cbegin(), rectInfos.cend());
|
||||||
|
|
||||||
|
m_differences.emplace_back(std::move(difference));
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFDiffResult::addTextReplaced(PDFInteger pageIndex1,
|
||||||
|
PDFInteger pageIndex2,
|
||||||
|
QString textRemoved,
|
||||||
|
QString textAdded,
|
||||||
|
const RectInfos& rectInfos1,
|
||||||
|
const RectInfos& rectInfos2)
|
||||||
|
{
|
||||||
|
Difference difference;
|
||||||
|
|
||||||
|
difference.type = Type::TextReplaced;
|
||||||
|
difference.pageIndex1 = pageIndex1;
|
||||||
|
difference.pageIndex2 = pageIndex2;
|
||||||
|
difference.textRemovedIndex = m_strings.size();
|
||||||
|
m_strings << textRemoved;
|
||||||
|
difference.textAddedIndex = m_strings.size();
|
||||||
|
m_strings << textAdded;
|
||||||
|
|
||||||
|
difference.leftRectIndex = m_rects.size();
|
||||||
|
difference.leftRectCount = rectInfos1.size();
|
||||||
|
m_rects.insert(m_rects.end(), rectInfos1.cbegin(), rectInfos1.cend());
|
||||||
|
|
||||||
|
difference.rightRectIndex = m_rects.size();
|
||||||
|
difference.rightRectCount = rectInfos2.size();
|
||||||
|
m_rects.insert(m_rects.end(), rectInfos2.cbegin(), rectInfos2.cend());
|
||||||
|
|
||||||
|
m_differences.emplace_back(std::move(difference));
|
||||||
|
}
|
||||||
|
|
||||||
QString PDFDiffResult::getMessage(size_t index) const
|
QString PDFDiffResult::getMessage(size_t index) const
|
||||||
{
|
{
|
||||||
if (index >= m_differences.size())
|
if (index >= m_differences.size())
|
||||||
|
@ -972,6 +1096,15 @@ QString PDFDiffResult::getMessage(size_t index) const
|
||||||
case Type::AddedShadingContent:
|
case Type::AddedShadingContent:
|
||||||
return PDFDiff::tr("Added shading from page %1.").arg(difference.pageIndex2 + 1);
|
return PDFDiff::tr("Added shading from page %1.").arg(difference.pageIndex2 + 1);
|
||||||
|
|
||||||
|
case Type::TextAdded:
|
||||||
|
return PDFDiff::tr("Text '%1' has been added to page %2.").arg(m_strings[difference.textAddedIndex]).arg(difference.pageIndex2 + 1);
|
||||||
|
|
||||||
|
case Type::TextRemoved:
|
||||||
|
return PDFDiff::tr("Text '%1' has been removed from page %2.").arg(m_strings[difference.textRemovedIndex]).arg(difference.pageIndex1 + 1);
|
||||||
|
|
||||||
|
case Type::TextReplaced:
|
||||||
|
return PDFDiff::tr("Text '%1' on page %2 has been replaced by text '%3' on page %4.").arg(m_strings[difference.textRemovedIndex]).arg(difference.pageIndex1 + 1).arg(m_strings[difference.textAddedIndex]).arg(difference.pageIndex2 + 1);
|
||||||
|
|
||||||
default:
|
default:
|
||||||
Q_ASSERT(false);
|
Q_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
|
@ -984,14 +1117,14 @@ void PDFDiffResult::addRectLeft(Difference& difference, QRectF rect)
|
||||||
{
|
{
|
||||||
difference.leftRectIndex = m_rects.size();
|
difference.leftRectIndex = m_rects.size();
|
||||||
difference.leftRectCount = 1;
|
difference.leftRectCount = 1;
|
||||||
m_rects.emplace_back(rect);
|
m_rects.emplace_back(difference.pageIndex1, rect);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PDFDiffResult::addRectRight(Difference& difference, QRectF rect)
|
void PDFDiffResult::addRectRight(Difference& difference, QRectF rect)
|
||||||
{
|
{
|
||||||
difference.rightRectIndex = m_rects.size();
|
difference.rightRectIndex = m_rects.size();
|
||||||
difference.rightRectCount = 1;
|
difference.rightRectCount = 1;
|
||||||
m_rects.emplace_back(rect);
|
m_rects.emplace_back(difference.pageIndex2, rect);
|
||||||
}
|
}
|
||||||
|
|
||||||
PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left,
|
PDFDiffHelper::Differences PDFDiffHelper::calculateDifferences(const GraphicPieceInfos& left,
|
||||||
|
@ -1195,4 +1328,52 @@ std::vector<PDFDiffHelper::TextCompareItem> PDFDiffHelper::prepareTextCompareIte
|
||||||
return items;
|
return items;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFDiffHelper::refineTextRectangles(PDFDiffResult::RectInfos& items)
|
||||||
|
{
|
||||||
|
PDFDiffResult::RectInfos refinedItems;
|
||||||
|
|
||||||
|
auto it = items.cbegin();
|
||||||
|
auto itEnd = items.cend();
|
||||||
|
while (it != itEnd)
|
||||||
|
{
|
||||||
|
// Jakub Melka: find range which can be merged into one
|
||||||
|
// rectangle (it must be on a single page and rectangles must go
|
||||||
|
// in right direction).
|
||||||
|
|
||||||
|
auto itNext = std::next(it);
|
||||||
|
while (itNext != itEnd)
|
||||||
|
{
|
||||||
|
const std::pair<PDFInteger, QRectF>& currentItem = *std::prev(itNext);
|
||||||
|
const std::pair<PDFInteger, QRectF>& nextItem = *itNext;
|
||||||
|
if (nextItem.first != currentItem.first)
|
||||||
|
{
|
||||||
|
// Page index has changed...
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const QRectF& left = currentItem.second;
|
||||||
|
const QRectF& right = nextItem.second;
|
||||||
|
|
||||||
|
if (left.center().x() >= right.center().x())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
++itNext;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge range [it, itNext) into one new sequence
|
||||||
|
QRectF unifiedRect;
|
||||||
|
for (auto cit = it; cit != itNext; ++cit)
|
||||||
|
{
|
||||||
|
unifiedRect = unifiedRect.united((*cit).second);
|
||||||
|
}
|
||||||
|
refinedItems.emplace_back((*it).first, unifiedRect);
|
||||||
|
|
||||||
|
it = itNext;
|
||||||
|
}
|
||||||
|
|
||||||
|
items = std::move(refinedItems);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
|
|
@ -54,24 +54,32 @@ public:
|
||||||
AddedVectorGraphicContent,
|
AddedVectorGraphicContent,
|
||||||
AddedImageContent,
|
AddedImageContent,
|
||||||
AddedShadingContent,
|
AddedShadingContent,
|
||||||
|
TextReplaced,
|
||||||
|
TextAdded,
|
||||||
|
TextRemoved,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Difference
|
using RectInfos = std::vector<std::pair<PDFInteger, QRectF>>;
|
||||||
{
|
|
||||||
Type type = Type::Invalid;
|
|
||||||
PDFInteger pageIndex1 = -1;
|
|
||||||
PDFInteger pageIndex2 = -1;
|
|
||||||
size_t leftRectIndex = 0;
|
|
||||||
size_t leftRectCount = 0;
|
|
||||||
size_t rightRectIndex = 0;
|
|
||||||
size_t rightRectCount = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
using Differences = std::vector<Difference>;
|
|
||||||
|
|
||||||
void setResult(PDFOperationResult result) { m_result = std::move(result); }
|
void setResult(PDFOperationResult result) { m_result = std::move(result); }
|
||||||
const PDFOperationResult& getResult() const { return m_result; }
|
const PDFOperationResult& getResult() const { return m_result; }
|
||||||
|
|
||||||
|
/// Returns true, if some difference was found
|
||||||
|
bool isChanged() const { return getDifferencesCount() > 0; }
|
||||||
|
|
||||||
|
/// Returns true, if no difference was found
|
||||||
|
bool isSame() const { return !isChanged(); }
|
||||||
|
|
||||||
|
/// Returns number of detected changes
|
||||||
|
size_t getDifferencesCount() const { return m_differences.size(); }
|
||||||
|
|
||||||
|
/// Returns message describing difference in a page content
|
||||||
|
/// \param index Index
|
||||||
|
QString getMessage(size_t index) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend class PDFDiff;
|
||||||
|
|
||||||
void addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2);
|
void addPageMoved(PDFInteger pageIndex1, PDFInteger pageIndex2);
|
||||||
void addPageAdded(PDFInteger pageIndex);
|
void addPageAdded(PDFInteger pageIndex);
|
||||||
void addPageRemoved(PDFInteger pageIndex);
|
void addPageRemoved(PDFInteger pageIndex);
|
||||||
|
@ -85,9 +93,34 @@ public:
|
||||||
void addAddedImageContent(PDFInteger pageIndex, QRectF rect);
|
void addAddedImageContent(PDFInteger pageIndex, QRectF rect);
|
||||||
void addAddedShadingContent(PDFInteger pageIndex, QRectF rect);
|
void addAddedShadingContent(PDFInteger pageIndex, QRectF rect);
|
||||||
|
|
||||||
QString getMessage(size_t index) const;
|
void addTextAdded(PDFInteger pageIndex, QString text, const RectInfos& rectInfos);
|
||||||
|
void addTextRemoved(PDFInteger pageIndex, QString text, const RectInfos& rectInfos);
|
||||||
|
|
||||||
|
void addTextReplaced(PDFInteger pageIndex1,
|
||||||
|
PDFInteger pageIndex2,
|
||||||
|
QString textRemoved,
|
||||||
|
QString textAdded,
|
||||||
|
const RectInfos& rectInfos1,
|
||||||
|
const RectInfos& rectInfos2);
|
||||||
|
|
||||||
|
/// Single content difference descriptor. It describes type
|
||||||
|
/// of difference (such as graphics, image, text change) on a page
|
||||||
|
/// or on a list of multiple pages.
|
||||||
|
struct Difference
|
||||||
|
{
|
||||||
|
Type type = Type::Invalid;
|
||||||
|
PDFInteger pageIndex1 = -1;
|
||||||
|
PDFInteger pageIndex2 = -1;
|
||||||
|
size_t leftRectIndex = 0;
|
||||||
|
size_t leftRectCount = 0;
|
||||||
|
size_t rightRectIndex = 0;
|
||||||
|
size_t rightRectCount = 0;
|
||||||
|
int textAddedIndex = -1;
|
||||||
|
int textRemovedIndex = -1;
|
||||||
|
};
|
||||||
|
|
||||||
|
using Differences = std::vector<Difference>;
|
||||||
|
|
||||||
private:
|
|
||||||
void addLeftItem(Type type, PDFInteger pageIndex, QRectF rect);
|
void addLeftItem(Type type, PDFInteger pageIndex, QRectF rect);
|
||||||
void addRightItem(Type type, PDFInteger pageIndex, QRectF rect);
|
void addRightItem(Type type, PDFInteger pageIndex, QRectF rect);
|
||||||
|
|
||||||
|
@ -95,8 +128,9 @@ private:
|
||||||
void addRectRight(Difference& difference, QRectF rect);
|
void addRectRight(Difference& difference, QRectF rect);
|
||||||
|
|
||||||
Differences m_differences;
|
Differences m_differences;
|
||||||
std::vector<QRectF> m_rects;
|
RectInfos m_rects; ///< Rectangles with page indices
|
||||||
PDFOperationResult m_result;
|
PDFOperationResult m_result;
|
||||||
|
QStringList m_strings;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Diff engine for comparing two pdf documents.
|
/// Diff engine for comparing two pdf documents.
|
||||||
|
|
|
@ -206,7 +206,6 @@ protected:
|
||||||
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
||||||
virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
|
virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
|
||||||
virtual void performMarkedContentEnd() override;
|
virtual void performMarkedContentEnd() override;
|
||||||
virtual void performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule) override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const PDFStructureItem* getStructureTreeItemFromMCID(PDFInteger mcid) const;
|
const PDFStructureItem* getStructureTreeItemFromMCID(PDFInteger mcid) const;
|
||||||
|
@ -237,32 +236,22 @@ private:
|
||||||
std::vector<QRectF> m_characterBoundingRects;
|
std::vector<QRectF> m_characterBoundingRects;
|
||||||
};
|
};
|
||||||
|
|
||||||
void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule)
|
|
||||||
{
|
|
||||||
if (!text)
|
|
||||||
{
|
|
||||||
// Jakub Melka: This should not occur
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::BoundingBoxes))
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
Q_UNUSED(stroke);
|
|
||||||
Q_UNUSED(fill);
|
|
||||||
Q_UNUSED(fillRule);
|
|
||||||
|
|
||||||
QMatrix matrix = getCurrentWorldMatrix();
|
|
||||||
QPainterPath worldPath = matrix.map(path);
|
|
||||||
m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect());
|
|
||||||
m_characterBoundingRects.push_back(worldPath.controlPointRect());
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeTextContentProcessor::finishText()
|
void PDFStructureTreeTextContentProcessor::finishText()
|
||||||
{
|
{
|
||||||
m_currentText = m_currentText.trimmed();
|
QString trimmedText = m_currentText.trimmed();
|
||||||
|
const int index = m_currentText.indexOf(trimmedText);
|
||||||
|
Q_ASSERT(index != -1);
|
||||||
|
if (trimmedText.size() < m_currentText.size())
|
||||||
|
{
|
||||||
|
// Fix character bounding boxes...
|
||||||
|
if (m_characterBoundingRects.size() == m_currentText.size())
|
||||||
|
{
|
||||||
|
std::vector<QRectF> boundingRects(std::next(m_characterBoundingRects.cbegin(), index), std::next(m_characterBoundingRects.cbegin(), index + trimmedText.length()));
|
||||||
|
m_characterBoundingRects = std::move(boundingRects);
|
||||||
|
}
|
||||||
|
m_currentText = std::move(trimmedText);
|
||||||
|
}
|
||||||
|
|
||||||
if (!m_currentText.isEmpty() && (!m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::SkipArtifact) || !isArtifact()))
|
if (!m_currentText.isEmpty() && (!m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::SkipArtifact) || !isArtifact()))
|
||||||
{
|
{
|
||||||
if (m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::AdjustReversedText) && isReversedText())
|
if (m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::AdjustReversedText) && isReversedText())
|
||||||
|
@ -276,6 +265,7 @@ void PDFStructureTreeTextContentProcessor::finishText()
|
||||||
m_currentText = qMove(reversed);
|
m_currentText = qMove(reversed);
|
||||||
std::reverse(m_characterBoundingRects.begin(), m_characterBoundingRects.end());
|
std::reverse(m_characterBoundingRects.begin(), m_characterBoundingRects.end());
|
||||||
}
|
}
|
||||||
|
Q_ASSERT(m_currentText.size() == m_characterBoundingRects.size() || m_characterBoundingRects.empty());
|
||||||
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(std::move(m_currentText), m_pageIndex, m_currentBoundingBox, std::move(m_characterBoundingRects)));
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(std::move(m_currentText), m_pageIndex, m_currentBoundingBox, std::move(m_characterBoundingRects)));
|
||||||
}
|
}
|
||||||
m_currentText = QString();
|
m_currentText = QString();
|
||||||
|
@ -381,8 +371,6 @@ bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind k
|
||||||
switch (kind)
|
switch (kind)
|
||||||
{
|
{
|
||||||
case ContentKind::Text:
|
case ContentKind::Text:
|
||||||
return !m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::BoundingBoxes);
|
|
||||||
|
|
||||||
case ContentKind::Shapes:
|
case ContentKind::Shapes:
|
||||||
case ContentKind::Images:
|
case ContentKind::Images:
|
||||||
case ContentKind::Shading:
|
case ContentKind::Shading:
|
||||||
|
@ -408,6 +396,18 @@ void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextC
|
||||||
if (!info.character.isNull() && info.character != QChar(QChar::SoftHyphen))
|
if (!info.character.isNull() && info.character != QChar(QChar::SoftHyphen))
|
||||||
{
|
{
|
||||||
m_currentText.push_back(info.character);
|
m_currentText.push_back(info.character);
|
||||||
|
|
||||||
|
QPainterPath worldPath = info.matrix.map(info.outline);
|
||||||
|
if (!worldPath.isEmpty())
|
||||||
|
{
|
||||||
|
QRectF boundingRect = worldPath.controlPointRect();
|
||||||
|
m_currentBoundingBox = m_currentBoundingBox.united(boundingRect);
|
||||||
|
m_characterBoundingRects.push_back(boundingRect);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_characterBoundingRects.push_back(QRectF());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue