DocDiff application: text bounding boxes

This commit is contained in:
Jakub Melka
2021-09-27 11:15:43 +02:00
parent 7f748295c0
commit 3b6784b8e4
4 changed files with 36 additions and 10 deletions

View File

@@ -89,21 +89,22 @@ struct PDFStructureTreeTextItem
}; };
PDFStructureTreeTextItem() = default; PDFStructureTreeTextItem() = default;
PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect) : PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) :
type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect) type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect), characterBoundingRects(std::move(characterBoundingRects))
{ {
} }
static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect); } static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect, std::move(characterBoundingRects)); }
static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF()); } static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF(), { }); }
static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF()); } static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF(), { }); }
Type type = Type::Text; Type type = Type::Text;
const PDFStructureItem* item = nullptr; const PDFStructureItem* item = nullptr;
QString text; QString text;
PDFInteger pageIndex = -1; PDFInteger pageIndex = -1;
QRectF boundingRect; QRectF boundingRect;
std::vector<QRectF> characterBoundingRects;
}; };
using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>; using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>;
@@ -147,6 +148,7 @@ public:
QRectF boundingRect; QRectF boundingRect;
PDFInteger pageIndex = -1; PDFInteger pageIndex = -1;
QString text; QString text;
std::vector<QRectF> characterBoundingRects;
}; };
using TextItems = std::vector<TextItem>; using TextItems = std::vector<TextItem>;
@@ -232,6 +234,7 @@ private:
QStringList m_unmatchedText; QStringList m_unmatchedText;
PDFStructureTreeTextExtractor::Options m_extractorOptions; PDFStructureTreeTextExtractor::Options m_extractorOptions;
PDFInteger m_pageIndex; PDFInteger m_pageIndex;
std::vector<QRectF> m_characterBoundingRects;
}; };
void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule) void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule)
@@ -254,6 +257,7 @@ void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPat
QMatrix matrix = getCurrentWorldMatrix(); QMatrix matrix = getCurrentWorldMatrix();
QPainterPath worldPath = matrix.map(path); QPainterPath worldPath = matrix.map(path);
m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect()); m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect());
m_characterBoundingRects.push_back(worldPath.controlPointRect());
} }
void PDFStructureTreeTextContentProcessor::finishText() void PDFStructureTreeTextContentProcessor::finishText()
@@ -270,11 +274,13 @@ void PDFStructureTreeTextContentProcessor::finishText()
reversed.push_back(*it); reversed.push_back(*it);
} }
m_currentText = qMove(reversed); m_currentText = qMove(reversed);
std::reverse(m_characterBoundingRects.begin(), m_characterBoundingRects.end());
} }
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(qMove(m_currentText), m_pageIndex, m_currentBoundingBox)); m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(std::move(m_currentText), m_pageIndex, m_currentBoundingBox, std::move(m_characterBoundingRects)));
} }
m_currentText = QString(); m_currentText = QString();
m_currentBoundingBox = QRectF(); m_currentBoundingBox = QRectF();
m_characterBoundingRects.clear();
} }
bool PDFStructureTreeTextContentProcessor::isArtifact() const bool PDFStructureTreeTextContentProcessor::isArtifact() const
@@ -346,6 +352,7 @@ void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
m_unmatchedText << qMove(m_currentText); m_unmatchedText << qMove(m_currentText);
} }
m_currentBoundingBox = QRectF(); m_currentBoundingBox = QRectF();
m_characterBoundingRects.clear();
} }
} }
@@ -464,17 +471,26 @@ void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageI
switch (sequenceItem.type) switch (sequenceItem.type)
{ {
case PDFStructureTreeTextItem::Type::StartTag: case PDFStructureTreeTextItem::Type::StartTag:
{
stack.push(sequenceItem.item); stack.push(sequenceItem.item);
break; break;
}
case PDFStructureTreeTextItem::Type::EndTag: case PDFStructureTreeTextItem::Type::EndTag:
{
stack.pop(); stack.pop();
break; break;
}
case PDFStructureTreeTextItem::Type::Text: case PDFStructureTreeTextItem::Type::Text:
{
if (!stack.empty()) if (!stack.empty())
{ {
m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text }); m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text, sequenceItem.characterBoundingRects });
} }
break; break;
}
default:
break;
} }
} }
} }
@@ -598,7 +614,7 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure
for (const auto& textItem : m_extractor->getText(structureElement)) for (const auto& textItem : m_extractor->getText(structureElement))
{ {
markHasContent(); markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text }); m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text, textItem.characterBoundingRects });
} }
acceptChildren(structureElement); acceptChildren(structureElement);
@@ -688,7 +704,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart }); flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart });
for (const PDFTextFlow& textFlow : textFlows) for (const PDFTextFlow& textFlow : textFlows)
{ {
flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text }); flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text, textFlow.getBoundingBoxes() });
} }
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd });
@@ -748,7 +764,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
{ {
if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text) if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text)
{ {
flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text }); flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text, sequenceItem.characterBoundingRects });
} }
} }
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd });

View File

@@ -56,6 +56,7 @@ public:
PDFInteger pageIndex = 0; PDFInteger pageIndex = 0;
QString text; QString text;
Flags flags = None; Flags flags = None;
std::vector<QRectF> characterBoundingRects;
bool isText() const { return flags.testFlag(Text); } bool isText() const { return flags.testFlag(Text); }
bool isSpecial() const { return !isText(); } bool isSpecial() const { return !isText(); }

View File

@@ -1176,6 +1176,7 @@ void PDFTextFlow::merge(const PDFTextFlow& next)
m_text += next.m_text; m_text += next.m_text;
m_boundingBox = m_boundingBox.united(next.m_boundingBox); m_boundingBox = m_boundingBox.united(next.m_boundingBox);
m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend()); m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend());
m_characterBoundingBoxes.insert(m_characterBoundingBoxes.end(), next.m_characterBoundingBoxes.cbegin(), next.m_characterBoundingBoxes.cend());
} }
PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex) PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex)
@@ -1222,6 +1223,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
{ {
currentFlow.m_text += QChar(' '); currentFlow.m_text += QChar(' ');
currentFlow.m_characterPointers.emplace_back(); currentFlow.m_characterPointers.emplace_back();
currentFlow.m_characterBoundingBoxes.emplace_back();
} }
} }
@@ -1233,6 +1235,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
pointer.lineIndex = textLineIndex; pointer.lineIndex = textLineIndex;
pointer.characterIndex = i; pointer.characterIndex = i;
currentFlow.m_characterPointers.emplace_back(qMove(pointer)); currentFlow.m_characterPointers.emplace_back(qMove(pointer));
currentFlow.m_characterBoundingBoxes.emplace_back(currentCharacter.boundingBox.controlPointRect());
} }
// Remove soft hyphen, if it is enabled // Remove soft hyphen, if it is enabled
@@ -1240,6 +1243,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
{ {
currentFlow.m_text.chop(1); currentFlow.m_text.chop(1);
currentFlow.m_characterPointers.pop_back(); currentFlow.m_characterPointers.pop_back();
currentFlow.m_characterBoundingBoxes.pop_back();
if (!flags.testFlag(AddLineBreaks)) if (!flags.testFlag(AddLineBreaks))
{ {
@@ -1252,6 +1256,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
// Add line break // Add line break
currentFlow.m_text += lineBreak; currentFlow.m_text += lineBreak;
currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer()); currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer());
currentFlow.m_characterBoundingBoxes.insert(currentFlow.m_characterBoundingBoxes.end(), lineBreak.length(), QRectF());
++textLineIndex; ++textLineIndex;
} }

View File

@@ -297,6 +297,9 @@ public:
/// Returns whole text for this text flow /// Returns whole text for this text flow
QString getText() const { return m_text; } QString getText() const { return m_text; }
/// Returns character bounding boxes
std::vector<QRectF> getBoundingBoxes() const { return m_characterBoundingBoxes; }
/// Returns text form character pointers /// Returns text form character pointers
/// \param begin Begin character /// \param begin Begin character
/// \param end End character /// \param end End character
@@ -330,6 +333,7 @@ private:
QString m_text; QString m_text;
QRectF m_boundingBox; QRectF m_boundingBox;
std::vector<PDFCharacterPointer> m_characterPointers; std::vector<PDFCharacterPointer> m_characterPointers;
std::vector<QRectF> m_characterBoundingBoxes;
}; };
/// Text layout of single page. Can handle various fonts, various angles of lines /// Text layout of single page. Can handle various fonts, various angles of lines