mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
DocDiff application: text bounding boxes
This commit is contained in:
@ -89,21 +89,22 @@ struct PDFStructureTreeTextItem
|
|||||||
};
|
};
|
||||||
|
|
||||||
PDFStructureTreeTextItem() = default;
|
PDFStructureTreeTextItem() = default;
|
||||||
PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect) :
|
PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) :
|
||||||
type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect)
|
type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect), characterBoundingRects(std::move(characterBoundingRects))
|
||||||
{
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect); }
|
static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect, std::move(characterBoundingRects)); }
|
||||||
static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF()); }
|
static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF(), { }); }
|
||||||
static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF()); }
|
static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF(), { }); }
|
||||||
|
|
||||||
Type type = Type::Text;
|
Type type = Type::Text;
|
||||||
const PDFStructureItem* item = nullptr;
|
const PDFStructureItem* item = nullptr;
|
||||||
QString text;
|
QString text;
|
||||||
PDFInteger pageIndex = -1;
|
PDFInteger pageIndex = -1;
|
||||||
QRectF boundingRect;
|
QRectF boundingRect;
|
||||||
|
std::vector<QRectF> characterBoundingRects;
|
||||||
};
|
};
|
||||||
|
|
||||||
using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>;
|
using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>;
|
||||||
@ -147,6 +148,7 @@ public:
|
|||||||
QRectF boundingRect;
|
QRectF boundingRect;
|
||||||
PDFInteger pageIndex = -1;
|
PDFInteger pageIndex = -1;
|
||||||
QString text;
|
QString text;
|
||||||
|
std::vector<QRectF> characterBoundingRects;
|
||||||
};
|
};
|
||||||
|
|
||||||
using TextItems = std::vector<TextItem>;
|
using TextItems = std::vector<TextItem>;
|
||||||
@ -232,6 +234,7 @@ private:
|
|||||||
QStringList m_unmatchedText;
|
QStringList m_unmatchedText;
|
||||||
PDFStructureTreeTextExtractor::Options m_extractorOptions;
|
PDFStructureTreeTextExtractor::Options m_extractorOptions;
|
||||||
PDFInteger m_pageIndex;
|
PDFInteger m_pageIndex;
|
||||||
|
std::vector<QRectF> m_characterBoundingRects;
|
||||||
};
|
};
|
||||||
|
|
||||||
void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule)
|
void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule)
|
||||||
@ -254,6 +257,7 @@ void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPat
|
|||||||
QMatrix matrix = getCurrentWorldMatrix();
|
QMatrix matrix = getCurrentWorldMatrix();
|
||||||
QPainterPath worldPath = matrix.map(path);
|
QPainterPath worldPath = matrix.map(path);
|
||||||
m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect());
|
m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect());
|
||||||
|
m_characterBoundingRects.push_back(worldPath.controlPointRect());
|
||||||
}
|
}
|
||||||
|
|
||||||
void PDFStructureTreeTextContentProcessor::finishText()
|
void PDFStructureTreeTextContentProcessor::finishText()
|
||||||
@ -270,11 +274,13 @@ void PDFStructureTreeTextContentProcessor::finishText()
|
|||||||
reversed.push_back(*it);
|
reversed.push_back(*it);
|
||||||
}
|
}
|
||||||
m_currentText = qMove(reversed);
|
m_currentText = qMove(reversed);
|
||||||
|
std::reverse(m_characterBoundingRects.begin(), m_characterBoundingRects.end());
|
||||||
}
|
}
|
||||||
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(qMove(m_currentText), m_pageIndex, m_currentBoundingBox));
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(std::move(m_currentText), m_pageIndex, m_currentBoundingBox, std::move(m_characterBoundingRects)));
|
||||||
}
|
}
|
||||||
m_currentText = QString();
|
m_currentText = QString();
|
||||||
m_currentBoundingBox = QRectF();
|
m_currentBoundingBox = QRectF();
|
||||||
|
m_characterBoundingRects.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PDFStructureTreeTextContentProcessor::isArtifact() const
|
bool PDFStructureTreeTextContentProcessor::isArtifact() const
|
||||||
@ -346,6 +352,7 @@ void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
|
|||||||
m_unmatchedText << qMove(m_currentText);
|
m_unmatchedText << qMove(m_currentText);
|
||||||
}
|
}
|
||||||
m_currentBoundingBox = QRectF();
|
m_currentBoundingBox = QRectF();
|
||||||
|
m_characterBoundingRects.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -464,17 +471,26 @@ void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageI
|
|||||||
switch (sequenceItem.type)
|
switch (sequenceItem.type)
|
||||||
{
|
{
|
||||||
case PDFStructureTreeTextItem::Type::StartTag:
|
case PDFStructureTreeTextItem::Type::StartTag:
|
||||||
|
{
|
||||||
stack.push(sequenceItem.item);
|
stack.push(sequenceItem.item);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case PDFStructureTreeTextItem::Type::EndTag:
|
case PDFStructureTreeTextItem::Type::EndTag:
|
||||||
|
{
|
||||||
stack.pop();
|
stack.pop();
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case PDFStructureTreeTextItem::Type::Text:
|
case PDFStructureTreeTextItem::Type::Text:
|
||||||
|
{
|
||||||
if (!stack.empty())
|
if (!stack.empty())
|
||||||
{
|
{
|
||||||
m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text });
|
m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text, sequenceItem.characterBoundingRects });
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -598,7 +614,7 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure
|
|||||||
for (const auto& textItem : m_extractor->getText(structureElement))
|
for (const auto& textItem : m_extractor->getText(structureElement))
|
||||||
{
|
{
|
||||||
markHasContent();
|
markHasContent();
|
||||||
m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text });
|
m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text, textItem.characterBoundingRects });
|
||||||
}
|
}
|
||||||
|
|
||||||
acceptChildren(structureElement);
|
acceptChildren(structureElement);
|
||||||
@ -688,7 +704,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart });
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart });
|
||||||
for (const PDFTextFlow& textFlow : textFlows)
|
for (const PDFTextFlow& textFlow : textFlows)
|
||||||
{
|
{
|
||||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text });
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text, textFlow.getBoundingBoxes() });
|
||||||
}
|
}
|
||||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd });
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd });
|
||||||
|
|
||||||
@ -748,7 +764,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
{
|
{
|
||||||
if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text)
|
if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text)
|
||||||
{
|
{
|
||||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text });
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text, sequenceItem.characterBoundingRects });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd });
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd });
|
||||||
|
@ -56,6 +56,7 @@ public:
|
|||||||
PDFInteger pageIndex = 0;
|
PDFInteger pageIndex = 0;
|
||||||
QString text;
|
QString text;
|
||||||
Flags flags = None;
|
Flags flags = None;
|
||||||
|
std::vector<QRectF> characterBoundingRects;
|
||||||
|
|
||||||
bool isText() const { return flags.testFlag(Text); }
|
bool isText() const { return flags.testFlag(Text); }
|
||||||
bool isSpecial() const { return !isText(); }
|
bool isSpecial() const { return !isText(); }
|
||||||
|
@ -1176,6 +1176,7 @@ void PDFTextFlow::merge(const PDFTextFlow& next)
|
|||||||
m_text += next.m_text;
|
m_text += next.m_text;
|
||||||
m_boundingBox = m_boundingBox.united(next.m_boundingBox);
|
m_boundingBox = m_boundingBox.united(next.m_boundingBox);
|
||||||
m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend());
|
m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend());
|
||||||
|
m_characterBoundingBoxes.insert(m_characterBoundingBoxes.end(), next.m_characterBoundingBoxes.cbegin(), next.m_characterBoundingBoxes.cend());
|
||||||
}
|
}
|
||||||
|
|
||||||
PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex)
|
PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex)
|
||||||
@ -1222,6 +1223,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
|
|||||||
{
|
{
|
||||||
currentFlow.m_text += QChar(' ');
|
currentFlow.m_text += QChar(' ');
|
||||||
currentFlow.m_characterPointers.emplace_back();
|
currentFlow.m_characterPointers.emplace_back();
|
||||||
|
currentFlow.m_characterBoundingBoxes.emplace_back();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1233,6 +1235,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
|
|||||||
pointer.lineIndex = textLineIndex;
|
pointer.lineIndex = textLineIndex;
|
||||||
pointer.characterIndex = i;
|
pointer.characterIndex = i;
|
||||||
currentFlow.m_characterPointers.emplace_back(qMove(pointer));
|
currentFlow.m_characterPointers.emplace_back(qMove(pointer));
|
||||||
|
currentFlow.m_characterBoundingBoxes.emplace_back(currentCharacter.boundingBox.controlPointRect());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove soft hyphen, if it is enabled
|
// Remove soft hyphen, if it is enabled
|
||||||
@ -1240,6 +1243,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
|
|||||||
{
|
{
|
||||||
currentFlow.m_text.chop(1);
|
currentFlow.m_text.chop(1);
|
||||||
currentFlow.m_characterPointers.pop_back();
|
currentFlow.m_characterPointers.pop_back();
|
||||||
|
currentFlow.m_characterBoundingBoxes.pop_back();
|
||||||
|
|
||||||
if (!flags.testFlag(AddLineBreaks))
|
if (!flags.testFlag(AddLineBreaks))
|
||||||
{
|
{
|
||||||
@ -1252,6 +1256,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags
|
|||||||
// Add line break
|
// Add line break
|
||||||
currentFlow.m_text += lineBreak;
|
currentFlow.m_text += lineBreak;
|
||||||
currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer());
|
currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer());
|
||||||
|
currentFlow.m_characterBoundingBoxes.insert(currentFlow.m_characterBoundingBoxes.end(), lineBreak.length(), QRectF());
|
||||||
|
|
||||||
++textLineIndex;
|
++textLineIndex;
|
||||||
}
|
}
|
||||||
|
@ -297,6 +297,9 @@ public:
|
|||||||
/// Returns whole text for this text flow
|
/// Returns whole text for this text flow
|
||||||
QString getText() const { return m_text; }
|
QString getText() const { return m_text; }
|
||||||
|
|
||||||
|
/// Returns character bounding boxes
|
||||||
|
std::vector<QRectF> getBoundingBoxes() const { return m_characterBoundingBoxes; }
|
||||||
|
|
||||||
/// Returns text form character pointers
|
/// Returns text form character pointers
|
||||||
/// \param begin Begin character
|
/// \param begin Begin character
|
||||||
/// \param end End character
|
/// \param end End character
|
||||||
@ -330,6 +333,7 @@ private:
|
|||||||
QString m_text;
|
QString m_text;
|
||||||
QRectF m_boundingBox;
|
QRectF m_boundingBox;
|
||||||
std::vector<PDFCharacterPointer> m_characterPointers;
|
std::vector<PDFCharacterPointer> m_characterPointers;
|
||||||
|
std::vector<QRectF> m_characterBoundingBoxes;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Text layout of single page. Can handle various fonts, various angles of lines
|
/// Text layout of single page. Can handle various fonts, various angles of lines
|
||||||
|
Reference in New Issue
Block a user