diff --git a/Pdf4QtLib/sources/pdfdocumenttextflow.cpp b/Pdf4QtLib/sources/pdfdocumenttextflow.cpp index 3481ed7..1d64f4a 100644 --- a/Pdf4QtLib/sources/pdfdocumenttextflow.cpp +++ b/Pdf4QtLib/sources/pdfdocumenttextflow.cpp @@ -89,21 +89,22 @@ struct PDFStructureTreeTextItem }; PDFStructureTreeTextItem() = default; - PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect) : - type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect) + PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector characterBoundingRects) : + type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect), characterBoundingRects(std::move(characterBoundingRects)) { } - static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect); } - static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF()); } - static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF()); } + static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector characterBoundingRects) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect, std::move(characterBoundingRects)); } + static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF(), { }); } + static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF(), { }); } Type type = Type::Text; const PDFStructureItem* item = nullptr; QString text; PDFInteger pageIndex = -1; QRectF boundingRect; + std::vector characterBoundingRects; }; using PDFStructureTreeTextSequence = std::vector; @@ -147,6 +148,7 @@ public: QRectF boundingRect; PDFInteger pageIndex = -1; QString text; + std::vector characterBoundingRects; }; using TextItems = std::vector; @@ -232,6 +234,7 @@ private: QStringList m_unmatchedText; PDFStructureTreeTextExtractor::Options m_extractorOptions; PDFInteger m_pageIndex; + std::vector m_characterBoundingRects; }; void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule) @@ -254,6 +257,7 @@ void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPat QMatrix matrix = getCurrentWorldMatrix(); QPainterPath worldPath = matrix.map(path); m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect()); + m_characterBoundingRects.push_back(worldPath.controlPointRect()); } void PDFStructureTreeTextContentProcessor::finishText() @@ -270,11 +274,13 @@ void PDFStructureTreeTextContentProcessor::finishText() reversed.push_back(*it); } m_currentText = qMove(reversed); + std::reverse(m_characterBoundingRects.begin(), m_characterBoundingRects.end()); } - m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(qMove(m_currentText), m_pageIndex, m_currentBoundingBox)); + m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(std::move(m_currentText), m_pageIndex, m_currentBoundingBox, std::move(m_characterBoundingRects))); } m_currentText = QString(); m_currentBoundingBox = QRectF(); + m_characterBoundingRects.clear(); } bool PDFStructureTreeTextContentProcessor::isArtifact() const @@ -346,6 +352,7 @@ void PDFStructureTreeTextContentProcessor::performMarkedContentEnd() m_unmatchedText << qMove(m_currentText); } m_currentBoundingBox = QRectF(); + m_characterBoundingRects.clear(); } } @@ -464,17 +471,26 @@ void PDFStructureTreeTextExtractor::perform(const std::vector& pageI switch (sequenceItem.type) { case PDFStructureTreeTextItem::Type::StartTag: + { stack.push(sequenceItem.item); break; + } case PDFStructureTreeTextItem::Type::EndTag: + { stack.pop(); break; + } case PDFStructureTreeTextItem::Type::Text: + { if (!stack.empty()) { - m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text }); + m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text, sequenceItem.characterBoundingRects }); } break; + } + + default: + break; } } } @@ -598,7 +614,7 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure for (const auto& textItem : m_extractor->getText(structureElement)) { markHasContent(); - m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text }); + m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text, textItem.characterBoundingRects }); } acceptChildren(structureElement); @@ -688,7 +704,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart }); for (const PDFTextFlow& textFlow : textFlows) { - flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text }); + flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text, textFlow.getBoundingBoxes() }); } flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); @@ -748,7 +764,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume { if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text) { - flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text }); + flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text, sequenceItem.characterBoundingRects }); } } flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); diff --git a/Pdf4QtLib/sources/pdfdocumenttextflow.h b/Pdf4QtLib/sources/pdfdocumenttextflow.h index 2d9d748..a39a150 100644 --- a/Pdf4QtLib/sources/pdfdocumenttextflow.h +++ b/Pdf4QtLib/sources/pdfdocumenttextflow.h @@ -56,6 +56,7 @@ public: PDFInteger pageIndex = 0; QString text; Flags flags = None; + std::vector characterBoundingRects; bool isText() const { return flags.testFlag(Text); } bool isSpecial() const { return !isText(); } diff --git a/Pdf4QtLib/sources/pdftextlayout.cpp b/Pdf4QtLib/sources/pdftextlayout.cpp index 60ece09..708adce 100644 --- a/Pdf4QtLib/sources/pdftextlayout.cpp +++ b/Pdf4QtLib/sources/pdftextlayout.cpp @@ -1176,6 +1176,7 @@ void PDFTextFlow::merge(const PDFTextFlow& next) m_text += next.m_text; m_boundingBox = m_boundingBox.united(next.m_boundingBox); m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend()); + m_characterBoundingBoxes.insert(m_characterBoundingBoxes.end(), next.m_characterBoundingBoxes.cbegin(), next.m_characterBoundingBoxes.cend()); } PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex) @@ -1222,6 +1223,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags { currentFlow.m_text += QChar(' '); currentFlow.m_characterPointers.emplace_back(); + currentFlow.m_characterBoundingBoxes.emplace_back(); } } @@ -1233,6 +1235,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags pointer.lineIndex = textLineIndex; pointer.characterIndex = i; currentFlow.m_characterPointers.emplace_back(qMove(pointer)); + currentFlow.m_characterBoundingBoxes.emplace_back(currentCharacter.boundingBox.controlPointRect()); } // Remove soft hyphen, if it is enabled @@ -1240,6 +1243,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags { currentFlow.m_text.chop(1); currentFlow.m_characterPointers.pop_back(); + currentFlow.m_characterBoundingBoxes.pop_back(); if (!flags.testFlag(AddLineBreaks)) { @@ -1252,6 +1256,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags // Add line break currentFlow.m_text += lineBreak; currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer()); + currentFlow.m_characterBoundingBoxes.insert(currentFlow.m_characterBoundingBoxes.end(), lineBreak.length(), QRectF()); ++textLineIndex; } diff --git a/Pdf4QtLib/sources/pdftextlayout.h b/Pdf4QtLib/sources/pdftextlayout.h index a2aadd8..aba3a30 100644 --- a/Pdf4QtLib/sources/pdftextlayout.h +++ b/Pdf4QtLib/sources/pdftextlayout.h @@ -297,6 +297,9 @@ public: /// Returns whole text for this text flow QString getText() const { return m_text; } + /// Returns character bounding boxes + std::vector getBoundingBoxes() const { return m_characterBoundingBoxes; } + /// Returns text form character pointers /// \param begin Begin character /// \param end End character @@ -330,6 +333,7 @@ private: QString m_text; QRectF m_boundingBox; std::vector m_characterPointers; + std::vector m_characterBoundingBoxes; }; /// Text layout of single page. Can handle various fonts, various angles of lines