mirror of
				https://github.com/JakubMelka/PDF4QT.git
				synced 2025-06-05 21:59:17 +02:00 
			
		
		
		
	DocDiff application: text bounding boxes
This commit is contained in:
		| @@ -89,21 +89,22 @@ struct PDFStructureTreeTextItem | |||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     PDFStructureTreeTextItem() = default; |     PDFStructureTreeTextItem() = default; | ||||||
|     PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect) : |     PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) : | ||||||
|         type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect) |         type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect), characterBoundingRects(std::move(characterBoundingRects)) | ||||||
|     { |     { | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect); } |     static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect, std::move(characterBoundingRects)); } | ||||||
|     static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF()); } |     static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF(), { }); } | ||||||
|     static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF()); } |     static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF(), { }); } | ||||||
|  |  | ||||||
|     Type type = Type::Text; |     Type type = Type::Text; | ||||||
|     const PDFStructureItem* item = nullptr; |     const PDFStructureItem* item = nullptr; | ||||||
|     QString text; |     QString text; | ||||||
|     PDFInteger pageIndex = -1; |     PDFInteger pageIndex = -1; | ||||||
|     QRectF boundingRect; |     QRectF boundingRect; | ||||||
|  |     std::vector<QRectF> characterBoundingRects; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>; | using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>; | ||||||
| @@ -147,6 +148,7 @@ public: | |||||||
|         QRectF boundingRect; |         QRectF boundingRect; | ||||||
|         PDFInteger pageIndex = -1; |         PDFInteger pageIndex = -1; | ||||||
|         QString text; |         QString text; | ||||||
|  |         std::vector<QRectF> characterBoundingRects; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     using TextItems = std::vector<TextItem>; |     using TextItems = std::vector<TextItem>; | ||||||
| @@ -232,6 +234,7 @@ private: | |||||||
|     QStringList m_unmatchedText; |     QStringList m_unmatchedText; | ||||||
|     PDFStructureTreeTextExtractor::Options m_extractorOptions; |     PDFStructureTreeTextExtractor::Options m_extractorOptions; | ||||||
|     PDFInteger m_pageIndex; |     PDFInteger m_pageIndex; | ||||||
|  |     std::vector<QRectF> m_characterBoundingRects; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule) | void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPath& path, bool stroke, bool fill, bool text, Qt::FillRule fillRule) | ||||||
| @@ -254,6 +257,7 @@ void PDFStructureTreeTextContentProcessor::performPathPainting(const QPainterPat | |||||||
|     QMatrix matrix = getCurrentWorldMatrix(); |     QMatrix matrix = getCurrentWorldMatrix(); | ||||||
|     QPainterPath worldPath = matrix.map(path); |     QPainterPath worldPath = matrix.map(path); | ||||||
|     m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect()); |     m_currentBoundingBox = m_currentBoundingBox.united(worldPath.controlPointRect()); | ||||||
|  |     m_characterBoundingRects.push_back(worldPath.controlPointRect()); | ||||||
| } | } | ||||||
|  |  | ||||||
| void PDFStructureTreeTextContentProcessor::finishText() | void PDFStructureTreeTextContentProcessor::finishText() | ||||||
| @@ -270,11 +274,13 @@ void PDFStructureTreeTextContentProcessor::finishText() | |||||||
|                 reversed.push_back(*it); |                 reversed.push_back(*it); | ||||||
|             } |             } | ||||||
|             m_currentText = qMove(reversed); |             m_currentText = qMove(reversed); | ||||||
|  |             std::reverse(m_characterBoundingRects.begin(), m_characterBoundingRects.end()); | ||||||
|         } |         } | ||||||
|         m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(qMove(m_currentText), m_pageIndex, m_currentBoundingBox)); |         m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(std::move(m_currentText), m_pageIndex, m_currentBoundingBox, std::move(m_characterBoundingRects))); | ||||||
|     } |     } | ||||||
|     m_currentText = QString(); |     m_currentText = QString(); | ||||||
|     m_currentBoundingBox = QRectF(); |     m_currentBoundingBox = QRectF(); | ||||||
|  |     m_characterBoundingRects.clear(); | ||||||
| } | } | ||||||
|  |  | ||||||
| bool PDFStructureTreeTextContentProcessor::isArtifact() const | bool PDFStructureTreeTextContentProcessor::isArtifact() const | ||||||
| @@ -346,6 +352,7 @@ void PDFStructureTreeTextContentProcessor::performMarkedContentEnd() | |||||||
|             m_unmatchedText << qMove(m_currentText); |             m_unmatchedText << qMove(m_currentText); | ||||||
|         } |         } | ||||||
|         m_currentBoundingBox = QRectF(); |         m_currentBoundingBox = QRectF(); | ||||||
|  |         m_characterBoundingRects.clear(); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -464,17 +471,26 @@ void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageI | |||||||
|                 switch (sequenceItem.type) |                 switch (sequenceItem.type) | ||||||
|                 { |                 { | ||||||
|                     case PDFStructureTreeTextItem::Type::StartTag: |                     case PDFStructureTreeTextItem::Type::StartTag: | ||||||
|  |                     { | ||||||
|                         stack.push(sequenceItem.item); |                         stack.push(sequenceItem.item); | ||||||
|                         break; |                         break; | ||||||
|  |                     } | ||||||
|                     case PDFStructureTreeTextItem::Type::EndTag: |                     case PDFStructureTreeTextItem::Type::EndTag: | ||||||
|  |                     { | ||||||
|                         stack.pop(); |                         stack.pop(); | ||||||
|                         break; |                         break; | ||||||
|  |                     } | ||||||
|                     case PDFStructureTreeTextItem::Type::Text: |                     case PDFStructureTreeTextItem::Type::Text: | ||||||
|  |                     { | ||||||
|                         if (!stack.empty()) |                         if (!stack.empty()) | ||||||
|                         { |                         { | ||||||
|                             m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text }); |                             m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text, sequenceItem.characterBoundingRects }); | ||||||
|                         } |                         } | ||||||
|                         break; |                         break; | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     default: | ||||||
|  |                         break; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -598,7 +614,7 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure | |||||||
|     for (const auto& textItem : m_extractor->getText(structureElement)) |     for (const auto& textItem : m_extractor->getText(structureElement)) | ||||||
|     { |     { | ||||||
|         markHasContent(); |         markHasContent(); | ||||||
|         m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text }); |         m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text, textItem.characterBoundingRects }); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     acceptChildren(structureElement); |     acceptChildren(structureElement); | ||||||
| @@ -688,7 +704,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume | |||||||
|                 flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart }); |                 flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart }); | ||||||
|                 for (const PDFTextFlow& textFlow : textFlows) |                 for (const PDFTextFlow& textFlow : textFlows) | ||||||
|                 { |                 { | ||||||
|                     flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text }); |                     flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text, textFlow.getBoundingBoxes() }); | ||||||
|                 } |                 } | ||||||
|                 flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); |                 flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); | ||||||
|  |  | ||||||
| @@ -748,7 +764,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume | |||||||
|                 { |                 { | ||||||
|                     if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text) |                     if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text) | ||||||
|                     { |                     { | ||||||
|                         flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text }); |                         flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text, sequenceItem.characterBoundingRects }); | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); |                 flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd }); | ||||||
|   | |||||||
| @@ -56,6 +56,7 @@ public: | |||||||
|         PDFInteger pageIndex = 0; |         PDFInteger pageIndex = 0; | ||||||
|         QString text; |         QString text; | ||||||
|         Flags flags = None; |         Flags flags = None; | ||||||
|  |         std::vector<QRectF> characterBoundingRects; | ||||||
|  |  | ||||||
|         bool isText() const { return flags.testFlag(Text); } |         bool isText() const { return flags.testFlag(Text); } | ||||||
|         bool isSpecial() const { return !isText(); } |         bool isSpecial() const { return !isText(); } | ||||||
|   | |||||||
| @@ -1176,6 +1176,7 @@ void PDFTextFlow::merge(const PDFTextFlow& next) | |||||||
|     m_text += next.m_text; |     m_text += next.m_text; | ||||||
|     m_boundingBox = m_boundingBox.united(next.m_boundingBox); |     m_boundingBox = m_boundingBox.united(next.m_boundingBox); | ||||||
|     m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend()); |     m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend()); | ||||||
|  |     m_characterBoundingBoxes.insert(m_characterBoundingBoxes.end(), next.m_characterBoundingBoxes.cbegin(), next.m_characterBoundingBoxes.cend()); | ||||||
| } | } | ||||||
|  |  | ||||||
| PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex) | PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex) | ||||||
| @@ -1222,6 +1223,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags | |||||||
|                     { |                     { | ||||||
|                         currentFlow.m_text += QChar(' '); |                         currentFlow.m_text += QChar(' '); | ||||||
|                         currentFlow.m_characterPointers.emplace_back(); |                         currentFlow.m_characterPointers.emplace_back(); | ||||||
|  |                         currentFlow.m_characterBoundingBoxes.emplace_back(); | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -1233,6 +1235,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags | |||||||
|                 pointer.lineIndex = textLineIndex; |                 pointer.lineIndex = textLineIndex; | ||||||
|                 pointer.characterIndex = i; |                 pointer.characterIndex = i; | ||||||
|                 currentFlow.m_characterPointers.emplace_back(qMove(pointer)); |                 currentFlow.m_characterPointers.emplace_back(qMove(pointer)); | ||||||
|  |                 currentFlow.m_characterBoundingBoxes.emplace_back(currentCharacter.boundingBox.controlPointRect()); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // Remove soft hyphen, if it is enabled |             // Remove soft hyphen, if it is enabled | ||||||
| @@ -1240,6 +1243,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags | |||||||
|             { |             { | ||||||
|                 currentFlow.m_text.chop(1); |                 currentFlow.m_text.chop(1); | ||||||
|                 currentFlow.m_characterPointers.pop_back(); |                 currentFlow.m_characterPointers.pop_back(); | ||||||
|  |                 currentFlow.m_characterBoundingBoxes.pop_back(); | ||||||
|  |  | ||||||
|                 if (!flags.testFlag(AddLineBreaks)) |                 if (!flags.testFlag(AddLineBreaks)) | ||||||
|                 { |                 { | ||||||
| @@ -1252,6 +1256,7 @@ PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags | |||||||
|             // Add line break |             // Add line break | ||||||
|             currentFlow.m_text += lineBreak; |             currentFlow.m_text += lineBreak; | ||||||
|             currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer()); |             currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer()); | ||||||
|  |             currentFlow.m_characterBoundingBoxes.insert(currentFlow.m_characterBoundingBoxes.end(), lineBreak.length(), QRectF()); | ||||||
|  |  | ||||||
|             ++textLineIndex; |             ++textLineIndex; | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -297,6 +297,9 @@ public: | |||||||
|     /// Returns whole text for this text flow |     /// Returns whole text for this text flow | ||||||
|     QString getText() const { return m_text; } |     QString getText() const { return m_text; } | ||||||
|  |  | ||||||
|  |     /// Returns character bounding boxes | ||||||
|  |     std::vector<QRectF> getBoundingBoxes() const { return m_characterBoundingBoxes; } | ||||||
|  |  | ||||||
|     /// Returns text form character pointers |     /// Returns text form character pointers | ||||||
|     /// \param begin Begin character |     /// \param begin Begin character | ||||||
|     /// \param end End character |     /// \param end End character | ||||||
| @@ -330,6 +333,7 @@ private: | |||||||
|     QString m_text; |     QString m_text; | ||||||
|     QRectF m_boundingBox; |     QRectF m_boundingBox; | ||||||
|     std::vector<PDFCharacterPointer> m_characterPointers; |     std::vector<PDFCharacterPointer> m_characterPointers; | ||||||
|  |     std::vector<QRectF> m_characterBoundingBoxes; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| /// Text layout of single page. Can handle various fonts, various angles of lines | /// Text layout of single page. Can handle various fonts, various angles of lines | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user