Structure tree text extractor (basics)

2025-06-05 21:59:17 +02:00 · 2020-10-13 19:14:05 +02:00
parent 08b38ce813
commit b0f8e1f1e3
3 changed files with 274 additions and 1 deletions
--- a/PdfForQtLib/sources/pdfdocumenttextflow.cpp
+++ b/PdfForQtLib/sources/pdfdocumenttextflow.cpp
@@ -113,6 +113,20 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
            break;
        }

+        case Algorithm::Structure:
+        {
+            if (!structureTree.isValid())
+            {
+                m_errors << PDFRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Valid tagged document required."));
+                break;
+            }
+
+            PDFStructureTreeTextExtractor extractor(document, &structureTree);
+            extractor.perform(pageIndices);
+
+            break;
+        }
+
        default:
            Q_ASSERT(false);
            break;
--- a/PdfForQtLib/sources/pdfstructuretree.cpp
+++ b/PdfForQtLib/sources/pdfstructuretree.cpp
@@ -19,6 +19,10 @@
 #include "pdfdocument.h"
 #include "pdfnametreeloader.h"
 #include "pdfnumbertreeloader.h"
+#include "pdfpagecontentprocessor.h"
+#include "pdfcms.h"
+#include "pdfexecutionpolicy.h"
+#include "pdfconstants.h"

 #include <array>

@@ -935,4 +939,237 @@ void PDFStructureTreeAbstractVisitor::acceptChildren(const PDFStructureItem* ite
    }
 }

+class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor
+{
+public:
+    explicit inline PDFStructureTreeReferenceCollector(std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
+        m_mapping(mapping)
+    {
+
+    }
+
+    virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
+    virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
+    virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
+    virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
+
+private:
+    void addReference(const PDFStructureItem* structureObjectReference);
+
+    std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
+};
+
+void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree)
+{
+    addReference(structureTree);
+    acceptChildren(structureTree);
+}
+
+void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement)
+{
+    addReference(structureElement);
+    acceptChildren(structureElement);
+}
+
+void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
+{
+    addReference(structureMarkedContentReference);
+    acceptChildren(structureMarkedContentReference);
+}
+
+void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
+{
+    addReference(structureObjectReference);
+    acceptChildren(structureObjectReference);
+}
+
+void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem)
+{
+    if (structureItem->getSelfReference().isValid())
+    {
+        (*m_mapping)[structureItem->getSelfReference()] = structureItem;
+    }
+}
+
+class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor
+{
+    using BaseClass = PDFPageContentProcessor;
+
+public:
+    explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features,
+                                                  const PDFPage* page,
+                                                  const PDFDocument* document,
+                                                  const PDFFontCache* fontCache,
+                                                  const PDFCMS* cms,
+                                                  const PDFOptionalContentActivity* optionalContentActivity,
+                                                  QMatrix pagePointToDevicePointMatrix,
+                                                  const PDFMeshQualitySettings& meshQualitySettings) :
+        BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
+        m_features(features)
+    {
+
+    }
+
+    std::map<PDFInteger, QStringList>& takeTexts() { return m_text; }
+    QStringList& takeUnmatchedTexts() { return m_unmatchedText; }
+
+protected:
+    virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
+    virtual bool isContentKindSuppressed(ContentKind kind) const override;
+    virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
+    virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
+    virtual void performMarkedContentEnd() override;
+
+private:
+    struct MarkedContentInfo
+    {
+        QByteArray tag;
+        PDFInteger mcid = -1;
+    };
+
+    PDFRenderer::Features m_features;
+    std::vector<MarkedContentInfo> m_markedContentInfoStack;
+    QString m_currentText;
+    std::map<PDFInteger, QStringList> m_text;
+    QStringList m_unmatchedText;
+};
+
+void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties)
+{
+    MarkedContentInfo info;
+    info.tag = tag;
+
+    if (properties.isDictionary())
+    {
+        const PDFDictionary* dictionary = properties.getDictionary();
+        PDFObject mcid = dictionary->get("MCID");
+        if (mcid.isInt())
+        {
+            info.mcid = mcid.getInteger();
+        }
+    }
+
+    m_markedContentInfoStack.emplace_back(qMove(info));
+}
+
+void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
+{
+    MarkedContentInfo info = qMove(m_markedContentInfoStack.back());
+    m_markedContentInfoStack.pop_back();
+
+    if (info.mcid != -1)
+    {
+        if (!m_currentText.isEmpty())
+        {
+            m_text[info.mcid].push_back(qMove(m_currentText));
+        }
+        m_currentText = QString();
+    }
+
+    if (m_markedContentInfoStack.empty() && !m_currentText.isEmpty())
+    {
+        m_unmatchedText << qMove(m_currentText);
+        m_currentText = QString();
+    }
+}
+
+bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd)
+{
+    if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent))
+    {
+        return false;
+    }
+
+    return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd);
+}
+
+bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const
+{
+    switch (kind)
+    {
+        case ContentKind::Shapes:
+        case ContentKind::Text:
+        case ContentKind::Images:
+        case ContentKind::Shading:
+            return true;
+
+        case ContentKind::Tiling:
+            return false; // Tiling can have text
+
+        default:
+        {
+            Q_ASSERT(false);
+            break;
+        }
+    }
+
+    return false;
+}
+
+void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info)
+{
+    if (!isContentSuppressed())
+    {
+        if (!info.character.isNull())
+        {
+            m_currentText.push_back(info.character);
+        }
+    }
+}
+
+PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree) :
+    m_document(document),
+    m_tree(tree)
+{
+
+}
+
+void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageIndices)
+{
+    std::map<PDFObjectReference, const PDFStructureItem*> mapping;
+    PDFStructureTreeReferenceCollector referenceCollector(&mapping);
+    m_tree->accept(&referenceCollector);
+
+    PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
+
+    // Jakub Melka: maps text to structure tree items. Key is pair of (page index, mcid)
+    std::map<std::pair<PDFInteger, PDFInteger>, QStringList> extractedText;
+
+    QMutex mutex;
+    PDFCMSGeneric cms;
+    PDFMeshQualitySettings mqs;
+    PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr);
+    pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(m_document), &oca);
+    fontCache.setDocument(md);
+    fontCache.setCacheShrinkEnabled(nullptr, false);
+
+    auto generateTextLayout = [this, &mutex, &extractedText, &fontCache, &cms, &mqs, &oca](PDFInteger pageIndex)
+    {
+        const PDFCatalog* catalog = m_document->getCatalog();
+        if (!catalog->getPage(pageIndex))
+        {
+            // Invalid page index
+            return;
+        }
+
+        const PDFPage* page = catalog->getPage(pageIndex);
+        Q_ASSERT(page);
+
+        PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QMatrix(), mqs);
+        QList<PDFRenderError> errors = processor.processContents();
+
+        QMutexLocker lock(&mutex);
+        for (auto& item : processor.takeTexts())
+        {
+            extractedText[std::make_pair(pageIndex, item.first)].append(qMove(item.second));
+        }
+        m_unmatchedText << qMove(processor.takeUnmatchedTexts());
+        m_errors.append(qMove(errors));
+    };
+
+    PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
+
+    fontCache.setCacheShrinkEnabled(nullptr, true);
+}
+
 }   // namespace pdf
--- a/PdfForQtLib/sources/pdfstructuretree.h
+++ b/PdfForQtLib/sources/pdfstructuretree.h
@@ -21,10 +21,11 @@
 #include "pdfobject.h"
 #include "pdfobjectutils.h"
 #include "pdffile.h"
+#include "pdfexception.h"

 namespace pdf
 {
-
+class PDFDocument;
 class PDFObjectStorage;
 struct PDFStructureTreeAttributeDefinition;

@@ -597,6 +598,27 @@ private:
    PDFObjectReference m_objectReference;
 };

+/// Text extractor for structure tree. Can extract text to fill structure tree contents.
+class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeTextExtractor
+{
+public:
+    explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree);
+
+    /// Performs text extracting algorithm. Only \p pageIndices
+    /// pages are processed for text extraction.
+    /// \param pageIndices Page indices
+    void perform(const std::vector<PDFInteger>& pageIndices);
+
+    /// Returns a list of errors/warnings
+    const QList<PDFRenderError>& getErrors() const { return m_errors; }
+
+private:
+    QList<PDFRenderError> m_errors;
+    const PDFDocument* m_document;
+    const PDFStructureTree* m_tree;
+    QStringList m_unmatchedText;
+};
+
 }   // namespace pdf

 #endif // PDFSTRUCTURETREE_H