Structure tree text extractor (basics)

2025-06-05 21:59:17 +02:00 · 2020-10-13 19:14:05 +02:00
parent 08b38ce813
commit b0f8e1f1e3
3 changed files with 274 additions and 1 deletions
--- a/PdfForQtLib/sources/pdfstructuretree.h
+++ b/PdfForQtLib/sources/pdfstructuretree.h
@ -21,10 +21,11 @@
 #include "pdfobject.h"
 #include "pdfobjectutils.h"
 #include "pdffile.h"
+#include "pdfexception.h"

 namespace pdf
 {
-
+class PDFDocument;
 class PDFObjectStorage;
 struct PDFStructureTreeAttributeDefinition;

@ -597,6 +598,27 @@ private:
    PDFObjectReference m_objectReference;
 };

+/// Text extractor for structure tree. Can extract text to fill structure tree contents.
+class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeTextExtractor
+{
+public:
+    explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree);
+
+    /// Performs text extracting algorithm. Only \p pageIndices
+    /// pages are processed for text extraction.
+    /// \param pageIndices Page indices
+    void perform(const std::vector<PDFInteger>& pageIndices);
+
+    /// Returns a list of errors/warnings
+    const QList<PDFRenderError>& getErrors() const { return m_errors; }
+
+private:
+    QList<PDFRenderError> m_errors;
+    const PDFDocument* m_document;
+    const PDFStructureTree* m_tree;
+    QStringList m_unmatchedText;
+};
+
 }   // namespace pdf

 #endif // PDFSTRUCTURETREE_H