Structure tree text extractor (basics)

This commit is contained in:
Jakub Melka
2020-10-13 19:14:05 +02:00
parent 08b38ce813
commit b0f8e1f1e3
3 changed files with 274 additions and 1 deletions

View File

@ -21,10 +21,11 @@
#include "pdfobject.h"
#include "pdfobjectutils.h"
#include "pdffile.h"
#include "pdfexception.h"
namespace pdf
{
class PDFDocument;
class PDFObjectStorage;
struct PDFStructureTreeAttributeDefinition;
@ -597,6 +598,27 @@ private:
PDFObjectReference m_objectReference;
};
/// Text extractor for structure tree. Can extract text to fill structure tree contents.
class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeTextExtractor
{
public:
explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree);
/// Performs text extracting algorithm. Only \p pageIndices
/// pages are processed for text extraction.
/// \param pageIndices Page indices
void perform(const std::vector<PDFInteger>& pageIndices);
/// Returns a list of errors/warnings
const QList<PDFRenderError>& getErrors() const { return m_errors; }
private:
QList<PDFRenderError> m_errors;
const PDFDocument* m_document;
const PDFStructureTree* m_tree;
QStringList m_unmatchedText;
};
} // namespace pdf
#endif // PDFSTRUCTURETREE_H