diff --git a/PdfForQtLib/PdfForQtLib.pro b/PdfForQtLib/PdfForQtLib.pro index f69e404..28fbedd 100644 --- a/PdfForQtLib/PdfForQtLib.pro +++ b/PdfForQtLib/PdfForQtLib.pro @@ -49,6 +49,7 @@ SOURCES += \ sources/pdfcms.cpp \ sources/pdfcompiler.cpp \ sources/pdfdocumentbuilder.cpp \ + sources/pdfdocumenttextflow.cpp \ sources/pdfdocumentwriter.cpp \ sources/pdfexecutionpolicy.cpp \ sources/pdffile.cpp \ @@ -104,6 +105,7 @@ HEADERS += \ sources/pdfcompiler.h \ sources/pdfdocumentbuilder.h \ sources/pdfdocumentdrawinterface.h \ + sources/pdfdocumenttextflow.h \ sources/pdfdocumentwriter.h \ sources/pdfexecutionpolicy.h \ sources/pdffile.h \ diff --git a/PdfForQtLib/sources/pdfcompiler.cpp b/PdfForQtLib/sources/pdfcompiler.cpp index c2f9fe9..98ca615 100644 --- a/PdfForQtLib/sources/pdfcompiler.cpp +++ b/PdfForQtLib/sources/pdfcompiler.cpp @@ -185,38 +185,6 @@ void PDFAsynchronousPageCompiler::onPageCompiled() } } -class PDFTextLayoutGenerator : public PDFPageContentProcessor -{ - using BaseClass = PDFPageContentProcessor; - -public: - explicit PDFTextLayoutGenerator(PDFRenderer::Features features, - const PDFPage* page, - const PDFDocument* document, - const PDFFontCache* fontCache, - const PDFCMS* cms, - const PDFOptionalContentActivity* optionalContentActivity, - QMatrix pagePointToDevicePointMatrix, - const PDFMeshQualitySettings& meshQualitySettings) : - BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings), - m_features(features) - { - - } - - /// Creates text layout from the text - PDFTextLayout createTextLayout(); - -protected: - virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override; - virtual bool isContentKindSuppressed(ContentKind kind) const override; - virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override; - -private: - PDFRenderer::Features m_features; - PDFTextLayout m_textLayout; -}; - PDFTextLayout PDFTextLayoutGenerator::createTextLayout() { m_textLayout.perform(); diff --git a/PdfForQtLib/sources/pdfcompiler.h b/PdfForQtLib/sources/pdfcompiler.h index 02808e6..4b16d5d 100644 --- a/PdfForQtLib/sources/pdfcompiler.h +++ b/PdfForQtLib/sources/pdfcompiler.h @@ -168,6 +168,38 @@ private: QFutureWatcher m_textLayoutCompileFutureWatcher; }; +class PDFTextLayoutGenerator : public PDFPageContentProcessor +{ + using BaseClass = PDFPageContentProcessor; + +public: + explicit PDFTextLayoutGenerator(PDFRenderer::Features features, + const PDFPage* page, + const PDFDocument* document, + const PDFFontCache* fontCache, + const PDFCMS* cms, + const PDFOptionalContentActivity* optionalContentActivity, + QMatrix pagePointToDevicePointMatrix, + const PDFMeshQualitySettings& meshQualitySettings) : + BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings), + m_features(features) + { + + } + + /// Creates text layout from the text + PDFTextLayout createTextLayout(); + +protected: + virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override; + virtual bool isContentKindSuppressed(ContentKind kind) const override; + virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override; + +private: + PDFRenderer::Features m_features; + PDFTextLayout m_textLayout; +}; + } // namespace pdf #endif // PDFCOMPILER_H diff --git a/PdfForQtLib/sources/pdfdocumenttextflow.cpp b/PdfForQtLib/sources/pdfdocumenttextflow.cpp new file mode 100644 index 0000000..e411dac --- /dev/null +++ b/PdfForQtLib/sources/pdfdocumenttextflow.cpp @@ -0,0 +1,122 @@ +// Copyright (C) 2020 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + +#include "pdfdocumenttextflow.h" +#include "pdfdocument.h" +#include "pdfstructuretree.h" +#include "pdfcompiler.h" +#include "pdfexecutionpolicy.h" +#include "pdfconstants.h" +#include "pdfcms.h" + +namespace pdf +{ + +PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector& pageIndices, Algorithm algorithm) +{ + PDFDocumentTextFlow result; + PDFStructureTree structureTree; + + const PDFCatalog* catalog = document->getCatalog(); + if (algorithm == Algorithm::Auto || algorithm == Algorithm::Structure) + { + structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot()); + } + + if (algorithm == Algorithm::Auto) + { + // Determine algorithm + if (catalog->isLogicalStructureMarked() && structureTree.isValid()) + { + algorithm = Algorithm::Structure; + } + else + { + algorithm = Algorithm::Layout; + } + } + + Q_ASSERT(algorithm != Algorithm::Auto); + + QMutex mutex; + + // Perform algorithm to retrieve document text + switch (algorithm) + { + case Algorithm::Layout: + { + PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT); + fontCache.setCacheShrinkEnabled(nullptr, false); + + std::map items; + + PDFCMSGeneric cms; + PDFMeshQualitySettings mqs; + PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr); + + auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex) + { + if (!catalog->getPage(pageIndex)) + { + // Invalid page index + return; + } + + const PDFPage* page = catalog->getPage(pageIndex); + Q_ASSERT(page); + + PDFTextLayoutGenerator generator(PDFRenderer::IgnoreOptionalContent, page, document, &fontCache, &cms, &oca, QMatrix(), mqs); + QList errors = generator.processContents(); + PDFTextLayout textLayout = generator.createTextLayout(); + PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex); + + PDFDocumentTextFlow::Items flowItems; + flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, QString() }); + for (const PDFTextFlow& textFlow : textFlows) + { + flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() }); + } + flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageEnd, pageIndex, QString() }); + + QMutexLocker lock(&mutex); + items[pageIndex] = qMove(flowItems); + m_errors.append(qMove(errors)); + }; + + PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout); + + fontCache.setCacheShrinkEnabled(nullptr, true); + + PDFDocumentTextFlow::Items flowItems; + for (const auto& item : items) + { + flowItems.insert(flowItems.end(), std::make_move_iterator(item.second.begin()), std::make_move_iterator(item.second.end())); + } + + result = PDFDocumentTextFlow(qMove(flowItems)); + break; + } + + default: + Q_ASSERT(false); + break; + } + + return result; +} + +} // namespace pdf diff --git a/PdfForQtLib/sources/pdfdocumenttextflow.h b/PdfForQtLib/sources/pdfdocumenttextflow.h new file mode 100644 index 0000000..332c787 --- /dev/null +++ b/PdfForQtLib/sources/pdfdocumenttextflow.h @@ -0,0 +1,109 @@ +// Copyright (C) 2020 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + +#ifndef PDFDOCUMENTTEXTFLOW_H +#define PDFDOCUMENTTEXTFLOW_H + +#include "pdfglobal.h" +#include "pdfexception.h" + +namespace pdf +{ +class PDFDocument; + +/// Text flow extracted from document. Text flow can be created \p PDFDocumentTextFlowFactory. +/// Flow can contain various items, not just text ones. Also, some manipulation functions +/// are available, they can modify text flow by various content. +class PDFFORQTLIBSHARED_EXPORT PDFDocumentTextFlow +{ +public: + + enum Flag + { + None = 0x0000, ///< No text flag + Text = 0x0001, ///< Ordinary text + PageStart = 0x0002, ///< Page start marker + PageEnd = 0x0004, ///< Page end marker + StructureTitle = 0x0008, ///< Structure tree item title + StructureLanguage = 0x0010, ///< Structure tree item language + StructureAlternativeDescription = 0x0020, ///< Structure tree item alternative description + StructureExpandedForm = 0x0040, ///< Structure tree item expanded form of text + StructureActualText = 0x0080, ///< Structure tree item actual text + StructurePhoneme = 0x0100, ///< Structure tree item phoneme + StructureItemStart = 0x0200, ///< Start of structure tree item + StructureItemEnd = 0x0400, ///< End of structure tree item + StructureEmpty = 0x0800, ///< Structure tree item doesn't contain any text + }; + Q_DECLARE_FLAGS(Flags, Flag) + + struct Item + { + Flags flags = None; + PDFInteger pageIndex = 0; + QString text; + }; + using Items = std::vector; + + explicit PDFDocumentTextFlow() = default; + explicit PDFDocumentTextFlow(Items&& items) : + m_items(qMove(items)) + { + + } + + const Items& getItems() const { return m_items; } + +private: + Items m_items; +}; + +/// This factory creates text flow for whole document +class PDFDocumentTextFlowFactory +{ +public: + explicit PDFDocumentTextFlowFactory() = default; + + enum class Algorithm + { + Auto, ///< Determine best text layout algorithm automatically + Layout, ///< Use text layout recognition using docstrum algorithm + Content, ///< Use content-stream text layout recognition (usually unreliable), but fast + Structure, ///< Use structure oriented text layout recognition (requires tagged document) + }; + + /// Performs document text flow analysis using given algorithm. Text flow + /// can be performed only for given subset of pages, if required. + /// \param document Document + /// \param pageIndices Analyzed page indices + /// \param algorithm Algorithm + PDFDocumentTextFlow create(const PDFDocument* document, + const std::vector& pageIndices, + Algorithm algorithm); + + /// Has some error/warning occured during text layout creation? + bool hasError() const { return !m_errors.isEmpty(); } + + /// Returns a list of errors/warnings + const QList& getErrors() const { return m_errors; } + +private: + QList m_errors; +}; + +} // namespace pdf + +#endif // PDFDOCUMENTTEXTFLOW_H