Document text flow (first part)

This commit is contained in:
Jakub Melka 2020-10-10 18:38:30 +02:00
parent b0831a84a7
commit a656e9857f
5 changed files with 265 additions and 32 deletions

View File

@ -49,6 +49,7 @@ SOURCES += \
sources/pdfcms.cpp \ sources/pdfcms.cpp \
sources/pdfcompiler.cpp \ sources/pdfcompiler.cpp \
sources/pdfdocumentbuilder.cpp \ sources/pdfdocumentbuilder.cpp \
sources/pdfdocumenttextflow.cpp \
sources/pdfdocumentwriter.cpp \ sources/pdfdocumentwriter.cpp \
sources/pdfexecutionpolicy.cpp \ sources/pdfexecutionpolicy.cpp \
sources/pdffile.cpp \ sources/pdffile.cpp \
@ -104,6 +105,7 @@ HEADERS += \
sources/pdfcompiler.h \ sources/pdfcompiler.h \
sources/pdfdocumentbuilder.h \ sources/pdfdocumentbuilder.h \
sources/pdfdocumentdrawinterface.h \ sources/pdfdocumentdrawinterface.h \
sources/pdfdocumenttextflow.h \
sources/pdfdocumentwriter.h \ sources/pdfdocumentwriter.h \
sources/pdfexecutionpolicy.h \ sources/pdfexecutionpolicy.h \
sources/pdffile.h \ sources/pdffile.h \

View File

@ -185,38 +185,6 @@ void PDFAsynchronousPageCompiler::onPageCompiled()
} }
} }
class PDFTextLayoutGenerator : public PDFPageContentProcessor
{
using BaseClass = PDFPageContentProcessor;
public:
explicit PDFTextLayoutGenerator(PDFRenderer::Features features,
const PDFPage* page,
const PDFDocument* document,
const PDFFontCache* fontCache,
const PDFCMS* cms,
const PDFOptionalContentActivity* optionalContentActivity,
QMatrix pagePointToDevicePointMatrix,
const PDFMeshQualitySettings& meshQualitySettings) :
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
m_features(features)
{
}
/// Creates text layout from the text
PDFTextLayout createTextLayout();
protected:
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
virtual bool isContentKindSuppressed(ContentKind kind) const override;
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
private:
PDFRenderer::Features m_features;
PDFTextLayout m_textLayout;
};
PDFTextLayout PDFTextLayoutGenerator::createTextLayout() PDFTextLayout PDFTextLayoutGenerator::createTextLayout()
{ {
m_textLayout.perform(); m_textLayout.perform();

View File

@ -168,6 +168,38 @@ private:
QFutureWatcher<PDFTextLayoutStorage> m_textLayoutCompileFutureWatcher; QFutureWatcher<PDFTextLayoutStorage> m_textLayoutCompileFutureWatcher;
}; };
class PDFTextLayoutGenerator : public PDFPageContentProcessor
{
using BaseClass = PDFPageContentProcessor;
public:
explicit PDFTextLayoutGenerator(PDFRenderer::Features features,
const PDFPage* page,
const PDFDocument* document,
const PDFFontCache* fontCache,
const PDFCMS* cms,
const PDFOptionalContentActivity* optionalContentActivity,
QMatrix pagePointToDevicePointMatrix,
const PDFMeshQualitySettings& meshQualitySettings) :
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
m_features(features)
{
}
/// Creates text layout from the text
PDFTextLayout createTextLayout();
protected:
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
virtual bool isContentKindSuppressed(ContentKind kind) const override;
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
private:
PDFRenderer::Features m_features;
PDFTextLayout m_textLayout;
};
} // namespace pdf } // namespace pdf
#endif // PDFCOMPILER_H #endif // PDFCOMPILER_H

View File

@ -0,0 +1,122 @@
// Copyright (C) 2020 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#include "pdfdocumenttextflow.h"
#include "pdfdocument.h"
#include "pdfstructuretree.h"
#include "pdfcompiler.h"
#include "pdfexecutionpolicy.h"
#include "pdfconstants.h"
#include "pdfcms.h"
namespace pdf
{
PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector<PDFInteger>& pageIndices, Algorithm algorithm)
{
PDFDocumentTextFlow result;
PDFStructureTree structureTree;
const PDFCatalog* catalog = document->getCatalog();
if (algorithm == Algorithm::Auto || algorithm == Algorithm::Structure)
{
structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot());
}
if (algorithm == Algorithm::Auto)
{
// Determine algorithm
if (catalog->isLogicalStructureMarked() && structureTree.isValid())
{
algorithm = Algorithm::Structure;
}
else
{
algorithm = Algorithm::Layout;
}
}
Q_ASSERT(algorithm != Algorithm::Auto);
QMutex mutex;
// Perform algorithm to retrieve document text
switch (algorithm)
{
case Algorithm::Layout:
{
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
fontCache.setCacheShrinkEnabled(nullptr, false);
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
PDFCMSGeneric cms;
PDFMeshQualitySettings mqs;
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex)
{
if (!catalog->getPage(pageIndex))
{
// Invalid page index
return;
}
const PDFPage* page = catalog->getPage(pageIndex);
Q_ASSERT(page);
PDFTextLayoutGenerator generator(PDFRenderer::IgnoreOptionalContent, page, document, &fontCache, &cms, &oca, QMatrix(), mqs);
QList<PDFRenderError> errors = generator.processContents();
PDFTextLayout textLayout = generator.createTextLayout();
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex);
PDFDocumentTextFlow::Items flowItems;
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, QString() });
for (const PDFTextFlow& textFlow : textFlows)
{
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() });
}
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageEnd, pageIndex, QString() });
QMutexLocker lock(&mutex);
items[pageIndex] = qMove(flowItems);
m_errors.append(qMove(errors));
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
fontCache.setCacheShrinkEnabled(nullptr, true);
PDFDocumentTextFlow::Items flowItems;
for (const auto& item : items)
{
flowItems.insert(flowItems.end(), std::make_move_iterator(item.second.begin()), std::make_move_iterator(item.second.end()));
}
result = PDFDocumentTextFlow(qMove(flowItems));
break;
}
default:
Q_ASSERT(false);
break;
}
return result;
}
} // namespace pdf

View File

@ -0,0 +1,109 @@
// Copyright (C) 2020 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#ifndef PDFDOCUMENTTEXTFLOW_H
#define PDFDOCUMENTTEXTFLOW_H
#include "pdfglobal.h"
#include "pdfexception.h"
namespace pdf
{
class PDFDocument;
/// Text flow extracted from document. Text flow can be created \p PDFDocumentTextFlowFactory.
/// Flow can contain various items, not just text ones. Also, some manipulation functions
/// are available, they can modify text flow by various content.
class PDFFORQTLIBSHARED_EXPORT PDFDocumentTextFlow
{
public:
enum Flag
{
None = 0x0000, ///< No text flag
Text = 0x0001, ///< Ordinary text
PageStart = 0x0002, ///< Page start marker
PageEnd = 0x0004, ///< Page end marker
StructureTitle = 0x0008, ///< Structure tree item title
StructureLanguage = 0x0010, ///< Structure tree item language
StructureAlternativeDescription = 0x0020, ///< Structure tree item alternative description
StructureExpandedForm = 0x0040, ///< Structure tree item expanded form of text
StructureActualText = 0x0080, ///< Structure tree item actual text
StructurePhoneme = 0x0100, ///< Structure tree item phoneme
StructureItemStart = 0x0200, ///< Start of structure tree item
StructureItemEnd = 0x0400, ///< End of structure tree item
StructureEmpty = 0x0800, ///< Structure tree item doesn't contain any text
};
Q_DECLARE_FLAGS(Flags, Flag)
struct Item
{
Flags flags = None;
PDFInteger pageIndex = 0;
QString text;
};
using Items = std::vector<Item>;
explicit PDFDocumentTextFlow() = default;
explicit PDFDocumentTextFlow(Items&& items) :
m_items(qMove(items))
{
}
const Items& getItems() const { return m_items; }
private:
Items m_items;
};
/// This factory creates text flow for whole document
class PDFDocumentTextFlowFactory
{
public:
explicit PDFDocumentTextFlowFactory() = default;
enum class Algorithm
{
Auto, ///< Determine best text layout algorithm automatically
Layout, ///< Use text layout recognition using docstrum algorithm
Content, ///< Use content-stream text layout recognition (usually unreliable), but fast
Structure, ///< Use structure oriented text layout recognition (requires tagged document)
};
/// Performs document text flow analysis using given algorithm. Text flow
/// can be performed only for given subset of pages, if required.
/// \param document Document
/// \param pageIndices Analyzed page indices
/// \param algorithm Algorithm
PDFDocumentTextFlow create(const PDFDocument* document,
const std::vector<PDFInteger>& pageIndices,
Algorithm algorithm);
/// Has some error/warning occured during text layout creation?
bool hasError() const { return !m_errors.isEmpty(); }
/// Returns a list of errors/warnings
const QList<PDFRenderError>& getErrors() const { return m_errors; }
private:
QList<PDFRenderError> m_errors;
};
} // namespace pdf
#endif // PDFDOCUMENTTEXTFLOW_H