mirror of https://github.com/JakubMelka/PDF4QT.git
Document text flow (first part)
This commit is contained in:
parent
b0831a84a7
commit
a656e9857f
|
@ -49,6 +49,7 @@ SOURCES += \
|
||||||
sources/pdfcms.cpp \
|
sources/pdfcms.cpp \
|
||||||
sources/pdfcompiler.cpp \
|
sources/pdfcompiler.cpp \
|
||||||
sources/pdfdocumentbuilder.cpp \
|
sources/pdfdocumentbuilder.cpp \
|
||||||
|
sources/pdfdocumenttextflow.cpp \
|
||||||
sources/pdfdocumentwriter.cpp \
|
sources/pdfdocumentwriter.cpp \
|
||||||
sources/pdfexecutionpolicy.cpp \
|
sources/pdfexecutionpolicy.cpp \
|
||||||
sources/pdffile.cpp \
|
sources/pdffile.cpp \
|
||||||
|
@ -104,6 +105,7 @@ HEADERS += \
|
||||||
sources/pdfcompiler.h \
|
sources/pdfcompiler.h \
|
||||||
sources/pdfdocumentbuilder.h \
|
sources/pdfdocumentbuilder.h \
|
||||||
sources/pdfdocumentdrawinterface.h \
|
sources/pdfdocumentdrawinterface.h \
|
||||||
|
sources/pdfdocumenttextflow.h \
|
||||||
sources/pdfdocumentwriter.h \
|
sources/pdfdocumentwriter.h \
|
||||||
sources/pdfexecutionpolicy.h \
|
sources/pdfexecutionpolicy.h \
|
||||||
sources/pdffile.h \
|
sources/pdffile.h \
|
||||||
|
|
|
@ -185,38 +185,6 @@ void PDFAsynchronousPageCompiler::onPageCompiled()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class PDFTextLayoutGenerator : public PDFPageContentProcessor
|
|
||||||
{
|
|
||||||
using BaseClass = PDFPageContentProcessor;
|
|
||||||
|
|
||||||
public:
|
|
||||||
explicit PDFTextLayoutGenerator(PDFRenderer::Features features,
|
|
||||||
const PDFPage* page,
|
|
||||||
const PDFDocument* document,
|
|
||||||
const PDFFontCache* fontCache,
|
|
||||||
const PDFCMS* cms,
|
|
||||||
const PDFOptionalContentActivity* optionalContentActivity,
|
|
||||||
QMatrix pagePointToDevicePointMatrix,
|
|
||||||
const PDFMeshQualitySettings& meshQualitySettings) :
|
|
||||||
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
|
|
||||||
m_features(features)
|
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates text layout from the text
|
|
||||||
PDFTextLayout createTextLayout();
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
|
|
||||||
virtual bool isContentKindSuppressed(ContentKind kind) const override;
|
|
||||||
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
PDFRenderer::Features m_features;
|
|
||||||
PDFTextLayout m_textLayout;
|
|
||||||
};
|
|
||||||
|
|
||||||
PDFTextLayout PDFTextLayoutGenerator::createTextLayout()
|
PDFTextLayout PDFTextLayoutGenerator::createTextLayout()
|
||||||
{
|
{
|
||||||
m_textLayout.perform();
|
m_textLayout.perform();
|
||||||
|
|
|
@ -168,6 +168,38 @@ private:
|
||||||
QFutureWatcher<PDFTextLayoutStorage> m_textLayoutCompileFutureWatcher;
|
QFutureWatcher<PDFTextLayoutStorage> m_textLayoutCompileFutureWatcher;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class PDFTextLayoutGenerator : public PDFPageContentProcessor
|
||||||
|
{
|
||||||
|
using BaseClass = PDFPageContentProcessor;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit PDFTextLayoutGenerator(PDFRenderer::Features features,
|
||||||
|
const PDFPage* page,
|
||||||
|
const PDFDocument* document,
|
||||||
|
const PDFFontCache* fontCache,
|
||||||
|
const PDFCMS* cms,
|
||||||
|
const PDFOptionalContentActivity* optionalContentActivity,
|
||||||
|
QMatrix pagePointToDevicePointMatrix,
|
||||||
|
const PDFMeshQualitySettings& meshQualitySettings) :
|
||||||
|
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
|
||||||
|
m_features(features)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates text layout from the text
|
||||||
|
PDFTextLayout createTextLayout();
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
|
||||||
|
virtual bool isContentKindSuppressed(ContentKind kind) const override;
|
||||||
|
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
PDFRenderer::Features m_features;
|
||||||
|
PDFTextLayout m_textLayout;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
|
||||||
#endif // PDFCOMPILER_H
|
#endif // PDFCOMPILER_H
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
// Copyright (C) 2020 Jakub Melka
|
||||||
|
//
|
||||||
|
// This file is part of PdfForQt.
|
||||||
|
//
|
||||||
|
// PdfForQt is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// PdfForQt is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Lesser General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public License
|
||||||
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#include "pdfdocumenttextflow.h"
|
||||||
|
#include "pdfdocument.h"
|
||||||
|
#include "pdfstructuretree.h"
|
||||||
|
#include "pdfcompiler.h"
|
||||||
|
#include "pdfexecutionpolicy.h"
|
||||||
|
#include "pdfconstants.h"
|
||||||
|
#include "pdfcms.h"
|
||||||
|
|
||||||
|
namespace pdf
|
||||||
|
{
|
||||||
|
|
||||||
|
PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector<PDFInteger>& pageIndices, Algorithm algorithm)
|
||||||
|
{
|
||||||
|
PDFDocumentTextFlow result;
|
||||||
|
PDFStructureTree structureTree;
|
||||||
|
|
||||||
|
const PDFCatalog* catalog = document->getCatalog();
|
||||||
|
if (algorithm == Algorithm::Auto || algorithm == Algorithm::Structure)
|
||||||
|
{
|
||||||
|
structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (algorithm == Algorithm::Auto)
|
||||||
|
{
|
||||||
|
// Determine algorithm
|
||||||
|
if (catalog->isLogicalStructureMarked() && structureTree.isValid())
|
||||||
|
{
|
||||||
|
algorithm = Algorithm::Structure;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
algorithm = Algorithm::Layout;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Q_ASSERT(algorithm != Algorithm::Auto);
|
||||||
|
|
||||||
|
QMutex mutex;
|
||||||
|
|
||||||
|
// Perform algorithm to retrieve document text
|
||||||
|
switch (algorithm)
|
||||||
|
{
|
||||||
|
case Algorithm::Layout:
|
||||||
|
{
|
||||||
|
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
||||||
|
fontCache.setCacheShrinkEnabled(nullptr, false);
|
||||||
|
|
||||||
|
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
||||||
|
|
||||||
|
PDFCMSGeneric cms;
|
||||||
|
PDFMeshQualitySettings mqs;
|
||||||
|
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
||||||
|
|
||||||
|
auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex)
|
||||||
|
{
|
||||||
|
if (!catalog->getPage(pageIndex))
|
||||||
|
{
|
||||||
|
// Invalid page index
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PDFPage* page = catalog->getPage(pageIndex);
|
||||||
|
Q_ASSERT(page);
|
||||||
|
|
||||||
|
PDFTextLayoutGenerator generator(PDFRenderer::IgnoreOptionalContent, page, document, &fontCache, &cms, &oca, QMatrix(), mqs);
|
||||||
|
QList<PDFRenderError> errors = generator.processContents();
|
||||||
|
PDFTextLayout textLayout = generator.createTextLayout();
|
||||||
|
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex);
|
||||||
|
|
||||||
|
PDFDocumentTextFlow::Items flowItems;
|
||||||
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, QString() });
|
||||||
|
for (const PDFTextFlow& textFlow : textFlows)
|
||||||
|
{
|
||||||
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() });
|
||||||
|
}
|
||||||
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageEnd, pageIndex, QString() });
|
||||||
|
|
||||||
|
QMutexLocker lock(&mutex);
|
||||||
|
items[pageIndex] = qMove(flowItems);
|
||||||
|
m_errors.append(qMove(errors));
|
||||||
|
};
|
||||||
|
|
||||||
|
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
|
||||||
|
|
||||||
|
fontCache.setCacheShrinkEnabled(nullptr, true);
|
||||||
|
|
||||||
|
PDFDocumentTextFlow::Items flowItems;
|
||||||
|
for (const auto& item : items)
|
||||||
|
{
|
||||||
|
flowItems.insert(flowItems.end(), std::make_move_iterator(item.second.begin()), std::make_move_iterator(item.second.end()));
|
||||||
|
}
|
||||||
|
|
||||||
|
result = PDFDocumentTextFlow(qMove(flowItems));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
Q_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace pdf
|
|
@ -0,0 +1,109 @@
|
||||||
|
// Copyright (C) 2020 Jakub Melka
|
||||||
|
//
|
||||||
|
// This file is part of PdfForQt.
|
||||||
|
//
|
||||||
|
// PdfForQt is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// PdfForQt is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Lesser General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public License
|
||||||
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#ifndef PDFDOCUMENTTEXTFLOW_H
|
||||||
|
#define PDFDOCUMENTTEXTFLOW_H
|
||||||
|
|
||||||
|
#include "pdfglobal.h"
|
||||||
|
#include "pdfexception.h"
|
||||||
|
|
||||||
|
namespace pdf
|
||||||
|
{
|
||||||
|
class PDFDocument;
|
||||||
|
|
||||||
|
/// Text flow extracted from document. Text flow can be created \p PDFDocumentTextFlowFactory.
|
||||||
|
/// Flow can contain various items, not just text ones. Also, some manipulation functions
|
||||||
|
/// are available, they can modify text flow by various content.
|
||||||
|
class PDFFORQTLIBSHARED_EXPORT PDFDocumentTextFlow
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
enum Flag
|
||||||
|
{
|
||||||
|
None = 0x0000, ///< No text flag
|
||||||
|
Text = 0x0001, ///< Ordinary text
|
||||||
|
PageStart = 0x0002, ///< Page start marker
|
||||||
|
PageEnd = 0x0004, ///< Page end marker
|
||||||
|
StructureTitle = 0x0008, ///< Structure tree item title
|
||||||
|
StructureLanguage = 0x0010, ///< Structure tree item language
|
||||||
|
StructureAlternativeDescription = 0x0020, ///< Structure tree item alternative description
|
||||||
|
StructureExpandedForm = 0x0040, ///< Structure tree item expanded form of text
|
||||||
|
StructureActualText = 0x0080, ///< Structure tree item actual text
|
||||||
|
StructurePhoneme = 0x0100, ///< Structure tree item phoneme
|
||||||
|
StructureItemStart = 0x0200, ///< Start of structure tree item
|
||||||
|
StructureItemEnd = 0x0400, ///< End of structure tree item
|
||||||
|
StructureEmpty = 0x0800, ///< Structure tree item doesn't contain any text
|
||||||
|
};
|
||||||
|
Q_DECLARE_FLAGS(Flags, Flag)
|
||||||
|
|
||||||
|
struct Item
|
||||||
|
{
|
||||||
|
Flags flags = None;
|
||||||
|
PDFInteger pageIndex = 0;
|
||||||
|
QString text;
|
||||||
|
};
|
||||||
|
using Items = std::vector<Item>;
|
||||||
|
|
||||||
|
explicit PDFDocumentTextFlow() = default;
|
||||||
|
explicit PDFDocumentTextFlow(Items&& items) :
|
||||||
|
m_items(qMove(items))
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
const Items& getItems() const { return m_items; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
Items m_items;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// This factory creates text flow for whole document
|
||||||
|
class PDFDocumentTextFlowFactory
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit PDFDocumentTextFlowFactory() = default;
|
||||||
|
|
||||||
|
enum class Algorithm
|
||||||
|
{
|
||||||
|
Auto, ///< Determine best text layout algorithm automatically
|
||||||
|
Layout, ///< Use text layout recognition using docstrum algorithm
|
||||||
|
Content, ///< Use content-stream text layout recognition (usually unreliable), but fast
|
||||||
|
Structure, ///< Use structure oriented text layout recognition (requires tagged document)
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Performs document text flow analysis using given algorithm. Text flow
|
||||||
|
/// can be performed only for given subset of pages, if required.
|
||||||
|
/// \param document Document
|
||||||
|
/// \param pageIndices Analyzed page indices
|
||||||
|
/// \param algorithm Algorithm
|
||||||
|
PDFDocumentTextFlow create(const PDFDocument* document,
|
||||||
|
const std::vector<PDFInteger>& pageIndices,
|
||||||
|
Algorithm algorithm);
|
||||||
|
|
||||||
|
/// Has some error/warning occured during text layout creation?
|
||||||
|
bool hasError() const { return !m_errors.isEmpty(); }
|
||||||
|
|
||||||
|
/// Returns a list of errors/warnings
|
||||||
|
const QList<PDFRenderError>& getErrors() const { return m_errors; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
QList<PDFRenderError> m_errors;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace pdf
|
||||||
|
|
||||||
|
#endif // PDFDOCUMENTTEXTFLOW_H
|
Loading…
Reference in New Issue