mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-04-03 13:11:03 +02:00
Document content flow basics
This commit is contained in:
parent
b0f8e1f1e3
commit
0ccdb1e46f
@ -1362,6 +1362,7 @@ void PDFAnnotationManager::drawAnnotationUsingAppearanceStream(const PageAnnotat
|
|||||||
QByteArray content = m_document->getDecodedStream(formStream);
|
QByteArray content = m_document->getDecodedStream(formStream);
|
||||||
PDFObject resources = m_document->getObject(formDictionary->get("Resources"));
|
PDFObject resources = m_document->getObject(formDictionary->get("Resources"));
|
||||||
PDFObject transparencyGroup = m_document->getObject(formDictionary->get("Group"));
|
PDFObject transparencyGroup = m_document->getObject(formDictionary->get("Group"));
|
||||||
|
const PDFInteger formStructuralParentKey = loader.readIntegerFromDictionary(formDictionary, "StructParent", page->getStructureParentKey());
|
||||||
|
|
||||||
if (formBoundingBox.isEmpty() || annotationRectangle.isEmpty())
|
if (formBoundingBox.isEmpty() || annotationRectangle.isEmpty())
|
||||||
{
|
{
|
||||||
@ -1409,7 +1410,7 @@ void PDFAnnotationManager::drawAnnotationUsingAppearanceStream(const PageAnnotat
|
|||||||
|
|
||||||
if (isContentVisible)
|
if (isContentVisible)
|
||||||
{
|
{
|
||||||
pdfPainter.processForm(AA, formBoundingBox, resources, transparencyGroup, content);
|
pdfPainter.processForm(AA, formBoundingBox, resources, transparencyGroup, content, formStructuralParentKey);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -227,7 +227,7 @@ bool PDFTextLayoutGenerator::isContentKindSuppressed(ContentKind kind) const
|
|||||||
|
|
||||||
void PDFTextLayoutGenerator::performOutputCharacter(const PDFTextCharacterInfo& info)
|
void PDFTextLayoutGenerator::performOutputCharacter(const PDFTextCharacterInfo& info)
|
||||||
{
|
{
|
||||||
if (!isContentSuppressed())
|
if (!isContentSuppressed() && !info.character.isSpace())
|
||||||
{
|
{
|
||||||
m_textLayout.addCharacter(info);
|
m_textLayout.addCharacter(info);
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,483 @@
|
|||||||
namespace pdf
|
namespace pdf
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
||||||
|
class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit inline PDFStructureTreeReferenceCollector(std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
|
||||||
|
m_mapping(mapping)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
|
||||||
|
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
|
||||||
|
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
|
||||||
|
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void addReference(const PDFStructureItem* structureObjectReference);
|
||||||
|
|
||||||
|
std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
|
||||||
|
};
|
||||||
|
|
||||||
|
void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree)
|
||||||
|
{
|
||||||
|
addReference(structureTree);
|
||||||
|
acceptChildren(structureTree);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
||||||
|
{
|
||||||
|
addReference(structureElement);
|
||||||
|
acceptChildren(structureElement);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
|
||||||
|
{
|
||||||
|
addReference(structureMarkedContentReference);
|
||||||
|
acceptChildren(structureMarkedContentReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
|
||||||
|
{
|
||||||
|
addReference(structureObjectReference);
|
||||||
|
acceptChildren(structureObjectReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem)
|
||||||
|
{
|
||||||
|
if (structureItem->getSelfReference().isValid())
|
||||||
|
{
|
||||||
|
(*m_mapping)[structureItem->getSelfReference()] = structureItem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PDFStructureTreeTextItem
|
||||||
|
{
|
||||||
|
enum class Type
|
||||||
|
{
|
||||||
|
StartTag,
|
||||||
|
EndTag,
|
||||||
|
Text
|
||||||
|
};
|
||||||
|
|
||||||
|
PDFStructureTreeTextItem() = default;
|
||||||
|
PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text) :
|
||||||
|
type(type), item(item), text(qMove(text))
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static PDFStructureTreeTextItem createText(QString text) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text)); }
|
||||||
|
static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString()); }
|
||||||
|
static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString()); }
|
||||||
|
|
||||||
|
Type type = Type::Text;
|
||||||
|
const PDFStructureItem* item = nullptr;
|
||||||
|
QString text;
|
||||||
|
};
|
||||||
|
|
||||||
|
using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>;
|
||||||
|
|
||||||
|
/// Text extractor for structure tree. Extracts sequences of structure items,
|
||||||
|
/// page sequences are stored in \p textSequences. They can be accessed using
|
||||||
|
/// getters.
|
||||||
|
class PDFStructureTreeTextExtractor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
enum Option
|
||||||
|
{
|
||||||
|
None = 0x0000,
|
||||||
|
SkipArtifact = 0x0001, ///< Skip content marked as 'Artifact'
|
||||||
|
AdjustReversedText = 0x0002, ///< Adjust reversed text
|
||||||
|
CreateTreeMapping = 0x0004, ///< Create text mapping to structure tree item
|
||||||
|
};
|
||||||
|
Q_DECLARE_FLAGS(Options, Option)
|
||||||
|
|
||||||
|
explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options);
|
||||||
|
|
||||||
|
/// Performs text extracting algorithm. Only \p pageIndices
|
||||||
|
/// pages are processed for text extraction.
|
||||||
|
/// \param pageIndices Page indices
|
||||||
|
void perform(const std::vector<PDFInteger>& pageIndices);
|
||||||
|
|
||||||
|
/// Returns a list of errors/warnings
|
||||||
|
const QList<PDFRenderError>& getErrors() const { return m_errors; }
|
||||||
|
|
||||||
|
/// Returns a list of unmatched text
|
||||||
|
const QStringList& getUnmatchedText() const { return m_unmatchedText; }
|
||||||
|
|
||||||
|
/// Returns text sequence for given page. If page number is invalid,
|
||||||
|
/// then empty text sequence is returned.
|
||||||
|
/// \param pageNumber Page number
|
||||||
|
const PDFStructureTreeTextSequence& getTextSequence(PDFInteger pageNumber) const;
|
||||||
|
|
||||||
|
/// Returns text for given structure tree item. If structure tree item
|
||||||
|
/// is not found, then empty list is returned. This functionality
|
||||||
|
/// requires, that \p CreateTreeMapping flag is being set.
|
||||||
|
/// \param item Item
|
||||||
|
const QStringList& getText(const PDFStructureItem* item) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
QList<PDFRenderError> m_errors;
|
||||||
|
const PDFDocument* m_document;
|
||||||
|
const PDFStructureTree* m_tree;
|
||||||
|
QStringList m_unmatchedText;
|
||||||
|
std::map<PDFInteger, PDFStructureTreeTextSequence> m_textSequences;
|
||||||
|
std::map<const PDFStructureItem*, QStringList> m_textForItems;
|
||||||
|
Options m_options;
|
||||||
|
};
|
||||||
|
|
||||||
|
Q_DECLARE_OPERATORS_FOR_FLAGS(PDFStructureTreeTextExtractor::Options)
|
||||||
|
|
||||||
|
class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor
|
||||||
|
{
|
||||||
|
using BaseClass = PDFPageContentProcessor;
|
||||||
|
|
||||||
|
public:
|
||||||
|
explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features,
|
||||||
|
const PDFPage* page,
|
||||||
|
const PDFDocument* document,
|
||||||
|
const PDFFontCache* fontCache,
|
||||||
|
const PDFCMS* cms,
|
||||||
|
const PDFOptionalContentActivity* optionalContentActivity,
|
||||||
|
QMatrix pagePointToDevicePointMatrix,
|
||||||
|
const PDFMeshQualitySettings& meshQualitySettings,
|
||||||
|
const PDFStructureTree* tree,
|
||||||
|
const std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
|
||||||
|
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
|
||||||
|
m_features(features),
|
||||||
|
m_tree(tree),
|
||||||
|
m_mapping(mapping)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
PDFStructureTreeTextSequence& takeSequence() { return m_textSequence; }
|
||||||
|
QStringList& takeUnmatchedTexts() { return m_unmatchedText; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
|
||||||
|
virtual bool isContentKindSuppressed(ContentKind kind) const override;
|
||||||
|
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
||||||
|
virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
|
||||||
|
virtual void performMarkedContentEnd() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
const PDFStructureItem* getStructureTreeItemFromMCID(PDFInteger mcid) const;
|
||||||
|
void finishText();
|
||||||
|
|
||||||
|
struct MarkedContentInfo
|
||||||
|
{
|
||||||
|
QByteArray tag;
|
||||||
|
PDFInteger mcid = -1;
|
||||||
|
const PDFStructureItem* structureTreeItem = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
PDFRenderer::Features m_features;
|
||||||
|
const PDFStructureTree* m_tree;
|
||||||
|
const std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
|
||||||
|
std::vector<MarkedContentInfo> m_markedContentInfoStack;
|
||||||
|
QString m_currentText;
|
||||||
|
PDFStructureTreeTextSequence m_textSequence;
|
||||||
|
QStringList m_unmatchedText;
|
||||||
|
};
|
||||||
|
|
||||||
|
void PDFStructureTreeTextContentProcessor::finishText()
|
||||||
|
{
|
||||||
|
m_currentText = m_currentText.trimmed();
|
||||||
|
if (!m_currentText.isEmpty())
|
||||||
|
{
|
||||||
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(qMove(m_currentText)));
|
||||||
|
}
|
||||||
|
m_currentText = QString();
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties)
|
||||||
|
{
|
||||||
|
MarkedContentInfo info;
|
||||||
|
info.tag = tag;
|
||||||
|
|
||||||
|
if (properties.isDictionary())
|
||||||
|
{
|
||||||
|
const PDFDictionary* dictionary = properties.getDictionary();
|
||||||
|
PDFObject mcid = dictionary->get("MCID");
|
||||||
|
if (mcid.isInt())
|
||||||
|
{
|
||||||
|
// We must finish text, because we can have a sequence of text,
|
||||||
|
// then subitem, then text, and followed by another subitem. They
|
||||||
|
// can be interleaved.
|
||||||
|
finishText();
|
||||||
|
|
||||||
|
info.mcid = mcid.getInteger();
|
||||||
|
info.structureTreeItem = getStructureTreeItemFromMCID(info.mcid);
|
||||||
|
|
||||||
|
if (!info.structureTreeItem)
|
||||||
|
{
|
||||||
|
reportRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Structure tree item for MCID %1 not found.").arg(info.mcid));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (info.structureTreeItem)
|
||||||
|
{
|
||||||
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createStartTag(info.structureTreeItem));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_markedContentInfoStack.emplace_back(qMove(info));
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
|
||||||
|
{
|
||||||
|
MarkedContentInfo info = qMove(m_markedContentInfoStack.back());
|
||||||
|
m_markedContentInfoStack.pop_back();
|
||||||
|
|
||||||
|
if (info.mcid != -1)
|
||||||
|
{
|
||||||
|
finishText();
|
||||||
|
if (info.structureTreeItem)
|
||||||
|
{
|
||||||
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createEndTag(info.structureTreeItem));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for text, which doesn't belong to any structure tree item
|
||||||
|
if (m_markedContentInfoStack.empty())
|
||||||
|
{
|
||||||
|
m_currentText = m_currentText.trimmed();
|
||||||
|
if (!m_currentText.isEmpty())
|
||||||
|
{
|
||||||
|
m_unmatchedText << qMove(m_currentText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const PDFStructureItem* PDFStructureTreeTextContentProcessor::getStructureTreeItemFromMCID(PDFInteger mcid) const
|
||||||
|
{
|
||||||
|
auto it = m_mapping->find(m_tree->getParent(getStructuralParentKey(), mcid));
|
||||||
|
if (it != m_mapping->cend())
|
||||||
|
{
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd)
|
||||||
|
{
|
||||||
|
if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const
|
||||||
|
{
|
||||||
|
switch (kind)
|
||||||
|
{
|
||||||
|
case ContentKind::Shapes:
|
||||||
|
case ContentKind::Text:
|
||||||
|
case ContentKind::Images:
|
||||||
|
case ContentKind::Shading:
|
||||||
|
return true;
|
||||||
|
|
||||||
|
case ContentKind::Tiling:
|
||||||
|
return false; // Tiling can have text
|
||||||
|
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
Q_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info)
|
||||||
|
{
|
||||||
|
if (!isContentSuppressed())
|
||||||
|
{
|
||||||
|
if (!info.character.isNull())
|
||||||
|
{
|
||||||
|
m_currentText.push_back(info.character);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options) :
|
||||||
|
m_document(document),
|
||||||
|
m_tree(tree),
|
||||||
|
m_options(options)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageIndices)
|
||||||
|
{
|
||||||
|
std::map<PDFObjectReference, const PDFStructureItem*> mapping;
|
||||||
|
PDFStructureTreeReferenceCollector referenceCollector(&mapping);
|
||||||
|
m_tree->accept(&referenceCollector);
|
||||||
|
|
||||||
|
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
||||||
|
|
||||||
|
QMutex mutex;
|
||||||
|
PDFCMSGeneric cms;
|
||||||
|
PDFMeshQualitySettings mqs;
|
||||||
|
PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr);
|
||||||
|
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(m_document), &oca);
|
||||||
|
fontCache.setDocument(md);
|
||||||
|
fontCache.setCacheShrinkEnabled(nullptr, false);
|
||||||
|
|
||||||
|
auto generateTextLayout = [&, this](PDFInteger pageIndex)
|
||||||
|
{
|
||||||
|
const PDFCatalog* catalog = m_document->getCatalog();
|
||||||
|
if (!catalog->getPage(pageIndex))
|
||||||
|
{
|
||||||
|
// Invalid page index
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PDFPage* page = catalog->getPage(pageIndex);
|
||||||
|
Q_ASSERT(page);
|
||||||
|
|
||||||
|
PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QMatrix(), mqs, m_tree, &mapping);
|
||||||
|
QList<PDFRenderError> errors = processor.processContents();
|
||||||
|
|
||||||
|
QMutexLocker lock(&mutex);
|
||||||
|
m_textSequences[pageIndex] = qMove(processor.takeSequence());
|
||||||
|
m_unmatchedText << qMove(processor.takeUnmatchedTexts());
|
||||||
|
m_errors.append(qMove(errors));
|
||||||
|
};
|
||||||
|
|
||||||
|
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
|
||||||
|
|
||||||
|
fontCache.setCacheShrinkEnabled(nullptr, true);
|
||||||
|
|
||||||
|
if (m_options.testFlag(CreateTreeMapping))
|
||||||
|
{
|
||||||
|
for (const auto& sequence : m_textSequences)
|
||||||
|
{
|
||||||
|
std::stack<const PDFStructureItem*> stack;
|
||||||
|
for (const PDFStructureTreeTextItem& sequenceItem : sequence.second)
|
||||||
|
{
|
||||||
|
switch (sequenceItem.type)
|
||||||
|
{
|
||||||
|
case PDFStructureTreeTextItem::Type::StartTag:
|
||||||
|
stack.push(sequenceItem.item);
|
||||||
|
break;
|
||||||
|
case PDFStructureTreeTextItem::Type::EndTag:
|
||||||
|
stack.pop();
|
||||||
|
break;
|
||||||
|
case PDFStructureTreeTextItem::Type::Text:
|
||||||
|
if (!stack.empty())
|
||||||
|
{
|
||||||
|
m_textForItems[stack.top()] << sequenceItem.text;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const PDFStructureTreeTextSequence& PDFStructureTreeTextExtractor::getTextSequence(PDFInteger pageIndex) const
|
||||||
|
{
|
||||||
|
auto it = m_textSequences.find(pageIndex);
|
||||||
|
if (it != m_textSequences.cend())
|
||||||
|
{
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PDFStructureTreeTextSequence dummy;
|
||||||
|
return dummy;
|
||||||
|
}
|
||||||
|
|
||||||
|
const QStringList& PDFStructureTreeTextExtractor::getText(const PDFStructureItem* item) const
|
||||||
|
{
|
||||||
|
auto it = m_textForItems.find(item);
|
||||||
|
if (it != m_textForItems.cend())
|
||||||
|
{
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const QStringList dummy;
|
||||||
|
return dummy;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PDFStructureTreeTextFlowCollector : public PDFStructureTreeAbstractVisitor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit PDFStructureTreeTextFlowCollector(PDFDocumentTextFlow::Items* items, const PDFStructureTreeTextExtractor* extractor) :
|
||||||
|
m_items(items),
|
||||||
|
m_extractor(extractor)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
|
||||||
|
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
|
||||||
|
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
|
||||||
|
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
PDFDocumentTextFlow::Items* m_items;
|
||||||
|
const PDFStructureTreeTextExtractor* m_extractor;
|
||||||
|
std::vector<bool> m_hasContentStack;
|
||||||
|
};
|
||||||
|
|
||||||
|
void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTree* structureTree)
|
||||||
|
{
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemStart, -1, QString()});
|
||||||
|
acceptChildren(structureTree);
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
||||||
|
{
|
||||||
|
size_t index = m_items->size();
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemStart, -1, QString()});
|
||||||
|
|
||||||
|
// Mark stack so we can delete unused items
|
||||||
|
m_hasContentStack.push_back(false);
|
||||||
|
|
||||||
|
for (const QString& string : m_extractor->getText(structureElement))
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
|
||||||
|
{
|
||||||
|
m_hasContentStack[i] = true;
|
||||||
|
}
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string});
|
||||||
|
}
|
||||||
|
|
||||||
|
acceptChildren(structureElement);
|
||||||
|
|
||||||
|
const bool hasContent = m_hasContentStack.back();
|
||||||
|
m_hasContentStack.pop_back();
|
||||||
|
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
|
||||||
|
|
||||||
|
if (!hasContent)
|
||||||
|
{
|
||||||
|
// Delete unused content
|
||||||
|
m_items->erase(std::next(m_items->begin(), index), m_items->end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextFlowCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
|
||||||
|
{
|
||||||
|
acceptChildren(structureMarkedContentReference);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextFlowCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
|
||||||
|
{
|
||||||
|
acceptChildren(structureObjectReference);
|
||||||
|
}
|
||||||
|
|
||||||
PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector<PDFInteger>& pageIndices, Algorithm algorithm)
|
PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector<PDFInteger>& pageIndices, Algorithm algorithm)
|
||||||
{
|
{
|
||||||
PDFDocumentTextFlow result;
|
PDFDocumentTextFlow result;
|
||||||
@ -121,9 +598,39 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
PDFStructureTreeTextExtractor extractor(document, &structureTree);
|
PDFStructureTreeTextExtractor extractor(document, &structureTree, PDFStructureTreeTextExtractor::SkipArtifact | PDFStructureTreeTextExtractor::AdjustReversedText | PDFStructureTreeTextExtractor::CreateTreeMapping);
|
||||||
extractor.perform(pageIndices);
|
extractor.perform(pageIndices);
|
||||||
|
|
||||||
|
PDFDocumentTextFlow::Items flowItems;
|
||||||
|
PDFStructureTreeTextFlowCollector collector(&flowItems, &extractor);
|
||||||
|
structureTree.accept(&collector);
|
||||||
|
|
||||||
|
result = PDFDocumentTextFlow(qMove(flowItems));
|
||||||
|
m_errors.append(extractor.getErrors());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case Algorithm::Content:
|
||||||
|
{
|
||||||
|
PDFStructureTreeTextExtractor extractor(document, &structureTree, PDFStructureTreeTextExtractor::None);
|
||||||
|
extractor.perform(pageIndices);
|
||||||
|
|
||||||
|
PDFDocumentTextFlow::Items flowItems;
|
||||||
|
for (PDFInteger pageIndex : pageIndices)
|
||||||
|
{
|
||||||
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1) });
|
||||||
|
for (const PDFStructureTreeTextItem& sequenceItem : extractor.getTextSequence(pageIndex))
|
||||||
|
{
|
||||||
|
if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text)
|
||||||
|
{
|
||||||
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, sequenceItem.text });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageEnd, pageIndex, QString() });
|
||||||
|
}
|
||||||
|
|
||||||
|
result = PDFDocumentTextFlow(qMove(flowItems));
|
||||||
|
m_errors.append(extractor.getErrors());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -244,11 +244,14 @@ PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page,
|
|||||||
m_drawingUncoloredTilingPatternState(0),
|
m_drawingUncoloredTilingPatternState(0),
|
||||||
m_patternBaseMatrix(pagePointToDevicePointMatrix),
|
m_patternBaseMatrix(pagePointToDevicePointMatrix),
|
||||||
m_pagePointToDevicePointMatrix(pagePointToDevicePointMatrix),
|
m_pagePointToDevicePointMatrix(pagePointToDevicePointMatrix),
|
||||||
m_meshQualitySettings(meshQualitySettings)
|
m_meshQualitySettings(meshQualitySettings),
|
||||||
|
m_structuralParentKey(0)
|
||||||
{
|
{
|
||||||
Q_ASSERT(page);
|
Q_ASSERT(page);
|
||||||
Q_ASSERT(document);
|
Q_ASSERT(document);
|
||||||
|
|
||||||
|
m_structuralParentKey = page->getStructureParentKey();
|
||||||
|
|
||||||
PDFExecutionPolicy::startProcessingContentStream();
|
PDFExecutionPolicy::startProcessingContentStream();
|
||||||
|
|
||||||
QPainterPath pageRectPath;
|
QPainterPath pageRectPath;
|
||||||
@ -338,6 +341,7 @@ QList<PDFRenderError> PDFPageContentProcessor::processContents()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
finishMarkedContent();
|
||||||
return m_errorList;
|
return m_errorList;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -659,9 +663,11 @@ void PDFPageContentProcessor::processForm(const QMatrix& matrix,
|
|||||||
const QRectF& boundingBox,
|
const QRectF& boundingBox,
|
||||||
const PDFObject& resources,
|
const PDFObject& resources,
|
||||||
const PDFObject& transparencyGroup,
|
const PDFObject& transparencyGroup,
|
||||||
const QByteArray& content)
|
const QByteArray& content,
|
||||||
|
PDFInteger formStructuralParent)
|
||||||
{
|
{
|
||||||
PDFPageContentProcessorStateGuard guard(this);
|
PDFPageContentProcessorStateGuard guard(this);
|
||||||
|
PDFTemporaryValueChange structuralParentChangeGuard(&m_structuralParentKey, formStructuralParent);
|
||||||
|
|
||||||
std::unique_ptr<PDFTransparencyGroupGuard> guard2;
|
std::unique_ptr<PDFTransparencyGroupGuard> guard2;
|
||||||
if (transparencyGroup.isDictionary())
|
if (transparencyGroup.isDictionary())
|
||||||
@ -1738,6 +1744,19 @@ void PDFPageContentProcessor::setRenderingIntentByName(QByteArray renderingInten
|
|||||||
m_graphicState.setRenderingIntentName(renderingIntentName);
|
m_graphicState.setRenderingIntentName(renderingIntentName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFPageContentProcessor::finishMarkedContent()
|
||||||
|
{
|
||||||
|
if (!m_markedContentStack.empty())
|
||||||
|
{
|
||||||
|
m_errorList.append(PDFRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Marked content is not well formed (not enough EMC operators).")));
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!m_markedContentStack.empty())
|
||||||
|
{
|
||||||
|
operatorMarkedContentEnd();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void PDFPageContentProcessor::reportRenderErrorOnce(RenderErrorType type, QString message)
|
void PDFPageContentProcessor::reportRenderErrorOnce(RenderErrorType type, QString message)
|
||||||
{
|
{
|
||||||
if (!m_onceReportedErrors.count(message))
|
if (!m_onceReportedErrors.count(message))
|
||||||
@ -2936,7 +2955,10 @@ void PDFPageContentProcessor::operatorPaintXObject(PDFPageContentProcessor::PDFO
|
|||||||
// Transparency group
|
// Transparency group
|
||||||
PDFObject transparencyGroup = m_document->getObject(streamDictionary->get("Group"));
|
PDFObject transparencyGroup = m_document->getObject(streamDictionary->get("Group"));
|
||||||
|
|
||||||
processForm(transformationMatrix, boundingBox, resources, transparencyGroup, content);
|
// Form structural parent key
|
||||||
|
const PDFInteger formStructuralParentKey = loader.readIntegerFromDictionary(streamDictionary, "StructParent", m_structuralParentKey);
|
||||||
|
|
||||||
|
processForm(transformationMatrix, boundingBox, resources, transparencyGroup, content, formStructuralParentKey);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -3075,14 +3097,23 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
|
|||||||
if (item.glyph)
|
if (item.glyph)
|
||||||
{
|
{
|
||||||
const QPainterPath& glyphPath = *item.glyph;
|
const QPainterPath& glyphPath = *item.glyph;
|
||||||
if (!glyphPath.isEmpty())
|
|
||||||
{
|
|
||||||
QMatrix textRenderingMatrix = adjustMatrix * textMatrix;
|
QMatrix textRenderingMatrix = adjustMatrix * textMatrix;
|
||||||
QMatrix toDeviceSpaceTransform = textRenderingMatrix * m_graphicState.getCurrentTransformationMatrix();
|
QMatrix toDeviceSpaceTransform = textRenderingMatrix * m_graphicState.getCurrentTransformationMatrix();
|
||||||
|
|
||||||
|
if (!glyphPath.isEmpty())
|
||||||
|
{
|
||||||
QPainterPath transformedGlyph = textRenderingMatrix.map(glyphPath);
|
QPainterPath transformedGlyph = textRenderingMatrix.map(glyphPath);
|
||||||
processPathPainting(transformedGlyph, stroke, fill, true, transformedGlyph.fillRule());
|
processPathPainting(transformedGlyph, stroke, fill, true, transformedGlyph.fillRule());
|
||||||
|
|
||||||
if (!item.character.isNull() && !item.character.isSpace())
|
if (clipped)
|
||||||
|
{
|
||||||
|
// Clipping is enabled, we must transform to the device coordinates
|
||||||
|
m_textClippingPath = m_textClippingPath.united(toDeviceSpaceTransform.map(glyphPath));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!item.character.isNull())
|
||||||
{
|
{
|
||||||
// Output character
|
// Output character
|
||||||
PDFTextCharacterInfo info;
|
PDFTextCharacterInfo info;
|
||||||
@ -3094,13 +3125,6 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
|
|||||||
info.matrix = toDeviceSpaceTransform;
|
info.matrix = toDeviceSpaceTransform;
|
||||||
performOutputCharacter(info);
|
performOutputCharacter(info);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (clipped)
|
|
||||||
{
|
|
||||||
// Clipping is enabled, we must transform to the device coordinates
|
|
||||||
m_textClippingPath = m_textClippingPath.united(toDeviceSpaceTransform.map(glyphPath));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
displacementX = advance.x();
|
displacementX = advance.x();
|
||||||
@ -3170,7 +3194,7 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
|
|||||||
|
|
||||||
processContent(*item.characterContentStream);
|
processContent(*item.characterContentStream);
|
||||||
|
|
||||||
if (!item.character.isNull() && !item.character.isSpace())
|
if (!item.character.isNull())
|
||||||
{
|
{
|
||||||
// Output character
|
// Output character
|
||||||
PDFTextCharacterInfo info;
|
PDFTextCharacterInfo info;
|
||||||
|
@ -219,7 +219,13 @@ public:
|
|||||||
/// \param resources Resources, assigned to the form
|
/// \param resources Resources, assigned to the form
|
||||||
/// \param transparencyGroup Transparency group object
|
/// \param transparencyGroup Transparency group object
|
||||||
/// \param content Content stream of the form
|
/// \param content Content stream of the form
|
||||||
void processForm(const QMatrix& matrix, const QRectF& boundingBox, const PDFObject& resources, const PDFObject& transparencyGroup, const QByteArray& content);
|
/// \param formStructuralParent Structural parent key for form
|
||||||
|
void processForm(const QMatrix& matrix,
|
||||||
|
const QRectF& boundingBox,
|
||||||
|
const PDFObject& resources,
|
||||||
|
const PDFObject& transparencyGroup,
|
||||||
|
const QByteArray& content,
|
||||||
|
PDFInteger formStructuralParent);
|
||||||
|
|
||||||
/// Initialize stream processor for processing content streams. For example,
|
/// Initialize stream processor for processing content streams. For example,
|
||||||
/// graphic state is initialized to default, and default color spaces are initialized.
|
/// graphic state is initialized to default, and default color spaces are initialized.
|
||||||
@ -572,6 +578,9 @@ protected:
|
|||||||
/// shading, images, ...)
|
/// shading, images, ...)
|
||||||
virtual bool isContentKindSuppressed(ContentKind kind) const;
|
virtual bool isContentKindSuppressed(ContentKind kind) const;
|
||||||
|
|
||||||
|
/// Returns current structural parent key
|
||||||
|
PDFInteger getStructuralParentKey() const { return m_structuralParentKey; }
|
||||||
|
|
||||||
/// Returns current graphic state
|
/// Returns current graphic state
|
||||||
const PDFPageContentProcessorState* getGraphicState() const { return &m_graphicState; }
|
const PDFPageContentProcessorState* getGraphicState() const { return &m_graphicState; }
|
||||||
|
|
||||||
@ -918,6 +927,9 @@ private:
|
|||||||
/// Set rendering intent by name
|
/// Set rendering intent by name
|
||||||
void setRenderingIntentByName(QByteArray renderingIntentName);
|
void setRenderingIntentByName(QByteArray renderingIntentName);
|
||||||
|
|
||||||
|
/// Finishes marked content (if end of marked content is missing)
|
||||||
|
void finishMarkedContent();
|
||||||
|
|
||||||
const PDFPage* m_page;
|
const PDFPage* m_page;
|
||||||
const PDFDocument* m_document;
|
const PDFDocument* m_document;
|
||||||
const PDFFontCache* m_fontCache;
|
const PDFFontCache* m_fontCache;
|
||||||
@ -990,6 +1002,9 @@ private:
|
|||||||
|
|
||||||
/// Set with rendering errors, which were reported (and should be reported once)
|
/// Set with rendering errors, which were reported (and should be reported once)
|
||||||
std::set<QString> m_onceReportedErrors;
|
std::set<QString> m_onceReportedErrors;
|
||||||
|
|
||||||
|
/// Active structural parent key
|
||||||
|
PDFInteger m_structuralParentKey;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
@ -505,6 +505,19 @@ std::vector<PDFObjectReference> PDFStructureTree::getParents(PDFInteger id) cons
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PDFObjectReference PDFStructureTree::getParent(PDFInteger id, PDFInteger index) const
|
||||||
|
{
|
||||||
|
Q_ASSERT(std::is_sorted(m_parentTreeEntries.cbegin(), m_parentTreeEntries.cend()));
|
||||||
|
ParentTreeEntry entry{ id, PDFObjectReference() };
|
||||||
|
auto [it, itEnd] = std::equal_range(m_parentTreeEntries.cbegin(), m_parentTreeEntries.cend(), entry);
|
||||||
|
const PDFInteger count = std::distance(it, itEnd);
|
||||||
|
if (index >= 0 && index < count)
|
||||||
|
{
|
||||||
|
return (*std::next(it, index)).reference;
|
||||||
|
}
|
||||||
|
return PDFObjectReference();
|
||||||
|
}
|
||||||
|
|
||||||
PDFStructureItem::Type PDFStructureTree::getTypeFromRole(const QByteArray& role) const
|
PDFStructureItem::Type PDFStructureTree::getTypeFromRole(const QByteArray& role) const
|
||||||
{
|
{
|
||||||
auto it = m_roleMap.find(role);
|
auto it = m_roleMap.find(role);
|
||||||
@ -556,16 +569,14 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
|
|||||||
return id < other.id;
|
return id < other.id;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ParentTreeParseEntry parse(PDFInteger id, const PDFObjectStorage*, const PDFObject& object)
|
static ParentTreeParseEntry parse(PDFInteger id, const PDFObjectStorage* storage, const PDFObject& object)
|
||||||
{
|
{
|
||||||
if (object.isReference())
|
const PDFObject& dereferencedObject = storage->getObject(object);
|
||||||
{
|
|
||||||
return ParentTreeParseEntry{ id, { object.getReference() } };
|
if (dereferencedObject.isArray())
|
||||||
}
|
|
||||||
else if (object.isArray())
|
|
||||||
{
|
{
|
||||||
std::vector<PDFObjectReference> references;
|
std::vector<PDFObjectReference> references;
|
||||||
for (const PDFObject& object : *object.getArray())
|
for (const PDFObject& object : *dereferencedObject.getArray())
|
||||||
{
|
{
|
||||||
if (object.isReference())
|
if (object.isReference())
|
||||||
{
|
{
|
||||||
@ -575,6 +586,10 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
|
|||||||
|
|
||||||
return ParentTreeParseEntry{ id, qMove(references) };
|
return ParentTreeParseEntry{ id, qMove(references) };
|
||||||
}
|
}
|
||||||
|
else if (object.isReference())
|
||||||
|
{
|
||||||
|
return ParentTreeParseEntry{ id, { object.getReference() } };
|
||||||
|
}
|
||||||
|
|
||||||
return ParentTreeParseEntry{ id, { } };
|
return ParentTreeParseEntry{ id, { } };
|
||||||
}
|
}
|
||||||
@ -629,6 +644,16 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
|
|||||||
return tree;
|
return tree;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PDFStructureTree::ParentTreeEntry PDFStructureTree::getParentTreeEntry(PDFInteger index) const
|
||||||
|
{
|
||||||
|
if (index >= 0 && index < PDFInteger(m_parentTreeEntries.size()))
|
||||||
|
{
|
||||||
|
return m_parentTreeEntries[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
return ParentTreeEntry();
|
||||||
|
}
|
||||||
|
|
||||||
PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent)
|
PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent)
|
||||||
{
|
{
|
||||||
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
|
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
|
||||||
@ -939,237 +964,4 @@ void PDFStructureTreeAbstractVisitor::acceptChildren(const PDFStructureItem* ite
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
explicit inline PDFStructureTreeReferenceCollector(std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
|
|
||||||
m_mapping(mapping)
|
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
|
|
||||||
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
|
|
||||||
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
|
|
||||||
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
void addReference(const PDFStructureItem* structureObjectReference);
|
|
||||||
|
|
||||||
std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
|
|
||||||
};
|
|
||||||
|
|
||||||
void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree)
|
|
||||||
{
|
|
||||||
addReference(structureTree);
|
|
||||||
acceptChildren(structureTree);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
|
||||||
{
|
|
||||||
addReference(structureElement);
|
|
||||||
acceptChildren(structureElement);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
|
|
||||||
{
|
|
||||||
addReference(structureMarkedContentReference);
|
|
||||||
acceptChildren(structureMarkedContentReference);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
|
|
||||||
{
|
|
||||||
addReference(structureObjectReference);
|
|
||||||
acceptChildren(structureObjectReference);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem)
|
|
||||||
{
|
|
||||||
if (structureItem->getSelfReference().isValid())
|
|
||||||
{
|
|
||||||
(*m_mapping)[structureItem->getSelfReference()] = structureItem;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor
|
|
||||||
{
|
|
||||||
using BaseClass = PDFPageContentProcessor;
|
|
||||||
|
|
||||||
public:
|
|
||||||
explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features,
|
|
||||||
const PDFPage* page,
|
|
||||||
const PDFDocument* document,
|
|
||||||
const PDFFontCache* fontCache,
|
|
||||||
const PDFCMS* cms,
|
|
||||||
const PDFOptionalContentActivity* optionalContentActivity,
|
|
||||||
QMatrix pagePointToDevicePointMatrix,
|
|
||||||
const PDFMeshQualitySettings& meshQualitySettings) :
|
|
||||||
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
|
|
||||||
m_features(features)
|
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
std::map<PDFInteger, QStringList>& takeTexts() { return m_text; }
|
|
||||||
QStringList& takeUnmatchedTexts() { return m_unmatchedText; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
|
|
||||||
virtual bool isContentKindSuppressed(ContentKind kind) const override;
|
|
||||||
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
|
||||||
virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
|
|
||||||
virtual void performMarkedContentEnd() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct MarkedContentInfo
|
|
||||||
{
|
|
||||||
QByteArray tag;
|
|
||||||
PDFInteger mcid = -1;
|
|
||||||
};
|
|
||||||
|
|
||||||
PDFRenderer::Features m_features;
|
|
||||||
std::vector<MarkedContentInfo> m_markedContentInfoStack;
|
|
||||||
QString m_currentText;
|
|
||||||
std::map<PDFInteger, QStringList> m_text;
|
|
||||||
QStringList m_unmatchedText;
|
|
||||||
};
|
|
||||||
|
|
||||||
void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties)
|
|
||||||
{
|
|
||||||
MarkedContentInfo info;
|
|
||||||
info.tag = tag;
|
|
||||||
|
|
||||||
if (properties.isDictionary())
|
|
||||||
{
|
|
||||||
const PDFDictionary* dictionary = properties.getDictionary();
|
|
||||||
PDFObject mcid = dictionary->get("MCID");
|
|
||||||
if (mcid.isInt())
|
|
||||||
{
|
|
||||||
info.mcid = mcid.getInteger();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m_markedContentInfoStack.emplace_back(qMove(info));
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
|
|
||||||
{
|
|
||||||
MarkedContentInfo info = qMove(m_markedContentInfoStack.back());
|
|
||||||
m_markedContentInfoStack.pop_back();
|
|
||||||
|
|
||||||
if (info.mcid != -1)
|
|
||||||
{
|
|
||||||
if (!m_currentText.isEmpty())
|
|
||||||
{
|
|
||||||
m_text[info.mcid].push_back(qMove(m_currentText));
|
|
||||||
}
|
|
||||||
m_currentText = QString();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_markedContentInfoStack.empty() && !m_currentText.isEmpty())
|
|
||||||
{
|
|
||||||
m_unmatchedText << qMove(m_currentText);
|
|
||||||
m_currentText = QString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd)
|
|
||||||
{
|
|
||||||
if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const
|
|
||||||
{
|
|
||||||
switch (kind)
|
|
||||||
{
|
|
||||||
case ContentKind::Shapes:
|
|
||||||
case ContentKind::Text:
|
|
||||||
case ContentKind::Images:
|
|
||||||
case ContentKind::Shading:
|
|
||||||
return true;
|
|
||||||
|
|
||||||
case ContentKind::Tiling:
|
|
||||||
return false; // Tiling can have text
|
|
||||||
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
Q_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info)
|
|
||||||
{
|
|
||||||
if (!isContentSuppressed())
|
|
||||||
{
|
|
||||||
if (!info.character.isNull())
|
|
||||||
{
|
|
||||||
m_currentText.push_back(info.character);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree) :
|
|
||||||
m_document(document),
|
|
||||||
m_tree(tree)
|
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageIndices)
|
|
||||||
{
|
|
||||||
std::map<PDFObjectReference, const PDFStructureItem*> mapping;
|
|
||||||
PDFStructureTreeReferenceCollector referenceCollector(&mapping);
|
|
||||||
m_tree->accept(&referenceCollector);
|
|
||||||
|
|
||||||
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
|
||||||
|
|
||||||
// Jakub Melka: maps text to structure tree items. Key is pair of (page index, mcid)
|
|
||||||
std::map<std::pair<PDFInteger, PDFInteger>, QStringList> extractedText;
|
|
||||||
|
|
||||||
QMutex mutex;
|
|
||||||
PDFCMSGeneric cms;
|
|
||||||
PDFMeshQualitySettings mqs;
|
|
||||||
PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr);
|
|
||||||
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(m_document), &oca);
|
|
||||||
fontCache.setDocument(md);
|
|
||||||
fontCache.setCacheShrinkEnabled(nullptr, false);
|
|
||||||
|
|
||||||
auto generateTextLayout = [this, &mutex, &extractedText, &fontCache, &cms, &mqs, &oca](PDFInteger pageIndex)
|
|
||||||
{
|
|
||||||
const PDFCatalog* catalog = m_document->getCatalog();
|
|
||||||
if (!catalog->getPage(pageIndex))
|
|
||||||
{
|
|
||||||
// Invalid page index
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const PDFPage* page = catalog->getPage(pageIndex);
|
|
||||||
Q_ASSERT(page);
|
|
||||||
|
|
||||||
PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QMatrix(), mqs);
|
|
||||||
QList<PDFRenderError> errors = processor.processContents();
|
|
||||||
|
|
||||||
QMutexLocker lock(&mutex);
|
|
||||||
for (auto& item : processor.takeTexts())
|
|
||||||
{
|
|
||||||
extractedText[std::make_pair(pageIndex, item.first)].append(qMove(item.second));
|
|
||||||
}
|
|
||||||
m_unmatchedText << qMove(processor.takeUnmatchedTexts());
|
|
||||||
m_errors.append(qMove(errors));
|
|
||||||
};
|
|
||||||
|
|
||||||
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
|
|
||||||
|
|
||||||
fontCache.setCacheShrinkEnabled(nullptr, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
@ -382,6 +382,13 @@ public:
|
|||||||
/// \param id Id
|
/// \param id Id
|
||||||
std::vector<PDFObjectReference> getParents(PDFInteger id) const;
|
std::vector<PDFObjectReference> getParents(PDFInteger id) const;
|
||||||
|
|
||||||
|
/// Returns parent key for structural entry with given id,
|
||||||
|
/// and index. Id is, typically, structural tree parent key in page,
|
||||||
|
/// index is index into the marked content references array.
|
||||||
|
/// \param id Structural tree parent id
|
||||||
|
/// \param index Index into the subarray
|
||||||
|
PDFObjectReference getParent(PDFInteger id, PDFInteger index) const;
|
||||||
|
|
||||||
/// Returns type from role. Role can be an entry in RoleMap dictionary,
|
/// Returns type from role. Role can be an entry in RoleMap dictionary,
|
||||||
/// or one of the standard roles.
|
/// or one of the standard roles.
|
||||||
/// \param role Role
|
/// \param role Role
|
||||||
@ -410,8 +417,6 @@ public:
|
|||||||
/// \param object Structure tree root object
|
/// \param object Structure tree root object
|
||||||
static PDFStructureTree parse(const PDFObjectStorage* storage, PDFObject object);
|
static PDFStructureTree parse(const PDFObjectStorage* storage, PDFObject object);
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
struct ParentTreeEntry
|
struct ParentTreeEntry
|
||||||
{
|
{
|
||||||
PDFInteger id = 0;
|
PDFInteger id = 0;
|
||||||
@ -422,6 +427,13 @@ private:
|
|||||||
return id < other.id;
|
return id < other.id;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Returns given page tree entry. If index is invalid,
|
||||||
|
/// empty parent tree entry is returned.
|
||||||
|
/// \param index Index
|
||||||
|
ParentTreeEntry getParentTreeEntry(PDFInteger index) const;
|
||||||
|
|
||||||
|
private:
|
||||||
using ParentTreeEntries = std::vector<ParentTreeEntry>;
|
using ParentTreeEntries = std::vector<ParentTreeEntry>;
|
||||||
|
|
||||||
std::map<QByteArray, PDFObjectReference> m_idTreeMap;
|
std::map<QByteArray, PDFObjectReference> m_idTreeMap;
|
||||||
@ -598,27 +610,6 @@ private:
|
|||||||
PDFObjectReference m_objectReference;
|
PDFObjectReference m_objectReference;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Text extractor for structure tree. Can extract text to fill structure tree contents.
|
|
||||||
class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeTextExtractor
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree);
|
|
||||||
|
|
||||||
/// Performs text extracting algorithm. Only \p pageIndices
|
|
||||||
/// pages are processed for text extraction.
|
|
||||||
/// \param pageIndices Page indices
|
|
||||||
void perform(const std::vector<PDFInteger>& pageIndices);
|
|
||||||
|
|
||||||
/// Returns a list of errors/warnings
|
|
||||||
const QList<PDFRenderError>& getErrors() const { return m_errors; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
QList<PDFRenderError> m_errors;
|
|
||||||
const PDFDocument* m_document;
|
|
||||||
const PDFStructureTree* m_tree;
|
|
||||||
QStringList m_unmatchedText;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
|
||||||
#endif // PDFSTRUCTURETREE_H
|
#endif // PDFSTRUCTURETREE_H
|
||||||
|
@ -649,7 +649,14 @@ void PDFConsole::writeText(QString text, QString codecName)
|
|||||||
{
|
{
|
||||||
// Write console failed. This can happen only, if outputHandle is not handle
|
// Write console failed. This can happen only, if outputHandle is not handle
|
||||||
// to console screen buffer, but, for example a file or a pipe.
|
// to console screen buffer, but, for example a file or a pipe.
|
||||||
if (QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1()))
|
QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1());
|
||||||
|
if (!codec)
|
||||||
|
{
|
||||||
|
codec = QTextCodec::codecForName("UTF-8");
|
||||||
|
writeError(QString("No codec found for '%1'. Defaulting to text codec '%2'.").arg(codecName, QString::fromLatin1(codec->name())), codecName);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (codec)
|
||||||
{
|
{
|
||||||
QByteArray encodedData = codec->fromUnicode(text);
|
QByteArray encodedData = codec->fromUnicode(text);
|
||||||
WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr);
|
WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr);
|
||||||
@ -675,7 +682,13 @@ void PDFConsole::writeError(QString text, QString codecName)
|
|||||||
{
|
{
|
||||||
// Write console failed. This can happen only, if outputHandle is not handle
|
// Write console failed. This can happen only, if outputHandle is not handle
|
||||||
// to console screen buffer, but, for example a file or a pipe.
|
// to console screen buffer, but, for example a file or a pipe.
|
||||||
if (QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1()))
|
QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1());
|
||||||
|
if (!codec)
|
||||||
|
{
|
||||||
|
codec = QTextCodec::codecForName("UTF-8");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (codec)
|
||||||
{
|
{
|
||||||
QByteArray encodedData = codec->fromUnicode(text);
|
QByteArray encodedData = codec->fromUnicode(text);
|
||||||
WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr);
|
WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr);
|
||||||
|
@ -43,7 +43,7 @@ struct PDFToolOptions
|
|||||||
{
|
{
|
||||||
// For option 'ConsoleFormat'
|
// For option 'ConsoleFormat'
|
||||||
PDFOutputFormatter::Style outputStyle = PDFOutputFormatter::Style::Text;
|
PDFOutputFormatter::Style outputStyle = PDFOutputFormatter::Style::Text;
|
||||||
QString outputCodec;
|
QString outputCodec = "UTF-8";
|
||||||
|
|
||||||
// For option 'DateFormat'
|
// For option 'DateFormat'
|
||||||
Qt::DateFormat outputDateFormat = Qt::DefaultLocaleShortDate;
|
Qt::DateFormat outputDateFormat = Qt::DefaultLocaleShortDate;
|
||||||
|
@ -65,17 +65,48 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
|||||||
pdf::PDFDocumentTextFlowFactory factory;
|
pdf::PDFDocumentTextFlowFactory factory;
|
||||||
pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm);
|
pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm);
|
||||||
|
|
||||||
|
PDFOutputFormatter formatter(options.outputStyle, options.outputCodec);
|
||||||
|
formatter.beginDocument("text-extraction", QString());
|
||||||
|
formatter.endl();
|
||||||
|
|
||||||
|
for (const pdf::PDFDocumentTextFlow::Item& item : documentTextFlow.getItems())
|
||||||
|
{
|
||||||
|
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemStart))
|
||||||
|
{
|
||||||
|
formatter.beginHeader("item", item.text);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!item.text.isEmpty())
|
||||||
|
{
|
||||||
|
formatter.writeText("text", item.text);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd))
|
||||||
|
{
|
||||||
|
formatter.endHeader();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd))
|
||||||
|
{
|
||||||
|
formatter.endl();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
formatter.endDocument();
|
||||||
|
|
||||||
for (const pdf::PDFRenderError& error : factory.getErrors())
|
for (const pdf::PDFRenderError& error : factory.getErrors())
|
||||||
{
|
{
|
||||||
PDFConsole::writeError(error.message, options.outputCodec);
|
PDFConsole::writeError(error.message, options.outputCodec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PDFConsole::writeText(formatter.getString(), options.outputCodec);
|
||||||
|
|
||||||
return ExitSuccess;
|
return ExitSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
|
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
|
||||||
{
|
{
|
||||||
return ConsoleFormat | OpenDocument | TextAnalysis;
|
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace pdftool
|
} // namespace pdftool
|
||||||
|
Loading…
x
Reference in New Issue
Block a user