Document content flow basics

This commit is contained in:
Jakub Melka 2020-10-17 16:56:39 +02:00
parent b0f8e1f1e3
commit 0ccdb1e46f
10 changed files with 664 additions and 290 deletions

View File

@ -1362,6 +1362,7 @@ void PDFAnnotationManager::drawAnnotationUsingAppearanceStream(const PageAnnotat
QByteArray content = m_document->getDecodedStream(formStream);
PDFObject resources = m_document->getObject(formDictionary->get("Resources"));
PDFObject transparencyGroup = m_document->getObject(formDictionary->get("Group"));
const PDFInteger formStructuralParentKey = loader.readIntegerFromDictionary(formDictionary, "StructParent", page->getStructureParentKey());
if (formBoundingBox.isEmpty() || annotationRectangle.isEmpty())
{
@ -1409,7 +1410,7 @@ void PDFAnnotationManager::drawAnnotationUsingAppearanceStream(const PageAnnotat
if (isContentVisible)
{
pdfPainter.processForm(AA, formBoundingBox, resources, transparencyGroup, content);
pdfPainter.processForm(AA, formBoundingBox, resources, transparencyGroup, content, formStructuralParentKey);
}
}

View File

@ -227,7 +227,7 @@ bool PDFTextLayoutGenerator::isContentKindSuppressed(ContentKind kind) const
void PDFTextLayoutGenerator::performOutputCharacter(const PDFTextCharacterInfo& info)
{
if (!isContentSuppressed())
if (!isContentSuppressed() && !info.character.isSpace())
{
m_textLayout.addCharacter(info);
}

View File

@ -26,6 +26,483 @@
namespace pdf
{
class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor
{
public:
explicit inline PDFStructureTreeReferenceCollector(std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
m_mapping(mapping)
{
}
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
private:
void addReference(const PDFStructureItem* structureObjectReference);
std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
};
void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree)
{
addReference(structureTree);
acceptChildren(structureTree);
}
void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement)
{
addReference(structureElement);
acceptChildren(structureElement);
}
void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
{
addReference(structureMarkedContentReference);
acceptChildren(structureMarkedContentReference);
}
void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
{
addReference(structureObjectReference);
acceptChildren(structureObjectReference);
}
void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem)
{
if (structureItem->getSelfReference().isValid())
{
(*m_mapping)[structureItem->getSelfReference()] = structureItem;
}
}
struct PDFStructureTreeTextItem
{
enum class Type
{
StartTag,
EndTag,
Text
};
PDFStructureTreeTextItem() = default;
PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text) :
type(type), item(item), text(qMove(text))
{
}
static PDFStructureTreeTextItem createText(QString text) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text)); }
static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString()); }
static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString()); }
Type type = Type::Text;
const PDFStructureItem* item = nullptr;
QString text;
};
using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>;
/// Text extractor for structure tree. Extracts sequences of structure items,
/// page sequences are stored in \p textSequences. They can be accessed using
/// getters.
class PDFStructureTreeTextExtractor
{
public:
enum Option
{
None = 0x0000,
SkipArtifact = 0x0001, ///< Skip content marked as 'Artifact'
AdjustReversedText = 0x0002, ///< Adjust reversed text
CreateTreeMapping = 0x0004, ///< Create text mapping to structure tree item
};
Q_DECLARE_FLAGS(Options, Option)
explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options);
/// Performs text extracting algorithm. Only \p pageIndices
/// pages are processed for text extraction.
/// \param pageIndices Page indices
void perform(const std::vector<PDFInteger>& pageIndices);
/// Returns a list of errors/warnings
const QList<PDFRenderError>& getErrors() const { return m_errors; }
/// Returns a list of unmatched text
const QStringList& getUnmatchedText() const { return m_unmatchedText; }
/// Returns text sequence for given page. If page number is invalid,
/// then empty text sequence is returned.
/// \param pageNumber Page number
const PDFStructureTreeTextSequence& getTextSequence(PDFInteger pageNumber) const;
/// Returns text for given structure tree item. If structure tree item
/// is not found, then empty list is returned. This functionality
/// requires, that \p CreateTreeMapping flag is being set.
/// \param item Item
const QStringList& getText(const PDFStructureItem* item) const;
private:
QList<PDFRenderError> m_errors;
const PDFDocument* m_document;
const PDFStructureTree* m_tree;
QStringList m_unmatchedText;
std::map<PDFInteger, PDFStructureTreeTextSequence> m_textSequences;
std::map<const PDFStructureItem*, QStringList> m_textForItems;
Options m_options;
};
Q_DECLARE_OPERATORS_FOR_FLAGS(PDFStructureTreeTextExtractor::Options)
class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor
{
using BaseClass = PDFPageContentProcessor;
public:
explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features,
const PDFPage* page,
const PDFDocument* document,
const PDFFontCache* fontCache,
const PDFCMS* cms,
const PDFOptionalContentActivity* optionalContentActivity,
QMatrix pagePointToDevicePointMatrix,
const PDFMeshQualitySettings& meshQualitySettings,
const PDFStructureTree* tree,
const std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
m_features(features),
m_tree(tree),
m_mapping(mapping)
{
}
PDFStructureTreeTextSequence& takeSequence() { return m_textSequence; }
QStringList& takeUnmatchedTexts() { return m_unmatchedText; }
protected:
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
virtual bool isContentKindSuppressed(ContentKind kind) const override;
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
virtual void performMarkedContentEnd() override;
private:
const PDFStructureItem* getStructureTreeItemFromMCID(PDFInteger mcid) const;
void finishText();
struct MarkedContentInfo
{
QByteArray tag;
PDFInteger mcid = -1;
const PDFStructureItem* structureTreeItem = nullptr;
};
PDFRenderer::Features m_features;
const PDFStructureTree* m_tree;
const std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
std::vector<MarkedContentInfo> m_markedContentInfoStack;
QString m_currentText;
PDFStructureTreeTextSequence m_textSequence;
QStringList m_unmatchedText;
};
void PDFStructureTreeTextContentProcessor::finishText()
{
m_currentText = m_currentText.trimmed();
if (!m_currentText.isEmpty())
{
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(qMove(m_currentText)));
}
m_currentText = QString();
}
void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties)
{
MarkedContentInfo info;
info.tag = tag;
if (properties.isDictionary())
{
const PDFDictionary* dictionary = properties.getDictionary();
PDFObject mcid = dictionary->get("MCID");
if (mcid.isInt())
{
// We must finish text, because we can have a sequence of text,
// then subitem, then text, and followed by another subitem. They
// can be interleaved.
finishText();
info.mcid = mcid.getInteger();
info.structureTreeItem = getStructureTreeItemFromMCID(info.mcid);
if (!info.structureTreeItem)
{
reportRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Structure tree item for MCID %1 not found.").arg(info.mcid));
}
if (info.structureTreeItem)
{
m_textSequence.emplace_back(PDFStructureTreeTextItem::createStartTag(info.structureTreeItem));
}
}
}
m_markedContentInfoStack.emplace_back(qMove(info));
}
void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
{
MarkedContentInfo info = qMove(m_markedContentInfoStack.back());
m_markedContentInfoStack.pop_back();
if (info.mcid != -1)
{
finishText();
if (info.structureTreeItem)
{
m_textSequence.emplace_back(PDFStructureTreeTextItem::createEndTag(info.structureTreeItem));
}
}
// Check for text, which doesn't belong to any structure tree item
if (m_markedContentInfoStack.empty())
{
m_currentText = m_currentText.trimmed();
if (!m_currentText.isEmpty())
{
m_unmatchedText << qMove(m_currentText);
}
}
}
const PDFStructureItem* PDFStructureTreeTextContentProcessor::getStructureTreeItemFromMCID(PDFInteger mcid) const
{
auto it = m_mapping->find(m_tree->getParent(getStructuralParentKey(), mcid));
if (it != m_mapping->cend())
{
return it->second;
}
return nullptr;
}
bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd)
{
if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent))
{
return false;
}
return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd);
}
bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const
{
switch (kind)
{
case ContentKind::Shapes:
case ContentKind::Text:
case ContentKind::Images:
case ContentKind::Shading:
return true;
case ContentKind::Tiling:
return false; // Tiling can have text
default:
{
Q_ASSERT(false);
break;
}
}
return false;
}
void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info)
{
if (!isContentSuppressed())
{
if (!info.character.isNull())
{
m_currentText.push_back(info.character);
}
}
}
PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options) :
m_document(document),
m_tree(tree),
m_options(options)
{
}
void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageIndices)
{
std::map<PDFObjectReference, const PDFStructureItem*> mapping;
PDFStructureTreeReferenceCollector referenceCollector(&mapping);
m_tree->accept(&referenceCollector);
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
QMutex mutex;
PDFCMSGeneric cms;
PDFMeshQualitySettings mqs;
PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr);
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(m_document), &oca);
fontCache.setDocument(md);
fontCache.setCacheShrinkEnabled(nullptr, false);
auto generateTextLayout = [&, this](PDFInteger pageIndex)
{
const PDFCatalog* catalog = m_document->getCatalog();
if (!catalog->getPage(pageIndex))
{
// Invalid page index
return;
}
const PDFPage* page = catalog->getPage(pageIndex);
Q_ASSERT(page);
PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QMatrix(), mqs, m_tree, &mapping);
QList<PDFRenderError> errors = processor.processContents();
QMutexLocker lock(&mutex);
m_textSequences[pageIndex] = qMove(processor.takeSequence());
m_unmatchedText << qMove(processor.takeUnmatchedTexts());
m_errors.append(qMove(errors));
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
fontCache.setCacheShrinkEnabled(nullptr, true);
if (m_options.testFlag(CreateTreeMapping))
{
for (const auto& sequence : m_textSequences)
{
std::stack<const PDFStructureItem*> stack;
for (const PDFStructureTreeTextItem& sequenceItem : sequence.second)
{
switch (sequenceItem.type)
{
case PDFStructureTreeTextItem::Type::StartTag:
stack.push(sequenceItem.item);
break;
case PDFStructureTreeTextItem::Type::EndTag:
stack.pop();
break;
case PDFStructureTreeTextItem::Type::Text:
if (!stack.empty())
{
m_textForItems[stack.top()] << sequenceItem.text;
}
break;
}
}
}
}
}
const PDFStructureTreeTextSequence& PDFStructureTreeTextExtractor::getTextSequence(PDFInteger pageIndex) const
{
auto it = m_textSequences.find(pageIndex);
if (it != m_textSequences.cend())
{
return it->second;
}
static PDFStructureTreeTextSequence dummy;
return dummy;
}
const QStringList& PDFStructureTreeTextExtractor::getText(const PDFStructureItem* item) const
{
auto it = m_textForItems.find(item);
if (it != m_textForItems.cend())
{
return it->second;
}
static const QStringList dummy;
return dummy;
}
class PDFStructureTreeTextFlowCollector : public PDFStructureTreeAbstractVisitor
{
public:
explicit PDFStructureTreeTextFlowCollector(PDFDocumentTextFlow::Items* items, const PDFStructureTreeTextExtractor* extractor) :
m_items(items),
m_extractor(extractor)
{
}
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
private:
PDFDocumentTextFlow::Items* m_items;
const PDFStructureTreeTextExtractor* m_extractor;
std::vector<bool> m_hasContentStack;
};
void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTree* structureTree)
{
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemStart, -1, QString()});
acceptChildren(structureTree);
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
}
void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement)
{
size_t index = m_items->size();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemStart, -1, QString()});
// Mark stack so we can delete unused items
m_hasContentStack.push_back(false);
for (const QString& string : m_extractor->getText(structureElement))
{
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
{
m_hasContentStack[i] = true;
}
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string});
}
acceptChildren(structureElement);
const bool hasContent = m_hasContentStack.back();
m_hasContentStack.pop_back();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
if (!hasContent)
{
// Delete unused content
m_items->erase(std::next(m_items->begin(), index), m_items->end());
}
}
void PDFStructureTreeTextFlowCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
{
acceptChildren(structureMarkedContentReference);
}
void PDFStructureTreeTextFlowCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
{
acceptChildren(structureObjectReference);
}
PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector<PDFInteger>& pageIndices, Algorithm algorithm)
{
PDFDocumentTextFlow result;
@ -121,9 +598,39 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
break;
}
PDFStructureTreeTextExtractor extractor(document, &structureTree);
PDFStructureTreeTextExtractor extractor(document, &structureTree, PDFStructureTreeTextExtractor::SkipArtifact | PDFStructureTreeTextExtractor::AdjustReversedText | PDFStructureTreeTextExtractor::CreateTreeMapping);
extractor.perform(pageIndices);
PDFDocumentTextFlow::Items flowItems;
PDFStructureTreeTextFlowCollector collector(&flowItems, &extractor);
structureTree.accept(&collector);
result = PDFDocumentTextFlow(qMove(flowItems));
m_errors.append(extractor.getErrors());
break;
}
case Algorithm::Content:
{
PDFStructureTreeTextExtractor extractor(document, &structureTree, PDFStructureTreeTextExtractor::None);
extractor.perform(pageIndices);
PDFDocumentTextFlow::Items flowItems;
for (PDFInteger pageIndex : pageIndices)
{
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1) });
for (const PDFStructureTreeTextItem& sequenceItem : extractor.getTextSequence(pageIndex))
{
if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text)
{
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, sequenceItem.text });
}
}
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageEnd, pageIndex, QString() });
}
result = PDFDocumentTextFlow(qMove(flowItems));
m_errors.append(extractor.getErrors());
break;
}

View File

@ -244,11 +244,14 @@ PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page,
m_drawingUncoloredTilingPatternState(0),
m_patternBaseMatrix(pagePointToDevicePointMatrix),
m_pagePointToDevicePointMatrix(pagePointToDevicePointMatrix),
m_meshQualitySettings(meshQualitySettings)
m_meshQualitySettings(meshQualitySettings),
m_structuralParentKey(0)
{
Q_ASSERT(page);
Q_ASSERT(document);
m_structuralParentKey = page->getStructureParentKey();
PDFExecutionPolicy::startProcessingContentStream();
QPainterPath pageRectPath;
@ -338,6 +341,7 @@ QList<PDFRenderError> PDFPageContentProcessor::processContents()
}
}
finishMarkedContent();
return m_errorList;
}
@ -659,9 +663,11 @@ void PDFPageContentProcessor::processForm(const QMatrix& matrix,
const QRectF& boundingBox,
const PDFObject& resources,
const PDFObject& transparencyGroup,
const QByteArray& content)
const QByteArray& content,
PDFInteger formStructuralParent)
{
PDFPageContentProcessorStateGuard guard(this);
PDFTemporaryValueChange structuralParentChangeGuard(&m_structuralParentKey, formStructuralParent);
std::unique_ptr<PDFTransparencyGroupGuard> guard2;
if (transparencyGroup.isDictionary())
@ -1738,6 +1744,19 @@ void PDFPageContentProcessor::setRenderingIntentByName(QByteArray renderingInten
m_graphicState.setRenderingIntentName(renderingIntentName);
}
void PDFPageContentProcessor::finishMarkedContent()
{
if (!m_markedContentStack.empty())
{
m_errorList.append(PDFRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Marked content is not well formed (not enough EMC operators).")));
}
while (!m_markedContentStack.empty())
{
operatorMarkedContentEnd();
}
}
void PDFPageContentProcessor::reportRenderErrorOnce(RenderErrorType type, QString message)
{
if (!m_onceReportedErrors.count(message))
@ -2936,7 +2955,10 @@ void PDFPageContentProcessor::operatorPaintXObject(PDFPageContentProcessor::PDFO
// Transparency group
PDFObject transparencyGroup = m_document->getObject(streamDictionary->get("Group"));
processForm(transformationMatrix, boundingBox, resources, transparencyGroup, content);
// Form structural parent key
const PDFInteger formStructuralParentKey = loader.readIntegerFromDictionary(streamDictionary, "StructParent", m_structuralParentKey);
processForm(transformationMatrix, boundingBox, resources, transparencyGroup, content, formStructuralParentKey);
}
else
{
@ -3075,14 +3097,23 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
if (item.glyph)
{
const QPainterPath& glyphPath = *item.glyph;
if (!glyphPath.isEmpty())
{
QMatrix textRenderingMatrix = adjustMatrix * textMatrix;
QMatrix toDeviceSpaceTransform = textRenderingMatrix * m_graphicState.getCurrentTransformationMatrix();
if (!glyphPath.isEmpty())
{
QPainterPath transformedGlyph = textRenderingMatrix.map(glyphPath);
processPathPainting(transformedGlyph, stroke, fill, true, transformedGlyph.fillRule());
if (!item.character.isNull() && !item.character.isSpace())
if (clipped)
{
// Clipping is enabled, we must transform to the device coordinates
m_textClippingPath = m_textClippingPath.united(toDeviceSpaceTransform.map(glyphPath));
}
}
if (!item.character.isNull())
{
// Output character
PDFTextCharacterInfo info;
@ -3094,13 +3125,6 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
info.matrix = toDeviceSpaceTransform;
performOutputCharacter(info);
}
if (clipped)
{
// Clipping is enabled, we must transform to the device coordinates
m_textClippingPath = m_textClippingPath.united(toDeviceSpaceTransform.map(glyphPath));
}
}
}
displacementX = advance.x();
@ -3170,7 +3194,7 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
processContent(*item.characterContentStream);
if (!item.character.isNull() && !item.character.isSpace())
if (!item.character.isNull())
{
// Output character
PDFTextCharacterInfo info;

View File

@ -219,7 +219,13 @@ public:
/// \param resources Resources, assigned to the form
/// \param transparencyGroup Transparency group object
/// \param content Content stream of the form
void processForm(const QMatrix& matrix, const QRectF& boundingBox, const PDFObject& resources, const PDFObject& transparencyGroup, const QByteArray& content);
/// \param formStructuralParent Structural parent key for form
void processForm(const QMatrix& matrix,
const QRectF& boundingBox,
const PDFObject& resources,
const PDFObject& transparencyGroup,
const QByteArray& content,
PDFInteger formStructuralParent);
/// Initialize stream processor for processing content streams. For example,
/// graphic state is initialized to default, and default color spaces are initialized.
@ -572,6 +578,9 @@ protected:
/// shading, images, ...)
virtual bool isContentKindSuppressed(ContentKind kind) const;
/// Returns current structural parent key
PDFInteger getStructuralParentKey() const { return m_structuralParentKey; }
/// Returns current graphic state
const PDFPageContentProcessorState* getGraphicState() const { return &m_graphicState; }
@ -918,6 +927,9 @@ private:
/// Set rendering intent by name
void setRenderingIntentByName(QByteArray renderingIntentName);
/// Finishes marked content (if end of marked content is missing)
void finishMarkedContent();
const PDFPage* m_page;
const PDFDocument* m_document;
const PDFFontCache* m_fontCache;
@ -990,6 +1002,9 @@ private:
/// Set with rendering errors, which were reported (and should be reported once)
std::set<QString> m_onceReportedErrors;
/// Active structural parent key
PDFInteger m_structuralParentKey;
};
} // namespace pdf

View File

@ -505,6 +505,19 @@ std::vector<PDFObjectReference> PDFStructureTree::getParents(PDFInteger id) cons
return result;
}
PDFObjectReference PDFStructureTree::getParent(PDFInteger id, PDFInteger index) const
{
Q_ASSERT(std::is_sorted(m_parentTreeEntries.cbegin(), m_parentTreeEntries.cend()));
ParentTreeEntry entry{ id, PDFObjectReference() };
auto [it, itEnd] = std::equal_range(m_parentTreeEntries.cbegin(), m_parentTreeEntries.cend(), entry);
const PDFInteger count = std::distance(it, itEnd);
if (index >= 0 && index < count)
{
return (*std::next(it, index)).reference;
}
return PDFObjectReference();
}
PDFStructureItem::Type PDFStructureTree::getTypeFromRole(const QByteArray& role) const
{
auto it = m_roleMap.find(role);
@ -556,16 +569,14 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
return id < other.id;
}
static ParentTreeParseEntry parse(PDFInteger id, const PDFObjectStorage*, const PDFObject& object)
static ParentTreeParseEntry parse(PDFInteger id, const PDFObjectStorage* storage, const PDFObject& object)
{
if (object.isReference())
{
return ParentTreeParseEntry{ id, { object.getReference() } };
}
else if (object.isArray())
const PDFObject& dereferencedObject = storage->getObject(object);
if (dereferencedObject.isArray())
{
std::vector<PDFObjectReference> references;
for (const PDFObject& object : *object.getArray())
for (const PDFObject& object : *dereferencedObject.getArray())
{
if (object.isReference())
{
@ -575,6 +586,10 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
return ParentTreeParseEntry{ id, qMove(references) };
}
else if (object.isReference())
{
return ParentTreeParseEntry{ id, { object.getReference() } };
}
return ParentTreeParseEntry{ id, { } };
}
@ -629,6 +644,16 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj
return tree;
}
PDFStructureTree::ParentTreeEntry PDFStructureTree::getParentTreeEntry(PDFInteger index) const
{
if (index >= 0 && index < PDFInteger(m_parentTreeEntries.size()))
{
return m_parentTreeEntries[index];
}
return ParentTreeEntry();
}
PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent)
{
if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object))
@ -939,237 +964,4 @@ void PDFStructureTreeAbstractVisitor::acceptChildren(const PDFStructureItem* ite
}
}
class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor
{
public:
explicit inline PDFStructureTreeReferenceCollector(std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
m_mapping(mapping)
{
}
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
private:
void addReference(const PDFStructureItem* structureObjectReference);
std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
};
void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree)
{
addReference(structureTree);
acceptChildren(structureTree);
}
void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement)
{
addReference(structureElement);
acceptChildren(structureElement);
}
void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
{
addReference(structureMarkedContentReference);
acceptChildren(structureMarkedContentReference);
}
void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
{
addReference(structureObjectReference);
acceptChildren(structureObjectReference);
}
void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem)
{
if (structureItem->getSelfReference().isValid())
{
(*m_mapping)[structureItem->getSelfReference()] = structureItem;
}
}
class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor
{
using BaseClass = PDFPageContentProcessor;
public:
explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features,
const PDFPage* page,
const PDFDocument* document,
const PDFFontCache* fontCache,
const PDFCMS* cms,
const PDFOptionalContentActivity* optionalContentActivity,
QMatrix pagePointToDevicePointMatrix,
const PDFMeshQualitySettings& meshQualitySettings) :
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
m_features(features)
{
}
std::map<PDFInteger, QStringList>& takeTexts() { return m_text; }
QStringList& takeUnmatchedTexts() { return m_unmatchedText; }
protected:
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
virtual bool isContentKindSuppressed(ContentKind kind) const override;
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
virtual void performMarkedContentEnd() override;
private:
struct MarkedContentInfo
{
QByteArray tag;
PDFInteger mcid = -1;
};
PDFRenderer::Features m_features;
std::vector<MarkedContentInfo> m_markedContentInfoStack;
QString m_currentText;
std::map<PDFInteger, QStringList> m_text;
QStringList m_unmatchedText;
};
void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties)
{
MarkedContentInfo info;
info.tag = tag;
if (properties.isDictionary())
{
const PDFDictionary* dictionary = properties.getDictionary();
PDFObject mcid = dictionary->get("MCID");
if (mcid.isInt())
{
info.mcid = mcid.getInteger();
}
}
m_markedContentInfoStack.emplace_back(qMove(info));
}
void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
{
MarkedContentInfo info = qMove(m_markedContentInfoStack.back());
m_markedContentInfoStack.pop_back();
if (info.mcid != -1)
{
if (!m_currentText.isEmpty())
{
m_text[info.mcid].push_back(qMove(m_currentText));
}
m_currentText = QString();
}
if (m_markedContentInfoStack.empty() && !m_currentText.isEmpty())
{
m_unmatchedText << qMove(m_currentText);
m_currentText = QString();
}
}
bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd)
{
if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent))
{
return false;
}
return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd);
}
bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const
{
switch (kind)
{
case ContentKind::Shapes:
case ContentKind::Text:
case ContentKind::Images:
case ContentKind::Shading:
return true;
case ContentKind::Tiling:
return false; // Tiling can have text
default:
{
Q_ASSERT(false);
break;
}
}
return false;
}
void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info)
{
if (!isContentSuppressed())
{
if (!info.character.isNull())
{
m_currentText.push_back(info.character);
}
}
}
PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree) :
m_document(document),
m_tree(tree)
{
}
void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageIndices)
{
std::map<PDFObjectReference, const PDFStructureItem*> mapping;
PDFStructureTreeReferenceCollector referenceCollector(&mapping);
m_tree->accept(&referenceCollector);
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
// Jakub Melka: maps text to structure tree items. Key is pair of (page index, mcid)
std::map<std::pair<PDFInteger, PDFInteger>, QStringList> extractedText;
QMutex mutex;
PDFCMSGeneric cms;
PDFMeshQualitySettings mqs;
PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr);
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(m_document), &oca);
fontCache.setDocument(md);
fontCache.setCacheShrinkEnabled(nullptr, false);
auto generateTextLayout = [this, &mutex, &extractedText, &fontCache, &cms, &mqs, &oca](PDFInteger pageIndex)
{
const PDFCatalog* catalog = m_document->getCatalog();
if (!catalog->getPage(pageIndex))
{
// Invalid page index
return;
}
const PDFPage* page = catalog->getPage(pageIndex);
Q_ASSERT(page);
PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QMatrix(), mqs);
QList<PDFRenderError> errors = processor.processContents();
QMutexLocker lock(&mutex);
for (auto& item : processor.takeTexts())
{
extractedText[std::make_pair(pageIndex, item.first)].append(qMove(item.second));
}
m_unmatchedText << qMove(processor.takeUnmatchedTexts());
m_errors.append(qMove(errors));
};
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
fontCache.setCacheShrinkEnabled(nullptr, true);
}
} // namespace pdf

View File

@ -382,6 +382,13 @@ public:
/// \param id Id
std::vector<PDFObjectReference> getParents(PDFInteger id) const;
/// Returns parent key for structural entry with given id,
/// and index. Id is, typically, structural tree parent key in page,
/// index is index into the marked content references array.
/// \param id Structural tree parent id
/// \param index Index into the subarray
PDFObjectReference getParent(PDFInteger id, PDFInteger index) const;
/// Returns type from role. Role can be an entry in RoleMap dictionary,
/// or one of the standard roles.
/// \param role Role
@ -410,8 +417,6 @@ public:
/// \param object Structure tree root object
static PDFStructureTree parse(const PDFObjectStorage* storage, PDFObject object);
private:
struct ParentTreeEntry
{
PDFInteger id = 0;
@ -422,6 +427,13 @@ private:
return id < other.id;
}
};
/// Returns given page tree entry. If index is invalid,
/// empty parent tree entry is returned.
/// \param index Index
ParentTreeEntry getParentTreeEntry(PDFInteger index) const;
private:
using ParentTreeEntries = std::vector<ParentTreeEntry>;
std::map<QByteArray, PDFObjectReference> m_idTreeMap;
@ -598,27 +610,6 @@ private:
PDFObjectReference m_objectReference;
};
/// Text extractor for structure tree. Can extract text to fill structure tree contents.
class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeTextExtractor
{
public:
explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree);
/// Performs text extracting algorithm. Only \p pageIndices
/// pages are processed for text extraction.
/// \param pageIndices Page indices
void perform(const std::vector<PDFInteger>& pageIndices);
/// Returns a list of errors/warnings
const QList<PDFRenderError>& getErrors() const { return m_errors; }
private:
QList<PDFRenderError> m_errors;
const PDFDocument* m_document;
const PDFStructureTree* m_tree;
QStringList m_unmatchedText;
};
} // namespace pdf
#endif // PDFSTRUCTURETREE_H

View File

@ -649,7 +649,14 @@ void PDFConsole::writeText(QString text, QString codecName)
{
// Write console failed. This can happen only, if outputHandle is not handle
// to console screen buffer, but, for example a file or a pipe.
if (QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1()))
QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1());
if (!codec)
{
codec = QTextCodec::codecForName("UTF-8");
writeError(QString("No codec found for '%1'. Defaulting to text codec '%2'.").arg(codecName, QString::fromLatin1(codec->name())), codecName);
}
if (codec)
{
QByteArray encodedData = codec->fromUnicode(text);
WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr);
@ -675,7 +682,13 @@ void PDFConsole::writeError(QString text, QString codecName)
{
// Write console failed. This can happen only, if outputHandle is not handle
// to console screen buffer, but, for example a file or a pipe.
if (QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1()))
QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1());
if (!codec)
{
codec = QTextCodec::codecForName("UTF-8");
}
if (codec)
{
QByteArray encodedData = codec->fromUnicode(text);
WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr);

View File

@ -43,7 +43,7 @@ struct PDFToolOptions
{
// For option 'ConsoleFormat'
PDFOutputFormatter::Style outputStyle = PDFOutputFormatter::Style::Text;
QString outputCodec;
QString outputCodec = "UTF-8";
// For option 'DateFormat'
Qt::DateFormat outputDateFormat = Qt::DefaultLocaleShortDate;

View File

@ -65,17 +65,48 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
pdf::PDFDocumentTextFlowFactory factory;
pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm);
PDFOutputFormatter formatter(options.outputStyle, options.outputCodec);
formatter.beginDocument("text-extraction", QString());
formatter.endl();
for (const pdf::PDFDocumentTextFlow::Item& item : documentTextFlow.getItems())
{
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemStart))
{
formatter.beginHeader("item", item.text);
}
if (!item.text.isEmpty())
{
formatter.writeText("text", item.text);
}
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd))
{
formatter.endHeader();
}
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd))
{
formatter.endl();
}
}
formatter.endDocument();
for (const pdf::PDFRenderError& error : factory.getErrors())
{
PDFConsole::writeError(error.message, options.outputCodec);
}
PDFConsole::writeText(formatter.getString(), options.outputCodec);
return ExitSuccess;
}
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
{
return ConsoleFormat | OpenDocument | TextAnalysis;
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis;
}
} // namespace pdftool