From 0ccdb1e46f592a457d225c9ae4993bbfd7dea0c4 Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sat, 17 Oct 2020 16:56:39 +0200 Subject: [PATCH] Document content flow basics --- PdfForQtLib/sources/pdfannotation.cpp | 3 +- PdfForQtLib/sources/pdfcompiler.cpp | 2 +- PdfForQtLib/sources/pdfdocumenttextflow.cpp | 509 +++++++++++++++++- .../sources/pdfpagecontentprocessor.cpp | 62 ++- PdfForQtLib/sources/pdfpagecontentprocessor.h | 17 +- PdfForQtLib/sources/pdfstructuretree.cpp | 272 ++-------- PdfForQtLib/sources/pdfstructuretree.h | 37 +- PdfTool/pdfoutputformatter.cpp | 17 +- PdfTool/pdftoolabstractapplication.h | 2 +- PdfTool/pdftoolfetchtext.cpp | 33 +- 10 files changed, 664 insertions(+), 290 deletions(-) diff --git a/PdfForQtLib/sources/pdfannotation.cpp b/PdfForQtLib/sources/pdfannotation.cpp index 1e4f3fe..c5bafce 100644 --- a/PdfForQtLib/sources/pdfannotation.cpp +++ b/PdfForQtLib/sources/pdfannotation.cpp @@ -1362,6 +1362,7 @@ void PDFAnnotationManager::drawAnnotationUsingAppearanceStream(const PageAnnotat QByteArray content = m_document->getDecodedStream(formStream); PDFObject resources = m_document->getObject(formDictionary->get("Resources")); PDFObject transparencyGroup = m_document->getObject(formDictionary->get("Group")); + const PDFInteger formStructuralParentKey = loader.readIntegerFromDictionary(formDictionary, "StructParent", page->getStructureParentKey()); if (formBoundingBox.isEmpty() || annotationRectangle.isEmpty()) { @@ -1409,7 +1410,7 @@ void PDFAnnotationManager::drawAnnotationUsingAppearanceStream(const PageAnnotat if (isContentVisible) { - pdfPainter.processForm(AA, formBoundingBox, resources, transparencyGroup, content); + pdfPainter.processForm(AA, formBoundingBox, resources, transparencyGroup, content, formStructuralParentKey); } } diff --git a/PdfForQtLib/sources/pdfcompiler.cpp b/PdfForQtLib/sources/pdfcompiler.cpp index 98ca615..a31b1a9 100644 --- a/PdfForQtLib/sources/pdfcompiler.cpp +++ b/PdfForQtLib/sources/pdfcompiler.cpp @@ -227,7 +227,7 @@ bool PDFTextLayoutGenerator::isContentKindSuppressed(ContentKind kind) const void PDFTextLayoutGenerator::performOutputCharacter(const PDFTextCharacterInfo& info) { - if (!isContentSuppressed()) + if (!isContentSuppressed() && !info.character.isSpace()) { m_textLayout.addCharacter(info); } diff --git a/PdfForQtLib/sources/pdfdocumenttextflow.cpp b/PdfForQtLib/sources/pdfdocumenttextflow.cpp index dd7df46..3bee63e 100644 --- a/PdfForQtLib/sources/pdfdocumenttextflow.cpp +++ b/PdfForQtLib/sources/pdfdocumenttextflow.cpp @@ -26,6 +26,483 @@ namespace pdf { + +class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor +{ +public: + explicit inline PDFStructureTreeReferenceCollector(std::map* mapping) : + m_mapping(mapping) + { + + } + + virtual void visitStructureTree(const PDFStructureTree* structureTree) override; + virtual void visitStructureElement(const PDFStructureElement* structureElement) override; + virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override; + virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override; + +private: + void addReference(const PDFStructureItem* structureObjectReference); + + std::map* m_mapping; +}; + +void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree) +{ + addReference(structureTree); + acceptChildren(structureTree); +} + +void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement) +{ + addReference(structureElement); + acceptChildren(structureElement); +} + +void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) +{ + addReference(structureMarkedContentReference); + acceptChildren(structureMarkedContentReference); +} + +void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) +{ + addReference(structureObjectReference); + acceptChildren(structureObjectReference); +} + +void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem) +{ + if (structureItem->getSelfReference().isValid()) + { + (*m_mapping)[structureItem->getSelfReference()] = structureItem; + } +} + +struct PDFStructureTreeTextItem +{ + enum class Type + { + StartTag, + EndTag, + Text + }; + + PDFStructureTreeTextItem() = default; + PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text) : + type(type), item(item), text(qMove(text)) + { + + } + + static PDFStructureTreeTextItem createText(QString text) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text)); } + static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString()); } + static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString()); } + + Type type = Type::Text; + const PDFStructureItem* item = nullptr; + QString text; +}; + +using PDFStructureTreeTextSequence = std::vector; + +/// Text extractor for structure tree. Extracts sequences of structure items, +/// page sequences are stored in \p textSequences. They can be accessed using +/// getters. +class PDFStructureTreeTextExtractor +{ +public: + enum Option + { + None = 0x0000, + SkipArtifact = 0x0001, ///< Skip content marked as 'Artifact' + AdjustReversedText = 0x0002, ///< Adjust reversed text + CreateTreeMapping = 0x0004, ///< Create text mapping to structure tree item + }; + Q_DECLARE_FLAGS(Options, Option) + + explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options); + + /// Performs text extracting algorithm. Only \p pageIndices + /// pages are processed for text extraction. + /// \param pageIndices Page indices + void perform(const std::vector& pageIndices); + + /// Returns a list of errors/warnings + const QList& getErrors() const { return m_errors; } + + /// Returns a list of unmatched text + const QStringList& getUnmatchedText() const { return m_unmatchedText; } + + /// Returns text sequence for given page. If page number is invalid, + /// then empty text sequence is returned. + /// \param pageNumber Page number + const PDFStructureTreeTextSequence& getTextSequence(PDFInteger pageNumber) const; + + /// Returns text for given structure tree item. If structure tree item + /// is not found, then empty list is returned. This functionality + /// requires, that \p CreateTreeMapping flag is being set. + /// \param item Item + const QStringList& getText(const PDFStructureItem* item) const; + +private: + QList m_errors; + const PDFDocument* m_document; + const PDFStructureTree* m_tree; + QStringList m_unmatchedText; + std::map m_textSequences; + std::map m_textForItems; + Options m_options; +}; + +Q_DECLARE_OPERATORS_FOR_FLAGS(PDFStructureTreeTextExtractor::Options) + +class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor +{ + using BaseClass = PDFPageContentProcessor; + +public: + explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features, + const PDFPage* page, + const PDFDocument* document, + const PDFFontCache* fontCache, + const PDFCMS* cms, + const PDFOptionalContentActivity* optionalContentActivity, + QMatrix pagePointToDevicePointMatrix, + const PDFMeshQualitySettings& meshQualitySettings, + const PDFStructureTree* tree, + const std::map* mapping) : + BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings), + m_features(features), + m_tree(tree), + m_mapping(mapping) + { + + } + + PDFStructureTreeTextSequence& takeSequence() { return m_textSequence; } + QStringList& takeUnmatchedTexts() { return m_unmatchedText; } + +protected: + virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override; + virtual bool isContentKindSuppressed(ContentKind kind) const override; + virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override; + virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override; + virtual void performMarkedContentEnd() override; + +private: + const PDFStructureItem* getStructureTreeItemFromMCID(PDFInteger mcid) const; + void finishText(); + + struct MarkedContentInfo + { + QByteArray tag; + PDFInteger mcid = -1; + const PDFStructureItem* structureTreeItem = nullptr; + }; + + PDFRenderer::Features m_features; + const PDFStructureTree* m_tree; + const std::map* m_mapping; + std::vector m_markedContentInfoStack; + QString m_currentText; + PDFStructureTreeTextSequence m_textSequence; + QStringList m_unmatchedText; +}; + +void PDFStructureTreeTextContentProcessor::finishText() +{ + m_currentText = m_currentText.trimmed(); + if (!m_currentText.isEmpty()) + { + m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(qMove(m_currentText))); + } + m_currentText = QString(); +} + +void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) +{ + MarkedContentInfo info; + info.tag = tag; + + if (properties.isDictionary()) + { + const PDFDictionary* dictionary = properties.getDictionary(); + PDFObject mcid = dictionary->get("MCID"); + if (mcid.isInt()) + { + // We must finish text, because we can have a sequence of text, + // then subitem, then text, and followed by another subitem. They + // can be interleaved. + finishText(); + + info.mcid = mcid.getInteger(); + info.structureTreeItem = getStructureTreeItemFromMCID(info.mcid); + + if (!info.structureTreeItem) + { + reportRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Structure tree item for MCID %1 not found.").arg(info.mcid)); + } + + if (info.structureTreeItem) + { + m_textSequence.emplace_back(PDFStructureTreeTextItem::createStartTag(info.structureTreeItem)); + } + } + } + + m_markedContentInfoStack.emplace_back(qMove(info)); +} + +void PDFStructureTreeTextContentProcessor::performMarkedContentEnd() +{ + MarkedContentInfo info = qMove(m_markedContentInfoStack.back()); + m_markedContentInfoStack.pop_back(); + + if (info.mcid != -1) + { + finishText(); + if (info.structureTreeItem) + { + m_textSequence.emplace_back(PDFStructureTreeTextItem::createEndTag(info.structureTreeItem)); + } + } + + // Check for text, which doesn't belong to any structure tree item + if (m_markedContentInfoStack.empty()) + { + m_currentText = m_currentText.trimmed(); + if (!m_currentText.isEmpty()) + { + m_unmatchedText << qMove(m_currentText); + } + } +} + +const PDFStructureItem* PDFStructureTreeTextContentProcessor::getStructureTreeItemFromMCID(PDFInteger mcid) const +{ + auto it = m_mapping->find(m_tree->getParent(getStructuralParentKey(), mcid)); + if (it != m_mapping->cend()) + { + return it->second; + } + return nullptr; +} + +bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) +{ + if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent)) + { + return false; + } + + return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd); +} + +bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const +{ + switch (kind) + { + case ContentKind::Shapes: + case ContentKind::Text: + case ContentKind::Images: + case ContentKind::Shading: + return true; + + case ContentKind::Tiling: + return false; // Tiling can have text + + default: + { + Q_ASSERT(false); + break; + } + } + + return false; +} + +void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info) +{ + if (!isContentSuppressed()) + { + if (!info.character.isNull()) + { + m_currentText.push_back(info.character); + } + } +} + +PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options) : + m_document(document), + m_tree(tree), + m_options(options) +{ + +} + +void PDFStructureTreeTextExtractor::perform(const std::vector& pageIndices) +{ + std::map mapping; + PDFStructureTreeReferenceCollector referenceCollector(&mapping); + m_tree->accept(&referenceCollector); + + PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT); + + QMutex mutex; + PDFCMSGeneric cms; + PDFMeshQualitySettings mqs; + PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr); + pdf::PDFModifiedDocument md(const_cast(m_document), &oca); + fontCache.setDocument(md); + fontCache.setCacheShrinkEnabled(nullptr, false); + + auto generateTextLayout = [&, this](PDFInteger pageIndex) + { + const PDFCatalog* catalog = m_document->getCatalog(); + if (!catalog->getPage(pageIndex)) + { + // Invalid page index + return; + } + + const PDFPage* page = catalog->getPage(pageIndex); + Q_ASSERT(page); + + PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QMatrix(), mqs, m_tree, &mapping); + QList errors = processor.processContents(); + + QMutexLocker lock(&mutex); + m_textSequences[pageIndex] = qMove(processor.takeSequence()); + m_unmatchedText << qMove(processor.takeUnmatchedTexts()); + m_errors.append(qMove(errors)); + }; + + PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout); + + fontCache.setCacheShrinkEnabled(nullptr, true); + + if (m_options.testFlag(CreateTreeMapping)) + { + for (const auto& sequence : m_textSequences) + { + std::stack stack; + for (const PDFStructureTreeTextItem& sequenceItem : sequence.second) + { + switch (sequenceItem.type) + { + case PDFStructureTreeTextItem::Type::StartTag: + stack.push(sequenceItem.item); + break; + case PDFStructureTreeTextItem::Type::EndTag: + stack.pop(); + break; + case PDFStructureTreeTextItem::Type::Text: + if (!stack.empty()) + { + m_textForItems[stack.top()] << sequenceItem.text; + } + break; + } + } + } + } +} + +const PDFStructureTreeTextSequence& PDFStructureTreeTextExtractor::getTextSequence(PDFInteger pageIndex) const +{ + auto it = m_textSequences.find(pageIndex); + if (it != m_textSequences.cend()) + { + return it->second; + } + + static PDFStructureTreeTextSequence dummy; + return dummy; +} + +const QStringList& PDFStructureTreeTextExtractor::getText(const PDFStructureItem* item) const +{ + auto it = m_textForItems.find(item); + if (it != m_textForItems.cend()) + { + return it->second; + } + + static const QStringList dummy; + return dummy; +} + + +class PDFStructureTreeTextFlowCollector : public PDFStructureTreeAbstractVisitor +{ +public: + explicit PDFStructureTreeTextFlowCollector(PDFDocumentTextFlow::Items* items, const PDFStructureTreeTextExtractor* extractor) : + m_items(items), + m_extractor(extractor) + { + + } + + virtual void visitStructureTree(const PDFStructureTree* structureTree) override; + virtual void visitStructureElement(const PDFStructureElement* structureElement) override; + virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override; + virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override; + +private: + PDFDocumentTextFlow::Items* m_items; + const PDFStructureTreeTextExtractor* m_extractor; + std::vector m_hasContentStack; +}; + +void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTree* structureTree) +{ + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemStart, -1, QString()}); + acceptChildren(structureTree); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()}); +} + +void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement) +{ + size_t index = m_items->size(); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemStart, -1, QString()}); + + // Mark stack so we can delete unused items + m_hasContentStack.push_back(false); + + for (const QString& string : m_extractor->getText(structureElement)) + { + for (size_t i = 0; i < m_hasContentStack.size(); ++i) + { + m_hasContentStack[i] = true; + } + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string}); + } + + acceptChildren(structureElement); + + const bool hasContent = m_hasContentStack.back(); + m_hasContentStack.pop_back(); + + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()}); + + if (!hasContent) + { + // Delete unused content + m_items->erase(std::next(m_items->begin(), index), m_items->end()); + } +} + +void PDFStructureTreeTextFlowCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) +{ + acceptChildren(structureMarkedContentReference); +} + +void PDFStructureTreeTextFlowCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) +{ + acceptChildren(structureObjectReference); +} + PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector& pageIndices, Algorithm algorithm) { PDFDocumentTextFlow result; @@ -121,9 +598,39 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume break; } - PDFStructureTreeTextExtractor extractor(document, &structureTree); + PDFStructureTreeTextExtractor extractor(document, &structureTree, PDFStructureTreeTextExtractor::SkipArtifact | PDFStructureTreeTextExtractor::AdjustReversedText | PDFStructureTreeTextExtractor::CreateTreeMapping); extractor.perform(pageIndices); + PDFDocumentTextFlow::Items flowItems; + PDFStructureTreeTextFlowCollector collector(&flowItems, &extractor); + structureTree.accept(&collector); + + result = PDFDocumentTextFlow(qMove(flowItems)); + m_errors.append(extractor.getErrors()); + break; + } + + case Algorithm::Content: + { + PDFStructureTreeTextExtractor extractor(document, &structureTree, PDFStructureTreeTextExtractor::None); + extractor.perform(pageIndices); + + PDFDocumentTextFlow::Items flowItems; + for (PDFInteger pageIndex : pageIndices) + { + flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1) }); + for (const PDFStructureTreeTextItem& sequenceItem : extractor.getTextSequence(pageIndex)) + { + if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text) + { + flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, sequenceItem.text }); + } + } + flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageEnd, pageIndex, QString() }); + } + + result = PDFDocumentTextFlow(qMove(flowItems)); + m_errors.append(extractor.getErrors()); break; } diff --git a/PdfForQtLib/sources/pdfpagecontentprocessor.cpp b/PdfForQtLib/sources/pdfpagecontentprocessor.cpp index 9e678a5..6b9554a 100644 --- a/PdfForQtLib/sources/pdfpagecontentprocessor.cpp +++ b/PdfForQtLib/sources/pdfpagecontentprocessor.cpp @@ -244,11 +244,14 @@ PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page, m_drawingUncoloredTilingPatternState(0), m_patternBaseMatrix(pagePointToDevicePointMatrix), m_pagePointToDevicePointMatrix(pagePointToDevicePointMatrix), - m_meshQualitySettings(meshQualitySettings) + m_meshQualitySettings(meshQualitySettings), + m_structuralParentKey(0) { Q_ASSERT(page); Q_ASSERT(document); + m_structuralParentKey = page->getStructureParentKey(); + PDFExecutionPolicy::startProcessingContentStream(); QPainterPath pageRectPath; @@ -338,6 +341,7 @@ QList PDFPageContentProcessor::processContents() } } + finishMarkedContent(); return m_errorList; } @@ -659,9 +663,11 @@ void PDFPageContentProcessor::processForm(const QMatrix& matrix, const QRectF& boundingBox, const PDFObject& resources, const PDFObject& transparencyGroup, - const QByteArray& content) + const QByteArray& content, + PDFInteger formStructuralParent) { PDFPageContentProcessorStateGuard guard(this); + PDFTemporaryValueChange structuralParentChangeGuard(&m_structuralParentKey, formStructuralParent); std::unique_ptr guard2; if (transparencyGroup.isDictionary()) @@ -1738,6 +1744,19 @@ void PDFPageContentProcessor::setRenderingIntentByName(QByteArray renderingInten m_graphicState.setRenderingIntentName(renderingIntentName); } +void PDFPageContentProcessor::finishMarkedContent() +{ + if (!m_markedContentStack.empty()) + { + m_errorList.append(PDFRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Marked content is not well formed (not enough EMC operators)."))); + } + + while (!m_markedContentStack.empty()) + { + operatorMarkedContentEnd(); + } +} + void PDFPageContentProcessor::reportRenderErrorOnce(RenderErrorType type, QString message) { if (!m_onceReportedErrors.count(message)) @@ -2936,7 +2955,10 @@ void PDFPageContentProcessor::operatorPaintXObject(PDFPageContentProcessor::PDFO // Transparency group PDFObject transparencyGroup = m_document->getObject(streamDictionary->get("Group")); - processForm(transformationMatrix, boundingBox, resources, transparencyGroup, content); + // Form structural parent key + const PDFInteger formStructuralParentKey = loader.readIntegerFromDictionary(streamDictionary, "StructParent", m_structuralParentKey); + + processForm(transformationMatrix, boundingBox, resources, transparencyGroup, content, formStructuralParentKey); } else { @@ -3075,32 +3097,34 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence) if (item.glyph) { const QPainterPath& glyphPath = *item.glyph; + + QMatrix textRenderingMatrix = adjustMatrix * textMatrix; + QMatrix toDeviceSpaceTransform = textRenderingMatrix * m_graphicState.getCurrentTransformationMatrix(); + if (!glyphPath.isEmpty()) { - QMatrix textRenderingMatrix = adjustMatrix * textMatrix; - QMatrix toDeviceSpaceTransform = textRenderingMatrix * m_graphicState.getCurrentTransformationMatrix(); QPainterPath transformedGlyph = textRenderingMatrix.map(glyphPath); processPathPainting(transformedGlyph, stroke, fill, true, transformedGlyph.fillRule()); - if (!item.character.isNull() && !item.character.isSpace()) - { - // Output character - PDFTextCharacterInfo info; - info.character = item.character; - info.isVerticalWritingSystem = !isHorizontalWritingSystem; - info.advance = item.advance; - info.fontSize = fontSize; - info.outline = glyphPath; - info.matrix = toDeviceSpaceTransform; - performOutputCharacter(info); - } - if (clipped) { // Clipping is enabled, we must transform to the device coordinates m_textClippingPath = m_textClippingPath.united(toDeviceSpaceTransform.map(glyphPath)); } } + + if (!item.character.isNull()) + { + // Output character + PDFTextCharacterInfo info; + info.character = item.character; + info.isVerticalWritingSystem = !isHorizontalWritingSystem; + info.advance = item.advance; + info.fontSize = fontSize; + info.outline = glyphPath; + info.matrix = toDeviceSpaceTransform; + performOutputCharacter(info); + } } displacementX = advance.x(); @@ -3170,7 +3194,7 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence) processContent(*item.characterContentStream); - if (!item.character.isNull() && !item.character.isSpace()) + if (!item.character.isNull()) { // Output character PDFTextCharacterInfo info; diff --git a/PdfForQtLib/sources/pdfpagecontentprocessor.h b/PdfForQtLib/sources/pdfpagecontentprocessor.h index 3d4e1a1..3cabb65 100644 --- a/PdfForQtLib/sources/pdfpagecontentprocessor.h +++ b/PdfForQtLib/sources/pdfpagecontentprocessor.h @@ -219,7 +219,13 @@ public: /// \param resources Resources, assigned to the form /// \param transparencyGroup Transparency group object /// \param content Content stream of the form - void processForm(const QMatrix& matrix, const QRectF& boundingBox, const PDFObject& resources, const PDFObject& transparencyGroup, const QByteArray& content); + /// \param formStructuralParent Structural parent key for form + void processForm(const QMatrix& matrix, + const QRectF& boundingBox, + const PDFObject& resources, + const PDFObject& transparencyGroup, + const QByteArray& content, + PDFInteger formStructuralParent); /// Initialize stream processor for processing content streams. For example, /// graphic state is initialized to default, and default color spaces are initialized. @@ -572,6 +578,9 @@ protected: /// shading, images, ...) virtual bool isContentKindSuppressed(ContentKind kind) const; + /// Returns current structural parent key + PDFInteger getStructuralParentKey() const { return m_structuralParentKey; } + /// Returns current graphic state const PDFPageContentProcessorState* getGraphicState() const { return &m_graphicState; } @@ -918,6 +927,9 @@ private: /// Set rendering intent by name void setRenderingIntentByName(QByteArray renderingIntentName); + /// Finishes marked content (if end of marked content is missing) + void finishMarkedContent(); + const PDFPage* m_page; const PDFDocument* m_document; const PDFFontCache* m_fontCache; @@ -990,6 +1002,9 @@ private: /// Set with rendering errors, which were reported (and should be reported once) std::set m_onceReportedErrors; + + /// Active structural parent key + PDFInteger m_structuralParentKey; }; } // namespace pdf diff --git a/PdfForQtLib/sources/pdfstructuretree.cpp b/PdfForQtLib/sources/pdfstructuretree.cpp index a2f86f1..ef03e7a 100644 --- a/PdfForQtLib/sources/pdfstructuretree.cpp +++ b/PdfForQtLib/sources/pdfstructuretree.cpp @@ -505,6 +505,19 @@ std::vector PDFStructureTree::getParents(PDFInteger id) cons return result; } +PDFObjectReference PDFStructureTree::getParent(PDFInteger id, PDFInteger index) const +{ + Q_ASSERT(std::is_sorted(m_parentTreeEntries.cbegin(), m_parentTreeEntries.cend())); + ParentTreeEntry entry{ id, PDFObjectReference() }; + auto [it, itEnd] = std::equal_range(m_parentTreeEntries.cbegin(), m_parentTreeEntries.cend(), entry); + const PDFInteger count = std::distance(it, itEnd); + if (index >= 0 && index < count) + { + return (*std::next(it, index)).reference; + } + return PDFObjectReference(); +} + PDFStructureItem::Type PDFStructureTree::getTypeFromRole(const QByteArray& role) const { auto it = m_roleMap.find(role); @@ -556,16 +569,14 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj return id < other.id; } - static ParentTreeParseEntry parse(PDFInteger id, const PDFObjectStorage*, const PDFObject& object) + static ParentTreeParseEntry parse(PDFInteger id, const PDFObjectStorage* storage, const PDFObject& object) { - if (object.isReference()) - { - return ParentTreeParseEntry{ id, { object.getReference() } }; - } - else if (object.isArray()) + const PDFObject& dereferencedObject = storage->getObject(object); + + if (dereferencedObject.isArray()) { std::vector references; - for (const PDFObject& object : *object.getArray()) + for (const PDFObject& object : *dereferencedObject.getArray()) { if (object.isReference()) { @@ -575,6 +586,10 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj return ParentTreeParseEntry{ id, qMove(references) }; } + else if (object.isReference()) + { + return ParentTreeParseEntry{ id, { object.getReference() } }; + } return ParentTreeParseEntry{ id, { } }; } @@ -629,6 +644,16 @@ PDFStructureTree PDFStructureTree::parse(const PDFObjectStorage* storage, PDFObj return tree; } +PDFStructureTree::ParentTreeEntry PDFStructureTree::getParentTreeEntry(PDFInteger index) const +{ + if (index >= 0 && index < PDFInteger(m_parentTreeEntries.size())) + { + return m_parentTreeEntries[index]; + } + + return ParentTreeEntry(); +} + PDFStructureItemPointer PDFStructureItem::parse(const PDFObjectStorage* storage, PDFObject object, PDFMarkedObjectsContext* context, PDFStructureItem* parent) { if (const PDFDictionary* dictionary = storage->getDictionaryFromObject(object)) @@ -939,237 +964,4 @@ void PDFStructureTreeAbstractVisitor::acceptChildren(const PDFStructureItem* ite } } -class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor -{ -public: - explicit inline PDFStructureTreeReferenceCollector(std::map* mapping) : - m_mapping(mapping) - { - - } - - virtual void visitStructureTree(const PDFStructureTree* structureTree) override; - virtual void visitStructureElement(const PDFStructureElement* structureElement) override; - virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override; - virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override; - -private: - void addReference(const PDFStructureItem* structureObjectReference); - - std::map* m_mapping; -}; - -void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree) -{ - addReference(structureTree); - acceptChildren(structureTree); -} - -void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement) -{ - addReference(structureElement); - acceptChildren(structureElement); -} - -void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) -{ - addReference(structureMarkedContentReference); - acceptChildren(structureMarkedContentReference); -} - -void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) -{ - addReference(structureObjectReference); - acceptChildren(structureObjectReference); -} - -void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem) -{ - if (structureItem->getSelfReference().isValid()) - { - (*m_mapping)[structureItem->getSelfReference()] = structureItem; - } -} - -class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor -{ - using BaseClass = PDFPageContentProcessor; - -public: - explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features, - const PDFPage* page, - const PDFDocument* document, - const PDFFontCache* fontCache, - const PDFCMS* cms, - const PDFOptionalContentActivity* optionalContentActivity, - QMatrix pagePointToDevicePointMatrix, - const PDFMeshQualitySettings& meshQualitySettings) : - BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings), - m_features(features) - { - - } - - std::map& takeTexts() { return m_text; } - QStringList& takeUnmatchedTexts() { return m_unmatchedText; } - -protected: - virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override; - virtual bool isContentKindSuppressed(ContentKind kind) const override; - virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override; - virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override; - virtual void performMarkedContentEnd() override; - -private: - struct MarkedContentInfo - { - QByteArray tag; - PDFInteger mcid = -1; - }; - - PDFRenderer::Features m_features; - std::vector m_markedContentInfoStack; - QString m_currentText; - std::map m_text; - QStringList m_unmatchedText; -}; - -void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) -{ - MarkedContentInfo info; - info.tag = tag; - - if (properties.isDictionary()) - { - const PDFDictionary* dictionary = properties.getDictionary(); - PDFObject mcid = dictionary->get("MCID"); - if (mcid.isInt()) - { - info.mcid = mcid.getInteger(); - } - } - - m_markedContentInfoStack.emplace_back(qMove(info)); -} - -void PDFStructureTreeTextContentProcessor::performMarkedContentEnd() -{ - MarkedContentInfo info = qMove(m_markedContentInfoStack.back()); - m_markedContentInfoStack.pop_back(); - - if (info.mcid != -1) - { - if (!m_currentText.isEmpty()) - { - m_text[info.mcid].push_back(qMove(m_currentText)); - } - m_currentText = QString(); - } - - if (m_markedContentInfoStack.empty() && !m_currentText.isEmpty()) - { - m_unmatchedText << qMove(m_currentText); - m_currentText = QString(); - } -} - -bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) -{ - if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent)) - { - return false; - } - - return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd); -} - -bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const -{ - switch (kind) - { - case ContentKind::Shapes: - case ContentKind::Text: - case ContentKind::Images: - case ContentKind::Shading: - return true; - - case ContentKind::Tiling: - return false; // Tiling can have text - - default: - { - Q_ASSERT(false); - break; - } - } - - return false; -} - -void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info) -{ - if (!isContentSuppressed()) - { - if (!info.character.isNull()) - { - m_currentText.push_back(info.character); - } - } -} - -PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree) : - m_document(document), - m_tree(tree) -{ - -} - -void PDFStructureTreeTextExtractor::perform(const std::vector& pageIndices) -{ - std::map mapping; - PDFStructureTreeReferenceCollector referenceCollector(&mapping); - m_tree->accept(&referenceCollector); - - PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT); - - // Jakub Melka: maps text to structure tree items. Key is pair of (page index, mcid) - std::map, QStringList> extractedText; - - QMutex mutex; - PDFCMSGeneric cms; - PDFMeshQualitySettings mqs; - PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr); - pdf::PDFModifiedDocument md(const_cast(m_document), &oca); - fontCache.setDocument(md); - fontCache.setCacheShrinkEnabled(nullptr, false); - - auto generateTextLayout = [this, &mutex, &extractedText, &fontCache, &cms, &mqs, &oca](PDFInteger pageIndex) - { - const PDFCatalog* catalog = m_document->getCatalog(); - if (!catalog->getPage(pageIndex)) - { - // Invalid page index - return; - } - - const PDFPage* page = catalog->getPage(pageIndex); - Q_ASSERT(page); - - PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QMatrix(), mqs); - QList errors = processor.processContents(); - - QMutexLocker lock(&mutex); - for (auto& item : processor.takeTexts()) - { - extractedText[std::make_pair(pageIndex, item.first)].append(qMove(item.second)); - } - m_unmatchedText << qMove(processor.takeUnmatchedTexts()); - m_errors.append(qMove(errors)); - }; - - PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout); - - fontCache.setCacheShrinkEnabled(nullptr, true); -} - } // namespace pdf diff --git a/PdfForQtLib/sources/pdfstructuretree.h b/PdfForQtLib/sources/pdfstructuretree.h index 3e2b7af..71369c3 100644 --- a/PdfForQtLib/sources/pdfstructuretree.h +++ b/PdfForQtLib/sources/pdfstructuretree.h @@ -382,6 +382,13 @@ public: /// \param id Id std::vector getParents(PDFInteger id) const; + /// Returns parent key for structural entry with given id, + /// and index. Id is, typically, structural tree parent key in page, + /// index is index into the marked content references array. + /// \param id Structural tree parent id + /// \param index Index into the subarray + PDFObjectReference getParent(PDFInteger id, PDFInteger index) const; + /// Returns type from role. Role can be an entry in RoleMap dictionary, /// or one of the standard roles. /// \param role Role @@ -410,8 +417,6 @@ public: /// \param object Structure tree root object static PDFStructureTree parse(const PDFObjectStorage* storage, PDFObject object); -private: - struct ParentTreeEntry { PDFInteger id = 0; @@ -422,6 +427,13 @@ private: return id < other.id; } }; + + /// Returns given page tree entry. If index is invalid, + /// empty parent tree entry is returned. + /// \param index Index + ParentTreeEntry getParentTreeEntry(PDFInteger index) const; + +private: using ParentTreeEntries = std::vector; std::map m_idTreeMap; @@ -598,27 +610,6 @@ private: PDFObjectReference m_objectReference; }; -/// Text extractor for structure tree. Can extract text to fill structure tree contents. -class PDFFORQTLIBSHARED_EXPORT PDFStructureTreeTextExtractor -{ -public: - explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree); - - /// Performs text extracting algorithm. Only \p pageIndices - /// pages are processed for text extraction. - /// \param pageIndices Page indices - void perform(const std::vector& pageIndices); - - /// Returns a list of errors/warnings - const QList& getErrors() const { return m_errors; } - -private: - QList m_errors; - const PDFDocument* m_document; - const PDFStructureTree* m_tree; - QStringList m_unmatchedText; -}; - } // namespace pdf #endif // PDFSTRUCTURETREE_H diff --git a/PdfTool/pdfoutputformatter.cpp b/PdfTool/pdfoutputformatter.cpp index ede25f0..acf6089 100644 --- a/PdfTool/pdfoutputformatter.cpp +++ b/PdfTool/pdfoutputformatter.cpp @@ -649,7 +649,14 @@ void PDFConsole::writeText(QString text, QString codecName) { // Write console failed. This can happen only, if outputHandle is not handle // to console screen buffer, but, for example a file or a pipe. - if (QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1())) + QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1()); + if (!codec) + { + codec = QTextCodec::codecForName("UTF-8"); + writeError(QString("No codec found for '%1'. Defaulting to text codec '%2'.").arg(codecName, QString::fromLatin1(codec->name())), codecName); + } + + if (codec) { QByteArray encodedData = codec->fromUnicode(text); WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr); @@ -675,7 +682,13 @@ void PDFConsole::writeError(QString text, QString codecName) { // Write console failed. This can happen only, if outputHandle is not handle // to console screen buffer, but, for example a file or a pipe. - if (QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1())) + QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1()); + if (!codec) + { + codec = QTextCodec::codecForName("UTF-8"); + } + + if (codec) { QByteArray encodedData = codec->fromUnicode(text); WriteFile(outputHandle, encodedData.constData(), encodedData.size(), nullptr, nullptr); diff --git a/PdfTool/pdftoolabstractapplication.h b/PdfTool/pdftoolabstractapplication.h index 8d92375..3cd88f2 100644 --- a/PdfTool/pdftoolabstractapplication.h +++ b/PdfTool/pdftoolabstractapplication.h @@ -43,7 +43,7 @@ struct PDFToolOptions { // For option 'ConsoleFormat' PDFOutputFormatter::Style outputStyle = PDFOutputFormatter::Style::Text; - QString outputCodec; + QString outputCodec = "UTF-8"; // For option 'DateFormat' Qt::DateFormat outputDateFormat = Qt::DefaultLocaleShortDate; diff --git a/PdfTool/pdftoolfetchtext.cpp b/PdfTool/pdftoolfetchtext.cpp index b9e2b67..8565eae 100644 --- a/PdfTool/pdftoolfetchtext.cpp +++ b/PdfTool/pdftoolfetchtext.cpp @@ -65,17 +65,48 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options) pdf::PDFDocumentTextFlowFactory factory; pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm); + PDFOutputFormatter formatter(options.outputStyle, options.outputCodec); + formatter.beginDocument("text-extraction", QString()); + formatter.endl(); + + for (const pdf::PDFDocumentTextFlow::Item& item : documentTextFlow.getItems()) + { + if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemStart)) + { + formatter.beginHeader("item", item.text); + } + + if (!item.text.isEmpty()) + { + formatter.writeText("text", item.text); + } + + if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd)) + { + formatter.endHeader(); + } + + if (item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd)) + { + formatter.endl(); + } + } + + formatter.endDocument(); + for (const pdf::PDFRenderError& error : factory.getErrors()) { PDFConsole::writeError(error.message, options.outputCodec); } + PDFConsole::writeText(formatter.getString(), options.outputCodec); + return ExitSuccess; } PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const { - return ConsoleFormat | OpenDocument | TextAnalysis; + return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis; } } // namespace pdftool