mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-01-22 21:30:06 +01:00
1093 lines
38 KiB
C++
1093 lines
38 KiB
C++
// Copyright (C) 2020-2022 Jakub Melka
|
|
//
|
|
// This file is part of PDF4QT.
|
|
//
|
|
// PDF4QT is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// with the written consent of the copyright owner, any later version.
|
|
//
|
|
// PDF4QT is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
// along with PDF4QT. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
#include "pdfdocumenttextflow.h"
|
|
#include "pdfdocument.h"
|
|
#include "pdfstructuretree.h"
|
|
#include "pdfexecutionpolicy.h"
|
|
#include "pdfconstants.h"
|
|
#include "pdfcms.h"
|
|
#include "pdftextlayoutgenerator.h"
|
|
#include "pdfpagecontentprocessor.h"
|
|
#include "pdfdbgheap.h"
|
|
|
|
namespace pdf
|
|
{
|
|
|
|
class PDFStructureTreeReferenceCollector : public PDFStructureTreeAbstractVisitor
|
|
{
|
|
public:
|
|
explicit inline PDFStructureTreeReferenceCollector(std::map<PDFObjectReference, const PDFStructureItem*>* mapping) :
|
|
m_mapping(mapping)
|
|
{
|
|
|
|
}
|
|
|
|
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
|
|
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
|
|
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
|
|
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
|
|
|
private:
|
|
void addReference(const PDFStructureItem* structureObjectReference);
|
|
|
|
std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
|
|
};
|
|
|
|
void PDFStructureTreeReferenceCollector::visitStructureTree(const PDFStructureTree* structureTree)
|
|
{
|
|
addReference(structureTree);
|
|
acceptChildren(structureTree);
|
|
}
|
|
|
|
void PDFStructureTreeReferenceCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
|
{
|
|
addReference(structureElement);
|
|
acceptChildren(structureElement);
|
|
}
|
|
|
|
void PDFStructureTreeReferenceCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
|
|
{
|
|
addReference(structureMarkedContentReference);
|
|
acceptChildren(structureMarkedContentReference);
|
|
}
|
|
|
|
void PDFStructureTreeReferenceCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
|
|
{
|
|
addReference(structureObjectReference);
|
|
acceptChildren(structureObjectReference);
|
|
}
|
|
|
|
void PDFStructureTreeReferenceCollector::addReference(const PDFStructureItem* structureItem)
|
|
{
|
|
if (structureItem->getSelfReference().isValid())
|
|
{
|
|
(*m_mapping)[structureItem->getSelfReference()] = structureItem;
|
|
}
|
|
}
|
|
|
|
struct PDFStructureTreeTextItem
|
|
{
|
|
enum class Type
|
|
{
|
|
StartTag,
|
|
EndTag,
|
|
Text
|
|
};
|
|
|
|
PDFStructureTreeTextItem() = default;
|
|
PDFStructureTreeTextItem(Type type, const PDFStructureItem* item, QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) :
|
|
type(type), item(item), text(qMove(text)), pageIndex(pageIndex), boundingRect(boundingRect), characterBoundingRects(std::move(characterBoundingRects))
|
|
{
|
|
|
|
}
|
|
|
|
static PDFStructureTreeTextItem createText(QString text, PDFInteger pageIndex, QRectF boundingRect, std::vector<QRectF> characterBoundingRects) { return PDFStructureTreeTextItem(Type::Text, nullptr, qMove(text), pageIndex, boundingRect, std::move(characterBoundingRects)); }
|
|
static PDFStructureTreeTextItem createStartTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::StartTag, item, QString(), -1, QRectF(), { }); }
|
|
static PDFStructureTreeTextItem createEndTag(const PDFStructureItem* item) { return PDFStructureTreeTextItem(Type::EndTag, item, QString(), -1, QRectF(), { }); }
|
|
|
|
Type type = Type::Text;
|
|
const PDFStructureItem* item = nullptr;
|
|
QString text;
|
|
PDFInteger pageIndex = -1;
|
|
QRectF boundingRect;
|
|
std::vector<QRectF> characterBoundingRects;
|
|
};
|
|
|
|
using PDFStructureTreeTextSequence = std::vector<PDFStructureTreeTextItem>;
|
|
|
|
/// Text extractor for structure tree. Extracts sequences of structure items,
|
|
/// page sequences are stored in \p textSequences. They can be accessed using
|
|
/// getters.
|
|
class PDFStructureTreeTextExtractor
|
|
{
|
|
public:
|
|
enum Option
|
|
{
|
|
None = 0x0000,
|
|
SkipArtifact = 0x0001, ///< Skip content marked as 'Artifact'
|
|
AdjustReversedText = 0x0002, ///< Adjust reversed text
|
|
CreateTreeMapping = 0x0004, ///< Create text mapping to structure tree item
|
|
BoundingBoxes = 0x0008, ///< Compute bounding boxes of the texts
|
|
};
|
|
Q_DECLARE_FLAGS(Options, Option)
|
|
|
|
explicit PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options);
|
|
|
|
/// Performs text extracting algorithm. Only \p pageIndices
|
|
/// pages are processed for text extraction.
|
|
/// \param pageIndices Page indices
|
|
void perform(const std::vector<PDFInteger>& pageIndices);
|
|
|
|
/// Returns a list of errors/warnings
|
|
const QList<PDFRenderError>& getErrors() const { return m_errors; }
|
|
|
|
/// Returns a list of unmatched text
|
|
const QStringList& getUnmatchedText() const { return m_unmatchedText; }
|
|
|
|
/// Returns text sequence for given page. If page number is invalid,
|
|
/// then empty text sequence is returned.
|
|
/// \param pageNumber Page number
|
|
const PDFStructureTreeTextSequence& getTextSequence(PDFInteger pageNumber) const;
|
|
|
|
struct TextItem
|
|
{
|
|
QRectF boundingRect;
|
|
PDFInteger pageIndex = -1;
|
|
QString text;
|
|
std::vector<QRectF> characterBoundingRects;
|
|
};
|
|
|
|
using TextItems = std::vector<TextItem>;
|
|
|
|
/// Returns text for given structure tree item. If structure tree item
|
|
/// is not found, then empty list is returned. This functionality
|
|
/// requires, that \p CreateTreeMapping flag is being set.
|
|
/// \param item Item
|
|
const TextItems& getText(const PDFStructureItem* item) const;
|
|
|
|
private:
|
|
QList<PDFRenderError> m_errors;
|
|
const PDFDocument* m_document;
|
|
const PDFStructureTree* m_tree;
|
|
QStringList m_unmatchedText;
|
|
std::map<PDFInteger, PDFStructureTreeTextSequence> m_textSequences;
|
|
std::map<const PDFStructureItem*, TextItems> m_textForItems;
|
|
Options m_options;
|
|
};
|
|
|
|
Q_DECLARE_OPERATORS_FOR_FLAGS(PDFStructureTreeTextExtractor::Options)
|
|
|
|
class PDFStructureTreeTextContentProcessor : public PDFPageContentProcessor
|
|
{
|
|
using BaseClass = PDFPageContentProcessor;
|
|
|
|
public:
|
|
explicit PDFStructureTreeTextContentProcessor(PDFRenderer::Features features,
|
|
const PDFPage* page,
|
|
const PDFDocument* document,
|
|
const PDFFontCache* fontCache,
|
|
const PDFCMS* cms,
|
|
const PDFOptionalContentActivity* optionalContentActivity,
|
|
QTransform pagePointToDevicePointMatrix,
|
|
const PDFMeshQualitySettings& meshQualitySettings,
|
|
const PDFStructureTree* tree,
|
|
const std::map<PDFObjectReference, const PDFStructureItem*>* mapping,
|
|
PDFStructureTreeTextExtractor::Options extractorOptions) :
|
|
BaseClass(page, document, fontCache, cms, optionalContentActivity, pagePointToDevicePointMatrix, meshQualitySettings),
|
|
m_features(features),
|
|
m_tree(tree),
|
|
m_mapping(mapping),
|
|
m_extractorOptions(extractorOptions),
|
|
m_pageIndex(document->getCatalog()->getPageIndexFromPageReference(page->getPageReference()))
|
|
{
|
|
|
|
}
|
|
|
|
PDFStructureTreeTextSequence& takeSequence() { return m_textSequence; }
|
|
QStringList& takeUnmatchedTexts() { return m_unmatchedText; }
|
|
|
|
protected:
|
|
virtual bool isContentSuppressedByOC(PDFObjectReference ocgOrOcmd) override;
|
|
virtual bool isContentKindSuppressed(ContentKind kind) const override;
|
|
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
|
virtual void performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties) override;
|
|
virtual void performMarkedContentEnd() override;
|
|
|
|
private:
|
|
const PDFStructureItem* getStructureTreeItemFromMCID(PDFInteger mcid) const;
|
|
void finishText();
|
|
|
|
bool isArtifact() const;
|
|
bool isReversedText() const;
|
|
|
|
struct MarkedContentInfo
|
|
{
|
|
QByteArray tag;
|
|
PDFInteger mcid = -1;
|
|
const PDFStructureItem* structureTreeItem = nullptr;
|
|
bool isArtifact = false;
|
|
bool isReversedText = false;
|
|
};
|
|
|
|
PDFRenderer::Features m_features;
|
|
const PDFStructureTree* m_tree;
|
|
const std::map<PDFObjectReference, const PDFStructureItem*>* m_mapping;
|
|
std::vector<MarkedContentInfo> m_markedContentInfoStack;
|
|
QString m_currentText;
|
|
QRectF m_currentBoundingBox;
|
|
PDFStructureTreeTextSequence m_textSequence;
|
|
QStringList m_unmatchedText;
|
|
PDFStructureTreeTextExtractor::Options m_extractorOptions;
|
|
PDFInteger m_pageIndex;
|
|
std::vector<QRectF> m_characterBoundingRects;
|
|
};
|
|
|
|
void PDFStructureTreeTextContentProcessor::finishText()
|
|
{
|
|
QString trimmedText = m_currentText.trimmed();
|
|
const int index = m_currentText.indexOf(trimmedText);
|
|
Q_ASSERT(index != -1);
|
|
if (trimmedText.size() < m_currentText.size())
|
|
{
|
|
// Fix character bounding boxes...
|
|
if (m_characterBoundingRects.size() == static_cast<size_t>(m_currentText.size()))
|
|
{
|
|
std::vector<QRectF> boundingRects(std::next(m_characterBoundingRects.cbegin(), index), std::next(m_characterBoundingRects.cbegin(), index + trimmedText.length()));
|
|
m_characterBoundingRects = std::move(boundingRects);
|
|
}
|
|
m_currentText = std::move(trimmedText);
|
|
}
|
|
|
|
if (!m_currentText.isEmpty() && (!m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::SkipArtifact) || !isArtifact()))
|
|
{
|
|
if (m_extractorOptions.testFlag(PDFStructureTreeTextExtractor::AdjustReversedText) && isReversedText())
|
|
{
|
|
QString reversed;
|
|
reversed.reserve(m_currentText.size());
|
|
for (auto it = m_currentText.rbegin(); it != m_currentText.rend(); ++it)
|
|
{
|
|
reversed.push_back(*it);
|
|
}
|
|
m_currentText = qMove(reversed);
|
|
std::reverse(m_characterBoundingRects.begin(), m_characterBoundingRects.end());
|
|
}
|
|
Q_ASSERT(static_cast<size_t>(m_currentText.size()) == m_characterBoundingRects.size() || m_characterBoundingRects.empty());
|
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createText(std::move(m_currentText), m_pageIndex, m_currentBoundingBox, std::move(m_characterBoundingRects)));
|
|
}
|
|
m_currentText = QString();
|
|
m_currentBoundingBox = QRectF();
|
|
m_characterBoundingRects.clear();
|
|
}
|
|
|
|
bool PDFStructureTreeTextContentProcessor::isArtifact() const
|
|
{
|
|
return std::any_of(m_markedContentInfoStack.cbegin(), m_markedContentInfoStack.cend(), [](const auto& item) { return item.isArtifact; });
|
|
}
|
|
|
|
bool PDFStructureTreeTextContentProcessor::isReversedText() const
|
|
{
|
|
return std::any_of(m_markedContentInfoStack.cbegin(), m_markedContentInfoStack.cend(), [](const auto& item) { return item.isReversedText; });
|
|
}
|
|
|
|
void PDFStructureTreeTextContentProcessor::performMarkedContentBegin(const QByteArray& tag, const PDFObject& properties)
|
|
{
|
|
MarkedContentInfo info;
|
|
info.tag = tag;
|
|
|
|
if (properties.isDictionary())
|
|
{
|
|
const PDFDictionary* dictionary = properties.getDictionary();
|
|
PDFObject mcid = dictionary->get("MCID");
|
|
if (mcid.isInt())
|
|
{
|
|
// We must finish text, because we can have a sequence of text,
|
|
// then subitem, then text, and followed by another subitem. They
|
|
// can be interleaved.
|
|
finishText();
|
|
|
|
info.mcid = mcid.getInteger();
|
|
info.structureTreeItem = getStructureTreeItemFromMCID(info.mcid);
|
|
info.isArtifact = tag == "Artifact";
|
|
info.isReversedText = tag == "ReversedChars";
|
|
|
|
if (!info.structureTreeItem)
|
|
{
|
|
reportRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Structure tree item for MCID %1 not found.").arg(info.mcid));
|
|
}
|
|
|
|
if (info.structureTreeItem)
|
|
{
|
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createStartTag(info.structureTreeItem));
|
|
}
|
|
}
|
|
}
|
|
|
|
m_markedContentInfoStack.emplace_back(qMove(info));
|
|
}
|
|
|
|
void PDFStructureTreeTextContentProcessor::performMarkedContentEnd()
|
|
{
|
|
MarkedContentInfo info = qMove(m_markedContentInfoStack.back());
|
|
m_markedContentInfoStack.pop_back();
|
|
|
|
if (info.mcid != -1)
|
|
{
|
|
finishText();
|
|
if (info.structureTreeItem)
|
|
{
|
|
m_textSequence.emplace_back(PDFStructureTreeTextItem::createEndTag(info.structureTreeItem));
|
|
}
|
|
}
|
|
|
|
// Check for text, which doesn't belong to any structure tree item
|
|
if (m_markedContentInfoStack.empty())
|
|
{
|
|
m_currentText = m_currentText.trimmed();
|
|
if (!m_currentText.isEmpty())
|
|
{
|
|
m_unmatchedText << qMove(m_currentText);
|
|
}
|
|
m_currentBoundingBox = QRectF();
|
|
m_characterBoundingRects.clear();
|
|
}
|
|
}
|
|
|
|
const PDFStructureItem* PDFStructureTreeTextContentProcessor::getStructureTreeItemFromMCID(PDFInteger mcid) const
|
|
{
|
|
auto it = m_mapping->find(m_tree->getParent(getStructuralParentKey(), mcid));
|
|
if (it != m_mapping->cend())
|
|
{
|
|
return it->second;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
bool PDFStructureTreeTextContentProcessor::isContentSuppressedByOC(PDFObjectReference ocgOrOcmd)
|
|
{
|
|
if (m_features.testFlag(PDFRenderer::IgnoreOptionalContent))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return PDFPageContentProcessor::isContentSuppressedByOC(ocgOrOcmd);
|
|
}
|
|
|
|
bool PDFStructureTreeTextContentProcessor::isContentKindSuppressed(ContentKind kind) const
|
|
{
|
|
switch (kind)
|
|
{
|
|
case ContentKind::Text:
|
|
case ContentKind::Shapes:
|
|
case ContentKind::Images:
|
|
case ContentKind::Shading:
|
|
return true;
|
|
|
|
case ContentKind::Tiling:
|
|
return false; // Tiling can have text
|
|
|
|
default:
|
|
{
|
|
Q_ASSERT(false);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info)
|
|
{
|
|
if (!isContentSuppressed())
|
|
{
|
|
if (!info.character.isNull() && info.character != QChar(QChar::SoftHyphen))
|
|
{
|
|
m_currentText.push_back(info.character);
|
|
|
|
QPainterPath worldPath = info.matrix.map(info.outline);
|
|
if (!worldPath.isEmpty())
|
|
{
|
|
QRectF boundingRect = worldPath.controlPointRect();
|
|
m_currentBoundingBox = m_currentBoundingBox.united(boundingRect);
|
|
m_characterBoundingRects.push_back(boundingRect);
|
|
}
|
|
else
|
|
{
|
|
m_characterBoundingRects.push_back(QRectF());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
PDFStructureTreeTextExtractor::PDFStructureTreeTextExtractor(const PDFDocument* document, const PDFStructureTree* tree, Options options) :
|
|
m_document(document),
|
|
m_tree(tree),
|
|
m_options(options)
|
|
{
|
|
|
|
}
|
|
|
|
void PDFStructureTreeTextExtractor::perform(const std::vector<PDFInteger>& pageIndices)
|
|
{
|
|
std::map<PDFObjectReference, const PDFStructureItem*> mapping;
|
|
PDFStructureTreeReferenceCollector referenceCollector(&mapping);
|
|
m_tree->accept(&referenceCollector);
|
|
|
|
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
|
|
|
QMutex mutex;
|
|
PDFCMSGeneric cms;
|
|
PDFMeshQualitySettings mqs;
|
|
PDFOptionalContentActivity oca(m_document, OCUsage::Export, nullptr);
|
|
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(m_document), &oca);
|
|
fontCache.setDocument(md);
|
|
fontCache.setCacheShrinkEnabled(nullptr, false);
|
|
|
|
auto generateTextLayout = [&, this](PDFInteger pageIndex)
|
|
{
|
|
const PDFCatalog* catalog = m_document->getCatalog();
|
|
if (!catalog->getPage(pageIndex))
|
|
{
|
|
// Invalid page index
|
|
return;
|
|
}
|
|
|
|
const PDFPage* page = catalog->getPage(pageIndex);
|
|
Q_ASSERT(page);
|
|
|
|
PDFStructureTreeTextContentProcessor processor(PDFRenderer::IgnoreOptionalContent, page, m_document, &fontCache, &cms, &oca, QTransform(), mqs, m_tree, &mapping, m_options);
|
|
QList<PDFRenderError> errors = processor.processContents();
|
|
|
|
QMutexLocker lock(&mutex);
|
|
m_textSequences[pageIndex] = qMove(processor.takeSequence());
|
|
m_unmatchedText << qMove(processor.takeUnmatchedTexts());
|
|
m_errors.append(qMove(errors));
|
|
};
|
|
|
|
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
|
|
|
|
fontCache.setCacheShrinkEnabled(nullptr, true);
|
|
|
|
if (m_options.testFlag(CreateTreeMapping))
|
|
{
|
|
for (const auto& sequence : m_textSequences)
|
|
{
|
|
std::stack<const PDFStructureItem*> stack;
|
|
for (const PDFStructureTreeTextItem& sequenceItem : sequence.second)
|
|
{
|
|
switch (sequenceItem.type)
|
|
{
|
|
case PDFStructureTreeTextItem::Type::StartTag:
|
|
{
|
|
stack.push(sequenceItem.item);
|
|
break;
|
|
}
|
|
case PDFStructureTreeTextItem::Type::EndTag:
|
|
{
|
|
stack.pop();
|
|
break;
|
|
}
|
|
case PDFStructureTreeTextItem::Type::Text:
|
|
{
|
|
if (!stack.empty())
|
|
{
|
|
m_textForItems[stack.top()].emplace_back(TextItem{ sequenceItem.boundingRect, sequenceItem.pageIndex, sequenceItem.text, sequenceItem.characterBoundingRects });
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const PDFStructureTreeTextSequence& PDFStructureTreeTextExtractor::getTextSequence(PDFInteger pageIndex) const
|
|
{
|
|
auto it = m_textSequences.find(pageIndex);
|
|
if (it != m_textSequences.cend())
|
|
{
|
|
return it->second;
|
|
}
|
|
|
|
static PDFStructureTreeTextSequence dummy;
|
|
return dummy;
|
|
}
|
|
|
|
const PDFStructureTreeTextExtractor::TextItems& PDFStructureTreeTextExtractor::getText(const PDFStructureItem* item) const
|
|
{
|
|
auto it = m_textForItems.find(item);
|
|
if (it != m_textForItems.cend())
|
|
{
|
|
return it->second;
|
|
}
|
|
|
|
static const TextItems dummy;
|
|
return dummy;
|
|
}
|
|
|
|
|
|
class PDFStructureTreeTextFlowCollector : public PDFStructureTreeAbstractVisitor
|
|
{
|
|
public:
|
|
explicit PDFStructureTreeTextFlowCollector(PDFDocumentTextFlow::Items* items, const PDFStructureTreeTextExtractor* extractor) :
|
|
m_items(items),
|
|
m_extractor(extractor)
|
|
{
|
|
|
|
}
|
|
|
|
virtual void visitStructureTree(const PDFStructureTree* structureTree) override;
|
|
virtual void visitStructureElement(const PDFStructureElement* structureElement) override;
|
|
virtual void visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference) override;
|
|
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
|
|
|
private:
|
|
void markHasContent();
|
|
|
|
PDFDocumentTextFlow::Items* m_items;
|
|
const PDFStructureTreeTextExtractor* m_extractor;
|
|
std::vector<bool> m_hasContentStack;
|
|
};
|
|
|
|
void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTree* structureTree)
|
|
{
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, QString(), PDFDocumentTextFlow::StructureItemStart, {} });
|
|
acceptChildren(structureTree);
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, QString(), PDFDocumentTextFlow::StructureItemEnd, {} });
|
|
}
|
|
|
|
void PDFStructureTreeTextFlowCollector::markHasContent()
|
|
{
|
|
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
|
|
{
|
|
m_hasContentStack[i] = true;
|
|
}
|
|
}
|
|
|
|
void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
|
{
|
|
size_t index = m_items->size();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, QString(), PDFDocumentTextFlow::StructureItemStart, {} });
|
|
|
|
// Mark stack so we can delete unused items
|
|
m_hasContentStack.push_back(false);
|
|
|
|
QString title = structureElement->getText(PDFStructureElement::Title);
|
|
QString language = structureElement->getText(PDFStructureElement::Language);
|
|
QString alternativeDescription = structureElement->getText(PDFStructureElement::AlternativeDescription);
|
|
QString expandedForm = structureElement->getText(PDFStructureElement::ExpandedForm);
|
|
QString actualText = structureElement->getText(PDFStructureElement::ActualText);
|
|
QString phoneme = structureElement->getText(PDFStructureElement::Phoneme);
|
|
|
|
if (!title.isEmpty())
|
|
{
|
|
markHasContent();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, QString(), PDFDocumentTextFlow::StructureTitle, {} });
|
|
}
|
|
|
|
if (!language.isEmpty())
|
|
{
|
|
markHasContent();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, language, PDFDocumentTextFlow::StructureLanguage, {} });
|
|
}
|
|
|
|
if (!alternativeDescription.isEmpty())
|
|
{
|
|
markHasContent();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, alternativeDescription, PDFDocumentTextFlow::StructureAlternativeDescription, {} });
|
|
}
|
|
|
|
if (!expandedForm.isEmpty())
|
|
{
|
|
markHasContent();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, expandedForm, PDFDocumentTextFlow::StructureExpandedForm, {} });
|
|
}
|
|
|
|
if (!actualText.isEmpty())
|
|
{
|
|
markHasContent();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, actualText, PDFDocumentTextFlow::StructureActualText, {} });
|
|
}
|
|
|
|
if (!phoneme.isEmpty())
|
|
{
|
|
markHasContent();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, phoneme, PDFDocumentTextFlow::StructurePhoneme, {} });
|
|
}
|
|
|
|
for (const auto& textItem : m_extractor->getText(structureElement))
|
|
{
|
|
markHasContent();
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ textItem.boundingRect, textItem.pageIndex, textItem.text, PDFDocumentTextFlow::Text, textItem.characterBoundingRects });
|
|
}
|
|
|
|
acceptChildren(structureElement);
|
|
|
|
const bool hasContent = m_hasContentStack.back();
|
|
m_hasContentStack.pop_back();
|
|
|
|
m_items->push_back(PDFDocumentTextFlow::Item{ QRectF(), -1, QString(), PDFDocumentTextFlow::StructureItemEnd, {} });
|
|
|
|
if (!hasContent)
|
|
{
|
|
// Delete unused content
|
|
m_items->erase(std::next(m_items->begin(), index), m_items->end());
|
|
}
|
|
}
|
|
|
|
void PDFStructureTreeTextFlowCollector::visitStructureMarkedContentReference(const PDFStructureMarkedContentReference* structureMarkedContentReference)
|
|
{
|
|
acceptChildren(structureMarkedContentReference);
|
|
}
|
|
|
|
void PDFStructureTreeTextFlowCollector::visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference)
|
|
{
|
|
acceptChildren(structureObjectReference);
|
|
}
|
|
|
|
PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, const std::vector<PDFInteger>& pageIndices, Algorithm algorithm)
|
|
{
|
|
PDFDocumentTextFlow result;
|
|
PDFStructureTree structureTree;
|
|
|
|
const PDFCatalog* catalog = document->getCatalog();
|
|
if (algorithm != Algorithm::Layout)
|
|
{
|
|
structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot());
|
|
}
|
|
|
|
if (algorithm == Algorithm::Auto)
|
|
{
|
|
// Determine algorithm
|
|
if (catalog->isLogicalStructureMarked() && structureTree.isValid())
|
|
{
|
|
algorithm = Algorithm::Structure;
|
|
}
|
|
else
|
|
{
|
|
algorithm = Algorithm::Layout;
|
|
}
|
|
}
|
|
|
|
Q_ASSERT(algorithm != Algorithm::Auto);
|
|
|
|
// Perform algorithm to retrieve document text
|
|
switch (algorithm)
|
|
{
|
|
case Algorithm::Layout:
|
|
{
|
|
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
|
|
|
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
|
|
|
QMutex mutex;
|
|
PDFCMSGeneric cms;
|
|
PDFMeshQualitySettings mqs;
|
|
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
|
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(document), &oca);
|
|
fontCache.setDocument(md);
|
|
fontCache.setCacheShrinkEnabled(nullptr, false);
|
|
|
|
auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex)
|
|
{
|
|
if (!catalog->getPage(pageIndex))
|
|
{
|
|
// Invalid page index
|
|
return;
|
|
}
|
|
|
|
const PDFPage* page = catalog->getPage(pageIndex);
|
|
Q_ASSERT(page);
|
|
|
|
PDFTextLayoutGenerator generator(PDFRenderer::IgnoreOptionalContent, page, document, &fontCache, &cms, &oca, QTransform(), mqs);
|
|
QList<PDFRenderError> errors = generator.processContents();
|
|
PDFTextLayout textLayout = generator.createTextLayout();
|
|
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex);
|
|
|
|
PDFDocumentTextFlow::Items flowItems;
|
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart, {} });
|
|
for (const PDFTextFlow& textFlow : textFlows)
|
|
{
|
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ textFlow.getBoundingBox(), pageIndex, textFlow.getText(), PDFDocumentTextFlow::Text, textFlow.getBoundingBoxes() });
|
|
}
|
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd, {} });
|
|
|
|
QMutexLocker lock(&mutex);
|
|
items[pageIndex] = qMove(flowItems);
|
|
m_errors.append(qMove(errors));
|
|
};
|
|
|
|
PDFExecutionPolicy::execute(PDFExecutionPolicy::Scope::Page, pageIndices.begin(), pageIndices.end(), generateTextLayout);
|
|
|
|
fontCache.setCacheShrinkEnabled(nullptr, true);
|
|
|
|
PDFDocumentTextFlow::Items flowItems;
|
|
for (const auto& item : items)
|
|
{
|
|
flowItems.insert(flowItems.end(), std::make_move_iterator(item.second.begin()), std::make_move_iterator(item.second.end()));
|
|
}
|
|
|
|
result = PDFDocumentTextFlow(qMove(flowItems));
|
|
break;
|
|
}
|
|
|
|
case Algorithm::Structure:
|
|
{
|
|
if (!structureTree.isValid())
|
|
{
|
|
m_errors << PDFRenderError(RenderErrorType::Error, PDFTranslationContext::tr("Valid tagged document required."));
|
|
break;
|
|
}
|
|
|
|
PDFStructureTreeTextExtractor::Options options = PDFStructureTreeTextExtractor::SkipArtifact | PDFStructureTreeTextExtractor::AdjustReversedText | PDFStructureTreeTextExtractor::CreateTreeMapping;
|
|
options.setFlag(PDFStructureTreeTextExtractor::BoundingBoxes, m_calculateBoundingBoxes);
|
|
PDFStructureTreeTextExtractor extractor(document, &structureTree, options);
|
|
extractor.perform(pageIndices);
|
|
|
|
PDFDocumentTextFlow::Items flowItems;
|
|
PDFStructureTreeTextFlowCollector collector(&flowItems, &extractor);
|
|
structureTree.accept(&collector);
|
|
|
|
result = PDFDocumentTextFlow(qMove(flowItems));
|
|
m_errors.append(extractor.getErrors());
|
|
break;
|
|
}
|
|
|
|
case Algorithm::Content:
|
|
{
|
|
PDFStructureTreeTextExtractor::Options options = PDFStructureTreeTextExtractor::None;
|
|
options.setFlag(PDFStructureTreeTextExtractor::BoundingBoxes, m_calculateBoundingBoxes);
|
|
PDFStructureTreeTextExtractor extractor(document, &structureTree, options);
|
|
extractor.perform(pageIndices);
|
|
|
|
PDFDocumentTextFlow::Items flowItems;
|
|
for (PDFInteger pageIndex : pageIndices)
|
|
{
|
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1), PDFDocumentTextFlow::PageStart, {} });
|
|
for (const PDFStructureTreeTextItem& sequenceItem : extractor.getTextSequence(pageIndex))
|
|
{
|
|
if (sequenceItem.type == PDFStructureTreeTextItem::Type::Text)
|
|
{
|
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ sequenceItem.boundingRect, pageIndex, sequenceItem.text, PDFDocumentTextFlow::Text, sequenceItem.characterBoundingRects });
|
|
}
|
|
}
|
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ QRectF(), pageIndex, QString(), PDFDocumentTextFlow::PageEnd, {} });
|
|
}
|
|
|
|
result = PDFDocumentTextFlow(qMove(flowItems));
|
|
m_errors.append(extractor.getErrors());
|
|
break;
|
|
}
|
|
|
|
default:
|
|
Q_ASSERT(false);
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* document, Algorithm algorithm)
|
|
{
|
|
std::vector<pdf::PDFInteger> pageIndices;
|
|
pageIndices.resize(document->getCatalog()->getPageCount(), 0);
|
|
std::iota(pageIndices.begin(), pageIndices.end(), 0);
|
|
|
|
return create(document, pageIndices, algorithm);
|
|
}
|
|
|
|
void PDFDocumentTextFlowFactory::setCalculateBoundingBoxes(bool calculateBoundingBoxes)
|
|
{
|
|
m_calculateBoundingBoxes = calculateBoundingBoxes;
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::setTextFlow(PDFDocumentTextFlow textFlow)
|
|
{
|
|
m_originalTextFlow = std::move(textFlow);
|
|
createEditedFromOriginalTextFlow();
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::setSelectionActive(bool active)
|
|
{
|
|
for (auto& item : m_editedTextFlow)
|
|
{
|
|
if (item.editedItemFlags.testFlag(Selected))
|
|
{
|
|
item.editedItemFlags.setFlag(Removed, !active);
|
|
}
|
|
}
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::select(size_t index, bool select)
|
|
{
|
|
getEditedItem(index)->editedItemFlags.setFlag(Selected, select);
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::deselect()
|
|
{
|
|
for (auto& item : m_editedTextFlow)
|
|
{
|
|
item.editedItemFlags.setFlag(Selected, false);
|
|
}
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::removeItem(size_t index)
|
|
{
|
|
getEditedItem(index)->editedItemFlags.setFlag(Removed, true);
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::addItem(size_t index)
|
|
{
|
|
getEditedItem(index)->editedItemFlags.setFlag(Removed, false);
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::clear()
|
|
{
|
|
m_originalTextFlow = PDFDocumentTextFlow();
|
|
m_editedTextFlow.clear();
|
|
m_pageIndicesMapping.clear();
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::setText(const QString& text, size_t index)
|
|
{
|
|
EditedItem* item = getEditedItem(index);
|
|
item->text = text;
|
|
updateModifiedFlag(index);
|
|
}
|
|
|
|
bool PDFDocumentTextFlowEditor::isSelectionEmpty() const
|
|
{
|
|
return std::all_of(m_editedTextFlow.cbegin(), m_editedTextFlow.cend(), [](const auto& item) { return !item.editedItemFlags.testFlag(Selected); });
|
|
}
|
|
|
|
bool PDFDocumentTextFlowEditor::isSelectionModified() const
|
|
{
|
|
return std::any_of(m_editedTextFlow.cbegin(), m_editedTextFlow.cend(), [](const auto& item) { return item.editedItemFlags.testFlag(Selected) && item.editedItemFlags.testFlag(Modified); });
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::selectByRectangle(QRectF rectangle)
|
|
{
|
|
for (auto& item : m_editedTextFlow)
|
|
{
|
|
const QRectF& boundingRectangle = item.boundingRect;
|
|
|
|
if (boundingRectangle.isEmpty())
|
|
{
|
|
item.editedItemFlags.setFlag(Selected, false);
|
|
continue;
|
|
}
|
|
|
|
const bool isContained = rectangle.contains(boundingRectangle);
|
|
item.editedItemFlags.setFlag(Selected, isContained);
|
|
}
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::selectByContainedText(QString text)
|
|
{
|
|
for (auto& item : m_editedTextFlow)
|
|
{
|
|
const bool isContained = item.text.contains(text, Qt::CaseSensitive);
|
|
item.editedItemFlags.setFlag(Selected, isContained);
|
|
}
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::selectByRegularExpression(const QRegularExpression& expression)
|
|
{
|
|
for (auto& item : m_editedTextFlow)
|
|
{
|
|
QRegularExpressionMatch match = expression.match(item.text, 0, QRegularExpression::NormalMatch, QRegularExpression::NoMatchOption);
|
|
const bool hasMatch = match.hasMatch();
|
|
item.editedItemFlags.setFlag(Selected, hasMatch);
|
|
}
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::selectByPageIndices(const pdf::PDFClosedIntervalSet& indices)
|
|
{
|
|
std::vector<PDFInteger> pageIndices = indices.unfold();
|
|
std::sort(pageIndices.begin(), pageIndices.end());
|
|
|
|
for (auto& item : m_editedTextFlow)
|
|
{
|
|
const bool isPageValid = std::binary_search(pageIndices.begin(), pageIndices.end(), item.pageIndex + 1);
|
|
item.editedItemFlags.setFlag(Selected, isPageValid);
|
|
}
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::restoreOriginalTexts()
|
|
{
|
|
for (auto& item : m_editedTextFlow)
|
|
{
|
|
if (item.editedItemFlags.testFlag(Selected))
|
|
{
|
|
item.text = m_originalTextFlow.getItem(item.originalIndex)->text;
|
|
item.editedItemFlags.setFlag(Modified, false);
|
|
}
|
|
}
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::moveSelectionUp()
|
|
{
|
|
size_t originalSize = m_editedTextFlow.size();
|
|
size_t firstSelectedIndex = originalSize;
|
|
|
|
EditedItems selectedItems;
|
|
for (auto it = m_editedTextFlow.begin(); it != m_editedTextFlow.end();)
|
|
{
|
|
if (it->editedItemFlags.testFlag(Selected))
|
|
{
|
|
if (firstSelectedIndex == originalSize)
|
|
{
|
|
firstSelectedIndex = std::distance(m_editedTextFlow.begin(), it);
|
|
}
|
|
|
|
selectedItems.emplace_back(std::move(*it));
|
|
it = m_editedTextFlow.erase(it);
|
|
}
|
|
else
|
|
{
|
|
++it;
|
|
}
|
|
}
|
|
|
|
if (firstSelectedIndex > 0)
|
|
{
|
|
--firstSelectedIndex;
|
|
}
|
|
|
|
m_editedTextFlow.insert(std::next(m_editedTextFlow.begin(), firstSelectedIndex), std::make_move_iterator(selectedItems.begin()), std::make_move_iterator(selectedItems.end()));
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::moveSelectionDown()
|
|
{
|
|
size_t originalSize = m_editedTextFlow.size();
|
|
size_t lastSelectedIndex = originalSize;
|
|
|
|
EditedItems selectedItems;
|
|
for (auto it = m_editedTextFlow.begin(); it != m_editedTextFlow.end();)
|
|
{
|
|
if (it->editedItemFlags.testFlag(Selected))
|
|
{
|
|
lastSelectedIndex = std::distance(m_editedTextFlow.begin(), it);
|
|
selectedItems.emplace_back(std::move(*it));
|
|
it = m_editedTextFlow.erase(it);
|
|
}
|
|
else
|
|
{
|
|
++it;
|
|
}
|
|
}
|
|
|
|
if (lastSelectedIndex < m_editedTextFlow.size())
|
|
{
|
|
++lastSelectedIndex;
|
|
}
|
|
|
|
m_editedTextFlow.insert(std::next(m_editedTextFlow.begin(), lastSelectedIndex), std::make_move_iterator(selectedItems.begin()), std::make_move_iterator(selectedItems.end()));
|
|
}
|
|
|
|
PDFDocumentTextFlowEditor::PageIndicesMappingRange PDFDocumentTextFlowEditor::getItemsForPageIndex(PDFInteger pageIndex) const
|
|
{
|
|
auto comparator = [](const auto& l, const auto& r)
|
|
{
|
|
return l.first < r.first;
|
|
};
|
|
return std::equal_range(m_pageIndicesMapping.cbegin(), m_pageIndicesMapping.cend(), std::make_pair(pageIndex, size_t(0)), comparator);
|
|
}
|
|
|
|
PDFDocumentTextFlow PDFDocumentTextFlowEditor::createEditedTextFlow() const
|
|
{
|
|
PDFDocumentTextFlow::Items items;
|
|
items.reserve(getItemCount());
|
|
|
|
const size_t size = getItemCount();
|
|
for (size_t i = 0; i < size; ++i)
|
|
{
|
|
if (isRemoved(i))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
PDFDocumentTextFlow::Item item = *getOriginalItem(i);
|
|
item.text = getText(i);
|
|
items.emplace_back(std::move(item));
|
|
}
|
|
|
|
return PDFDocumentTextFlow(std::move(items));
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::createPageMapping()
|
|
{
|
|
m_pageIndicesMapping.clear();
|
|
m_pageIndicesMapping.reserve(m_editedTextFlow.size());
|
|
|
|
for (size_t i = 0; i < m_editedTextFlow.size(); ++i)
|
|
{
|
|
m_pageIndicesMapping.emplace_back(m_editedTextFlow[i].pageIndex, i);
|
|
}
|
|
|
|
std::sort(m_pageIndicesMapping.begin(), m_pageIndicesMapping.end());
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::createEditedFromOriginalTextFlow()
|
|
{
|
|
const size_t count = m_originalTextFlow.getSize();
|
|
m_editedTextFlow.reserve(count);
|
|
|
|
for (size_t i = 0; i < count; ++i)
|
|
{
|
|
const PDFDocumentTextFlow::Item* originalItem = getOriginalItem(i);
|
|
|
|
if (originalItem->text.trimmed().isEmpty())
|
|
{
|
|
continue;
|
|
}
|
|
|
|
EditedItem editedItem;
|
|
static_cast<PDFDocumentTextFlow::Item&>(editedItem) = *originalItem;
|
|
editedItem.originalIndex = i;
|
|
editedItem.editedItemFlags = originalItem->isText() ? None : Removed;
|
|
m_editedTextFlow.emplace_back(std::move(editedItem));
|
|
}
|
|
|
|
createPageMapping();
|
|
}
|
|
|
|
void PDFDocumentTextFlowEditor::updateModifiedFlag(size_t index)
|
|
{
|
|
const bool isModified = getText(index) != getOriginalItem(index)->text;
|
|
|
|
EditedItem* item = getEditedItem(index);
|
|
item->editedItemFlags.setFlag(Modified, isModified);
|
|
}
|
|
|
|
std::map<PDFInteger, PDFDocumentTextFlow> PDFDocumentTextFlow::split(Flags mask) const
|
|
{
|
|
std::map<PDFInteger, PDFDocumentTextFlow> result;
|
|
|
|
for (const Item& item : m_items)
|
|
{
|
|
if (item.flags & mask)
|
|
{
|
|
result[item.pageIndex].addItem(item);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void PDFDocumentTextFlow::append(const PDFDocumentTextFlow& textFlow)
|
|
{
|
|
m_items.insert(m_items.end(), textFlow.m_items.cbegin(), textFlow.m_items.cend());
|
|
}
|
|
|
|
QString PDFDocumentTextFlow::getText() const
|
|
{
|
|
QStringList texts;
|
|
|
|
for (const auto& item : m_items)
|
|
{
|
|
texts << item.text.trimmed();
|
|
}
|
|
|
|
return texts.join(" ");
|
|
}
|
|
|
|
} // namespace pdf
|