From 16b583f39056c2a49a3fdfcb04a51e74c89bb104 Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sun, 18 Oct 2020 12:57:27 +0200 Subject: [PATCH] Finishing of text extracting tool --- PdfForQtLib/sources/pdfdocumenttextflow.cpp | 65 ++++++++++++++++++--- PdfForQtLib/sources/pdfdocumenttextflow.h | 1 - PdfTool/pdftoolabstractapplication.cpp | 24 +++++++- PdfTool/pdftoolabstractapplication.h | 13 ++++- PdfTool/pdftoolfetchtext.cpp | 25 +++++++- 5 files changed, 114 insertions(+), 14 deletions(-) diff --git a/PdfForQtLib/sources/pdfdocumenttextflow.cpp b/PdfForQtLib/sources/pdfdocumenttextflow.cpp index 195254d..8e32c47 100644 --- a/PdfForQtLib/sources/pdfdocumenttextflow.cpp +++ b/PdfForQtLib/sources/pdfdocumenttextflow.cpp @@ -356,7 +356,7 @@ void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextC { if (!isContentSuppressed()) { - if (!info.character.isNull()) + if (!info.character.isNull() && info.character != QChar(QChar::SoftHyphen)) { m_currentText.push_back(info.character); } @@ -480,6 +480,8 @@ public: virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override; private: + void markHasContent(); + PDFDocumentTextFlow::Items* m_items; const PDFStructureTreeTextExtractor* m_extractor; std::vector m_hasContentStack; @@ -492,6 +494,14 @@ void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTre m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()}); } +void PDFStructureTreeTextFlowCollector::markHasContent() +{ + for (size_t i = 0; i < m_hasContentStack.size(); ++i) + { + m_hasContentStack[i] = true; + } +} + void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement) { size_t index = m_items->size(); @@ -500,12 +510,52 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure // Mark stack so we can delete unused items m_hasContentStack.push_back(false); + QString title = structureElement->getText(PDFStructureElement::Title); + QString language = structureElement->getText(PDFStructureElement::Language); + QString alternativeDescription = structureElement->getText(PDFStructureElement::AlternativeDescription); + QString expandedForm = structureElement->getText(PDFStructureElement::ExpandedForm); + QString actualText = structureElement->getText(PDFStructureElement::ActualText); + QString phoneme = structureElement->getText(PDFStructureElement::Phoneme); + + if (!title.isEmpty()) + { + markHasContent(); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureTitle, -1, }); + } + + if (!language.isEmpty()) + { + markHasContent(); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureLanguage, -1, language }); + } + + if (!alternativeDescription.isEmpty()) + { + markHasContent(); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureAlternativeDescription, -1, alternativeDescription }); + } + + if (!expandedForm.isEmpty()) + { + markHasContent(); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureExpandedForm, -1, expandedForm }); + } + + if (!actualText.isEmpty()) + { + markHasContent(); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureActualText, -1, actualText }); + } + + if (!phoneme.isEmpty()) + { + markHasContent(); + m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructurePhoneme, -1, phoneme }); + } + for (const QString& string : m_extractor->getText(structureElement)) { - for (size_t i = 0; i < m_hasContentStack.size(); ++i) - { - m_hasContentStack[i] = true; - } + markHasContent(); m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string}); } @@ -539,7 +589,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume PDFStructureTree structureTree; const PDFCatalog* catalog = document->getCatalog(); - if (algorithm == Algorithm::Auto || algorithm == Algorithm::Structure) + if (algorithm != Algorithm::Layout) { structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot()); } @@ -559,8 +609,6 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume Q_ASSERT(algorithm != Algorithm::Auto); - QMutex mutex; - // Perform algorithm to retrieve document text switch (algorithm) { @@ -570,6 +618,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume std::map items; + QMutex mutex; PDFCMSGeneric cms; PDFMeshQualitySettings mqs; PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr); diff --git a/PdfForQtLib/sources/pdfdocumenttextflow.h b/PdfForQtLib/sources/pdfdocumenttextflow.h index 15220d0..649508e 100644 --- a/PdfForQtLib/sources/pdfdocumenttextflow.h +++ b/PdfForQtLib/sources/pdfdocumenttextflow.h @@ -46,7 +46,6 @@ public: StructurePhoneme = 0x0100, ///< Structure tree item phoneme StructureItemStart = 0x0200, ///< Start of structure tree item StructureItemEnd = 0x0400, ///< End of structure tree item - StructureEmpty = 0x0800, ///< Structure tree item doesn't contain any text }; Q_DECLARE_FLAGS(Flags, Flag) diff --git a/PdfTool/pdftoolabstractapplication.cpp b/PdfTool/pdftoolabstractapplication.cpp index 32999f1..7937831 100644 --- a/PdfTool/pdftoolabstractapplication.cpp +++ b/PdfTool/pdftoolabstractapplication.cpp @@ -204,7 +204,18 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser* if (optionFlags.testFlag(TextAnalysis)) { - parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto")); + parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure).", "algorithm", "auto")); + } + + if (optionFlags.testFlag(TextShow)) + { + parser->addOption(QCommandLineOption("text-show-page-numbers", "Show page numbers in extracted text.")); + parser->addOption(QCommandLineOption("text-show-struct-title", "Show title extracted from structure tree.")); + parser->addOption(QCommandLineOption("text-show-struct-lang", "Show language extracted from structure tree.")); + parser->addOption(QCommandLineOption("text-show-struct-alt-desc", "Show alternative description extracted from structure tree.")); + parser->addOption(QCommandLineOption("text-show-struct-expanded-form", "Show expanded form extracted from structure tree.")); + parser->addOption(QCommandLineOption("text-show-struct-act-text", "Show actual text extracted from structure tree.")); + parser->addOption(QCommandLineOption("text-show-phoneme", "Show phoneme extracted from structure tree.")); } } @@ -338,6 +349,17 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser } } + if (optionFlags.testFlag(TextShow)) + { + options.textShowPageNumbers = parser->isSet("text-show-page-numbers"); + options.textShowStructTitles = parser->isSet("text-show-struct-title"); + options.textShowStructLanguage = parser->isSet("text-show-struct-lang"); + options.textShowStructAlternativeDescription = parser->isSet("text-show-struct-alt-desc"); + options.textShowStructExpandedForm = parser->isSet("text-show-struct-expanded-form"); + options.textShowStructActualText = parser->isSet("text-show-struct-act-text"); + options.textShowStructPhoneme = parser->isSet("text-show-phoneme"); + } + return options; } diff --git a/PdfTool/pdftoolabstractapplication.h b/PdfTool/pdftoolabstractapplication.h index 3cd88f2..49b8f89 100644 --- a/PdfTool/pdftoolabstractapplication.h +++ b/PdfTool/pdftoolabstractapplication.h @@ -84,6 +84,15 @@ struct PDFToolOptions // For option 'TextAnalysis' pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto; + // For option 'TextShow' + bool textShowPageNumbers = false; + bool textShowStructTitles = false; + bool textShowStructLanguage = false; + bool textShowStructAlternativeDescription = false; + bool textShowStructExpandedForm = false; + bool textShowStructActualText = false; + bool textShowStructPhoneme = false; + /// Returns page range. If page range is invalid, then \p errorMessage is empty. /// \param pageCount Page count /// \param[out] errorMessage Error message @@ -105,7 +114,8 @@ public: ErrorNoDocumentSpecified, ErrorDocumentReading, ErrorInvalidArguments, - ErrorFailedWriteToFile + ErrorFailedWriteToFile, + ErrorPermissions }; enum StandardString @@ -126,6 +136,7 @@ public: ComputeHashes = 0x0040, ///< Compute hashes PageSelector = 0x0080, ///< Select page range (or all pages) TextAnalysis = 0x0100, ///< Text analysis options + TextShow = 0x0200, ///< Text extract and show options }; Q_DECLARE_FLAGS(Options, Option) diff --git a/PdfTool/pdftoolfetchtext.cpp b/PdfTool/pdftoolfetchtext.cpp index 8565eae..31d5536 100644 --- a/PdfTool/pdftoolfetchtext.cpp +++ b/PdfTool/pdftoolfetchtext.cpp @@ -34,7 +34,7 @@ QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplicatio return PDFToolTranslationContext::tr("Fetch text"); case Description: - return PDFToolTranslationContext::tr("Fetch text content from a document."); + return PDFToolTranslationContext::tr("Fetch text content from document."); default: Q_ASSERT(false); @@ -53,6 +53,12 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options) return ErrorDocumentReading; } + if (!document.getStorage().getSecurityHandler()->isAllowed(pdf::PDFSecurityHandler::Permission::CopyContent)) + { + PDFConsole::writeError(PDFToolTranslationContext::tr("Document doesn't allow to copy content."), options.outputCodec); + return ErrorPermissions; + } + QString parseError; std::vector pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true); @@ -78,7 +84,20 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options) if (!item.text.isEmpty()) { - formatter.writeText("text", item.text); + bool showText = (item.flags.testFlag(pdf::PDFDocumentTextFlow::Text)) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::PageStart) && options.textShowPageNumbers) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd) && options.textShowPageNumbers) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureTitle) && options.textShowStructTitles) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureLanguage) && options.textShowStructLanguage) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureAlternativeDescription) && options.textShowStructAlternativeDescription) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureExpandedForm) && options.textShowStructExpandedForm) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureActualText) && options.textShowStructActualText) || + (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructurePhoneme) && options.textShowStructPhoneme); + + if (showText) + { + formatter.writeText("text", item.text); + } } if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd)) @@ -106,7 +125,7 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options) PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const { - return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis; + return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis | TextShow; } } // namespace pdftool