diff --git a/PdfForQtLib/sources/pdfdocumenttextflow.cpp b/PdfForQtLib/sources/pdfdocumenttextflow.cpp index e411dac..0527a01 100644 --- a/PdfForQtLib/sources/pdfdocumenttextflow.cpp +++ b/PdfForQtLib/sources/pdfdocumenttextflow.cpp @@ -60,13 +60,15 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume case Algorithm::Layout: { PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT); - fontCache.setCacheShrinkEnabled(nullptr, false); std::map items; PDFCMSGeneric cms; PDFMeshQualitySettings mqs; PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr); + pdf::PDFModifiedDocument md(const_cast(document), &oca); + fontCache.setDocument(md); + fontCache.setCacheShrinkEnabled(nullptr, false); auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex) { @@ -85,7 +87,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex); PDFDocumentTextFlow::Items flowItems; - flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, QString() }); + flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1) }); for (const PDFTextFlow& textFlow : textFlows) { flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() }); diff --git a/PdfForQtLib/sources/pdfdocumenttextflow.h b/PdfForQtLib/sources/pdfdocumenttextflow.h index 332c787..15220d0 100644 --- a/PdfForQtLib/sources/pdfdocumenttextflow.h +++ b/PdfForQtLib/sources/pdfdocumenttextflow.h @@ -72,7 +72,7 @@ private: }; /// This factory creates text flow for whole document -class PDFDocumentTextFlowFactory +class PDFFORQTLIBSHARED_EXPORT PDFDocumentTextFlowFactory { public: explicit PDFDocumentTextFlowFactory() = default; diff --git a/PdfTool/PdfTool.pro b/PdfTool/PdfTool.pro index 3c090ea..7213667 100644 --- a/PdfTool/PdfTool.pro +++ b/PdfTool/PdfTool.pro @@ -43,6 +43,7 @@ SOURCES += \ pdfoutputformatter.cpp \ pdftoolabstractapplication.cpp \ pdftoolattachments.cpp \ + pdftoolfetchtext.cpp \ pdftoolinfo.cpp \ pdftoolinfojavascript.cpp \ pdftoolinfometadata.cpp \ @@ -65,6 +66,7 @@ HEADERS += \ pdfoutputformatter.h \ pdftoolabstractapplication.h \ pdftoolattachments.h \ + pdftoolfetchtext.h \ pdftoolinfo.h \ pdftoolinfojavascript.h \ pdftoolinfometadata.h \ diff --git a/PdfTool/main.cpp b/PdfTool/main.cpp index f3ae672..2b7c499 100644 --- a/PdfTool/main.cpp +++ b/PdfTool/main.cpp @@ -27,6 +27,8 @@ int main(int argc, char *argv[]) QCoreApplication::setApplicationName("PdfTool"); QCoreApplication::setApplicationVersion("1.0.0"); + QResource::registerResource(QString("cmaps.qrb")); + QStringList arguments = QCoreApplication::arguments(); QCommandLineParser parser; diff --git a/PdfTool/pdfoutputformatter.cpp b/PdfTool/pdfoutputformatter.cpp index f18ab22..ede25f0 100644 --- a/PdfTool/pdfoutputformatter.cpp +++ b/PdfTool/pdfoutputformatter.cpp @@ -662,6 +662,13 @@ void PDFConsole::writeText(QString text, QString codecName) void PDFConsole::writeError(QString text, QString codecName) { + if (text.isEmpty()) + { + return; + } + + text += "\n"; + #ifdef Q_OS_WIN HANDLE outputHandle = GetStdHandle(STD_ERROR_HANDLE); if (!WriteConsoleW(outputHandle, text.utf16(), text.size(), nullptr, nullptr)) @@ -675,7 +682,9 @@ void PDFConsole::writeError(QString text, QString codecName) } } #else - QTextStream(stdout) << text; + QTextStream stream(stdout); + stream << text; + stream << endl; #endif } diff --git a/PdfTool/pdftoolabstractapplication.cpp b/PdfTool/pdftoolabstractapplication.cpp index eb00bb0..32999f1 100644 --- a/PdfTool/pdftoolabstractapplication.cpp +++ b/PdfTool/pdftoolabstractapplication.cpp @@ -201,6 +201,11 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser* parser->addOption(QCommandLineOption("page-last", "Last page of page range.", "number")); parser->addOption(QCommandLineOption("page-select", "Choose arbitrary pages, in form '1,5,3,7-11,-29,43-.'.", "number")); } + + if (optionFlags.testFlag(TextAnalysis)) + { + parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto")); + } } PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const @@ -308,6 +313,31 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser options.pageSelectorSelection = parser->isSet("page-select") ? parser->value("page-select") : QString(); } + if (optionFlags.testFlag(TextAnalysis)) + { + QString algoritm = parser->value("text-analysis-alg"); + if (algoritm == "auto") + { + options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto; + } + else if (algoritm == "layout") + { + options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Layout; + } + else if (algoritm == "content") + { + options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Content; + } + else if (algoritm == "structure") + { + options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Structure; + } + else if (!algoritm.isEmpty()) + { + PDFConsole::writeError(PDFToolTranslationContext::tr("Unknown text layout analysis algorithm '%1'. Defaulting to automatic algorithm selection.").arg(algoritm), options.outputCodec); + } + } + return options; } diff --git a/PdfTool/pdftoolabstractapplication.h b/PdfTool/pdftoolabstractapplication.h index 3a471ac..8d92375 100644 --- a/PdfTool/pdftoolabstractapplication.h +++ b/PdfTool/pdftoolabstractapplication.h @@ -20,6 +20,7 @@ #include "pdfoutputformatter.h" #include "pdfdocument.h" +#include "pdfdocumenttextflow.h" #include #include @@ -80,6 +81,9 @@ struct PDFToolOptions QString pageSelectorLastPage; QString pageSelectorSelection; + // For option 'TextAnalysis' + pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto; + /// Returns page range. If page range is invalid, then \p errorMessage is empty. /// \param pageCount Page count /// \param[out] errorMessage Error message @@ -121,6 +125,7 @@ public: DateFormat = 0x0020, ///< Date format ComputeHashes = 0x0040, ///< Compute hashes PageSelector = 0x0080, ///< Select page range (or all pages) + TextAnalysis = 0x0100, ///< Text analysis options }; Q_DECLARE_FLAGS(Options, Option) diff --git a/PdfTool/pdftoolfetchtext.cpp b/PdfTool/pdftoolfetchtext.cpp new file mode 100644 index 0000000..b9e2b67 --- /dev/null +++ b/PdfTool/pdftoolfetchtext.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2020 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + +#include "pdftoolfetchtext.h" +#include "pdfdocumenttextflow.h" + +namespace pdftool +{ + +static PDFToolFetchTextApplication s_fetchTextApplication; + +QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplication::StandardString standardString) const +{ + switch (standardString) + { + case Command: + return "fetch-text"; + + case Name: + return PDFToolTranslationContext::tr("Fetch text"); + + case Description: + return PDFToolTranslationContext::tr("Fetch text content from a document."); + + default: + Q_ASSERT(false); + break; + } + + return QString(); +} + +int PDFToolFetchTextApplication::execute(const PDFToolOptions& options) +{ + pdf::PDFDocument document; + QByteArray sourceData; + if (!readDocument(options, document, &sourceData)) + { + return ErrorDocumentReading; + } + + QString parseError; + std::vector pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true); + + if (!parseError.isEmpty()) + { + PDFConsole::writeError(parseError, options.outputCodec); + return ErrorInvalidArguments; + } + + pdf::PDFDocumentTextFlowFactory factory; + pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm); + + for (const pdf::PDFRenderError& error : factory.getErrors()) + { + PDFConsole::writeError(error.message, options.outputCodec); + } + + return ExitSuccess; +} + +PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const +{ + return ConsoleFormat | OpenDocument | TextAnalysis; +} + +} // namespace pdftool diff --git a/PdfTool/pdftoolfetchtext.h b/PdfTool/pdftoolfetchtext.h new file mode 100644 index 0000000..34a2d25 --- /dev/null +++ b/PdfTool/pdftoolfetchtext.h @@ -0,0 +1,36 @@ +// Copyright (C) 2020 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + +#ifndef PDFTOOLFETCHTEXT_H +#define PDFTOOLFETCHTEXT_H + +#include "pdftoolabstractapplication.h" + +namespace pdftool +{ + +class PDFToolFetchTextApplication : public PDFToolAbstractApplication +{ +public: + virtual QString getStandardString(StandardString standardString) const override; + virtual int execute(const PDFToolOptions& options) override; + virtual Options getOptionsFlags() const override; +}; + +} // namespace pdftool + +#endif // PDFTOOLFETCHTEXT_H