Tool for fetching text (basics)

This commit is contained in:
Jakub Melka 2020-10-11 18:21:20 +02:00
parent a656e9857f
commit 08b38ce813
9 changed files with 171 additions and 4 deletions

View File

@ -60,13 +60,15 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
case Algorithm::Layout:
{
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
fontCache.setCacheShrinkEnabled(nullptr, false);
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
PDFCMSGeneric cms;
PDFMeshQualitySettings mqs;
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(document), &oca);
fontCache.setDocument(md);
fontCache.setCacheShrinkEnabled(nullptr, false);
auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex)
{
@ -85,7 +87,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex);
PDFDocumentTextFlow::Items flowItems;
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, QString() });
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1) });
for (const PDFTextFlow& textFlow : textFlows)
{
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() });

View File

@ -72,7 +72,7 @@ private:
};
/// This factory creates text flow for whole document
class PDFDocumentTextFlowFactory
class PDFFORQTLIBSHARED_EXPORT PDFDocumentTextFlowFactory
{
public:
explicit PDFDocumentTextFlowFactory() = default;

View File

@ -43,6 +43,7 @@ SOURCES += \
pdfoutputformatter.cpp \
pdftoolabstractapplication.cpp \
pdftoolattachments.cpp \
pdftoolfetchtext.cpp \
pdftoolinfo.cpp \
pdftoolinfojavascript.cpp \
pdftoolinfometadata.cpp \
@ -65,6 +66,7 @@ HEADERS += \
pdfoutputformatter.h \
pdftoolabstractapplication.h \
pdftoolattachments.h \
pdftoolfetchtext.h \
pdftoolinfo.h \
pdftoolinfojavascript.h \
pdftoolinfometadata.h \

View File

@ -27,6 +27,8 @@ int main(int argc, char *argv[])
QCoreApplication::setApplicationName("PdfTool");
QCoreApplication::setApplicationVersion("1.0.0");
QResource::registerResource(QString("cmaps.qrb"));
QStringList arguments = QCoreApplication::arguments();
QCommandLineParser parser;

View File

@ -662,6 +662,13 @@ void PDFConsole::writeText(QString text, QString codecName)
void PDFConsole::writeError(QString text, QString codecName)
{
if (text.isEmpty())
{
return;
}
text += "\n";
#ifdef Q_OS_WIN
HANDLE outputHandle = GetStdHandle(STD_ERROR_HANDLE);
if (!WriteConsoleW(outputHandle, text.utf16(), text.size(), nullptr, nullptr))
@ -675,7 +682,9 @@ void PDFConsole::writeError(QString text, QString codecName)
}
}
#else
QTextStream(stdout) << text;
QTextStream stream(stdout);
stream << text;
stream << endl;
#endif
}

View File

@ -201,6 +201,11 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
parser->addOption(QCommandLineOption("page-last", "Last page of page range.", "number"));
parser->addOption(QCommandLineOption("page-select", "Choose arbitrary pages, in form '1,5,3,7-11,-29,43-.'.", "number"));
}
if (optionFlags.testFlag(TextAnalysis))
{
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto"));
}
}
PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const
@ -308,6 +313,31 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
options.pageSelectorSelection = parser->isSet("page-select") ? parser->value("page-select") : QString();
}
if (optionFlags.testFlag(TextAnalysis))
{
QString algoritm = parser->value("text-analysis-alg");
if (algoritm == "auto")
{
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
}
else if (algoritm == "layout")
{
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Layout;
}
else if (algoritm == "content")
{
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Content;
}
else if (algoritm == "structure")
{
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Structure;
}
else if (!algoritm.isEmpty())
{
PDFConsole::writeError(PDFToolTranslationContext::tr("Unknown text layout analysis algorithm '%1'. Defaulting to automatic algorithm selection.").arg(algoritm), options.outputCodec);
}
}
return options;
}

View File

@ -20,6 +20,7 @@
#include "pdfoutputformatter.h"
#include "pdfdocument.h"
#include "pdfdocumenttextflow.h"
#include <QtGlobal>
#include <QString>
@ -80,6 +81,9 @@ struct PDFToolOptions
QString pageSelectorLastPage;
QString pageSelectorSelection;
// For option 'TextAnalysis'
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
/// \param pageCount Page count
/// \param[out] errorMessage Error message
@ -121,6 +125,7 @@ public:
DateFormat = 0x0020, ///< Date format
ComputeHashes = 0x0040, ///< Compute hashes
PageSelector = 0x0080, ///< Select page range (or all pages)
TextAnalysis = 0x0100, ///< Text analysis options
};
Q_DECLARE_FLAGS(Options, Option)

View File

@ -0,0 +1,81 @@
// Copyright (C) 2020 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#include "pdftoolfetchtext.h"
#include "pdfdocumenttextflow.h"
namespace pdftool
{
static PDFToolFetchTextApplication s_fetchTextApplication;
QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplication::StandardString standardString) const
{
switch (standardString)
{
case Command:
return "fetch-text";
case Name:
return PDFToolTranslationContext::tr("Fetch text");
case Description:
return PDFToolTranslationContext::tr("Fetch text content from a document.");
default:
Q_ASSERT(false);
break;
}
return QString();
}
int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
{
pdf::PDFDocument document;
QByteArray sourceData;
if (!readDocument(options, document, &sourceData))
{
return ErrorDocumentReading;
}
QString parseError;
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
if (!parseError.isEmpty())
{
PDFConsole::writeError(parseError, options.outputCodec);
return ErrorInvalidArguments;
}
pdf::PDFDocumentTextFlowFactory factory;
pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm);
for (const pdf::PDFRenderError& error : factory.getErrors())
{
PDFConsole::writeError(error.message, options.outputCodec);
}
return ExitSuccess;
}
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
{
return ConsoleFormat | OpenDocument | TextAnalysis;
}
} // namespace pdftool

View File

@ -0,0 +1,36 @@
// Copyright (C) 2020 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#ifndef PDFTOOLFETCHTEXT_H
#define PDFTOOLFETCHTEXT_H
#include "pdftoolabstractapplication.h"
namespace pdftool
{
class PDFToolFetchTextApplication : public PDFToolAbstractApplication
{
public:
virtual QString getStandardString(StandardString standardString) const override;
virtual int execute(const PDFToolOptions& options) override;
virtual Options getOptionsFlags() const override;
};
} // namespace pdftool
#endif // PDFTOOLFETCHTEXT_H