mirror of https://github.com/JakubMelka/PDF4QT.git
Tool for fetching text (basics)
This commit is contained in:
parent
a656e9857f
commit
08b38ce813
|
@ -60,13 +60,15 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||
case Algorithm::Layout:
|
||||
{
|
||||
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
||||
fontCache.setCacheShrinkEnabled(nullptr, false);
|
||||
|
||||
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
||||
|
||||
PDFCMSGeneric cms;
|
||||
PDFMeshQualitySettings mqs;
|
||||
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
||||
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(document), &oca);
|
||||
fontCache.setDocument(md);
|
||||
fontCache.setCacheShrinkEnabled(nullptr, false);
|
||||
|
||||
auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex)
|
||||
{
|
||||
|
@ -85,7 +87,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex);
|
||||
|
||||
PDFDocumentTextFlow::Items flowItems;
|
||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, QString() });
|
||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1) });
|
||||
for (const PDFTextFlow& textFlow : textFlows)
|
||||
{
|
||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() });
|
||||
|
|
|
@ -72,7 +72,7 @@ private:
|
|||
};
|
||||
|
||||
/// This factory creates text flow for whole document
|
||||
class PDFDocumentTextFlowFactory
|
||||
class PDFFORQTLIBSHARED_EXPORT PDFDocumentTextFlowFactory
|
||||
{
|
||||
public:
|
||||
explicit PDFDocumentTextFlowFactory() = default;
|
||||
|
|
|
@ -43,6 +43,7 @@ SOURCES += \
|
|||
pdfoutputformatter.cpp \
|
||||
pdftoolabstractapplication.cpp \
|
||||
pdftoolattachments.cpp \
|
||||
pdftoolfetchtext.cpp \
|
||||
pdftoolinfo.cpp \
|
||||
pdftoolinfojavascript.cpp \
|
||||
pdftoolinfometadata.cpp \
|
||||
|
@ -65,6 +66,7 @@ HEADERS += \
|
|||
pdfoutputformatter.h \
|
||||
pdftoolabstractapplication.h \
|
||||
pdftoolattachments.h \
|
||||
pdftoolfetchtext.h \
|
||||
pdftoolinfo.h \
|
||||
pdftoolinfojavascript.h \
|
||||
pdftoolinfometadata.h \
|
||||
|
|
|
@ -27,6 +27,8 @@ int main(int argc, char *argv[])
|
|||
QCoreApplication::setApplicationName("PdfTool");
|
||||
QCoreApplication::setApplicationVersion("1.0.0");
|
||||
|
||||
QResource::registerResource(QString("cmaps.qrb"));
|
||||
|
||||
QStringList arguments = QCoreApplication::arguments();
|
||||
|
||||
QCommandLineParser parser;
|
||||
|
|
|
@ -662,6 +662,13 @@ void PDFConsole::writeText(QString text, QString codecName)
|
|||
|
||||
void PDFConsole::writeError(QString text, QString codecName)
|
||||
{
|
||||
if (text.isEmpty())
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
text += "\n";
|
||||
|
||||
#ifdef Q_OS_WIN
|
||||
HANDLE outputHandle = GetStdHandle(STD_ERROR_HANDLE);
|
||||
if (!WriteConsoleW(outputHandle, text.utf16(), text.size(), nullptr, nullptr))
|
||||
|
@ -675,7 +682,9 @@ void PDFConsole::writeError(QString text, QString codecName)
|
|||
}
|
||||
}
|
||||
#else
|
||||
QTextStream(stdout) << text;
|
||||
QTextStream stream(stdout);
|
||||
stream << text;
|
||||
stream << endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -201,6 +201,11 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
|
|||
parser->addOption(QCommandLineOption("page-last", "Last page of page range.", "number"));
|
||||
parser->addOption(QCommandLineOption("page-select", "Choose arbitrary pages, in form '1,5,3,7-11,-29,43-.'.", "number"));
|
||||
}
|
||||
|
||||
if (optionFlags.testFlag(TextAnalysis))
|
||||
{
|
||||
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto"));
|
||||
}
|
||||
}
|
||||
|
||||
PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const
|
||||
|
@ -308,6 +313,31 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
|
|||
options.pageSelectorSelection = parser->isSet("page-select") ? parser->value("page-select") : QString();
|
||||
}
|
||||
|
||||
if (optionFlags.testFlag(TextAnalysis))
|
||||
{
|
||||
QString algoritm = parser->value("text-analysis-alg");
|
||||
if (algoritm == "auto")
|
||||
{
|
||||
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
|
||||
}
|
||||
else if (algoritm == "layout")
|
||||
{
|
||||
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Layout;
|
||||
}
|
||||
else if (algoritm == "content")
|
||||
{
|
||||
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Content;
|
||||
}
|
||||
else if (algoritm == "structure")
|
||||
{
|
||||
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Structure;
|
||||
}
|
||||
else if (!algoritm.isEmpty())
|
||||
{
|
||||
PDFConsole::writeError(PDFToolTranslationContext::tr("Unknown text layout analysis algorithm '%1'. Defaulting to automatic algorithm selection.").arg(algoritm), options.outputCodec);
|
||||
}
|
||||
}
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#include "pdfoutputformatter.h"
|
||||
#include "pdfdocument.h"
|
||||
#include "pdfdocumenttextflow.h"
|
||||
|
||||
#include <QtGlobal>
|
||||
#include <QString>
|
||||
|
@ -80,6 +81,9 @@ struct PDFToolOptions
|
|||
QString pageSelectorLastPage;
|
||||
QString pageSelectorSelection;
|
||||
|
||||
// For option 'TextAnalysis'
|
||||
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
|
||||
|
||||
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
|
||||
/// \param pageCount Page count
|
||||
/// \param[out] errorMessage Error message
|
||||
|
@ -121,6 +125,7 @@ public:
|
|||
DateFormat = 0x0020, ///< Date format
|
||||
ComputeHashes = 0x0040, ///< Compute hashes
|
||||
PageSelector = 0x0080, ///< Select page range (or all pages)
|
||||
TextAnalysis = 0x0100, ///< Text analysis options
|
||||
};
|
||||
Q_DECLARE_FLAGS(Options, Option)
|
||||
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
// Copyright (C) 2020 Jakub Melka
|
||||
//
|
||||
// This file is part of PdfForQt.
|
||||
//
|
||||
// PdfForQt is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Lesser General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// PdfForQt is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
#include "pdftoolfetchtext.h"
|
||||
#include "pdfdocumenttextflow.h"
|
||||
|
||||
namespace pdftool
|
||||
{
|
||||
|
||||
static PDFToolFetchTextApplication s_fetchTextApplication;
|
||||
|
||||
QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplication::StandardString standardString) const
|
||||
{
|
||||
switch (standardString)
|
||||
{
|
||||
case Command:
|
||||
return "fetch-text";
|
||||
|
||||
case Name:
|
||||
return PDFToolTranslationContext::tr("Fetch text");
|
||||
|
||||
case Description:
|
||||
return PDFToolTranslationContext::tr("Fetch text content from a document.");
|
||||
|
||||
default:
|
||||
Q_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
|
||||
return QString();
|
||||
}
|
||||
|
||||
int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
||||
{
|
||||
pdf::PDFDocument document;
|
||||
QByteArray sourceData;
|
||||
if (!readDocument(options, document, &sourceData))
|
||||
{
|
||||
return ErrorDocumentReading;
|
||||
}
|
||||
|
||||
QString parseError;
|
||||
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
|
||||
|
||||
if (!parseError.isEmpty())
|
||||
{
|
||||
PDFConsole::writeError(parseError, options.outputCodec);
|
||||
return ErrorInvalidArguments;
|
||||
}
|
||||
|
||||
pdf::PDFDocumentTextFlowFactory factory;
|
||||
pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm);
|
||||
|
||||
for (const pdf::PDFRenderError& error : factory.getErrors())
|
||||
{
|
||||
PDFConsole::writeError(error.message, options.outputCodec);
|
||||
}
|
||||
|
||||
return ExitSuccess;
|
||||
}
|
||||
|
||||
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
|
||||
{
|
||||
return ConsoleFormat | OpenDocument | TextAnalysis;
|
||||
}
|
||||
|
||||
} // namespace pdftool
|
|
@ -0,0 +1,36 @@
|
|||
// Copyright (C) 2020 Jakub Melka
|
||||
//
|
||||
// This file is part of PdfForQt.
|
||||
//
|
||||
// PdfForQt is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Lesser General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// PdfForQt is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
#ifndef PDFTOOLFETCHTEXT_H
|
||||
#define PDFTOOLFETCHTEXT_H
|
||||
|
||||
#include "pdftoolabstractapplication.h"
|
||||
|
||||
namespace pdftool
|
||||
{
|
||||
|
||||
class PDFToolFetchTextApplication : public PDFToolAbstractApplication
|
||||
{
|
||||
public:
|
||||
virtual QString getStandardString(StandardString standardString) const override;
|
||||
virtual int execute(const PDFToolOptions& options) override;
|
||||
virtual Options getOptionsFlags() const override;
|
||||
};
|
||||
|
||||
} // namespace pdftool
|
||||
|
||||
#endif // PDFTOOLFETCHTEXT_H
|
Loading…
Reference in New Issue