mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
Tool for fetching text (basics)
This commit is contained in:
@ -60,13 +60,15 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
case Algorithm::Layout:
|
case Algorithm::Layout:
|
||||||
{
|
{
|
||||||
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
PDFFontCache fontCache(DEFAULT_FONT_CACHE_LIMIT, DEFAULT_REALIZED_FONT_CACHE_LIMIT);
|
||||||
fontCache.setCacheShrinkEnabled(nullptr, false);
|
|
||||||
|
|
||||||
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
||||||
|
|
||||||
PDFCMSGeneric cms;
|
PDFCMSGeneric cms;
|
||||||
PDFMeshQualitySettings mqs;
|
PDFMeshQualitySettings mqs;
|
||||||
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
||||||
|
pdf::PDFModifiedDocument md(const_cast<PDFDocument*>(document), &oca);
|
||||||
|
fontCache.setDocument(md);
|
||||||
|
fontCache.setCacheShrinkEnabled(nullptr, false);
|
||||||
|
|
||||||
auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex)
|
auto generateTextLayout = [this, &items, &mutex, &fontCache, &cms, &mqs, &oca, document, catalog](PDFInteger pageIndex)
|
||||||
{
|
{
|
||||||
@ -85,7 +87,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex);
|
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, PDFTextFlow::FlowFlags(PDFTextFlow::SeparateBlocks) | PDFTextFlow::RemoveSoftHyphen, pageIndex);
|
||||||
|
|
||||||
PDFDocumentTextFlow::Items flowItems;
|
PDFDocumentTextFlow::Items flowItems;
|
||||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, QString() });
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::PageStart, pageIndex, PDFTranslationContext::tr("Page %1").arg(pageIndex + 1) });
|
||||||
for (const PDFTextFlow& textFlow : textFlows)
|
for (const PDFTextFlow& textFlow : textFlows)
|
||||||
{
|
{
|
||||||
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() });
|
flowItems.emplace_back(PDFDocumentTextFlow::Item{ PDFDocumentTextFlow::Text, pageIndex, textFlow.getText() });
|
||||||
|
@ -72,7 +72,7 @@ private:
|
|||||||
};
|
};
|
||||||
|
|
||||||
/// This factory creates text flow for whole document
|
/// This factory creates text flow for whole document
|
||||||
class PDFDocumentTextFlowFactory
|
class PDFFORQTLIBSHARED_EXPORT PDFDocumentTextFlowFactory
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit PDFDocumentTextFlowFactory() = default;
|
explicit PDFDocumentTextFlowFactory() = default;
|
||||||
|
@ -43,6 +43,7 @@ SOURCES += \
|
|||||||
pdfoutputformatter.cpp \
|
pdfoutputformatter.cpp \
|
||||||
pdftoolabstractapplication.cpp \
|
pdftoolabstractapplication.cpp \
|
||||||
pdftoolattachments.cpp \
|
pdftoolattachments.cpp \
|
||||||
|
pdftoolfetchtext.cpp \
|
||||||
pdftoolinfo.cpp \
|
pdftoolinfo.cpp \
|
||||||
pdftoolinfojavascript.cpp \
|
pdftoolinfojavascript.cpp \
|
||||||
pdftoolinfometadata.cpp \
|
pdftoolinfometadata.cpp \
|
||||||
@ -65,6 +66,7 @@ HEADERS += \
|
|||||||
pdfoutputformatter.h \
|
pdfoutputformatter.h \
|
||||||
pdftoolabstractapplication.h \
|
pdftoolabstractapplication.h \
|
||||||
pdftoolattachments.h \
|
pdftoolattachments.h \
|
||||||
|
pdftoolfetchtext.h \
|
||||||
pdftoolinfo.h \
|
pdftoolinfo.h \
|
||||||
pdftoolinfojavascript.h \
|
pdftoolinfojavascript.h \
|
||||||
pdftoolinfometadata.h \
|
pdftoolinfometadata.h \
|
||||||
|
@ -27,6 +27,8 @@ int main(int argc, char *argv[])
|
|||||||
QCoreApplication::setApplicationName("PdfTool");
|
QCoreApplication::setApplicationName("PdfTool");
|
||||||
QCoreApplication::setApplicationVersion("1.0.0");
|
QCoreApplication::setApplicationVersion("1.0.0");
|
||||||
|
|
||||||
|
QResource::registerResource(QString("cmaps.qrb"));
|
||||||
|
|
||||||
QStringList arguments = QCoreApplication::arguments();
|
QStringList arguments = QCoreApplication::arguments();
|
||||||
|
|
||||||
QCommandLineParser parser;
|
QCommandLineParser parser;
|
||||||
|
@ -662,6 +662,13 @@ void PDFConsole::writeText(QString text, QString codecName)
|
|||||||
|
|
||||||
void PDFConsole::writeError(QString text, QString codecName)
|
void PDFConsole::writeError(QString text, QString codecName)
|
||||||
{
|
{
|
||||||
|
if (text.isEmpty())
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
text += "\n";
|
||||||
|
|
||||||
#ifdef Q_OS_WIN
|
#ifdef Q_OS_WIN
|
||||||
HANDLE outputHandle = GetStdHandle(STD_ERROR_HANDLE);
|
HANDLE outputHandle = GetStdHandle(STD_ERROR_HANDLE);
|
||||||
if (!WriteConsoleW(outputHandle, text.utf16(), text.size(), nullptr, nullptr))
|
if (!WriteConsoleW(outputHandle, text.utf16(), text.size(), nullptr, nullptr))
|
||||||
@ -675,7 +682,9 @@ void PDFConsole::writeError(QString text, QString codecName)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
QTextStream(stdout) << text;
|
QTextStream stream(stdout);
|
||||||
|
stream << text;
|
||||||
|
stream << endl;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,6 +201,11 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
|
|||||||
parser->addOption(QCommandLineOption("page-last", "Last page of page range.", "number"));
|
parser->addOption(QCommandLineOption("page-last", "Last page of page range.", "number"));
|
||||||
parser->addOption(QCommandLineOption("page-select", "Choose arbitrary pages, in form '1,5,3,7-11,-29,43-.'.", "number"));
|
parser->addOption(QCommandLineOption("page-select", "Choose arbitrary pages, in form '1,5,3,7-11,-29,43-.'.", "number"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (optionFlags.testFlag(TextAnalysis))
|
||||||
|
{
|
||||||
|
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const
|
PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const
|
||||||
@ -308,6 +313,31 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
|
|||||||
options.pageSelectorSelection = parser->isSet("page-select") ? parser->value("page-select") : QString();
|
options.pageSelectorSelection = parser->isSet("page-select") ? parser->value("page-select") : QString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (optionFlags.testFlag(TextAnalysis))
|
||||||
|
{
|
||||||
|
QString algoritm = parser->value("text-analysis-alg");
|
||||||
|
if (algoritm == "auto")
|
||||||
|
{
|
||||||
|
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
|
||||||
|
}
|
||||||
|
else if (algoritm == "layout")
|
||||||
|
{
|
||||||
|
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Layout;
|
||||||
|
}
|
||||||
|
else if (algoritm == "content")
|
||||||
|
{
|
||||||
|
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Content;
|
||||||
|
}
|
||||||
|
else if (algoritm == "structure")
|
||||||
|
{
|
||||||
|
options.textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Structure;
|
||||||
|
}
|
||||||
|
else if (!algoritm.isEmpty())
|
||||||
|
{
|
||||||
|
PDFConsole::writeError(PDFToolTranslationContext::tr("Unknown text layout analysis algorithm '%1'. Defaulting to automatic algorithm selection.").arg(algoritm), options.outputCodec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
|
|
||||||
#include "pdfoutputformatter.h"
|
#include "pdfoutputformatter.h"
|
||||||
#include "pdfdocument.h"
|
#include "pdfdocument.h"
|
||||||
|
#include "pdfdocumenttextflow.h"
|
||||||
|
|
||||||
#include <QtGlobal>
|
#include <QtGlobal>
|
||||||
#include <QString>
|
#include <QString>
|
||||||
@ -80,6 +81,9 @@ struct PDFToolOptions
|
|||||||
QString pageSelectorLastPage;
|
QString pageSelectorLastPage;
|
||||||
QString pageSelectorSelection;
|
QString pageSelectorSelection;
|
||||||
|
|
||||||
|
// For option 'TextAnalysis'
|
||||||
|
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
|
||||||
|
|
||||||
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
|
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
|
||||||
/// \param pageCount Page count
|
/// \param pageCount Page count
|
||||||
/// \param[out] errorMessage Error message
|
/// \param[out] errorMessage Error message
|
||||||
@ -121,6 +125,7 @@ public:
|
|||||||
DateFormat = 0x0020, ///< Date format
|
DateFormat = 0x0020, ///< Date format
|
||||||
ComputeHashes = 0x0040, ///< Compute hashes
|
ComputeHashes = 0x0040, ///< Compute hashes
|
||||||
PageSelector = 0x0080, ///< Select page range (or all pages)
|
PageSelector = 0x0080, ///< Select page range (or all pages)
|
||||||
|
TextAnalysis = 0x0100, ///< Text analysis options
|
||||||
};
|
};
|
||||||
Q_DECLARE_FLAGS(Options, Option)
|
Q_DECLARE_FLAGS(Options, Option)
|
||||||
|
|
||||||
|
81
PdfTool/pdftoolfetchtext.cpp
Normal file
81
PdfTool/pdftoolfetchtext.cpp
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
// Copyright (C) 2020 Jakub Melka
|
||||||
|
//
|
||||||
|
// This file is part of PdfForQt.
|
||||||
|
//
|
||||||
|
// PdfForQt is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// PdfForQt is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Lesser General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public License
|
||||||
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#include "pdftoolfetchtext.h"
|
||||||
|
#include "pdfdocumenttextflow.h"
|
||||||
|
|
||||||
|
namespace pdftool
|
||||||
|
{
|
||||||
|
|
||||||
|
static PDFToolFetchTextApplication s_fetchTextApplication;
|
||||||
|
|
||||||
|
QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplication::StandardString standardString) const
|
||||||
|
{
|
||||||
|
switch (standardString)
|
||||||
|
{
|
||||||
|
case Command:
|
||||||
|
return "fetch-text";
|
||||||
|
|
||||||
|
case Name:
|
||||||
|
return PDFToolTranslationContext::tr("Fetch text");
|
||||||
|
|
||||||
|
case Description:
|
||||||
|
return PDFToolTranslationContext::tr("Fetch text content from a document.");
|
||||||
|
|
||||||
|
default:
|
||||||
|
Q_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return QString();
|
||||||
|
}
|
||||||
|
|
||||||
|
int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
||||||
|
{
|
||||||
|
pdf::PDFDocument document;
|
||||||
|
QByteArray sourceData;
|
||||||
|
if (!readDocument(options, document, &sourceData))
|
||||||
|
{
|
||||||
|
return ErrorDocumentReading;
|
||||||
|
}
|
||||||
|
|
||||||
|
QString parseError;
|
||||||
|
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
|
||||||
|
|
||||||
|
if (!parseError.isEmpty())
|
||||||
|
{
|
||||||
|
PDFConsole::writeError(parseError, options.outputCodec);
|
||||||
|
return ErrorInvalidArguments;
|
||||||
|
}
|
||||||
|
|
||||||
|
pdf::PDFDocumentTextFlowFactory factory;
|
||||||
|
pdf::PDFDocumentTextFlow documentTextFlow = factory.create(&document, pages, options.textAnalysisAlgorithm);
|
||||||
|
|
||||||
|
for (const pdf::PDFRenderError& error : factory.getErrors())
|
||||||
|
{
|
||||||
|
PDFConsole::writeError(error.message, options.outputCodec);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ExitSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
|
||||||
|
{
|
||||||
|
return ConsoleFormat | OpenDocument | TextAnalysis;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace pdftool
|
36
PdfTool/pdftoolfetchtext.h
Normal file
36
PdfTool/pdftoolfetchtext.h
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
// Copyright (C) 2020 Jakub Melka
|
||||||
|
//
|
||||||
|
// This file is part of PdfForQt.
|
||||||
|
//
|
||||||
|
// PdfForQt is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// PdfForQt is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Lesser General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public License
|
||||||
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#ifndef PDFTOOLFETCHTEXT_H
|
||||||
|
#define PDFTOOLFETCHTEXT_H
|
||||||
|
|
||||||
|
#include "pdftoolabstractapplication.h"
|
||||||
|
|
||||||
|
namespace pdftool
|
||||||
|
{
|
||||||
|
|
||||||
|
class PDFToolFetchTextApplication : public PDFToolAbstractApplication
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
virtual QString getStandardString(StandardString standardString) const override;
|
||||||
|
virtual int execute(const PDFToolOptions& options) override;
|
||||||
|
virtual Options getOptionsFlags() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace pdftool
|
||||||
|
|
||||||
|
#endif // PDFTOOLFETCHTEXT_H
|
Reference in New Issue
Block a user