Finishing of text extracting tool

This commit is contained in:
Jakub Melka
2020-10-18 12:57:27 +02:00
parent 724d58194e
commit 16b583f390
5 changed files with 114 additions and 14 deletions

View File

@@ -204,7 +204,18 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
if (optionFlags.testFlag(TextAnalysis))
{
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto"));
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure).", "algorithm", "auto"));
}
if (optionFlags.testFlag(TextShow))
{
parser->addOption(QCommandLineOption("text-show-page-numbers", "Show page numbers in extracted text."));
parser->addOption(QCommandLineOption("text-show-struct-title", "Show title extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-lang", "Show language extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-alt-desc", "Show alternative description extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-expanded-form", "Show expanded form extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-act-text", "Show actual text extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-phoneme", "Show phoneme extracted from structure tree."));
}
}
@@ -338,6 +349,17 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
}
}
if (optionFlags.testFlag(TextShow))
{
options.textShowPageNumbers = parser->isSet("text-show-page-numbers");
options.textShowStructTitles = parser->isSet("text-show-struct-title");
options.textShowStructLanguage = parser->isSet("text-show-struct-lang");
options.textShowStructAlternativeDescription = parser->isSet("text-show-struct-alt-desc");
options.textShowStructExpandedForm = parser->isSet("text-show-struct-expanded-form");
options.textShowStructActualText = parser->isSet("text-show-struct-act-text");
options.textShowStructPhoneme = parser->isSet("text-show-phoneme");
}
return options;
}

View File

@@ -84,6 +84,15 @@ struct PDFToolOptions
// For option 'TextAnalysis'
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
// For option 'TextShow'
bool textShowPageNumbers = false;
bool textShowStructTitles = false;
bool textShowStructLanguage = false;
bool textShowStructAlternativeDescription = false;
bool textShowStructExpandedForm = false;
bool textShowStructActualText = false;
bool textShowStructPhoneme = false;
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
/// \param pageCount Page count
/// \param[out] errorMessage Error message
@@ -105,7 +114,8 @@ public:
ErrorNoDocumentSpecified,
ErrorDocumentReading,
ErrorInvalidArguments,
ErrorFailedWriteToFile
ErrorFailedWriteToFile,
ErrorPermissions
};
enum StandardString
@@ -126,6 +136,7 @@ public:
ComputeHashes = 0x0040, ///< Compute hashes
PageSelector = 0x0080, ///< Select page range (or all pages)
TextAnalysis = 0x0100, ///< Text analysis options
TextShow = 0x0200, ///< Text extract and show options
};
Q_DECLARE_FLAGS(Options, Option)

View File

@@ -34,7 +34,7 @@ QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplicatio
return PDFToolTranslationContext::tr("Fetch text");
case Description:
return PDFToolTranslationContext::tr("Fetch text content from a document.");
return PDFToolTranslationContext::tr("Fetch text content from document.");
default:
Q_ASSERT(false);
@@ -53,6 +53,12 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
return ErrorDocumentReading;
}
if (!document.getStorage().getSecurityHandler()->isAllowed(pdf::PDFSecurityHandler::Permission::CopyContent))
{
PDFConsole::writeError(PDFToolTranslationContext::tr("Document doesn't allow to copy content."), options.outputCodec);
return ErrorPermissions;
}
QString parseError;
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
@@ -78,7 +84,20 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
if (!item.text.isEmpty())
{
formatter.writeText("text", item.text);
bool showText = (item.flags.testFlag(pdf::PDFDocumentTextFlow::Text)) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageStart) && options.textShowPageNumbers) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd) && options.textShowPageNumbers) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureTitle) && options.textShowStructTitles) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureLanguage) && options.textShowStructLanguage) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureAlternativeDescription) && options.textShowStructAlternativeDescription) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureExpandedForm) && options.textShowStructExpandedForm) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureActualText) && options.textShowStructActualText) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructurePhoneme) && options.textShowStructPhoneme);
if (showText)
{
formatter.writeText("text", item.text);
}
}
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd))
@@ -106,7 +125,7 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
{
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis;
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis | TextShow;
}
} // namespace pdftool