Finishing of text extracting tool

This commit is contained in:
Jakub Melka 2020-10-18 12:57:27 +02:00
parent 724d58194e
commit 16b583f390
5 changed files with 114 additions and 14 deletions

View File

@ -356,7 +356,7 @@ void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextC
{
if (!isContentSuppressed())
{
if (!info.character.isNull())
if (!info.character.isNull() && info.character != QChar(QChar::SoftHyphen))
{
m_currentText.push_back(info.character);
}
@ -480,6 +480,8 @@ public:
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
private:
void markHasContent();
PDFDocumentTextFlow::Items* m_items;
const PDFStructureTreeTextExtractor* m_extractor;
std::vector<bool> m_hasContentStack;
@ -492,6 +494,14 @@ void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTre
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
}
void PDFStructureTreeTextFlowCollector::markHasContent()
{
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
{
m_hasContentStack[i] = true;
}
}
void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement)
{
size_t index = m_items->size();
@ -500,12 +510,52 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure
// Mark stack so we can delete unused items
m_hasContentStack.push_back(false);
QString title = structureElement->getText(PDFStructureElement::Title);
QString language = structureElement->getText(PDFStructureElement::Language);
QString alternativeDescription = structureElement->getText(PDFStructureElement::AlternativeDescription);
QString expandedForm = structureElement->getText(PDFStructureElement::ExpandedForm);
QString actualText = structureElement->getText(PDFStructureElement::ActualText);
QString phoneme = structureElement->getText(PDFStructureElement::Phoneme);
if (!title.isEmpty())
{
markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureTitle, -1, });
}
if (!language.isEmpty())
{
markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureLanguage, -1, language });
}
if (!alternativeDescription.isEmpty())
{
markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureAlternativeDescription, -1, alternativeDescription });
}
if (!expandedForm.isEmpty())
{
markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureExpandedForm, -1, expandedForm });
}
if (!actualText.isEmpty())
{
markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureActualText, -1, actualText });
}
if (!phoneme.isEmpty())
{
markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructurePhoneme, -1, phoneme });
}
for (const QString& string : m_extractor->getText(structureElement))
{
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
{
m_hasContentStack[i] = true;
}
markHasContent();
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string});
}
@ -539,7 +589,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
PDFStructureTree structureTree;
const PDFCatalog* catalog = document->getCatalog();
if (algorithm == Algorithm::Auto || algorithm == Algorithm::Structure)
if (algorithm != Algorithm::Layout)
{
structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot());
}
@ -559,8 +609,6 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
Q_ASSERT(algorithm != Algorithm::Auto);
QMutex mutex;
// Perform algorithm to retrieve document text
switch (algorithm)
{
@ -570,6 +618,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
QMutex mutex;
PDFCMSGeneric cms;
PDFMeshQualitySettings mqs;
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);

View File

@ -46,7 +46,6 @@ public:
StructurePhoneme = 0x0100, ///< Structure tree item phoneme
StructureItemStart = 0x0200, ///< Start of structure tree item
StructureItemEnd = 0x0400, ///< End of structure tree item
StructureEmpty = 0x0800, ///< Structure tree item doesn't contain any text
};
Q_DECLARE_FLAGS(Flags, Flag)

View File

@ -204,7 +204,18 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
if (optionFlags.testFlag(TextAnalysis))
{
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto"));
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure).", "algorithm", "auto"));
}
if (optionFlags.testFlag(TextShow))
{
parser->addOption(QCommandLineOption("text-show-page-numbers", "Show page numbers in extracted text."));
parser->addOption(QCommandLineOption("text-show-struct-title", "Show title extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-lang", "Show language extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-alt-desc", "Show alternative description extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-expanded-form", "Show expanded form extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-struct-act-text", "Show actual text extracted from structure tree."));
parser->addOption(QCommandLineOption("text-show-phoneme", "Show phoneme extracted from structure tree."));
}
}
@ -338,6 +349,17 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
}
}
if (optionFlags.testFlag(TextShow))
{
options.textShowPageNumbers = parser->isSet("text-show-page-numbers");
options.textShowStructTitles = parser->isSet("text-show-struct-title");
options.textShowStructLanguage = parser->isSet("text-show-struct-lang");
options.textShowStructAlternativeDescription = parser->isSet("text-show-struct-alt-desc");
options.textShowStructExpandedForm = parser->isSet("text-show-struct-expanded-form");
options.textShowStructActualText = parser->isSet("text-show-struct-act-text");
options.textShowStructPhoneme = parser->isSet("text-show-phoneme");
}
return options;
}

View File

@ -84,6 +84,15 @@ struct PDFToolOptions
// For option 'TextAnalysis'
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
// For option 'TextShow'
bool textShowPageNumbers = false;
bool textShowStructTitles = false;
bool textShowStructLanguage = false;
bool textShowStructAlternativeDescription = false;
bool textShowStructExpandedForm = false;
bool textShowStructActualText = false;
bool textShowStructPhoneme = false;
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
/// \param pageCount Page count
/// \param[out] errorMessage Error message
@ -105,7 +114,8 @@ public:
ErrorNoDocumentSpecified,
ErrorDocumentReading,
ErrorInvalidArguments,
ErrorFailedWriteToFile
ErrorFailedWriteToFile,
ErrorPermissions
};
enum StandardString
@ -126,6 +136,7 @@ public:
ComputeHashes = 0x0040, ///< Compute hashes
PageSelector = 0x0080, ///< Select page range (or all pages)
TextAnalysis = 0x0100, ///< Text analysis options
TextShow = 0x0200, ///< Text extract and show options
};
Q_DECLARE_FLAGS(Options, Option)

View File

@ -34,7 +34,7 @@ QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplicatio
return PDFToolTranslationContext::tr("Fetch text");
case Description:
return PDFToolTranslationContext::tr("Fetch text content from a document.");
return PDFToolTranslationContext::tr("Fetch text content from document.");
default:
Q_ASSERT(false);
@ -53,6 +53,12 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
return ErrorDocumentReading;
}
if (!document.getStorage().getSecurityHandler()->isAllowed(pdf::PDFSecurityHandler::Permission::CopyContent))
{
PDFConsole::writeError(PDFToolTranslationContext::tr("Document doesn't allow to copy content."), options.outputCodec);
return ErrorPermissions;
}
QString parseError;
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
@ -77,9 +83,22 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
}
if (!item.text.isEmpty())
{
bool showText = (item.flags.testFlag(pdf::PDFDocumentTextFlow::Text)) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageStart) && options.textShowPageNumbers) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd) && options.textShowPageNumbers) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureTitle) && options.textShowStructTitles) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureLanguage) && options.textShowStructLanguage) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureAlternativeDescription) && options.textShowStructAlternativeDescription) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureExpandedForm) && options.textShowStructExpandedForm) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureActualText) && options.textShowStructActualText) ||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructurePhoneme) && options.textShowStructPhoneme);
if (showText)
{
formatter.writeText("text", item.text);
}
}
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd))
{
@ -106,7 +125,7 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
{
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis;
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis | TextShow;
}
} // namespace pdftool