mirror of https://github.com/JakubMelka/PDF4QT.git
Finishing of text extracting tool
This commit is contained in:
parent
724d58194e
commit
16b583f390
|
@ -356,7 +356,7 @@ void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextC
|
|||
{
|
||||
if (!isContentSuppressed())
|
||||
{
|
||||
if (!info.character.isNull())
|
||||
if (!info.character.isNull() && info.character != QChar(QChar::SoftHyphen))
|
||||
{
|
||||
m_currentText.push_back(info.character);
|
||||
}
|
||||
|
@ -480,6 +480,8 @@ public:
|
|||
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
||||
|
||||
private:
|
||||
void markHasContent();
|
||||
|
||||
PDFDocumentTextFlow::Items* m_items;
|
||||
const PDFStructureTreeTextExtractor* m_extractor;
|
||||
std::vector<bool> m_hasContentStack;
|
||||
|
@ -492,6 +494,14 @@ void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTre
|
|||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
|
||||
}
|
||||
|
||||
void PDFStructureTreeTextFlowCollector::markHasContent()
|
||||
{
|
||||
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
|
||||
{
|
||||
m_hasContentStack[i] = true;
|
||||
}
|
||||
}
|
||||
|
||||
void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
||||
{
|
||||
size_t index = m_items->size();
|
||||
|
@ -500,12 +510,52 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure
|
|||
// Mark stack so we can delete unused items
|
||||
m_hasContentStack.push_back(false);
|
||||
|
||||
QString title = structureElement->getText(PDFStructureElement::Title);
|
||||
QString language = structureElement->getText(PDFStructureElement::Language);
|
||||
QString alternativeDescription = structureElement->getText(PDFStructureElement::AlternativeDescription);
|
||||
QString expandedForm = structureElement->getText(PDFStructureElement::ExpandedForm);
|
||||
QString actualText = structureElement->getText(PDFStructureElement::ActualText);
|
||||
QString phoneme = structureElement->getText(PDFStructureElement::Phoneme);
|
||||
|
||||
if (!title.isEmpty())
|
||||
{
|
||||
markHasContent();
|
||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureTitle, -1, });
|
||||
}
|
||||
|
||||
if (!language.isEmpty())
|
||||
{
|
||||
markHasContent();
|
||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureLanguage, -1, language });
|
||||
}
|
||||
|
||||
if (!alternativeDescription.isEmpty())
|
||||
{
|
||||
markHasContent();
|
||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureAlternativeDescription, -1, alternativeDescription });
|
||||
}
|
||||
|
||||
if (!expandedForm.isEmpty())
|
||||
{
|
||||
markHasContent();
|
||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureExpandedForm, -1, expandedForm });
|
||||
}
|
||||
|
||||
if (!actualText.isEmpty())
|
||||
{
|
||||
markHasContent();
|
||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureActualText, -1, actualText });
|
||||
}
|
||||
|
||||
if (!phoneme.isEmpty())
|
||||
{
|
||||
markHasContent();
|
||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructurePhoneme, -1, phoneme });
|
||||
}
|
||||
|
||||
for (const QString& string : m_extractor->getText(structureElement))
|
||||
{
|
||||
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
|
||||
{
|
||||
m_hasContentStack[i] = true;
|
||||
}
|
||||
markHasContent();
|
||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string});
|
||||
}
|
||||
|
||||
|
@ -539,7 +589,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||
PDFStructureTree structureTree;
|
||||
|
||||
const PDFCatalog* catalog = document->getCatalog();
|
||||
if (algorithm == Algorithm::Auto || algorithm == Algorithm::Structure)
|
||||
if (algorithm != Algorithm::Layout)
|
||||
{
|
||||
structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot());
|
||||
}
|
||||
|
@ -559,8 +609,6 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||
|
||||
Q_ASSERT(algorithm != Algorithm::Auto);
|
||||
|
||||
QMutex mutex;
|
||||
|
||||
// Perform algorithm to retrieve document text
|
||||
switch (algorithm)
|
||||
{
|
||||
|
@ -570,6 +618,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||
|
||||
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
||||
|
||||
QMutex mutex;
|
||||
PDFCMSGeneric cms;
|
||||
PDFMeshQualitySettings mqs;
|
||||
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
||||
|
|
|
@ -46,7 +46,6 @@ public:
|
|||
StructurePhoneme = 0x0100, ///< Structure tree item phoneme
|
||||
StructureItemStart = 0x0200, ///< Start of structure tree item
|
||||
StructureItemEnd = 0x0400, ///< End of structure tree item
|
||||
StructureEmpty = 0x0800, ///< Structure tree item doesn't contain any text
|
||||
};
|
||||
Q_DECLARE_FLAGS(Flags, Flag)
|
||||
|
||||
|
|
|
@ -204,7 +204,18 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
|
|||
|
||||
if (optionFlags.testFlag(TextAnalysis))
|
||||
{
|
||||
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto"));
|
||||
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure).", "algorithm", "auto"));
|
||||
}
|
||||
|
||||
if (optionFlags.testFlag(TextShow))
|
||||
{
|
||||
parser->addOption(QCommandLineOption("text-show-page-numbers", "Show page numbers in extracted text."));
|
||||
parser->addOption(QCommandLineOption("text-show-struct-title", "Show title extracted from structure tree."));
|
||||
parser->addOption(QCommandLineOption("text-show-struct-lang", "Show language extracted from structure tree."));
|
||||
parser->addOption(QCommandLineOption("text-show-struct-alt-desc", "Show alternative description extracted from structure tree."));
|
||||
parser->addOption(QCommandLineOption("text-show-struct-expanded-form", "Show expanded form extracted from structure tree."));
|
||||
parser->addOption(QCommandLineOption("text-show-struct-act-text", "Show actual text extracted from structure tree."));
|
||||
parser->addOption(QCommandLineOption("text-show-phoneme", "Show phoneme extracted from structure tree."));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -338,6 +349,17 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
|
|||
}
|
||||
}
|
||||
|
||||
if (optionFlags.testFlag(TextShow))
|
||||
{
|
||||
options.textShowPageNumbers = parser->isSet("text-show-page-numbers");
|
||||
options.textShowStructTitles = parser->isSet("text-show-struct-title");
|
||||
options.textShowStructLanguage = parser->isSet("text-show-struct-lang");
|
||||
options.textShowStructAlternativeDescription = parser->isSet("text-show-struct-alt-desc");
|
||||
options.textShowStructExpandedForm = parser->isSet("text-show-struct-expanded-form");
|
||||
options.textShowStructActualText = parser->isSet("text-show-struct-act-text");
|
||||
options.textShowStructPhoneme = parser->isSet("text-show-phoneme");
|
||||
}
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
|
|
|
@ -84,6 +84,15 @@ struct PDFToolOptions
|
|||
// For option 'TextAnalysis'
|
||||
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
|
||||
|
||||
// For option 'TextShow'
|
||||
bool textShowPageNumbers = false;
|
||||
bool textShowStructTitles = false;
|
||||
bool textShowStructLanguage = false;
|
||||
bool textShowStructAlternativeDescription = false;
|
||||
bool textShowStructExpandedForm = false;
|
||||
bool textShowStructActualText = false;
|
||||
bool textShowStructPhoneme = false;
|
||||
|
||||
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
|
||||
/// \param pageCount Page count
|
||||
/// \param[out] errorMessage Error message
|
||||
|
@ -105,7 +114,8 @@ public:
|
|||
ErrorNoDocumentSpecified,
|
||||
ErrorDocumentReading,
|
||||
ErrorInvalidArguments,
|
||||
ErrorFailedWriteToFile
|
||||
ErrorFailedWriteToFile,
|
||||
ErrorPermissions
|
||||
};
|
||||
|
||||
enum StandardString
|
||||
|
@ -126,6 +136,7 @@ public:
|
|||
ComputeHashes = 0x0040, ///< Compute hashes
|
||||
PageSelector = 0x0080, ///< Select page range (or all pages)
|
||||
TextAnalysis = 0x0100, ///< Text analysis options
|
||||
TextShow = 0x0200, ///< Text extract and show options
|
||||
};
|
||||
Q_DECLARE_FLAGS(Options, Option)
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplicatio
|
|||
return PDFToolTranslationContext::tr("Fetch text");
|
||||
|
||||
case Description:
|
||||
return PDFToolTranslationContext::tr("Fetch text content from a document.");
|
||||
return PDFToolTranslationContext::tr("Fetch text content from document.");
|
||||
|
||||
default:
|
||||
Q_ASSERT(false);
|
||||
|
@ -53,6 +53,12 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
|||
return ErrorDocumentReading;
|
||||
}
|
||||
|
||||
if (!document.getStorage().getSecurityHandler()->isAllowed(pdf::PDFSecurityHandler::Permission::CopyContent))
|
||||
{
|
||||
PDFConsole::writeError(PDFToolTranslationContext::tr("Document doesn't allow to copy content."), options.outputCodec);
|
||||
return ErrorPermissions;
|
||||
}
|
||||
|
||||
QString parseError;
|
||||
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
|
||||
|
||||
|
@ -77,9 +83,22 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
|||
}
|
||||
|
||||
if (!item.text.isEmpty())
|
||||
{
|
||||
bool showText = (item.flags.testFlag(pdf::PDFDocumentTextFlow::Text)) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageStart) && options.textShowPageNumbers) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd) && options.textShowPageNumbers) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureTitle) && options.textShowStructTitles) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureLanguage) && options.textShowStructLanguage) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureAlternativeDescription) && options.textShowStructAlternativeDescription) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureExpandedForm) && options.textShowStructExpandedForm) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureActualText) && options.textShowStructActualText) ||
|
||||
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructurePhoneme) && options.textShowStructPhoneme);
|
||||
|
||||
if (showText)
|
||||
{
|
||||
formatter.writeText("text", item.text);
|
||||
}
|
||||
}
|
||||
|
||||
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd))
|
||||
{
|
||||
|
@ -106,7 +125,7 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
|||
|
||||
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
|
||||
{
|
||||
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis;
|
||||
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis | TextShow;
|
||||
}
|
||||
|
||||
} // namespace pdftool
|
||||
|
|
Loading…
Reference in New Issue