mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
Finishing of text extracting tool
This commit is contained in:
@ -356,7 +356,7 @@ void PDFStructureTreeTextContentProcessor::performOutputCharacter(const PDFTextC
|
|||||||
{
|
{
|
||||||
if (!isContentSuppressed())
|
if (!isContentSuppressed())
|
||||||
{
|
{
|
||||||
if (!info.character.isNull())
|
if (!info.character.isNull() && info.character != QChar(QChar::SoftHyphen))
|
||||||
{
|
{
|
||||||
m_currentText.push_back(info.character);
|
m_currentText.push_back(info.character);
|
||||||
}
|
}
|
||||||
@ -480,6 +480,8 @@ public:
|
|||||||
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
virtual void visitStructureObjectReference(const PDFStructureObjectReference* structureObjectReference) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void markHasContent();
|
||||||
|
|
||||||
PDFDocumentTextFlow::Items* m_items;
|
PDFDocumentTextFlow::Items* m_items;
|
||||||
const PDFStructureTreeTextExtractor* m_extractor;
|
const PDFStructureTreeTextExtractor* m_extractor;
|
||||||
std::vector<bool> m_hasContentStack;
|
std::vector<bool> m_hasContentStack;
|
||||||
@ -492,6 +494,14 @@ void PDFStructureTreeTextFlowCollector::visitStructureTree(const PDFStructureTre
|
|||||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureItemEnd, -1, QString()});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFStructureTreeTextFlowCollector::markHasContent()
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
|
||||||
|
{
|
||||||
|
m_hasContentStack[i] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructureElement* structureElement)
|
||||||
{
|
{
|
||||||
size_t index = m_items->size();
|
size_t index = m_items->size();
|
||||||
@ -500,12 +510,52 @@ void PDFStructureTreeTextFlowCollector::visitStructureElement(const PDFStructure
|
|||||||
// Mark stack so we can delete unused items
|
// Mark stack so we can delete unused items
|
||||||
m_hasContentStack.push_back(false);
|
m_hasContentStack.push_back(false);
|
||||||
|
|
||||||
|
QString title = structureElement->getText(PDFStructureElement::Title);
|
||||||
|
QString language = structureElement->getText(PDFStructureElement::Language);
|
||||||
|
QString alternativeDescription = structureElement->getText(PDFStructureElement::AlternativeDescription);
|
||||||
|
QString expandedForm = structureElement->getText(PDFStructureElement::ExpandedForm);
|
||||||
|
QString actualText = structureElement->getText(PDFStructureElement::ActualText);
|
||||||
|
QString phoneme = structureElement->getText(PDFStructureElement::Phoneme);
|
||||||
|
|
||||||
|
if (!title.isEmpty())
|
||||||
|
{
|
||||||
|
markHasContent();
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureTitle, -1, });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!language.isEmpty())
|
||||||
|
{
|
||||||
|
markHasContent();
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureLanguage, -1, language });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!alternativeDescription.isEmpty())
|
||||||
|
{
|
||||||
|
markHasContent();
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureAlternativeDescription, -1, alternativeDescription });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!expandedForm.isEmpty())
|
||||||
|
{
|
||||||
|
markHasContent();
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureExpandedForm, -1, expandedForm });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!actualText.isEmpty())
|
||||||
|
{
|
||||||
|
markHasContent();
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructureActualText, -1, actualText });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!phoneme.isEmpty())
|
||||||
|
{
|
||||||
|
markHasContent();
|
||||||
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::StructurePhoneme, -1, phoneme });
|
||||||
|
}
|
||||||
|
|
||||||
for (const QString& string : m_extractor->getText(structureElement))
|
for (const QString& string : m_extractor->getText(structureElement))
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < m_hasContentStack.size(); ++i)
|
markHasContent();
|
||||||
{
|
|
||||||
m_hasContentStack[i] = true;
|
|
||||||
}
|
|
||||||
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string});
|
m_items->push_back(PDFDocumentTextFlow::Item{PDFDocumentTextFlow::Text, -1, string});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -539,7 +589,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
PDFStructureTree structureTree;
|
PDFStructureTree structureTree;
|
||||||
|
|
||||||
const PDFCatalog* catalog = document->getCatalog();
|
const PDFCatalog* catalog = document->getCatalog();
|
||||||
if (algorithm == Algorithm::Auto || algorithm == Algorithm::Structure)
|
if (algorithm != Algorithm::Layout)
|
||||||
{
|
{
|
||||||
structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot());
|
structureTree = PDFStructureTree::parse(&document->getStorage(), catalog->getStructureTreeRoot());
|
||||||
}
|
}
|
||||||
@ -559,8 +609,6 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
|
|
||||||
Q_ASSERT(algorithm != Algorithm::Auto);
|
Q_ASSERT(algorithm != Algorithm::Auto);
|
||||||
|
|
||||||
QMutex mutex;
|
|
||||||
|
|
||||||
// Perform algorithm to retrieve document text
|
// Perform algorithm to retrieve document text
|
||||||
switch (algorithm)
|
switch (algorithm)
|
||||||
{
|
{
|
||||||
@ -570,6 +618,7 @@ PDFDocumentTextFlow PDFDocumentTextFlowFactory::create(const PDFDocument* docume
|
|||||||
|
|
||||||
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
std::map<PDFInteger, PDFDocumentTextFlow::Items> items;
|
||||||
|
|
||||||
|
QMutex mutex;
|
||||||
PDFCMSGeneric cms;
|
PDFCMSGeneric cms;
|
||||||
PDFMeshQualitySettings mqs;
|
PDFMeshQualitySettings mqs;
|
||||||
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
PDFOptionalContentActivity oca(document, OCUsage::Export, nullptr);
|
||||||
|
@ -46,7 +46,6 @@ public:
|
|||||||
StructurePhoneme = 0x0100, ///< Structure tree item phoneme
|
StructurePhoneme = 0x0100, ///< Structure tree item phoneme
|
||||||
StructureItemStart = 0x0200, ///< Start of structure tree item
|
StructureItemStart = 0x0200, ///< Start of structure tree item
|
||||||
StructureItemEnd = 0x0400, ///< End of structure tree item
|
StructureItemEnd = 0x0400, ///< End of structure tree item
|
||||||
StructureEmpty = 0x0800, ///< Structure tree item doesn't contain any text
|
|
||||||
};
|
};
|
||||||
Q_DECLARE_FLAGS(Flags, Flag)
|
Q_DECLARE_FLAGS(Flags, Flag)
|
||||||
|
|
||||||
|
@ -204,7 +204,18 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
|
|||||||
|
|
||||||
if (optionFlags.testFlag(TextAnalysis))
|
if (optionFlags.testFlag(TextAnalysis))
|
||||||
{
|
{
|
||||||
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure", "algorithm", "auto"));
|
parser->addOption(QCommandLineOption("text-analysis-alg", "Text analysis algorithm (auto - select automatically, layout - perform automatic layout algorithm, content - simple content stream reading order, structure - use tagged document structure).", "algorithm", "auto"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (optionFlags.testFlag(TextShow))
|
||||||
|
{
|
||||||
|
parser->addOption(QCommandLineOption("text-show-page-numbers", "Show page numbers in extracted text."));
|
||||||
|
parser->addOption(QCommandLineOption("text-show-struct-title", "Show title extracted from structure tree."));
|
||||||
|
parser->addOption(QCommandLineOption("text-show-struct-lang", "Show language extracted from structure tree."));
|
||||||
|
parser->addOption(QCommandLineOption("text-show-struct-alt-desc", "Show alternative description extracted from structure tree."));
|
||||||
|
parser->addOption(QCommandLineOption("text-show-struct-expanded-form", "Show expanded form extracted from structure tree."));
|
||||||
|
parser->addOption(QCommandLineOption("text-show-struct-act-text", "Show actual text extracted from structure tree."));
|
||||||
|
parser->addOption(QCommandLineOption("text-show-phoneme", "Show phoneme extracted from structure tree."));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -338,6 +349,17 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (optionFlags.testFlag(TextShow))
|
||||||
|
{
|
||||||
|
options.textShowPageNumbers = parser->isSet("text-show-page-numbers");
|
||||||
|
options.textShowStructTitles = parser->isSet("text-show-struct-title");
|
||||||
|
options.textShowStructLanguage = parser->isSet("text-show-struct-lang");
|
||||||
|
options.textShowStructAlternativeDescription = parser->isSet("text-show-struct-alt-desc");
|
||||||
|
options.textShowStructExpandedForm = parser->isSet("text-show-struct-expanded-form");
|
||||||
|
options.textShowStructActualText = parser->isSet("text-show-struct-act-text");
|
||||||
|
options.textShowStructPhoneme = parser->isSet("text-show-phoneme");
|
||||||
|
}
|
||||||
|
|
||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,6 +84,15 @@ struct PDFToolOptions
|
|||||||
// For option 'TextAnalysis'
|
// For option 'TextAnalysis'
|
||||||
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
|
pdf::PDFDocumentTextFlowFactory::Algorithm textAnalysisAlgorithm = pdf::PDFDocumentTextFlowFactory::Algorithm::Auto;
|
||||||
|
|
||||||
|
// For option 'TextShow'
|
||||||
|
bool textShowPageNumbers = false;
|
||||||
|
bool textShowStructTitles = false;
|
||||||
|
bool textShowStructLanguage = false;
|
||||||
|
bool textShowStructAlternativeDescription = false;
|
||||||
|
bool textShowStructExpandedForm = false;
|
||||||
|
bool textShowStructActualText = false;
|
||||||
|
bool textShowStructPhoneme = false;
|
||||||
|
|
||||||
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
|
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
|
||||||
/// \param pageCount Page count
|
/// \param pageCount Page count
|
||||||
/// \param[out] errorMessage Error message
|
/// \param[out] errorMessage Error message
|
||||||
@ -105,7 +114,8 @@ public:
|
|||||||
ErrorNoDocumentSpecified,
|
ErrorNoDocumentSpecified,
|
||||||
ErrorDocumentReading,
|
ErrorDocumentReading,
|
||||||
ErrorInvalidArguments,
|
ErrorInvalidArguments,
|
||||||
ErrorFailedWriteToFile
|
ErrorFailedWriteToFile,
|
||||||
|
ErrorPermissions
|
||||||
};
|
};
|
||||||
|
|
||||||
enum StandardString
|
enum StandardString
|
||||||
@ -126,6 +136,7 @@ public:
|
|||||||
ComputeHashes = 0x0040, ///< Compute hashes
|
ComputeHashes = 0x0040, ///< Compute hashes
|
||||||
PageSelector = 0x0080, ///< Select page range (or all pages)
|
PageSelector = 0x0080, ///< Select page range (or all pages)
|
||||||
TextAnalysis = 0x0100, ///< Text analysis options
|
TextAnalysis = 0x0100, ///< Text analysis options
|
||||||
|
TextShow = 0x0200, ///< Text extract and show options
|
||||||
};
|
};
|
||||||
Q_DECLARE_FLAGS(Options, Option)
|
Q_DECLARE_FLAGS(Options, Option)
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ QString PDFToolFetchTextApplication::getStandardString(PDFToolAbstractApplicatio
|
|||||||
return PDFToolTranslationContext::tr("Fetch text");
|
return PDFToolTranslationContext::tr("Fetch text");
|
||||||
|
|
||||||
case Description:
|
case Description:
|
||||||
return PDFToolTranslationContext::tr("Fetch text content from a document.");
|
return PDFToolTranslationContext::tr("Fetch text content from document.");
|
||||||
|
|
||||||
default:
|
default:
|
||||||
Q_ASSERT(false);
|
Q_ASSERT(false);
|
||||||
@ -53,6 +53,12 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
|||||||
return ErrorDocumentReading;
|
return ErrorDocumentReading;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!document.getStorage().getSecurityHandler()->isAllowed(pdf::PDFSecurityHandler::Permission::CopyContent))
|
||||||
|
{
|
||||||
|
PDFConsole::writeError(PDFToolTranslationContext::tr("Document doesn't allow to copy content."), options.outputCodec);
|
||||||
|
return ErrorPermissions;
|
||||||
|
}
|
||||||
|
|
||||||
QString parseError;
|
QString parseError;
|
||||||
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
|
std::vector<pdf::PDFInteger> pages = options.getPageRange(document.getCatalog()->getPageCount(), parseError, true);
|
||||||
|
|
||||||
@ -78,7 +84,20 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
|||||||
|
|
||||||
if (!item.text.isEmpty())
|
if (!item.text.isEmpty())
|
||||||
{
|
{
|
||||||
formatter.writeText("text", item.text);
|
bool showText = (item.flags.testFlag(pdf::PDFDocumentTextFlow::Text)) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageStart) && options.textShowPageNumbers) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::PageEnd) && options.textShowPageNumbers) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureTitle) && options.textShowStructTitles) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureLanguage) && options.textShowStructLanguage) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureAlternativeDescription) && options.textShowStructAlternativeDescription) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureExpandedForm) && options.textShowStructExpandedForm) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureActualText) && options.textShowStructActualText) ||
|
||||||
|
(item.flags.testFlag(pdf::PDFDocumentTextFlow::StructurePhoneme) && options.textShowStructPhoneme);
|
||||||
|
|
||||||
|
if (showText)
|
||||||
|
{
|
||||||
|
formatter.writeText("text", item.text);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd))
|
if (item.flags.testFlag(pdf::PDFDocumentTextFlow::StructureItemEnd))
|
||||||
@ -106,7 +125,7 @@ int PDFToolFetchTextApplication::execute(const PDFToolOptions& options)
|
|||||||
|
|
||||||
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
|
PDFToolAbstractApplication::Options PDFToolFetchTextApplication::getOptionsFlags() const
|
||||||
{
|
{
|
||||||
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis;
|
return ConsoleFormat | OpenDocument | PageSelector | TextAnalysis | TextShow;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace pdftool
|
} // namespace pdftool
|
||||||
|
Reference in New Issue
Block a user