mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
Advanced find (algorithms)
This commit is contained in:
@@ -92,7 +92,7 @@ private:
|
||||
std::map<PDFInteger, CompileTask> m_tasks;
|
||||
};
|
||||
|
||||
class PDFAsynchronousTextLayoutCompiler : public QObject
|
||||
class PDFFORQTLIBSHARED_EXPORT PDFAsynchronousTextLayoutCompiler : public QObject
|
||||
{
|
||||
Q_OBJECT
|
||||
|
||||
@@ -130,6 +130,9 @@ public:
|
||||
/// \p textLayoutChanged is emitted.
|
||||
void makeTextLayout();
|
||||
|
||||
/// Returns true, if text layout is ready
|
||||
bool isTextLayoutReady() const { return m_textLayouts.has_value(); }
|
||||
|
||||
signals:
|
||||
void textLayoutChanged();
|
||||
|
||||
|
@@ -994,6 +994,7 @@ QRectF PDFDrawWidgetProxy::fromDeviceSpace(const QRectF& rect) const
|
||||
void PDFDrawWidgetProxy::onTextLayoutChanged()
|
||||
{
|
||||
emit repaintNeeded();
|
||||
emit textLayoutChanged();
|
||||
}
|
||||
|
||||
bool PDFDrawWidgetProxy::isBlockMode() const
|
||||
|
@@ -276,6 +276,7 @@ public:
|
||||
const PDFCMSManager* getCMSManager() const;
|
||||
PDFProgress* getProgress() const { return m_progress; }
|
||||
void setProgress(PDFProgress* progress) { m_progress = progress; }
|
||||
PDFAsynchronousTextLayoutCompiler* getTextLayoutCompiler() const { return m_textLayoutCompiler; }
|
||||
|
||||
void setFeatures(PDFRenderer::Features features);
|
||||
void setPreferredMeshResolutionRatio(PDFReal ratio);
|
||||
@@ -291,6 +292,7 @@ signals:
|
||||
void renderingError(PDFInteger pageIndex, const QList<PDFRenderError>& errors);
|
||||
void repaintNeeded();
|
||||
void pageImageChanged(bool all, const std::vector<PDFInteger>& pages);
|
||||
void textLayoutChanged();
|
||||
|
||||
private:
|
||||
struct LayoutItem
|
||||
|
@@ -1,4 +1,4 @@
|
||||
// Copyright (C) 2019 Jakub Melka
|
||||
// Copyright (C) 2019-2020 Jakub Melka
|
||||
//
|
||||
// This file is part of PdfForQt.
|
||||
//
|
||||
@@ -593,4 +593,170 @@ QDataStream& operator>>(QDataStream& stream, PDFTextLayoutSettings& settings)
|
||||
return stream;
|
||||
}
|
||||
|
||||
PDFTextSelection::PDFTextSelection(PDFTextSelectionItems&& items) :
|
||||
m_items(qMove(items))
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
PDFFindResults PDFTextFlow::find(const QString& text, Qt::CaseSensitivity caseSensitivity) const
|
||||
{
|
||||
PDFFindResults results;
|
||||
|
||||
int index = m_text.indexOf(text, 0, caseSensitivity);
|
||||
while (index != -1)
|
||||
{
|
||||
PDFFindResult result;
|
||||
result.matched = text;
|
||||
result.textSelectionItems = getTextSelectionItems(index, text.length());
|
||||
result.context = getContext(index, text.length());
|
||||
|
||||
if (!result.textSelectionItems.empty())
|
||||
{
|
||||
results.emplace_back(qMove(result));
|
||||
}
|
||||
|
||||
index = m_text.indexOf(text, index + 1, caseSensitivity);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
void PDFTextFlow::merge(const PDFTextFlow& next)
|
||||
{
|
||||
m_text += next.m_text;
|
||||
m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend());
|
||||
}
|
||||
|
||||
PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex)
|
||||
{
|
||||
PDFTextFlows result;
|
||||
|
||||
if (!flags.testFlag(SeparateBlocks))
|
||||
{
|
||||
result.emplace_back();
|
||||
}
|
||||
|
||||
QString lineBreak(" ");
|
||||
if (flags.testFlag(AddLineBreaks))
|
||||
{
|
||||
#if defined(Q_OS_WIN)
|
||||
lineBreak = QString("\r\n");
|
||||
#elif defined(Q_OS_UNIX)
|
||||
linebreak = QString("\n");
|
||||
#elif defined(Q_OS_MAC)
|
||||
lineBreak = QString("\r");
|
||||
#else
|
||||
static_assert(false, "Fix this code!");
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t textBlockIndex = 0;
|
||||
for (const PDFTextBlock& textBlock : layout.getTextBlocks())
|
||||
{
|
||||
PDFTextFlow currentFlow;
|
||||
|
||||
size_t textLineIndex = 0;
|
||||
for (const PDFTextLine& textLine : textBlock.getLines())
|
||||
{
|
||||
const TextCharacters& characters = textLine.getCharacters();
|
||||
for (size_t i = 0, characterCount = characters.size(); i < characterCount; ++i)
|
||||
{
|
||||
const TextCharacter& currentCharacter = characters[i];
|
||||
if (i > 0 && !currentCharacter.character.isSpace())
|
||||
{
|
||||
// Jakub Melka: try to guess space between letters
|
||||
const TextCharacter& previousCharacter = characters[i - 1];
|
||||
if (!previousCharacter.character.isSpace() && QLineF(previousCharacter.position, currentCharacter.position).length() > previousCharacter.advance * 1.1)
|
||||
{
|
||||
currentFlow.m_text += QChar(' ');
|
||||
currentFlow.m_characterPointers.emplace_back();
|
||||
}
|
||||
}
|
||||
|
||||
currentFlow.m_text += currentCharacter.character;
|
||||
|
||||
PDFCharacterPointer pointer;
|
||||
pointer.pageIndex = pageIndex;
|
||||
pointer.blockIndex = textBlockIndex;
|
||||
pointer.lineIndex = textLineIndex;
|
||||
pointer.characterIndex = i;
|
||||
currentFlow.m_characterPointers.emplace_back(qMove(pointer));
|
||||
}
|
||||
|
||||
// Remove soft hyphen, if it is enabled
|
||||
if (flags.testFlag(RemoveSoftHyphen) && !characters.empty() && currentFlow.m_text.back() == QChar(QChar::SoftHyphen))
|
||||
{
|
||||
currentFlow.m_text.chop(1);
|
||||
currentFlow.m_characterPointers.pop_back();
|
||||
|
||||
if (!flags.testFlag(AddLineBreaks))
|
||||
{
|
||||
// Do not add single empty space - because soft hypen probably breaks a word
|
||||
++textLineIndex;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Add line break
|
||||
currentFlow.m_text += lineBreak;
|
||||
currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer());
|
||||
|
||||
++textLineIndex;
|
||||
}
|
||||
|
||||
// If we are producing separate blocks, then make flow for each
|
||||
// text block, otherwise join flows.
|
||||
if (flags.testFlag(SeparateBlocks))
|
||||
{
|
||||
result.emplace_back(qMove(currentFlow));
|
||||
}
|
||||
else
|
||||
{
|
||||
result.back().merge(currentFlow);
|
||||
}
|
||||
|
||||
++textBlockIndex;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
PDFTextSelectionItems PDFTextFlow::getTextSelectionItems(size_t index, size_t length) const
|
||||
{
|
||||
PDFTextSelectionItems items;
|
||||
|
||||
auto it = std::next(m_characterPointers.cbegin(), index);
|
||||
auto itEnd = std::next(m_characterPointers.cbegin(), index + length);
|
||||
s
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
QString PDFTextFlow::getContext(size_t index, size_t length) const
|
||||
{
|
||||
Q_ASSERT(length > 0);
|
||||
|
||||
while (index > 0 && m_characterPointers[index - 1].hasSameLine(m_characterPointers[index]))
|
||||
{
|
||||
--index;
|
||||
++length;
|
||||
}
|
||||
|
||||
size_t currentEnd = index + length - 1;
|
||||
size_t last = m_characterPointers.size() - 1;
|
||||
while (currentEnd < last && m_characterPointers[currentEnd].hasSameLine(m_characterPointers[currentEnd + 1]))
|
||||
{
|
||||
++currentEnd;
|
||||
++length;
|
||||
}
|
||||
|
||||
return m_text.mid(int(index), int(length));
|
||||
}
|
||||
|
||||
bool PDFCharacterPointer::hasSameLine(const PDFCharacterPointer& other) const
|
||||
{
|
||||
return pageIndex == other.pageIndex && blockIndex == other.blockIndex && lineIndex == other.lineIndex;
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
@@ -1,4 +1,4 @@
|
||||
// Copyright (C) 2019 Jakub Melka
|
||||
// Copyright (C) 2019-2020 Jakub Melka
|
||||
//
|
||||
// This file is part of PdfForQt.
|
||||
//
|
||||
@@ -24,9 +24,11 @@
|
||||
#include <QPainterPath>
|
||||
|
||||
#include <set>
|
||||
#include <compare>
|
||||
|
||||
namespace pdf
|
||||
{
|
||||
class PDFTextLayout;
|
||||
|
||||
struct PDFTextCharacterInfo
|
||||
{
|
||||
@@ -152,6 +154,100 @@ private:
|
||||
|
||||
using PDFTextBlocks = std::vector<PDFTextBlock>;
|
||||
|
||||
/// Character pointer points to some character in text layout.
|
||||
/// It also has page index to decide, which page the pointer points to.
|
||||
struct PDFCharacterPointer
|
||||
{
|
||||
auto operator<=>(const PDFCharacterPointer&) const = default;
|
||||
|
||||
/// Returns true, if character pointer is valid and points to the correct location
|
||||
bool isValid() const { return pageIndex > -1; }
|
||||
|
||||
/// Returns true, if character belongs to same line
|
||||
bool hasSameLine(const PDFCharacterPointer& other) const;
|
||||
|
||||
int pageIndex = -1;
|
||||
size_t blockIndex = 0;
|
||||
size_t lineIndex = 0;
|
||||
size_t characterIndex = 0;
|
||||
};
|
||||
|
||||
using PDFTextSelectionItem = std::pair<PDFCharacterPointer, PDFCharacterPointer>;
|
||||
using PDFTextSelectionItems = std::vector<PDFTextSelectionItem>;
|
||||
|
||||
/// Text selection, can be used across multiple pages.
|
||||
class PDFTextSelection
|
||||
{
|
||||
public:
|
||||
explicit PDFTextSelection(PDFTextSelectionItems&& items);
|
||||
|
||||
private:
|
||||
PDFTextSelectionItems m_items;
|
||||
};
|
||||
|
||||
struct PDFFindResult
|
||||
{
|
||||
/// Matched string during search
|
||||
QString matched;
|
||||
|
||||
/// Context (text before and after match)
|
||||
QString context;
|
||||
|
||||
/// Matched selection (can be multiple items, if selection
|
||||
/// is spanned between multiple blocks)
|
||||
PDFTextSelectionItems textSelectionItems;
|
||||
};
|
||||
using PDFFindResults = std::vector<PDFFindResult>;
|
||||
|
||||
class PDFTextFlow;
|
||||
using PDFTextFlows = std::vector<PDFTextFlow>;
|
||||
|
||||
/// This class represents a portion of continuous text on the page. It can
|
||||
/// consists of multiple blocks (which follow reading order).
|
||||
class PDFTextFlow
|
||||
{
|
||||
public:
|
||||
|
||||
enum FlowFlag
|
||||
{
|
||||
None = 0x0000,
|
||||
SeparateBlocks = 0x0001, ///< Create flow for each block
|
||||
RemoveSoftHyphen = 0x0002, ///< Removes 'soft hyphen' unicode character from end-of-line (character 0x00AD)
|
||||
AddLineBreaks = 0x0004, ///< Add line break characters to the end of line
|
||||
};
|
||||
Q_DECLARE_FLAGS(FlowFlags, FlowFlag)
|
||||
|
||||
/// Finds simple text in current text flow. All text occurences are returned.
|
||||
/// \param text Text to be found
|
||||
/// \param caseSensitivity Case sensitivity
|
||||
PDFFindResults find(const QString& text, Qt::CaseSensitivity caseSensitivity) const;
|
||||
|
||||
/// Merge data from \p next flow (i.e. connect two consecutive flows)
|
||||
void merge(const PDFTextFlow& next);
|
||||
|
||||
/// Creates text flows from text layout, according to creation flags.
|
||||
/// \param layout Layout, from which is text flow created
|
||||
/// \param flags Flow creation flags
|
||||
/// \param pageIndex Page index
|
||||
static PDFTextFlows createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex);
|
||||
|
||||
private:
|
||||
/// Returns text selection from index and length. Returned text selection can also
|
||||
/// be empty (for example, if only single space character is selected, which has
|
||||
/// no counterpart in real text)
|
||||
/// \param index Index of text selection subrange
|
||||
/// \param length Length of text selection
|
||||
PDFTextSelectionItems getTextSelectionItems(size_t index, size_t length) const;
|
||||
|
||||
/// Returns context for text selection (or empty string, if text selection is empty)
|
||||
/// \param index Index of text selection subrange
|
||||
/// \param length Length of text selection
|
||||
QString getContext(size_t index, size_t length) const;
|
||||
|
||||
QString m_text;
|
||||
std::vector<PDFCharacterPointer> m_characterPointers;
|
||||
};
|
||||
|
||||
/// Text layout of single page. Can handle various fonts, various angles of lines
|
||||
/// and vertically oriented text. It performs the "docstrum" algorithm.
|
||||
class PDFTextLayout
|
||||
|
Reference in New Issue
Block a user