Advanced find (algorithms)

This commit is contained in:
Jakub Melka
2020-01-03 18:11:03 +01:00
parent b490dc7c89
commit 54ea7dcb7d
12 changed files with 654 additions and 5 deletions

View File

@@ -92,7 +92,7 @@ private:
std::map<PDFInteger, CompileTask> m_tasks;
};
class PDFAsynchronousTextLayoutCompiler : public QObject
class PDFFORQTLIBSHARED_EXPORT PDFAsynchronousTextLayoutCompiler : public QObject
{
Q_OBJECT
@@ -130,6 +130,9 @@ public:
/// \p textLayoutChanged is emitted.
void makeTextLayout();
/// Returns true, if text layout is ready
bool isTextLayoutReady() const { return m_textLayouts.has_value(); }
signals:
void textLayoutChanged();

View File

@@ -994,6 +994,7 @@ QRectF PDFDrawWidgetProxy::fromDeviceSpace(const QRectF& rect) const
void PDFDrawWidgetProxy::onTextLayoutChanged()
{
emit repaintNeeded();
emit textLayoutChanged();
}
bool PDFDrawWidgetProxy::isBlockMode() const

View File

@@ -276,6 +276,7 @@ public:
const PDFCMSManager* getCMSManager() const;
PDFProgress* getProgress() const { return m_progress; }
void setProgress(PDFProgress* progress) { m_progress = progress; }
PDFAsynchronousTextLayoutCompiler* getTextLayoutCompiler() const { return m_textLayoutCompiler; }
void setFeatures(PDFRenderer::Features features);
void setPreferredMeshResolutionRatio(PDFReal ratio);
@@ -291,6 +292,7 @@ signals:
void renderingError(PDFInteger pageIndex, const QList<PDFRenderError>& errors);
void repaintNeeded();
void pageImageChanged(bool all, const std::vector<PDFInteger>& pages);
void textLayoutChanged();
private:
struct LayoutItem

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2019 Jakub Melka
// Copyright (C) 2019-2020 Jakub Melka
//
// This file is part of PdfForQt.
//
@@ -593,4 +593,170 @@ QDataStream& operator>>(QDataStream& stream, PDFTextLayoutSettings& settings)
return stream;
}
PDFTextSelection::PDFTextSelection(PDFTextSelectionItems&& items) :
m_items(qMove(items))
{
}
PDFFindResults PDFTextFlow::find(const QString& text, Qt::CaseSensitivity caseSensitivity) const
{
PDFFindResults results;
int index = m_text.indexOf(text, 0, caseSensitivity);
while (index != -1)
{
PDFFindResult result;
result.matched = text;
result.textSelectionItems = getTextSelectionItems(index, text.length());
result.context = getContext(index, text.length());
if (!result.textSelectionItems.empty())
{
results.emplace_back(qMove(result));
}
index = m_text.indexOf(text, index + 1, caseSensitivity);
}
return results;
}
void PDFTextFlow::merge(const PDFTextFlow& next)
{
m_text += next.m_text;
m_characterPointers.insert(m_characterPointers.end(), next.m_characterPointers.cbegin(), next.m_characterPointers.cend());
}
PDFTextFlows PDFTextFlow::createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex)
{
PDFTextFlows result;
if (!flags.testFlag(SeparateBlocks))
{
result.emplace_back();
}
QString lineBreak(" ");
if (flags.testFlag(AddLineBreaks))
{
#if defined(Q_OS_WIN)
lineBreak = QString("\r\n");
#elif defined(Q_OS_UNIX)
linebreak = QString("\n");
#elif defined(Q_OS_MAC)
lineBreak = QString("\r");
#else
static_assert(false, "Fix this code!");
#endif
}
size_t textBlockIndex = 0;
for (const PDFTextBlock& textBlock : layout.getTextBlocks())
{
PDFTextFlow currentFlow;
size_t textLineIndex = 0;
for (const PDFTextLine& textLine : textBlock.getLines())
{
const TextCharacters& characters = textLine.getCharacters();
for (size_t i = 0, characterCount = characters.size(); i < characterCount; ++i)
{
const TextCharacter& currentCharacter = characters[i];
if (i > 0 && !currentCharacter.character.isSpace())
{
// Jakub Melka: try to guess space between letters
const TextCharacter& previousCharacter = characters[i - 1];
if (!previousCharacter.character.isSpace() && QLineF(previousCharacter.position, currentCharacter.position).length() > previousCharacter.advance * 1.1)
{
currentFlow.m_text += QChar(' ');
currentFlow.m_characterPointers.emplace_back();
}
}
currentFlow.m_text += currentCharacter.character;
PDFCharacterPointer pointer;
pointer.pageIndex = pageIndex;
pointer.blockIndex = textBlockIndex;
pointer.lineIndex = textLineIndex;
pointer.characterIndex = i;
currentFlow.m_characterPointers.emplace_back(qMove(pointer));
}
// Remove soft hyphen, if it is enabled
if (flags.testFlag(RemoveSoftHyphen) && !characters.empty() && currentFlow.m_text.back() == QChar(QChar::SoftHyphen))
{
currentFlow.m_text.chop(1);
currentFlow.m_characterPointers.pop_back();
if (!flags.testFlag(AddLineBreaks))
{
// Do not add single empty space - because soft hypen probably breaks a word
++textLineIndex;
continue;
}
}
// Add line break
currentFlow.m_text += lineBreak;
currentFlow.m_characterPointers.insert(currentFlow.m_characterPointers.end(), lineBreak.length(), PDFCharacterPointer());
++textLineIndex;
}
// If we are producing separate blocks, then make flow for each
// text block, otherwise join flows.
if (flags.testFlag(SeparateBlocks))
{
result.emplace_back(qMove(currentFlow));
}
else
{
result.back().merge(currentFlow);
}
++textBlockIndex;
}
return result;
}
PDFTextSelectionItems PDFTextFlow::getTextSelectionItems(size_t index, size_t length) const
{
PDFTextSelectionItems items;
auto it = std::next(m_characterPointers.cbegin(), index);
auto itEnd = std::next(m_characterPointers.cbegin(), index + length);
s
return items;
}
QString PDFTextFlow::getContext(size_t index, size_t length) const
{
Q_ASSERT(length > 0);
while (index > 0 && m_characterPointers[index - 1].hasSameLine(m_characterPointers[index]))
{
--index;
++length;
}
size_t currentEnd = index + length - 1;
size_t last = m_characterPointers.size() - 1;
while (currentEnd < last && m_characterPointers[currentEnd].hasSameLine(m_characterPointers[currentEnd + 1]))
{
++currentEnd;
++length;
}
return m_text.mid(int(index), int(length));
}
bool PDFCharacterPointer::hasSameLine(const PDFCharacterPointer& other) const
{
return pageIndex == other.pageIndex && blockIndex == other.blockIndex && lineIndex == other.lineIndex;
}
} // namespace pdf

View File

@@ -1,4 +1,4 @@
// Copyright (C) 2019 Jakub Melka
// Copyright (C) 2019-2020 Jakub Melka
//
// This file is part of PdfForQt.
//
@@ -24,9 +24,11 @@
#include <QPainterPath>
#include <set>
#include <compare>
namespace pdf
{
class PDFTextLayout;
struct PDFTextCharacterInfo
{
@@ -152,6 +154,100 @@ private:
using PDFTextBlocks = std::vector<PDFTextBlock>;
/// Character pointer points to some character in text layout.
/// It also has page index to decide, which page the pointer points to.
struct PDFCharacterPointer
{
auto operator<=>(const PDFCharacterPointer&) const = default;
/// Returns true, if character pointer is valid and points to the correct location
bool isValid() const { return pageIndex > -1; }
/// Returns true, if character belongs to same line
bool hasSameLine(const PDFCharacterPointer& other) const;
int pageIndex = -1;
size_t blockIndex = 0;
size_t lineIndex = 0;
size_t characterIndex = 0;
};
using PDFTextSelectionItem = std::pair<PDFCharacterPointer, PDFCharacterPointer>;
using PDFTextSelectionItems = std::vector<PDFTextSelectionItem>;
/// Text selection, can be used across multiple pages.
class PDFTextSelection
{
public:
explicit PDFTextSelection(PDFTextSelectionItems&& items);
private:
PDFTextSelectionItems m_items;
};
struct PDFFindResult
{
/// Matched string during search
QString matched;
/// Context (text before and after match)
QString context;
/// Matched selection (can be multiple items, if selection
/// is spanned between multiple blocks)
PDFTextSelectionItems textSelectionItems;
};
using PDFFindResults = std::vector<PDFFindResult>;
class PDFTextFlow;
using PDFTextFlows = std::vector<PDFTextFlow>;
/// This class represents a portion of continuous text on the page. It can
/// consists of multiple blocks (which follow reading order).
class PDFTextFlow
{
public:
enum FlowFlag
{
None = 0x0000,
SeparateBlocks = 0x0001, ///< Create flow for each block
RemoveSoftHyphen = 0x0002, ///< Removes 'soft hyphen' unicode character from end-of-line (character 0x00AD)
AddLineBreaks = 0x0004, ///< Add line break characters to the end of line
};
Q_DECLARE_FLAGS(FlowFlags, FlowFlag)
/// Finds simple text in current text flow. All text occurences are returned.
/// \param text Text to be found
/// \param caseSensitivity Case sensitivity
PDFFindResults find(const QString& text, Qt::CaseSensitivity caseSensitivity) const;
/// Merge data from \p next flow (i.e. connect two consecutive flows)
void merge(const PDFTextFlow& next);
/// Creates text flows from text layout, according to creation flags.
/// \param layout Layout, from which is text flow created
/// \param flags Flow creation flags
/// \param pageIndex Page index
static PDFTextFlows createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex);
private:
/// Returns text selection from index and length. Returned text selection can also
/// be empty (for example, if only single space character is selected, which has
/// no counterpart in real text)
/// \param index Index of text selection subrange
/// \param length Length of text selection
PDFTextSelectionItems getTextSelectionItems(size_t index, size_t length) const;
/// Returns context for text selection (or empty string, if text selection is empty)
/// \param index Index of text selection subrange
/// \param length Length of text selection
QString getContext(size_t index, size_t length) const;
QString m_text;
std::vector<PDFCharacterPointer> m_characterPointers;
};
/// Text layout of single page. Can handle various fonts, various angles of lines
/// and vertically oriented text. It performs the "docstrum" algorithm.
class PDFTextLayout