Advanced search (second part)

This commit is contained in:
Jakub Melka 2020-01-04 17:58:55 +01:00
parent 54ea7dcb7d
commit bf4ede1574
6 changed files with 274 additions and 29 deletions

View File

@ -133,6 +133,9 @@ public:
/// Returns true, if text layout is ready
bool isTextLayoutReady() const { return m_textLayouts.has_value(); }
/// Returns text layout storage (if it is ready), or nullptr
const PDFTextLayoutStorage* getTextLayoutStorage() const { return isTextLayoutReady() ? &m_textLayouts.value() : nullptr; }
signals:
void textLayoutChanged();

View File

@ -571,6 +571,64 @@ void PDFTextLayoutStorage::setTextLayout(PDFInteger pageIndex, const PDFTextLayo
layoutStream << result;
}
PDFFindResults PDFTextLayoutStorage::find(const QString& text, Qt::CaseSensitivity caseSensitivity, PDFTextFlow::FlowFlags flowFlags) const
{
PDFFindResults results;
QMutex resultsMutex;
auto findImpl = [this, flowFlags, caseSensitivity, &results, &resultsMutex, &text](size_t pageIndex)
{
PDFTextLayout textLayout = getTextLayout(pageIndex);
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, flowFlags, pageIndex);
for (const PDFTextFlow& textFlow : textFlows)
{
PDFFindResults flowResults = textFlow.find(text, caseSensitivity);
// Jakub Melka: Do not lock mutex, if we didn't find anything. In that case, just skip to next flow.
if (!flowResults.empty())
{
QMutexLocker lock(&resultsMutex);
results.insert(results.end(), flowResults.begin(), flowResults.end());
}
}
};
auto range = PDFIntegerRange<size_t>(0, m_offsets.size());
std::for_each(std::execution::parallel_policy(), range.begin(), range.end(), findImpl);
std::sort(results.begin(), results.end());
return results;
}
PDFFindResults PDFTextLayoutStorage::find(const QRegularExpression& expression, PDFTextFlow::FlowFlags flowFlags) const
{
PDFFindResults results;
QMutex resultsMutex;
auto findImpl = [this, flowFlags, &results, &resultsMutex, &expression](size_t pageIndex)
{
PDFTextLayout textLayout = getTextLayout(pageIndex);
PDFTextFlows textFlows = PDFTextFlow::createTextFlows(textLayout, flowFlags, pageIndex);
for (const PDFTextFlow& textFlow : textFlows)
{
PDFFindResults flowResults = textFlow.find(expression);
// Jakub Melka: Do not lock mutex, if we didn't find anything. In that case, just skip to next flow.
if (!flowResults.empty())
{
QMutexLocker lock(&resultsMutex);
results.insert(results.end(), flowResults.begin(), flowResults.end());
}
}
};
auto range = PDFIntegerRange<size_t>(0, m_offsets.size());
std::for_each(std::execution::parallel_policy(), range.begin(), range.end(), findImpl);
std::sort(results.begin(), results.end());
return results;
}
QDataStream& operator<<(QDataStream& stream, const PDFTextLayoutSettings& settings)
{
stream << settings.samples;
@ -622,6 +680,33 @@ PDFFindResults PDFTextFlow::find(const QString& text, Qt::CaseSensitivity caseSe
return results;
}
PDFFindResults PDFTextFlow::find(const QRegularExpression& expression) const
{
PDFFindResults results;
QRegularExpressionMatchIterator iterator = expression.globalMatch(m_text, 0, QRegularExpression::NormalMatch, QRegularExpression::NoMatchOption);
while (iterator.hasNext())
{
QRegularExpressionMatch match = iterator.next();
Q_ASSERT(match.hasMatch());
const int index = match.capturedStart();
const int length = match.capturedLength();
PDFFindResult result;
result.matched = match.captured();
result.textSelectionItems = getTextSelectionItems(index, length);
result.context = getContext(index, length);
if (!result.textSelectionItems.empty())
{
results.emplace_back(qMove(result));
}
}
return results;
}
void PDFTextFlow::merge(const PDFTextFlow& next)
{
m_text += next.m_text;
@ -728,8 +813,25 @@ PDFTextSelectionItems PDFTextFlow::getTextSelectionItems(size_t index, size_t le
auto it = std::next(m_characterPointers.cbegin(), index);
auto itEnd = std::next(m_characterPointers.cbegin(), index + length);
s
while (it != itEnd)
{
// Skip invalid items, find first valid
if (!it->isValid())
{
++it;
continue;
}
auto itSelectionStart = it;
while (it != itEnd && it->isValid() && it->hasSameBlock(*itSelectionStart))
{
++it;
}
auto itSelectionEnd = std::prev(it);
items.emplace_back(*itSelectionStart, *itSelectionEnd);
}
std::sort(items.begin(), items.end());
return items;
}
@ -754,9 +856,22 @@ QString PDFTextFlow::getContext(size_t index, size_t length) const
return m_text.mid(int(index), int(length));
}
bool PDFCharacterPointer::hasSameBlock(const PDFCharacterPointer& other) const
{
return pageIndex == other.pageIndex && blockIndex == other.blockIndex;
}
bool PDFCharacterPointer::hasSameLine(const PDFCharacterPointer& other) const
{
return pageIndex == other.pageIndex && blockIndex == other.blockIndex && lineIndex == other.lineIndex;
return hasSameBlock(other) && lineIndex == other.lineIndex;
}
bool PDFFindResult::operator<(const PDFFindResult& other) const
{
Q_ASSERT(!textSelectionItems.empty());
Q_ASSERT(!other.textSelectionItems.empty());
return textSelectionItems.front() < other.textSelectionItems.front();
}
} // namespace pdf

View File

@ -163,6 +163,9 @@ struct PDFCharacterPointer
/// Returns true, if character pointer is valid and points to the correct location
bool isValid() const { return pageIndex > -1; }
/// Returns true, if character belongs to same block
bool hasSameBlock(const PDFCharacterPointer& other) const;
/// Returns true, if character belongs to same line
bool hasSameLine(const PDFCharacterPointer& other) const;
@ -187,6 +190,8 @@ private:
struct PDFFindResult
{
bool operator<(const PDFFindResult& other) const;
/// Matched string during search
QString matched;
@ -222,6 +227,10 @@ public:
/// \param caseSensitivity Case sensitivity
PDFFindResults find(const QString& text, Qt::CaseSensitivity caseSensitivity) const;
/// Finds regular expression matches in current text flow. All text occurences are returned.
/// \param expression Regular expression to be matched
PDFFindResults find(const QRegularExpression& expression) const;
/// Merge data from \p next flow (i.e. connect two consecutive flows)
void merge(const PDFTextFlow& next);
@ -297,7 +306,7 @@ private:
/// For writing, mutex is used to synchronize asynchronous writes, for reading
/// no mutex is used at all. For this reason, both reading/writing at the same time
/// is prohibited, it is not thread safe.
class PDFTextLayoutStorage
class PDFFORQTLIBSHARED_EXPORT PDFTextLayoutStorage
{
public:
explicit inline PDFTextLayoutStorage() = default;
@ -320,6 +329,17 @@ public:
/// \param mutex Mutex for locking (calls of setTextLayout from multiple threads)
void setTextLayout(PDFInteger pageIndex, const PDFTextLayout& layout, QMutex* mutex);
/// Finds simple text in all pages. All text occurences are returned.
/// \param text Text to be found
/// \param caseSensitivity Case sensitivity
/// \param flowFlags Text flow flags
PDFFindResults find(const QString& text, Qt::CaseSensitivity caseSensitivity, PDFTextFlow::FlowFlags flowFlags) const;
/// Finds regular expression matches in current text flow. All text occurences are returned.
/// \param expression Regular expression to be matched
/// \param flowFlags Text flow flags
PDFFindResults find(const QRegularExpression& expression, PDFTextFlow::FlowFlags flowFlags) const;
private:
std::vector<int> m_offsets;
QByteArray m_textLayouts;

View File

@ -35,6 +35,8 @@ PDFAdvancedFindWidget::PDFAdvancedFindWidget(pdf::PDFDrawWidgetProxy* proxy, QWi
{
ui->setupUi(this);
ui->resultsTableWidget->setHorizontalHeaderLabels({ tr("Page No."), tr("Phrase"), tr("Context") });
connect(ui->regularExpressionsCheckbox, &QCheckBox::clicked, this, &PDFAdvancedFindWidget::updateUI);
connect(m_proxy, &pdf::PDFDrawWidgetProxy::textLayoutChanged, this, &PDFAdvancedFindWidget::performSearch);
updateUI();
@ -62,6 +64,7 @@ void PDFAdvancedFindWidget::on_searchButton_clicked()
m_parameters.isRegularExpression = ui->regularExpressionsCheckbox->isChecked();
m_parameters.isDotMatchingEverything = ui->dotMatchesEverythingCheckBox->isChecked();
m_parameters.isMultiline = ui->multilineMatchingCheckBox->isChecked();
m_parameters.isSoftHyphenRemoved = ui->removeSoftHyphenCheckBox->isChecked();
m_parameters.isSearchFinished = m_parameters.phrase.isEmpty();
if (m_parameters.isSearchFinished)
@ -85,6 +88,9 @@ void PDFAdvancedFindWidget::on_searchButton_clicked()
}
}
m_findResults.clear();
updateResultsUI();
pdf::PDFAsynchronousTextLayoutCompiler* compiler = m_proxy->getTextLayoutCompiler();
if (compiler->isTextLayoutReady())
{
@ -104,6 +110,25 @@ void PDFAdvancedFindWidget::updateUI()
ui->regularExpressionSettingsGroupBox->setEnabled(enableRegularExpressionUI);
}
void PDFAdvancedFindWidget::updateResultsUI()
{
ui->tabWidget->setTabText(ui->tabWidget->indexOf(ui->resultsTab), !m_findResults.empty() ? tr("Results (%1)").arg(m_findResults.size()) : tr("Results"));
ui->resultsTableWidget->setRowCount(static_cast<int>(m_findResults.size()));
for (int i = 0, rowCount = int(m_findResults.size()); i < rowCount; ++i)
{
const pdf::PDFFindResult& findResult = m_findResults[i];
ui->resultsTableWidget->setItem(i, 0, new QTableWidgetItem(QString::number(findResult.textSelectionItems.front().first.pageIndex + 1)));
ui->resultsTableWidget->setItem(i, 1, new QTableWidgetItem(findResult.matched));
ui->resultsTableWidget->setItem(i, 2, new QTableWidgetItem(findResult.context));
}
if (!m_findResults.empty())
{
ui->tabWidget->setCurrentWidget(ui->resultsTab);
}
}
void PDFAdvancedFindWidget::performSearch()
{
if (m_parameters.isSearchFinished)
@ -120,7 +145,62 @@ void PDFAdvancedFindWidget::performSearch()
return;
}
// Prepare string to search
bool useRegularExpression = m_parameters.isRegularExpression;
QString expression = m_parameters.phrase;
if (m_parameters.isWholeWordsOnly)
{
if (useRegularExpression)
{
expression = QString("\\b%1\\b").arg(expression);
}
else
{
expression = QString("\\b%1\\b").arg(QRegularExpression::escape(expression));
}
useRegularExpression = true;
}
pdf::PDFTextFlow::FlowFlags flowFlags = pdf::PDFTextFlow::SeparateBlocks;
if (m_parameters.isSoftHyphenRemoved)
{
flowFlags |= pdf::PDFTextFlow::RemoveSoftHyphen;
}
if (m_parameters.isRegularExpression)
{
flowFlags |= pdf::PDFTextFlow::AddLineBreaks;
}
const pdf::PDFTextLayoutStorage* textLayoutStorage = compiler->getTextLayoutStorage();
if (!useRegularExpression)
{
// Use simple text search
Qt::CaseSensitivity caseSensitivity = m_parameters.isCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive;
m_findResults = textLayoutStorage->find(expression, caseSensitivity, flowFlags);
}
else
{
// Use regular expression search
QRegularExpression::PatternOptions patternOptions = QRegularExpression::UseUnicodePropertiesOption | QRegularExpression::OptimizeOnFirstUsageOption;
if (!m_parameters.isCaseSensitive)
{
patternOptions |= QRegularExpression::CaseInsensitiveOption;
}
if (m_parameters.isDotMatchingEverything)
{
patternOptions |= QRegularExpression::DotMatchesEverythingOption;
}
if (m_parameters.isMultiline)
{
patternOptions |= QRegularExpression::MultilineOption;
}
QRegularExpression regularExpression(expression, patternOptions);
m_findResults = textLayoutStorage->find(regularExpression, flowFlags);
}
updateResultsUI();
}
} // namespace pdfviewer

View File

@ -19,6 +19,7 @@
#define PDFADVANCEDFINDWIDGET_H
#include "pdfglobal.h"
#include "pdftextlayout.h"
#include <QWidget>
@ -51,6 +52,7 @@ private slots:
private:
void updateUI();
void updateResultsUI();
void performSearch();
struct SearchParameters
@ -62,6 +64,7 @@ private:
bool isDotMatchingEverything = false;
bool isMultiline = false;
bool isSearchFinished = false;
bool isSoftHyphenRemoved = false;
};
Ui::PDFAdvancedFindWidget* ui;
@ -69,6 +72,7 @@ private:
pdf::PDFDrawWidgetProxy* m_proxy;
const pdf::PDFDocument* m_document;
SearchParameters m_parameters;
pdf::PDFFindResults m_findResults;
};
} // namespace pdfviewer

View File

@ -30,27 +30,13 @@
<string>Search Settings</string>
</property>
<layout class="QGridLayout" name="searchSettingsGroupBoxLayout" columnstretch="0,1,0">
<item row="0" column="0">
<widget class="QLabel" name="searchLabel">
<property name="text">
<string>Search for:</string>
</property>
</widget>
</item>
<item row="0" column="1" colspan="2">
<widget class="QLineEdit" name="searchPhraseEdit"/>
</item>
<item row="4" column="2">
<widget class="QPushButton" name="searchButton">
<item row="3" column="1" colspan="2">
<widget class="QCheckBox" name="regularExpressionsCheckbox">
<property name="text">
<string>Search</string>
</property>
</widget>
</item>
<item row="1" column="1" colspan="2">
<widget class="QCheckBox" name="caseSensitiveCheckBox">
<property name="text">
<string>Case sensitive</string>
<string>Use regular expressions</string>
</property>
</widget>
</item>
@ -61,10 +47,31 @@
</property>
</widget>
</item>
<item row="3" column="1" colspan="2">
<widget class="QCheckBox" name="regularExpressionsCheckbox">
<item row="0" column="0">
<widget class="QLabel" name="searchLabel">
<property name="text">
<string>Use regular expressions</string>
<string>Search for:</string>
</property>
</widget>
</item>
<item row="1" column="1" colspan="2">
<widget class="QCheckBox" name="caseSensitiveCheckBox">
<property name="text">
<string>Case sensitive</string>
</property>
</widget>
</item>
<item row="4" column="1">
<widget class="QCheckBox" name="removeSoftHyphenCheckBox">
<property name="text">
<string>Remove soft hyphen at end of line</string>
</property>
</widget>
</item>
<item row="5" column="2">
<widget class="QPushButton" name="searchButton">
<property name="text">
<string>Search</string>
</property>
</widget>
</item>
@ -118,12 +125,28 @@
</attribute>
<layout class="QVBoxLayout" name="verticalLayout_3">
<item>
<widget class="QTreeWidget" name="treeWidget">
<column>
<property name="text">
<string notr="true">1</string>
</property>
</column>
<widget class="QTableWidget" name="resultsTableWidget">
<property name="editTriggers">
<set>QAbstractItemView::NoEditTriggers</set>
</property>
<property name="alternatingRowColors">
<bool>true</bool>
</property>
<property name="selectionBehavior">
<enum>QAbstractItemView::SelectRows</enum>
</property>
<property name="sortingEnabled">
<bool>true</bool>
</property>
<property name="columnCount">
<number>3</number>
</property>
<attribute name="horizontalHeaderStretchLastSection">
<bool>true</bool>
</attribute>
<column/>
<column/>
<column/>
</widget>
</item>
</layout>