2020-01-03 18:11:03 +01:00
|
|
|
// Copyright (C) 2019-2020 Jakub Melka
|
2019-12-28 19:21:29 +01:00
|
|
|
//
|
|
|
|
// This file is part of PdfForQt.
|
|
|
|
//
|
|
|
|
// PdfForQt is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// PdfForQt is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Lesser General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
#ifndef PDFTEXTLAYOUT_H
|
|
|
|
#define PDFTEXTLAYOUT_H
|
|
|
|
|
|
|
|
#include "pdfglobal.h"
|
|
|
|
|
2019-12-31 17:39:31 +01:00
|
|
|
#include <QDataStream>
|
2019-12-28 19:21:29 +01:00
|
|
|
#include <QPainterPath>
|
|
|
|
|
|
|
|
#include <set>
|
2020-01-03 18:11:03 +01:00
|
|
|
#include <compare>
|
2019-12-28 19:21:29 +01:00
|
|
|
|
|
|
|
namespace pdf
|
|
|
|
{
|
2020-01-03 18:11:03 +01:00
|
|
|
class PDFTextLayout;
|
2019-12-28 19:21:29 +01:00
|
|
|
|
|
|
|
struct PDFTextCharacterInfo
|
|
|
|
{
|
|
|
|
/// Character
|
|
|
|
QChar character;
|
|
|
|
|
|
|
|
/// Character path
|
|
|
|
QPainterPath outline;
|
|
|
|
|
|
|
|
/// Do we use a vertical writing system?
|
|
|
|
bool isVerticalWritingSystem = false;
|
|
|
|
|
|
|
|
/// Advance (in character space, it must be translated
|
|
|
|
/// into device space), for both vertical/horizontal modes.
|
|
|
|
PDFReal advance = 0.0;
|
|
|
|
|
|
|
|
/// Font size (in character space, it must be translated
|
|
|
|
/// into device space)
|
|
|
|
PDFReal fontSize = 0.0;
|
|
|
|
|
|
|
|
/// Transformation matrix from character space to device space
|
|
|
|
QMatrix matrix;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct PDFTextLayoutSettings
|
|
|
|
{
|
|
|
|
/// Number of samples for 'docstrum' algorithm, i.e. number of
|
|
|
|
/// nearest characters. By default, 5 characters should fit.
|
|
|
|
size_t samples = 5;
|
|
|
|
|
|
|
|
/// Distance sensitivity to determine, if characters are close enough.
|
|
|
|
/// Maximal distance is computed as current character advance multiplied
|
|
|
|
/// by this constant.
|
|
|
|
PDFReal distanceSensitivity = 4.0;
|
|
|
|
|
|
|
|
/// Maximal vertical distance, in portion of font size, of two characters
|
|
|
|
/// to be considered they lie on same line.
|
|
|
|
PDFReal charactersOnLineSensitivity = 0.25;
|
|
|
|
|
|
|
|
/// Maximal ratio between font size of characters to be considered
|
|
|
|
/// that they lie on same line.
|
|
|
|
PDFReal fontSensitivity = 2.0;
|
|
|
|
|
|
|
|
/// Maximal space ratio between two lines of block. Default coefficient
|
|
|
|
/// means, that height ratio limit is (height1 + height2)
|
|
|
|
PDFReal blockVerticalSensitivity = 1.5;
|
|
|
|
|
|
|
|
/// Minimal horizontal overlap for two lines considered to be in one block
|
|
|
|
PDFReal blockOverlapSensitivity = 0.3;
|
2019-12-31 17:39:31 +01:00
|
|
|
|
|
|
|
friend QDataStream& operator<<(QDataStream& stream, const PDFTextLayoutSettings& settings);
|
|
|
|
friend QDataStream& operator>>(QDataStream& stream, PDFTextLayoutSettings& settings);
|
2019-12-28 19:21:29 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/// Represents character in device space coordinates. All values (dimensions,
|
|
|
|
/// bounding box, etc. are in device space coordinates).
|
|
|
|
struct TextCharacter
|
|
|
|
{
|
|
|
|
QChar character;
|
|
|
|
QPointF position;
|
|
|
|
PDFReal angle = 0.0;
|
|
|
|
PDFReal fontSize = 0.0;
|
|
|
|
PDFReal advance = 0.0;
|
|
|
|
QPainterPath boundingBox;
|
2019-12-29 13:50:00 +01:00
|
|
|
|
|
|
|
void applyTransform(const QMatrix& matrix);
|
2019-12-31 17:39:31 +01:00
|
|
|
|
|
|
|
friend QDataStream& operator<<(QDataStream& stream, const TextCharacter& character);
|
|
|
|
friend QDataStream& operator>>(QDataStream& stream, TextCharacter& character);
|
2019-12-28 19:21:29 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
using TextCharacters = std::vector<TextCharacter>;
|
|
|
|
|
|
|
|
/// Represents text line consisting of set of characters and line bounding box.
|
|
|
|
class PDFTextLine
|
|
|
|
{
|
|
|
|
public:
|
2019-12-31 17:39:31 +01:00
|
|
|
explicit inline PDFTextLine() = default;
|
|
|
|
|
2019-12-28 19:21:29 +01:00
|
|
|
/// Construct new line from characters. Characters are sorted in x-coordinate
|
|
|
|
/// and bounding box is computed.
|
|
|
|
/// \param characters
|
|
|
|
explicit PDFTextLine(TextCharacters characters);
|
|
|
|
|
|
|
|
const TextCharacters& getCharacters() const { return m_characters; }
|
|
|
|
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
2019-12-29 17:25:18 +01:00
|
|
|
const QPointF& getTopLeft() const { return m_topLeft; }
|
2019-12-28 19:21:29 +01:00
|
|
|
|
2019-12-29 13:50:00 +01:00
|
|
|
void applyTransform(const QMatrix& matrix);
|
|
|
|
|
2019-12-31 17:39:31 +01:00
|
|
|
friend QDataStream& operator<<(QDataStream& stream, const PDFTextLine& line);
|
|
|
|
friend QDataStream& operator>>(QDataStream& stream, PDFTextLine& line);
|
|
|
|
|
2019-12-28 19:21:29 +01:00
|
|
|
private:
|
|
|
|
TextCharacters m_characters;
|
|
|
|
QPainterPath m_boundingBox;
|
2019-12-29 17:25:18 +01:00
|
|
|
QPointF m_topLeft;
|
2019-12-28 19:21:29 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
using PDFTextLines = std::vector<PDFTextLine>;
|
|
|
|
|
|
|
|
/// Represents text block consisting of set of lines and block bounding box.
|
|
|
|
class PDFTextBlock
|
|
|
|
{
|
|
|
|
public:
|
2019-12-31 17:39:31 +01:00
|
|
|
explicit inline PDFTextBlock() = default;
|
2019-12-28 19:21:29 +01:00
|
|
|
explicit inline PDFTextBlock(PDFTextLines textLines);
|
|
|
|
|
|
|
|
const PDFTextLines& getLines() const { return m_lines; }
|
|
|
|
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
2019-12-29 17:25:18 +01:00
|
|
|
const QPointF& getTopLeft() const { return m_topLeft; }
|
2019-12-28 19:21:29 +01:00
|
|
|
|
2019-12-29 13:50:00 +01:00
|
|
|
void applyTransform(const QMatrix& matrix);
|
|
|
|
|
2019-12-31 17:39:31 +01:00
|
|
|
friend QDataStream& operator<<(QDataStream& stream, const PDFTextBlock& block);
|
|
|
|
friend QDataStream& operator>>(QDataStream& stream, PDFTextBlock& block);
|
|
|
|
|
2019-12-28 19:21:29 +01:00
|
|
|
private:
|
|
|
|
PDFTextLines m_lines;
|
|
|
|
QPainterPath m_boundingBox;
|
2019-12-29 17:25:18 +01:00
|
|
|
QPointF m_topLeft;
|
2019-12-28 19:21:29 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
using PDFTextBlocks = std::vector<PDFTextBlock>;
|
|
|
|
|
2020-01-03 18:11:03 +01:00
|
|
|
/// Character pointer points to some character in text layout.
|
|
|
|
/// It also has page index to decide, which page the pointer points to.
|
|
|
|
struct PDFCharacterPointer
|
|
|
|
{
|
|
|
|
auto operator<=>(const PDFCharacterPointer&) const = default;
|
|
|
|
|
|
|
|
/// Returns true, if character pointer is valid and points to the correct location
|
|
|
|
bool isValid() const { return pageIndex > -1; }
|
|
|
|
|
2020-01-04 17:58:55 +01:00
|
|
|
/// Returns true, if character belongs to same block
|
|
|
|
bool hasSameBlock(const PDFCharacterPointer& other) const;
|
|
|
|
|
2020-01-03 18:11:03 +01:00
|
|
|
/// Returns true, if character belongs to same line
|
|
|
|
bool hasSameLine(const PDFCharacterPointer& other) const;
|
|
|
|
|
|
|
|
int pageIndex = -1;
|
|
|
|
size_t blockIndex = 0;
|
|
|
|
size_t lineIndex = 0;
|
|
|
|
size_t characterIndex = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
using PDFTextSelectionItem = std::pair<PDFCharacterPointer, PDFCharacterPointer>;
|
|
|
|
using PDFTextSelectionItems = std::vector<PDFTextSelectionItem>;
|
|
|
|
|
|
|
|
/// Text selection, can be used across multiple pages.
|
|
|
|
class PDFTextSelection
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
explicit PDFTextSelection(PDFTextSelectionItems&& items);
|
|
|
|
|
|
|
|
private:
|
|
|
|
PDFTextSelectionItems m_items;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct PDFFindResult
|
|
|
|
{
|
2020-01-04 17:58:55 +01:00
|
|
|
bool operator<(const PDFFindResult& other) const;
|
|
|
|
|
2020-01-03 18:11:03 +01:00
|
|
|
/// Matched string during search
|
|
|
|
QString matched;
|
|
|
|
|
|
|
|
/// Context (text before and after match)
|
|
|
|
QString context;
|
|
|
|
|
|
|
|
/// Matched selection (can be multiple items, if selection
|
|
|
|
/// is spanned between multiple blocks)
|
|
|
|
PDFTextSelectionItems textSelectionItems;
|
|
|
|
};
|
|
|
|
using PDFFindResults = std::vector<PDFFindResult>;
|
|
|
|
|
|
|
|
class PDFTextFlow;
|
|
|
|
using PDFTextFlows = std::vector<PDFTextFlow>;
|
|
|
|
|
|
|
|
/// This class represents a portion of continuous text on the page. It can
|
|
|
|
/// consists of multiple blocks (which follow reading order).
|
|
|
|
class PDFTextFlow
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
|
|
|
enum FlowFlag
|
|
|
|
{
|
|
|
|
None = 0x0000,
|
|
|
|
SeparateBlocks = 0x0001, ///< Create flow for each block
|
|
|
|
RemoveSoftHyphen = 0x0002, ///< Removes 'soft hyphen' unicode character from end-of-line (character 0x00AD)
|
|
|
|
AddLineBreaks = 0x0004, ///< Add line break characters to the end of line
|
|
|
|
};
|
|
|
|
Q_DECLARE_FLAGS(FlowFlags, FlowFlag)
|
|
|
|
|
|
|
|
/// Finds simple text in current text flow. All text occurences are returned.
|
|
|
|
/// \param text Text to be found
|
|
|
|
/// \param caseSensitivity Case sensitivity
|
|
|
|
PDFFindResults find(const QString& text, Qt::CaseSensitivity caseSensitivity) const;
|
|
|
|
|
2020-01-04 17:58:55 +01:00
|
|
|
/// Finds regular expression matches in current text flow. All text occurences are returned.
|
|
|
|
/// \param expression Regular expression to be matched
|
|
|
|
PDFFindResults find(const QRegularExpression& expression) const;
|
|
|
|
|
2020-01-03 18:11:03 +01:00
|
|
|
/// Merge data from \p next flow (i.e. connect two consecutive flows)
|
|
|
|
void merge(const PDFTextFlow& next);
|
|
|
|
|
|
|
|
/// Creates text flows from text layout, according to creation flags.
|
|
|
|
/// \param layout Layout, from which is text flow created
|
|
|
|
/// \param flags Flow creation flags
|
|
|
|
/// \param pageIndex Page index
|
|
|
|
static PDFTextFlows createTextFlows(const PDFTextLayout& layout, FlowFlags flags, PDFInteger pageIndex);
|
|
|
|
|
|
|
|
private:
|
|
|
|
/// Returns text selection from index and length. Returned text selection can also
|
|
|
|
/// be empty (for example, if only single space character is selected, which has
|
|
|
|
/// no counterpart in real text)
|
|
|
|
/// \param index Index of text selection subrange
|
|
|
|
/// \param length Length of text selection
|
|
|
|
PDFTextSelectionItems getTextSelectionItems(size_t index, size_t length) const;
|
|
|
|
|
|
|
|
/// Returns context for text selection (or empty string, if text selection is empty)
|
|
|
|
/// \param index Index of text selection subrange
|
|
|
|
/// \param length Length of text selection
|
|
|
|
QString getContext(size_t index, size_t length) const;
|
|
|
|
|
|
|
|
QString m_text;
|
|
|
|
std::vector<PDFCharacterPointer> m_characterPointers;
|
|
|
|
};
|
|
|
|
|
2019-12-28 19:21:29 +01:00
|
|
|
/// Text layout of single page. Can handle various fonts, various angles of lines
|
|
|
|
/// and vertically oriented text. It performs the "docstrum" algorithm.
|
|
|
|
class PDFTextLayout
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
explicit PDFTextLayout();
|
|
|
|
|
|
|
|
/// Adds character to the layout
|
|
|
|
void addCharacter(const PDFTextCharacterInfo& info);
|
|
|
|
|
|
|
|
/// Perorms text layout algorithm
|
|
|
|
void perform();
|
|
|
|
|
|
|
|
/// Optimizes layout memory allocation to contain less space
|
|
|
|
void optimize();
|
|
|
|
|
|
|
|
/// Returns estimate of number of bytes, which this mesh occupies in memory
|
|
|
|
qint64 getMemoryConsumptionEstimate() const;
|
|
|
|
|
2019-12-29 17:25:18 +01:00
|
|
|
/// Returns recognized text blocks
|
|
|
|
const PDFTextBlocks& getTextBlocks() const { return m_blocks; }
|
|
|
|
|
2019-12-31 17:39:31 +01:00
|
|
|
friend QDataStream& operator<<(QDataStream& stream, const PDFTextLayout& layout);
|
|
|
|
friend QDataStream& operator>>(QDataStream& stream, PDFTextLayout& layout);
|
|
|
|
|
2019-12-28 19:21:29 +01:00
|
|
|
private:
|
|
|
|
/// Makes layout for particular angle
|
|
|
|
void performDoLayout(PDFReal angle);
|
|
|
|
|
|
|
|
/// Returns a list of characters for particular angle. Exact match is used
|
|
|
|
/// for angle, even if angle is floating point number.
|
|
|
|
/// \param angle Angle
|
|
|
|
TextCharacters getCharactersForAngle(PDFReal angle) const;
|
|
|
|
|
|
|
|
/// Applies transform to text characters (positions and bounding boxes)
|
|
|
|
/// \param characters Characters
|
|
|
|
/// \param matrix Transform matrix
|
|
|
|
void applyTransform(TextCharacters& characters, const QMatrix& matrix);
|
|
|
|
|
|
|
|
TextCharacters m_characters;
|
|
|
|
std::set<PDFReal> m_angles;
|
|
|
|
PDFTextLayoutSettings m_settings;
|
2019-12-29 13:50:00 +01:00
|
|
|
PDFTextBlocks m_blocks;
|
2019-12-28 19:21:29 +01:00
|
|
|
};
|
|
|
|
|
2019-12-31 17:39:31 +01:00
|
|
|
/// Storage for text layouts. For reading and writing, this object is thread safe.
|
|
|
|
/// For writing, mutex is used to synchronize asynchronous writes, for reading
|
|
|
|
/// no mutex is used at all. For this reason, both reading/writing at the same time
|
|
|
|
/// is prohibited, it is not thread safe.
|
2020-01-04 17:58:55 +01:00
|
|
|
class PDFFORQTLIBSHARED_EXPORT PDFTextLayoutStorage
|
2019-12-31 17:39:31 +01:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
explicit inline PDFTextLayoutStorage() = default;
|
|
|
|
explicit inline PDFTextLayoutStorage(PDFInteger pageCount) :
|
|
|
|
m_offsets(pageCount, 0)
|
|
|
|
{
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns text layout for particular page. If page index is invalid,
|
|
|
|
/// then empty text layout is returned. Function is not thread safe, if
|
|
|
|
/// function \p setTextLayout is called from another thread.
|
|
|
|
/// \param pageIndex Page index
|
|
|
|
PDFTextLayout getTextLayout(PDFInteger pageIndex) const;
|
|
|
|
|
|
|
|
/// Sets text layout to the particular index. Index must be valid and from
|
|
|
|
/// range 0 to \p pageCount - 1. Function is not thread safe.
|
|
|
|
/// \param pageIndex Page index
|
|
|
|
/// \param layout Text layout
|
|
|
|
/// \param mutex Mutex for locking (calls of setTextLayout from multiple threads)
|
|
|
|
void setTextLayout(PDFInteger pageIndex, const PDFTextLayout& layout, QMutex* mutex);
|
|
|
|
|
2020-01-04 17:58:55 +01:00
|
|
|
/// Finds simple text in all pages. All text occurences are returned.
|
|
|
|
/// \param text Text to be found
|
|
|
|
/// \param caseSensitivity Case sensitivity
|
|
|
|
/// \param flowFlags Text flow flags
|
|
|
|
PDFFindResults find(const QString& text, Qt::CaseSensitivity caseSensitivity, PDFTextFlow::FlowFlags flowFlags) const;
|
|
|
|
|
|
|
|
/// Finds regular expression matches in current text flow. All text occurences are returned.
|
|
|
|
/// \param expression Regular expression to be matched
|
|
|
|
/// \param flowFlags Text flow flags
|
|
|
|
PDFFindResults find(const QRegularExpression& expression, PDFTextFlow::FlowFlags flowFlags) const;
|
|
|
|
|
2019-12-31 17:39:31 +01:00
|
|
|
private:
|
|
|
|
std::vector<int> m_offsets;
|
|
|
|
QByteArray m_textLayouts;
|
|
|
|
};
|
|
|
|
|
2019-12-28 19:21:29 +01:00
|
|
|
} // namespace pdf
|
|
|
|
|
|
|
|
#endif // PDFTEXTLAYOUT_H
|