PDF4QT/PdfForQtLib/sources/pdftextlayout.h
2019-12-31 17:39:31 +01:00

235 lines
7.9 KiB
C++

// Copyright (C) 2019 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#ifndef PDFTEXTLAYOUT_H
#define PDFTEXTLAYOUT_H
#include "pdfglobal.h"
#include <QDataStream>
#include <QPainterPath>
#include <set>
namespace pdf
{
struct PDFTextCharacterInfo
{
/// Character
QChar character;
/// Character path
QPainterPath outline;
/// Do we use a vertical writing system?
bool isVerticalWritingSystem = false;
/// Advance (in character space, it must be translated
/// into device space), for both vertical/horizontal modes.
PDFReal advance = 0.0;
/// Font size (in character space, it must be translated
/// into device space)
PDFReal fontSize = 0.0;
/// Transformation matrix from character space to device space
QMatrix matrix;
};
struct PDFTextLayoutSettings
{
/// Number of samples for 'docstrum' algorithm, i.e. number of
/// nearest characters. By default, 5 characters should fit.
size_t samples = 5;
/// Distance sensitivity to determine, if characters are close enough.
/// Maximal distance is computed as current character advance multiplied
/// by this constant.
PDFReal distanceSensitivity = 4.0;
/// Maximal vertical distance, in portion of font size, of two characters
/// to be considered they lie on same line.
PDFReal charactersOnLineSensitivity = 0.25;
/// Maximal ratio between font size of characters to be considered
/// that they lie on same line.
PDFReal fontSensitivity = 2.0;
/// Maximal space ratio between two lines of block. Default coefficient
/// means, that height ratio limit is (height1 + height2)
PDFReal blockVerticalSensitivity = 1.5;
/// Minimal horizontal overlap for two lines considered to be in one block
PDFReal blockOverlapSensitivity = 0.3;
friend QDataStream& operator<<(QDataStream& stream, const PDFTextLayoutSettings& settings);
friend QDataStream& operator>>(QDataStream& stream, PDFTextLayoutSettings& settings);
};
/// Represents character in device space coordinates. All values (dimensions,
/// bounding box, etc. are in device space coordinates).
struct TextCharacter
{
QChar character;
QPointF position;
PDFReal angle = 0.0;
PDFReal fontSize = 0.0;
PDFReal advance = 0.0;
QPainterPath boundingBox;
void applyTransform(const QMatrix& matrix);
friend QDataStream& operator<<(QDataStream& stream, const TextCharacter& character);
friend QDataStream& operator>>(QDataStream& stream, TextCharacter& character);
};
using TextCharacters = std::vector<TextCharacter>;
/// Represents text line consisting of set of characters and line bounding box.
class PDFTextLine
{
public:
explicit inline PDFTextLine() = default;
/// Construct new line from characters. Characters are sorted in x-coordinate
/// and bounding box is computed.
/// \param characters
explicit PDFTextLine(TextCharacters characters);
const TextCharacters& getCharacters() const { return m_characters; }
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
const QPointF& getTopLeft() const { return m_topLeft; }
void applyTransform(const QMatrix& matrix);
friend QDataStream& operator<<(QDataStream& stream, const PDFTextLine& line);
friend QDataStream& operator>>(QDataStream& stream, PDFTextLine& line);
private:
TextCharacters m_characters;
QPainterPath m_boundingBox;
QPointF m_topLeft;
};
using PDFTextLines = std::vector<PDFTextLine>;
/// Represents text block consisting of set of lines and block bounding box.
class PDFTextBlock
{
public:
explicit inline PDFTextBlock() = default;
explicit inline PDFTextBlock(PDFTextLines textLines);
const PDFTextLines& getLines() const { return m_lines; }
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
const QPointF& getTopLeft() const { return m_topLeft; }
void applyTransform(const QMatrix& matrix);
friend QDataStream& operator<<(QDataStream& stream, const PDFTextBlock& block);
friend QDataStream& operator>>(QDataStream& stream, PDFTextBlock& block);
private:
PDFTextLines m_lines;
QPainterPath m_boundingBox;
QPointF m_topLeft;
};
using PDFTextBlocks = std::vector<PDFTextBlock>;
/// Text layout of single page. Can handle various fonts, various angles of lines
/// and vertically oriented text. It performs the "docstrum" algorithm.
class PDFTextLayout
{
public:
explicit PDFTextLayout();
/// Adds character to the layout
void addCharacter(const PDFTextCharacterInfo& info);
/// Perorms text layout algorithm
void perform();
/// Optimizes layout memory allocation to contain less space
void optimize();
/// Returns estimate of number of bytes, which this mesh occupies in memory
qint64 getMemoryConsumptionEstimate() const;
/// Returns recognized text blocks
const PDFTextBlocks& getTextBlocks() const { return m_blocks; }
friend QDataStream& operator<<(QDataStream& stream, const PDFTextLayout& layout);
friend QDataStream& operator>>(QDataStream& stream, PDFTextLayout& layout);
private:
/// Makes layout for particular angle
void performDoLayout(PDFReal angle);
/// Returns a list of characters for particular angle. Exact match is used
/// for angle, even if angle is floating point number.
/// \param angle Angle
TextCharacters getCharactersForAngle(PDFReal angle) const;
/// Applies transform to text characters (positions and bounding boxes)
/// \param characters Characters
/// \param matrix Transform matrix
void applyTransform(TextCharacters& characters, const QMatrix& matrix);
TextCharacters m_characters;
std::set<PDFReal> m_angles;
PDFTextLayoutSettings m_settings;
PDFTextBlocks m_blocks;
};
/// Storage for text layouts. For reading and writing, this object is thread safe.
/// For writing, mutex is used to synchronize asynchronous writes, for reading
/// no mutex is used at all. For this reason, both reading/writing at the same time
/// is prohibited, it is not thread safe.
class PDFTextLayoutStorage
{
public:
explicit inline PDFTextLayoutStorage() = default;
explicit inline PDFTextLayoutStorage(PDFInteger pageCount) :
m_offsets(pageCount, 0)
{
}
/// Returns text layout for particular page. If page index is invalid,
/// then empty text layout is returned. Function is not thread safe, if
/// function \p setTextLayout is called from another thread.
/// \param pageIndex Page index
PDFTextLayout getTextLayout(PDFInteger pageIndex) const;
/// Sets text layout to the particular index. Index must be valid and from
/// range 0 to \p pageCount - 1. Function is not thread safe.
/// \param pageIndex Page index
/// \param layout Text layout
/// \param mutex Mutex for locking (calls of setTextLayout from multiple threads)
void setTextLayout(PDFInteger pageIndex, const PDFTextLayout& layout, QMutex* mutex);
private:
std::vector<int> m_offsets;
QByteArray m_textLayouts;
};
} // namespace pdf
#endif // PDFTEXTLAYOUT_H