// Copyright (C) 2019 Jakub Melka // // This file is part of PdfForQt. // // PdfForQt is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // PdfForQt is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with PDFForQt. If not, see . #ifndef PDFTEXTLAYOUT_H #define PDFTEXTLAYOUT_H #include "pdfglobal.h" #include #include #include namespace pdf { struct PDFTextCharacterInfo { /// Character QChar character; /// Character path QPainterPath outline; /// Do we use a vertical writing system? bool isVerticalWritingSystem = false; /// Advance (in character space, it must be translated /// into device space), for both vertical/horizontal modes. PDFReal advance = 0.0; /// Font size (in character space, it must be translated /// into device space) PDFReal fontSize = 0.0; /// Transformation matrix from character space to device space QMatrix matrix; }; struct PDFTextLayoutSettings { /// Number of samples for 'docstrum' algorithm, i.e. number of /// nearest characters. By default, 5 characters should fit. size_t samples = 5; /// Distance sensitivity to determine, if characters are close enough. /// Maximal distance is computed as current character advance multiplied /// by this constant. PDFReal distanceSensitivity = 4.0; /// Maximal vertical distance, in portion of font size, of two characters /// to be considered they lie on same line. PDFReal charactersOnLineSensitivity = 0.25; /// Maximal ratio between font size of characters to be considered /// that they lie on same line. PDFReal fontSensitivity = 2.0; /// Maximal space ratio between two lines of block. Default coefficient /// means, that height ratio limit is (height1 + height2) PDFReal blockVerticalSensitivity = 1.5; /// Minimal horizontal overlap for two lines considered to be in one block PDFReal blockOverlapSensitivity = 0.3; friend QDataStream& operator<<(QDataStream& stream, const PDFTextLayoutSettings& settings); friend QDataStream& operator>>(QDataStream& stream, PDFTextLayoutSettings& settings); }; /// Represents character in device space coordinates. All values (dimensions, /// bounding box, etc. are in device space coordinates). struct TextCharacter { QChar character; QPointF position; PDFReal angle = 0.0; PDFReal fontSize = 0.0; PDFReal advance = 0.0; QPainterPath boundingBox; void applyTransform(const QMatrix& matrix); friend QDataStream& operator<<(QDataStream& stream, const TextCharacter& character); friend QDataStream& operator>>(QDataStream& stream, TextCharacter& character); }; using TextCharacters = std::vector; /// Represents text line consisting of set of characters and line bounding box. class PDFTextLine { public: explicit inline PDFTextLine() = default; /// Construct new line from characters. Characters are sorted in x-coordinate /// and bounding box is computed. /// \param characters explicit PDFTextLine(TextCharacters characters); const TextCharacters& getCharacters() const { return m_characters; } const QPainterPath& getBoundingBox() const { return m_boundingBox; } const QPointF& getTopLeft() const { return m_topLeft; } void applyTransform(const QMatrix& matrix); friend QDataStream& operator<<(QDataStream& stream, const PDFTextLine& line); friend QDataStream& operator>>(QDataStream& stream, PDFTextLine& line); private: TextCharacters m_characters; QPainterPath m_boundingBox; QPointF m_topLeft; }; using PDFTextLines = std::vector; /// Represents text block consisting of set of lines and block bounding box. class PDFTextBlock { public: explicit inline PDFTextBlock() = default; explicit inline PDFTextBlock(PDFTextLines textLines); const PDFTextLines& getLines() const { return m_lines; } const QPainterPath& getBoundingBox() const { return m_boundingBox; } const QPointF& getTopLeft() const { return m_topLeft; } void applyTransform(const QMatrix& matrix); friend QDataStream& operator<<(QDataStream& stream, const PDFTextBlock& block); friend QDataStream& operator>>(QDataStream& stream, PDFTextBlock& block); private: PDFTextLines m_lines; QPainterPath m_boundingBox; QPointF m_topLeft; }; using PDFTextBlocks = std::vector; /// Text layout of single page. Can handle various fonts, various angles of lines /// and vertically oriented text. It performs the "docstrum" algorithm. class PDFTextLayout { public: explicit PDFTextLayout(); /// Adds character to the layout void addCharacter(const PDFTextCharacterInfo& info); /// Perorms text layout algorithm void perform(); /// Optimizes layout memory allocation to contain less space void optimize(); /// Returns estimate of number of bytes, which this mesh occupies in memory qint64 getMemoryConsumptionEstimate() const; /// Returns recognized text blocks const PDFTextBlocks& getTextBlocks() const { return m_blocks; } friend QDataStream& operator<<(QDataStream& stream, const PDFTextLayout& layout); friend QDataStream& operator>>(QDataStream& stream, PDFTextLayout& layout); private: /// Makes layout for particular angle void performDoLayout(PDFReal angle); /// Returns a list of characters for particular angle. Exact match is used /// for angle, even if angle is floating point number. /// \param angle Angle TextCharacters getCharactersForAngle(PDFReal angle) const; /// Applies transform to text characters (positions and bounding boxes) /// \param characters Characters /// \param matrix Transform matrix void applyTransform(TextCharacters& characters, const QMatrix& matrix); TextCharacters m_characters; std::set m_angles; PDFTextLayoutSettings m_settings; PDFTextBlocks m_blocks; }; /// Storage for text layouts. For reading and writing, this object is thread safe. /// For writing, mutex is used to synchronize asynchronous writes, for reading /// no mutex is used at all. For this reason, both reading/writing at the same time /// is prohibited, it is not thread safe. class PDFTextLayoutStorage { public: explicit inline PDFTextLayoutStorage() = default; explicit inline PDFTextLayoutStorage(PDFInteger pageCount) : m_offsets(pageCount, 0) { } /// Returns text layout for particular page. If page index is invalid, /// then empty text layout is returned. Function is not thread safe, if /// function \p setTextLayout is called from another thread. /// \param pageIndex Page index PDFTextLayout getTextLayout(PDFInteger pageIndex) const; /// Sets text layout to the particular index. Index must be valid and from /// range 0 to \p pageCount - 1. Function is not thread safe. /// \param pageIndex Page index /// \param layout Text layout /// \param mutex Mutex for locking (calls of setTextLayout from multiple threads) void setTextLayout(PDFInteger pageIndex, const PDFTextLayout& layout, QMutex* mutex); private: std::vector m_offsets; QByteArray m_textLayouts; }; } // namespace pdf #endif // PDFTEXTLAYOUT_H