From 0c97e21f54d9fd3584e523d889dbbe174c2e8228 Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sun, 29 Dec 2019 13:50:00 +0100 Subject: [PATCH] =?UTF-8?q?Dokon=C4=8Den=C3=AD=20algoritmu=20pro=20layout?= =?UTF-8?q?=20textu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PdfForQtLib/sources/pdftextlayout.cpp | 158 +++++++++++++++++++++++--- PdfForQtLib/sources/pdftextlayout.h | 8 +- 2 files changed, 152 insertions(+), 14 deletions(-) diff --git a/PdfForQtLib/sources/pdftextlayout.cpp b/PdfForQtLib/sources/pdftextlayout.cpp index 34422e3..0ff3de9 100644 --- a/PdfForQtLib/sources/pdftextlayout.cpp +++ b/PdfForQtLib/sources/pdftextlayout.cpp @@ -75,6 +75,14 @@ qint64 PDFTextLayout::getMemoryConsumptionEstimate() const return estimate; } +struct NearestCharacterInfo +{ + size_t index = std::numeric_limits::max(); + PDFReal distance = std::numeric_limits::infinity(); + + inline bool operator<(const NearestCharacterInfo& other) const { return distance < other.distance; } +}; + void PDFTextLayout::performDoLayout(PDFReal angle) { // We will implement variation of 'docstrum' algorithm, we have divided characters by angles, @@ -97,19 +105,11 @@ void PDFTextLayout::performDoLayout(PDFReal angle) applyTransform(characters, angleMatrix); // Step 2) - find k-nearest characters - struct NearestCharacterInfo - { - size_t index = std::numeric_limits::max(); - PDFReal distance = std::numeric_limits::infinity(); - - inline bool operator<(const NearestCharacterInfo& other) const { return distance < other.distance; } - }; - const size_t characterCount = characters.size(); const size_t bucketSize = m_settings.samples + 1; std::vector nearestCharacters(bucketSize * characters.size(), NearestCharacterInfo()); - auto findNearestCharacters = [&](size_t currentCharacterIndex) + auto findNearestCharacters = [this, bucketSize, characterCount, &characters, &nearestCharacters](size_t currentCharacterIndex) { // It will be iterator to the start of the nearest neighbour sequence auto it = std::next(nearestCharacters.begin(), currentCharacterIndex * bucketSize); @@ -129,8 +129,29 @@ void PDFTextLayout::performDoLayout(PDFReal angle) // Now, use insert sort to sort the array of samples + 1 elements (#samples elements // are sorted, we use only insert sort on the last element). - auto itInsert = std::upper_bound(it, itLast, *itLast); - std::rotate(itInsert, itLast, itLast + 1); + auto itLeft = std::prev(itLast); + auto itRight = itLast; + while (true) + { + if (*itRight < *itLeft) + { + std::swap(*itRight, *itLeft); + itRight = itLeft; + + if (itLeft == it) + { + // We have reached the end + break; + } + + --itLeft; + } + else + { + // We have proper order, break the cycle + break; + } + } } }; @@ -222,8 +243,95 @@ void PDFTextLayout::performDoLayout(PDFReal angle) blocks.emplace_back(qMove(item.second)); } - // Transform blocks back to original coordinate system - volatile int i = 0; + // 5) Sort block by topological ordering. We will use approache described in paper + // "High Performance Document Layout Analysis", T.M. Breuel, 2003, where are described + // two rules, which are used to determine block precedence. + // + // Rule 1: a bBB.top(); + return isOverlappedOnHorizontalAxis && isAoverB; + }; + auto isBeforeByRule2 = [&blocks](const size_t aIndex, const size_t bIndex) + { + QRectF aBB = blocks[aIndex].getBoundingBox().boundingRect(); + QRectF bBB = blocks[bIndex].getBoundingBox().boundingRect(); + QRectF abBB = aBB.united(bBB); + + if (aBB.right() < bBB.left()) + { + // Check, if 'c' block doesn't exist + for (size_t i = 0, count = blocks.size(); i < count; ++i) + { + if (i == aIndex || i == bIndex) + { + continue; + } + + QRectF cBB = blocks[i].getBoundingBox().boundingRect(); + if (cBB.top() >= abBB.top() && cBB.bottom() <= abBB.bottom()) + { + const bool isAOverlappedOnHorizontalAxis = (aBB.right() < cBB.left() && aBB.left() < cBB.right()) || (cBB.right() < aBB.left() && cBB.left() < aBB.right()); + const bool isBOverlappedOnHorizontalAxis = (cBB.right() < bBB.left() && cBB.left() < bBB.right()) || (bBB.right() < cBB.left() && bBB.left() < cBB.right()); + if (isAOverlappedOnHorizontalAxis && isBOverlappedOnHorizontalAxis) + { + return false; + } + } + } + + return true; + } + + return false; + }; + + // Order blocks using topological sort (https://en.wikipedia.org/wiki/Topological_sorting, + // Kahn's algorithm is used) + std::set workBlocks; + std::vector ordering; + std::vector> orderingEdges(blocks.size(), std::set()); + ordering.reserve(blocks.size()); + for (size_t i = 0; i < blocks.size(); ++i) + { + workBlocks.insert(workBlocks.end(), i); + for (size_t j = 0; j < blocks.size(); ++j) + { + if (isBeforeByRule1(j, i) || isBeforeByRule2(j, i)) + { + orderingEdges[i].insert(j); + } + } + } + + // Topological sort + QMatrix invertedAngleMatrix = angleMatrix.inverted(); + while (!workBlocks.empty()) + { + auto it = std::min_element(workBlocks.begin(), workBlocks.end(), [&orderingEdges](const size_t l, const size_t r) { return orderingEdges[l].size() < orderingEdges[r].size(); }); + ordering.push_back(*it); + for (std::set& edges : orderingEdges) + { + edges.erase(*it); + } + + blocks[*it].applyTransform(invertedAngleMatrix); + m_blocks.emplace_back(qMove(blocks[*it])); + workBlocks.erase(it); + } } TextCharacters PDFTextLayout::getCharactersForAngle(PDFReal angle) const @@ -255,6 +363,15 @@ PDFTextLine::PDFTextLine(TextCharacters characters) : m_boundingBox.addRect(boundingBox); } +void PDFTextLine::applyTransform(const QMatrix& matrix) +{ + m_boundingBox = matrix.map(m_boundingBox); + for (TextCharacter& character : m_characters) + { + character.applyTransform(matrix); + } +} + PDFTextBlock::PDFTextBlock(PDFTextLines textLines) : m_lines(qMove(textLines)) { @@ -278,4 +395,19 @@ PDFTextBlock::PDFTextBlock(PDFTextLines textLines) : m_boundingBox.addRect(boundingBox); } +void PDFTextBlock::applyTransform(const QMatrix& matrix) +{ + m_boundingBox = matrix.map(m_boundingBox); + for (PDFTextLine& textLine : m_lines) + { + textLine.applyTransform(matrix); + } +} + +void TextCharacter::applyTransform(const QMatrix& matrix) +{ + position = matrix.map(position); + boundingBox = matrix.map(boundingBox); +} + } // namespace pdf diff --git a/PdfForQtLib/sources/pdftextlayout.h b/PdfForQtLib/sources/pdftextlayout.h index 562059a..3abf448 100644 --- a/PdfForQtLib/sources/pdftextlayout.h +++ b/PdfForQtLib/sources/pdftextlayout.h @@ -87,6 +87,8 @@ struct TextCharacter PDFReal fontSize = 0.0; PDFReal advance = 0.0; QPainterPath boundingBox; + + void applyTransform(const QMatrix& matrix); }; using TextCharacters = std::vector; @@ -103,6 +105,8 @@ public: const TextCharacters& getCharacters() const { return m_characters; } const QPainterPath& getBoundingBox() const { return m_boundingBox; } + void applyTransform(const QMatrix& matrix); + private: TextCharacters m_characters; QPainterPath m_boundingBox; @@ -119,6 +123,8 @@ public: const PDFTextLines& getLines() const { return m_lines; } const QPainterPath& getBoundingBox() const { return m_boundingBox; } + void applyTransform(const QMatrix& matrix); + private: PDFTextLines m_lines; QPainterPath m_boundingBox; @@ -146,7 +152,6 @@ public: qint64 getMemoryConsumptionEstimate() const; private: - /// Makes layout for particular angle void performDoLayout(PDFReal angle); @@ -163,6 +168,7 @@ private: TextCharacters m_characters; std::set m_angles; PDFTextLayoutSettings m_settings; + PDFTextBlocks m_blocks; }; } // namespace pdf