mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-01-30 09:04:48 +01:00
Dokončení algoritmu pro layout textu
This commit is contained in:
parent
0ec9d6cf0e
commit
0c97e21f54
@ -75,6 +75,14 @@ qint64 PDFTextLayout::getMemoryConsumptionEstimate() const
|
|||||||
return estimate;
|
return estimate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct NearestCharacterInfo
|
||||||
|
{
|
||||||
|
size_t index = std::numeric_limits<size_t>::max();
|
||||||
|
PDFReal distance = std::numeric_limits<PDFReal>::infinity();
|
||||||
|
|
||||||
|
inline bool operator<(const NearestCharacterInfo& other) const { return distance < other.distance; }
|
||||||
|
};
|
||||||
|
|
||||||
void PDFTextLayout::performDoLayout(PDFReal angle)
|
void PDFTextLayout::performDoLayout(PDFReal angle)
|
||||||
{
|
{
|
||||||
// We will implement variation of 'docstrum' algorithm, we have divided characters by angles,
|
// We will implement variation of 'docstrum' algorithm, we have divided characters by angles,
|
||||||
@ -97,19 +105,11 @@ void PDFTextLayout::performDoLayout(PDFReal angle)
|
|||||||
applyTransform(characters, angleMatrix);
|
applyTransform(characters, angleMatrix);
|
||||||
|
|
||||||
// Step 2) - find k-nearest characters
|
// Step 2) - find k-nearest characters
|
||||||
struct NearestCharacterInfo
|
|
||||||
{
|
|
||||||
size_t index = std::numeric_limits<size_t>::max();
|
|
||||||
PDFReal distance = std::numeric_limits<PDFReal>::infinity();
|
|
||||||
|
|
||||||
inline bool operator<(const NearestCharacterInfo& other) const { return distance < other.distance; }
|
|
||||||
};
|
|
||||||
|
|
||||||
const size_t characterCount = characters.size();
|
const size_t characterCount = characters.size();
|
||||||
const size_t bucketSize = m_settings.samples + 1;
|
const size_t bucketSize = m_settings.samples + 1;
|
||||||
std::vector<NearestCharacterInfo> nearestCharacters(bucketSize * characters.size(), NearestCharacterInfo());
|
std::vector<NearestCharacterInfo> nearestCharacters(bucketSize * characters.size(), NearestCharacterInfo());
|
||||||
|
|
||||||
auto findNearestCharacters = [&](size_t currentCharacterIndex)
|
auto findNearestCharacters = [this, bucketSize, characterCount, &characters, &nearestCharacters](size_t currentCharacterIndex)
|
||||||
{
|
{
|
||||||
// It will be iterator to the start of the nearest neighbour sequence
|
// It will be iterator to the start of the nearest neighbour sequence
|
||||||
auto it = std::next(nearestCharacters.begin(), currentCharacterIndex * bucketSize);
|
auto it = std::next(nearestCharacters.begin(), currentCharacterIndex * bucketSize);
|
||||||
@ -129,8 +129,29 @@ void PDFTextLayout::performDoLayout(PDFReal angle)
|
|||||||
|
|
||||||
// Now, use insert sort to sort the array of samples + 1 elements (#samples elements
|
// Now, use insert sort to sort the array of samples + 1 elements (#samples elements
|
||||||
// are sorted, we use only insert sort on the last element).
|
// are sorted, we use only insert sort on the last element).
|
||||||
auto itInsert = std::upper_bound(it, itLast, *itLast);
|
auto itLeft = std::prev(itLast);
|
||||||
std::rotate(itInsert, itLast, itLast + 1);
|
auto itRight = itLast;
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
if (*itRight < *itLeft)
|
||||||
|
{
|
||||||
|
std::swap(*itRight, *itLeft);
|
||||||
|
itRight = itLeft;
|
||||||
|
|
||||||
|
if (itLeft == it)
|
||||||
|
{
|
||||||
|
// We have reached the end
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
--itLeft;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// We have proper order, break the cycle
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -222,8 +243,95 @@ void PDFTextLayout::performDoLayout(PDFReal angle)
|
|||||||
blocks.emplace_back(qMove(item.second));
|
blocks.emplace_back(qMove(item.second));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Transform blocks back to original coordinate system
|
// 5) Sort block by topological ordering. We will use approache described in paper
|
||||||
volatile int i = 0;
|
// "High Performance Document Layout Analysis", T.M. Breuel, 2003, where are described
|
||||||
|
// two rules, which are used to determine block precedence.
|
||||||
|
//
|
||||||
|
// Rule 1: a<b, if:
|
||||||
|
// - blocks a,b have overlap in x-axis
|
||||||
|
// - block a is above block b
|
||||||
|
//
|
||||||
|
// Rule 2: a<b, if:
|
||||||
|
// - block a is entirely on left side of block b
|
||||||
|
// - there doesn't exist block c, which is between a,b in y-axis
|
||||||
|
// and moreover, overlaps both a and b in x-axis.
|
||||||
|
|
||||||
|
auto isBeforeByRule1 = [&blocks](const size_t aIndex, const size_t bIndex)
|
||||||
|
{
|
||||||
|
QRectF aBB = blocks[aIndex].getBoundingBox().boundingRect();
|
||||||
|
QRectF bBB = blocks[bIndex].getBoundingBox().boundingRect();
|
||||||
|
|
||||||
|
const bool isOverlappedOnHorizontalAxis = (aBB.right() < bBB.left() && aBB.left() < bBB.right()) || (bBB.right() < aBB.left() && bBB.left() < aBB.right());
|
||||||
|
const bool isAoverB = aBB.bottom() > bBB.top();
|
||||||
|
return isOverlappedOnHorizontalAxis && isAoverB;
|
||||||
|
};
|
||||||
|
auto isBeforeByRule2 = [&blocks](const size_t aIndex, const size_t bIndex)
|
||||||
|
{
|
||||||
|
QRectF aBB = blocks[aIndex].getBoundingBox().boundingRect();
|
||||||
|
QRectF bBB = blocks[bIndex].getBoundingBox().boundingRect();
|
||||||
|
QRectF abBB = aBB.united(bBB);
|
||||||
|
|
||||||
|
if (aBB.right() < bBB.left())
|
||||||
|
{
|
||||||
|
// Check, if 'c' block doesn't exist
|
||||||
|
for (size_t i = 0, count = blocks.size(); i < count; ++i)
|
||||||
|
{
|
||||||
|
if (i == aIndex || i == bIndex)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
QRectF cBB = blocks[i].getBoundingBox().boundingRect();
|
||||||
|
if (cBB.top() >= abBB.top() && cBB.bottom() <= abBB.bottom())
|
||||||
|
{
|
||||||
|
const bool isAOverlappedOnHorizontalAxis = (aBB.right() < cBB.left() && aBB.left() < cBB.right()) || (cBB.right() < aBB.left() && cBB.left() < aBB.right());
|
||||||
|
const bool isBOverlappedOnHorizontalAxis = (cBB.right() < bBB.left() && cBB.left() < bBB.right()) || (bBB.right() < cBB.left() && bBB.left() < cBB.right());
|
||||||
|
if (isAOverlappedOnHorizontalAxis && isBOverlappedOnHorizontalAxis)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Order blocks using topological sort (https://en.wikipedia.org/wiki/Topological_sorting,
|
||||||
|
// Kahn's algorithm is used)
|
||||||
|
std::set<size_t> workBlocks;
|
||||||
|
std::vector<size_t> ordering;
|
||||||
|
std::vector<std::set<size_t>> orderingEdges(blocks.size(), std::set<size_t>());
|
||||||
|
ordering.reserve(blocks.size());
|
||||||
|
for (size_t i = 0; i < blocks.size(); ++i)
|
||||||
|
{
|
||||||
|
workBlocks.insert(workBlocks.end(), i);
|
||||||
|
for (size_t j = 0; j < blocks.size(); ++j)
|
||||||
|
{
|
||||||
|
if (isBeforeByRule1(j, i) || isBeforeByRule2(j, i))
|
||||||
|
{
|
||||||
|
orderingEdges[i].insert(j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Topological sort
|
||||||
|
QMatrix invertedAngleMatrix = angleMatrix.inverted();
|
||||||
|
while (!workBlocks.empty())
|
||||||
|
{
|
||||||
|
auto it = std::min_element(workBlocks.begin(), workBlocks.end(), [&orderingEdges](const size_t l, const size_t r) { return orderingEdges[l].size() < orderingEdges[r].size(); });
|
||||||
|
ordering.push_back(*it);
|
||||||
|
for (std::set<size_t>& edges : orderingEdges)
|
||||||
|
{
|
||||||
|
edges.erase(*it);
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks[*it].applyTransform(invertedAngleMatrix);
|
||||||
|
m_blocks.emplace_back(qMove(blocks[*it]));
|
||||||
|
workBlocks.erase(it);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TextCharacters PDFTextLayout::getCharactersForAngle(PDFReal angle) const
|
TextCharacters PDFTextLayout::getCharactersForAngle(PDFReal angle) const
|
||||||
@ -255,6 +363,15 @@ PDFTextLine::PDFTextLine(TextCharacters characters) :
|
|||||||
m_boundingBox.addRect(boundingBox);
|
m_boundingBox.addRect(boundingBox);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFTextLine::applyTransform(const QMatrix& matrix)
|
||||||
|
{
|
||||||
|
m_boundingBox = matrix.map(m_boundingBox);
|
||||||
|
for (TextCharacter& character : m_characters)
|
||||||
|
{
|
||||||
|
character.applyTransform(matrix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
PDFTextBlock::PDFTextBlock(PDFTextLines textLines) :
|
PDFTextBlock::PDFTextBlock(PDFTextLines textLines) :
|
||||||
m_lines(qMove(textLines))
|
m_lines(qMove(textLines))
|
||||||
{
|
{
|
||||||
@ -278,4 +395,19 @@ PDFTextBlock::PDFTextBlock(PDFTextLines textLines) :
|
|||||||
m_boundingBox.addRect(boundingBox);
|
m_boundingBox.addRect(boundingBox);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFTextBlock::applyTransform(const QMatrix& matrix)
|
||||||
|
{
|
||||||
|
m_boundingBox = matrix.map(m_boundingBox);
|
||||||
|
for (PDFTextLine& textLine : m_lines)
|
||||||
|
{
|
||||||
|
textLine.applyTransform(matrix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TextCharacter::applyTransform(const QMatrix& matrix)
|
||||||
|
{
|
||||||
|
position = matrix.map(position);
|
||||||
|
boundingBox = matrix.map(boundingBox);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
@ -87,6 +87,8 @@ struct TextCharacter
|
|||||||
PDFReal fontSize = 0.0;
|
PDFReal fontSize = 0.0;
|
||||||
PDFReal advance = 0.0;
|
PDFReal advance = 0.0;
|
||||||
QPainterPath boundingBox;
|
QPainterPath boundingBox;
|
||||||
|
|
||||||
|
void applyTransform(const QMatrix& matrix);
|
||||||
};
|
};
|
||||||
|
|
||||||
using TextCharacters = std::vector<TextCharacter>;
|
using TextCharacters = std::vector<TextCharacter>;
|
||||||
@ -103,6 +105,8 @@ public:
|
|||||||
const TextCharacters& getCharacters() const { return m_characters; }
|
const TextCharacters& getCharacters() const { return m_characters; }
|
||||||
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
||||||
|
|
||||||
|
void applyTransform(const QMatrix& matrix);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
TextCharacters m_characters;
|
TextCharacters m_characters;
|
||||||
QPainterPath m_boundingBox;
|
QPainterPath m_boundingBox;
|
||||||
@ -119,6 +123,8 @@ public:
|
|||||||
const PDFTextLines& getLines() const { return m_lines; }
|
const PDFTextLines& getLines() const { return m_lines; }
|
||||||
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
||||||
|
|
||||||
|
void applyTransform(const QMatrix& matrix);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
PDFTextLines m_lines;
|
PDFTextLines m_lines;
|
||||||
QPainterPath m_boundingBox;
|
QPainterPath m_boundingBox;
|
||||||
@ -146,7 +152,6 @@ public:
|
|||||||
qint64 getMemoryConsumptionEstimate() const;
|
qint64 getMemoryConsumptionEstimate() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
/// Makes layout for particular angle
|
/// Makes layout for particular angle
|
||||||
void performDoLayout(PDFReal angle);
|
void performDoLayout(PDFReal angle);
|
||||||
|
|
||||||
@ -163,6 +168,7 @@ private:
|
|||||||
TextCharacters m_characters;
|
TextCharacters m_characters;
|
||||||
std::set<PDFReal> m_angles;
|
std::set<PDFReal> m_angles;
|
||||||
PDFTextLayoutSettings m_settings;
|
PDFTextLayoutSettings m_settings;
|
||||||
|
PDFTextBlocks m_blocks;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
Loading…
x
Reference in New Issue
Block a user