mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
Text layout - first part
This commit is contained in:
@ -55,6 +55,7 @@ SOURCES += \
|
|||||||
sources/pdfpattern.cpp \
|
sources/pdfpattern.cpp \
|
||||||
sources/pdfprogress.cpp \
|
sources/pdfprogress.cpp \
|
||||||
sources/pdfsecurityhandler.cpp \
|
sources/pdfsecurityhandler.cpp \
|
||||||
|
sources/pdftextlayout.cpp \
|
||||||
sources/pdfutils.cpp \
|
sources/pdfutils.cpp \
|
||||||
sources/pdfxreftable.cpp \
|
sources/pdfxreftable.cpp \
|
||||||
sources/pdfvisitor.cpp \
|
sources/pdfvisitor.cpp \
|
||||||
@ -98,6 +99,7 @@ HEADERS += \
|
|||||||
sources/pdfpattern.h \
|
sources/pdfpattern.h \
|
||||||
sources/pdfprogress.h \
|
sources/pdfprogress.h \
|
||||||
sources/pdfsecurityhandler.h \
|
sources/pdfsecurityhandler.h \
|
||||||
|
sources/pdftextlayout.h \
|
||||||
sources/pdfxreftable.h \
|
sources/pdfxreftable.h \
|
||||||
sources/pdfflatmap.h \
|
sources/pdfflatmap.h \
|
||||||
sources/pdfvisitor.h \
|
sources/pdfvisitor.h \
|
||||||
|
@ -397,7 +397,7 @@ private:
|
|||||||
struct Glyph
|
struct Glyph
|
||||||
{
|
{
|
||||||
QPainterPath glyph;
|
QPainterPath glyph;
|
||||||
PDFReal advance;
|
PDFReal advance = 0.0;
|
||||||
};
|
};
|
||||||
|
|
||||||
static int outlineMoveTo(const FT_Vector* to, void* user);
|
static int outlineMoveTo(const FT_Vector* to, void* user);
|
||||||
|
@ -387,6 +387,11 @@ void PDFPageContentProcessor::performEndTransparencyGroup(ProcessOrder order, co
|
|||||||
Q_UNUSED(transparencyGroup);
|
Q_UNUSED(transparencyGroup);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFPageContentProcessor::performOutputCharacter(const PDFTextCharacterInfo& info)
|
||||||
|
{
|
||||||
|
Q_UNUSED(info);
|
||||||
|
}
|
||||||
|
|
||||||
bool PDFPageContentProcessor::isContentSuppressed() const
|
bool PDFPageContentProcessor::isContentSuppressed() const
|
||||||
{
|
{
|
||||||
return std::any_of(m_markedContentStack.cbegin(), m_markedContentStack.cend(), [](const MarkedContentState& state) { return state.contentSuppressed; });
|
return std::any_of(m_markedContentStack.cbegin(), m_markedContentStack.cend(), [](const MarkedContentState& state) { return state.contentSuppressed; });
|
||||||
@ -2893,13 +2898,26 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
|
|||||||
if (!glyphPath.isEmpty())
|
if (!glyphPath.isEmpty())
|
||||||
{
|
{
|
||||||
QMatrix textRenderingMatrix = adjustMatrix * textMatrix;
|
QMatrix textRenderingMatrix = adjustMatrix * textMatrix;
|
||||||
|
QMatrix toDeviceSpaceTransform = textRenderingMatrix * m_graphicState.getCurrentTransformationMatrix();
|
||||||
QPainterPath transformedGlyph = textRenderingMatrix.map(glyphPath);
|
QPainterPath transformedGlyph = textRenderingMatrix.map(glyphPath);
|
||||||
processPathPainting(transformedGlyph, stroke, fill, true, transformedGlyph.fillRule());
|
processPathPainting(transformedGlyph, stroke, fill, true, transformedGlyph.fillRule());
|
||||||
|
|
||||||
|
if (!item.character.isNull() && !item.character.isSpace())
|
||||||
|
{
|
||||||
|
// Output character
|
||||||
|
PDFTextCharacterInfo info;
|
||||||
|
info.character = item.character;
|
||||||
|
info.isVerticalWritingSystem = !isHorizontalWritingSystem;
|
||||||
|
info.advance = item.advance;
|
||||||
|
info.fontSize = fontSize;
|
||||||
|
info.outline = glyphPath;
|
||||||
|
info.matrix = toDeviceSpaceTransform;
|
||||||
|
performOutputCharacter(info);
|
||||||
|
}
|
||||||
|
|
||||||
if (clipped)
|
if (clipped)
|
||||||
{
|
{
|
||||||
// Clipping is enabled, we must transform to the device coordinates
|
// Clipping is enabled, we must transform to the device coordinates
|
||||||
QMatrix toDeviceSpaceTransform = textRenderingMatrix * m_graphicState.getCurrentTransformationMatrix();
|
|
||||||
m_textClippingPath = m_textClippingPath.united(toDeviceSpaceTransform.map(glyphPath));
|
m_textClippingPath = m_textClippingPath.united(toDeviceSpaceTransform.map(glyphPath));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2970,6 +2988,18 @@ void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
|
|||||||
updateGraphicState();
|
updateGraphicState();
|
||||||
|
|
||||||
processContent(*item.characterContentStream);
|
processContent(*item.characterContentStream);
|
||||||
|
|
||||||
|
if (!item.character.isNull() && !item.character.isSpace())
|
||||||
|
{
|
||||||
|
// Output character
|
||||||
|
PDFTextCharacterInfo info;
|
||||||
|
info.character = item.character;
|
||||||
|
info.isVerticalWritingSystem = !isHorizontalWritingSystem;
|
||||||
|
info.advance = item.advance;
|
||||||
|
info.fontSize = fontSize;
|
||||||
|
info.matrix = worldMatrix;
|
||||||
|
performOutputCharacter(info);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
textMatrix.translate(displacementX, 0.0);
|
textMatrix.translate(displacementX, 0.0);
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include "pdfutils.h"
|
#include "pdfutils.h"
|
||||||
#include "pdfmeshqualitysettings.h"
|
#include "pdfmeshqualitysettings.h"
|
||||||
#include "pdfblendfunction.h"
|
#include "pdfblendfunction.h"
|
||||||
|
#include "pdftextlayout.h"
|
||||||
|
|
||||||
#include <QMatrix>
|
#include <QMatrix>
|
||||||
#include <QPainterPath>
|
#include <QPainterPath>
|
||||||
@ -489,6 +490,9 @@ protected:
|
|||||||
/// \param transparencyGroup Transparency group
|
/// \param transparencyGroup Transparency group
|
||||||
virtual void performEndTransparencyGroup(ProcessOrder order, const PDFTransparencyGroup& transparencyGroup);
|
virtual void performEndTransparencyGroup(ProcessOrder order, const PDFTransparencyGroup& transparencyGroup);
|
||||||
|
|
||||||
|
/// Implement to react on character printing
|
||||||
|
virtual void performOutputCharacter(const PDFTextCharacterInfo& info);
|
||||||
|
|
||||||
/// Returns current graphic state
|
/// Returns current graphic state
|
||||||
const PDFPageContentProcessorState* getGraphicState() const { return &m_graphicState; }
|
const PDFPageContentProcessorState* getGraphicState() const { return &m_graphicState; }
|
||||||
|
|
||||||
|
@ -447,6 +447,11 @@ void PDFPrecompiledPageGenerator::setCompositionMode(QPainter::CompositionMode m
|
|||||||
m_precompiledPage->addSetCompositionMode(mode);
|
m_precompiledPage->addSetCompositionMode(mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFPrecompiledPageGenerator::performOutputCharacter(const PDFTextCharacterInfo& info)
|
||||||
|
{
|
||||||
|
m_precompiledPage->addCharacter(info);
|
||||||
|
}
|
||||||
|
|
||||||
void PDFPrecompiledPage::draw(QPainter* painter, const QRectF& cropBox, const QMatrix& pagePointToDevicePointMatrix, PDFRenderer::Features features) const
|
void PDFPrecompiledPage::draw(QPainter* painter, const QRectF& cropBox, const QMatrix& pagePointToDevicePointMatrix, PDFRenderer::Features features) const
|
||||||
{
|
{
|
||||||
Q_ASSERT(painter);
|
Q_ASSERT(painter);
|
||||||
@ -600,6 +605,16 @@ void PDFPrecompiledPage::addSetCompositionMode(QPainter::CompositionMode composi
|
|||||||
m_compositionModes.push_back(compositionMode);
|
m_compositionModes.push_back(compositionMode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PDFPrecompiledPage::addCharacter(const PDFTextCharacterInfo& info)
|
||||||
|
{
|
||||||
|
m_textLayout.addCharacter(info);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFPrecompiledPage::createTextLayout()
|
||||||
|
{
|
||||||
|
m_textLayout.perform();
|
||||||
|
}
|
||||||
|
|
||||||
void PDFPrecompiledPage::optimize()
|
void PDFPrecompiledPage::optimize()
|
||||||
{
|
{
|
||||||
m_instructions.shrink_to_fit();
|
m_instructions.shrink_to_fit();
|
||||||
@ -609,6 +624,7 @@ void PDFPrecompiledPage::optimize()
|
|||||||
m_meshes.shrink_to_fit();
|
m_meshes.shrink_to_fit();
|
||||||
m_matrices.shrink_to_fit();
|
m_matrices.shrink_to_fit();
|
||||||
m_compositionModes.shrink_to_fit();
|
m_compositionModes.shrink_to_fit();
|
||||||
|
m_textLayout.optimize();
|
||||||
}
|
}
|
||||||
|
|
||||||
void PDFPrecompiledPage::finalize(qint64 compilingTimeNS, QList<PDFRenderError> errors)
|
void PDFPrecompiledPage::finalize(qint64 compilingTimeNS, QList<PDFRenderError> errors)
|
||||||
@ -651,6 +667,8 @@ void PDFPrecompiledPage::finalize(qint64 compilingTimeNS, QList<PDFRenderError>
|
|||||||
{
|
{
|
||||||
m_memoryConsumptionEstimate += data.mesh.getMemoryConsumptionEstimate();
|
m_memoryConsumptionEstimate += data.mesh.getMemoryConsumptionEstimate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_memoryConsumptionEstimate += m_textLayout.getMemoryConsumptionEstimate();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
@ -22,6 +22,7 @@
|
|||||||
#include "pdfpattern.h"
|
#include "pdfpattern.h"
|
||||||
#include "pdfrenderer.h"
|
#include "pdfrenderer.h"
|
||||||
#include "pdfpagecontentprocessor.h"
|
#include "pdfpagecontentprocessor.h"
|
||||||
|
#include "pdftextlayout.h"
|
||||||
|
|
||||||
#include <QPen>
|
#include <QPen>
|
||||||
#include <QBrush>
|
#include <QBrush>
|
||||||
@ -191,6 +192,10 @@ public:
|
|||||||
void addRestoreGraphicState() { m_instructions.emplace_back(InstructionType::RestoreGraphicState, 0); }
|
void addRestoreGraphicState() { m_instructions.emplace_back(InstructionType::RestoreGraphicState, 0); }
|
||||||
void addSetWorldMatrix(const QMatrix& matrix);
|
void addSetWorldMatrix(const QMatrix& matrix);
|
||||||
void addSetCompositionMode(QPainter::CompositionMode compositionMode);
|
void addSetCompositionMode(QPainter::CompositionMode compositionMode);
|
||||||
|
void addCharacter(const PDFTextCharacterInfo& info);
|
||||||
|
|
||||||
|
/// Creates text layout for the page
|
||||||
|
void createTextLayout();
|
||||||
|
|
||||||
/// Optimizes page memory allocation to contain less space
|
/// Optimizes page memory allocation to contain less space
|
||||||
void optimize();
|
void optimize();
|
||||||
@ -279,6 +284,7 @@ private:
|
|||||||
std::vector<QMatrix> m_matrices;
|
std::vector<QMatrix> m_matrices;
|
||||||
std::vector<QPainter::CompositionMode> m_compositionModes;
|
std::vector<QPainter::CompositionMode> m_compositionModes;
|
||||||
QList<PDFRenderError> m_errors;
|
QList<PDFRenderError> m_errors;
|
||||||
|
PDFTextLayout m_textLayout;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Processor, which processes PDF's page commands and writes them to the precompiled page.
|
/// Processor, which processes PDF's page commands and writes them to the precompiled page.
|
||||||
@ -311,6 +317,7 @@ protected:
|
|||||||
virtual void performRestoreGraphicState(ProcessOrder order) override;
|
virtual void performRestoreGraphicState(ProcessOrder order) override;
|
||||||
virtual void setWorldMatrix(const QMatrix& matrix) override;
|
virtual void setWorldMatrix(const QMatrix& matrix) override;
|
||||||
virtual void setCompositionMode(QPainter::CompositionMode mode) override;
|
virtual void setCompositionMode(QPainter::CompositionMode mode) override;
|
||||||
|
virtual void performOutputCharacter(const PDFTextCharacterInfo& info) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
PDFPrecompiledPage* m_precompiledPage;
|
PDFPrecompiledPage* m_precompiledPage;
|
||||||
|
@ -145,6 +145,7 @@ void PDFRenderer::compile(PDFPrecompiledPage* precompiledPage, size_t pageIndex)
|
|||||||
PDFPrecompiledPageGenerator generator(precompiledPage, m_features, page, m_document, m_fontCache, m_cms, m_optionalContentActivity, m_meshQualitySettings);
|
PDFPrecompiledPageGenerator generator(precompiledPage, m_features, page, m_document, m_fontCache, m_cms, m_optionalContentActivity, m_meshQualitySettings);
|
||||||
QList<PDFRenderError> errors = generator.processContents();
|
QList<PDFRenderError> errors = generator.processContents();
|
||||||
precompiledPage->optimize();
|
precompiledPage->optimize();
|
||||||
|
precompiledPage->createTextLayout();
|
||||||
precompiledPage->finalize(timer.nsecsElapsed(), qMove(errors));
|
precompiledPage->finalize(timer.nsecsElapsed(), qMove(errors));
|
||||||
timer.invalidate();
|
timer.invalidate();
|
||||||
}
|
}
|
||||||
|
281
PdfForQtLib/sources/pdftextlayout.cpp
Normal file
281
PdfForQtLib/sources/pdftextlayout.cpp
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
// Copyright (C) 2019 Jakub Melka
|
||||||
|
//
|
||||||
|
// This file is part of PdfForQt.
|
||||||
|
//
|
||||||
|
// PdfForQt is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// PdfForQt is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Lesser General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public License
|
||||||
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#include "pdftextlayout.h"
|
||||||
|
#include "pdfutils.h"
|
||||||
|
|
||||||
|
#include <execution>
|
||||||
|
|
||||||
|
namespace pdf
|
||||||
|
{
|
||||||
|
|
||||||
|
PDFTextLayout::PDFTextLayout()
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFTextLayout::addCharacter(const PDFTextCharacterInfo& info)
|
||||||
|
{
|
||||||
|
TextCharacter character;
|
||||||
|
|
||||||
|
// Fill the basic info. For computing the angle, we must consider, if we are
|
||||||
|
// in vertical writing system. If yes, take vertical edge of the character,
|
||||||
|
// otherwise take horizontal edge.
|
||||||
|
character.character = info.character;
|
||||||
|
character.position = info.matrix.map(QPointF(0.0, 0.0));
|
||||||
|
|
||||||
|
QLineF testLine(QPointF(0.0, 0.0), QPointF(info.isVerticalWritingSystem ? 0.0 : info.advance, !info.isVerticalWritingSystem ? 0.0 : info.advance));
|
||||||
|
QLineF mappedLine = info.matrix.map(testLine);
|
||||||
|
character.advance = mappedLine.length();
|
||||||
|
character.angle = qRound(mappedLine.angle());
|
||||||
|
|
||||||
|
QLineF fontTestLine(QPointF(0.0, 0.0), QPointF(0.0, info.fontSize));
|
||||||
|
QLineF fontMappedLine = info.matrix.map(fontTestLine);
|
||||||
|
character.fontSize = fontMappedLine.length();
|
||||||
|
|
||||||
|
QRectF boundingBox = info.outline.boundingRect();
|
||||||
|
character.boundingBox.addPolygon(info.matrix.map(boundingBox));
|
||||||
|
|
||||||
|
m_characters.emplace_back(qMove(character));
|
||||||
|
m_angles.insert(character.angle);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFTextLayout::perform()
|
||||||
|
{
|
||||||
|
for (PDFReal angle : m_angles)
|
||||||
|
{
|
||||||
|
performDoLayout(angle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFTextLayout::optimize()
|
||||||
|
{
|
||||||
|
m_characters.shrink_to_fit();
|
||||||
|
}
|
||||||
|
|
||||||
|
qint64 PDFTextLayout::getMemoryConsumptionEstimate() const
|
||||||
|
{
|
||||||
|
qint64 estimate = sizeof(*this);
|
||||||
|
estimate += sizeof(decltype(m_characters)::value_type) * m_characters.capacity();
|
||||||
|
estimate += sizeof(decltype(m_angles)::value_type) * m_angles.size();
|
||||||
|
return estimate;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFTextLayout::performDoLayout(PDFReal angle)
|
||||||
|
{
|
||||||
|
// We will implement variation of 'docstrum' algorithm, we have divided characters by angles,
|
||||||
|
// for each angle we get characters for that particular angle, and run 'docstrum' algorithm.
|
||||||
|
// We will do following steps:
|
||||||
|
// 1) Rotate the plane with characters so that they are all in horizontal line
|
||||||
|
// 2) Find k-nearest characters for each character (so each character will have
|
||||||
|
// k pointers to the nearest characters)
|
||||||
|
// 3) Find text lines. We will do that by creating transitive closure of characters, i.e.
|
||||||
|
// characters, which are close and are on horizontal line, are marked as in one text line.
|
||||||
|
// Consider also font size and empty space size between different characters.
|
||||||
|
// 4) Merge text lines into text blocks using various criteria, such as overlap,
|
||||||
|
// distance between the lines, and also using again, transitive closure.
|
||||||
|
// 5) Sort blocks using topological ordering
|
||||||
|
TextCharacters characters = getCharactersForAngle(angle);
|
||||||
|
|
||||||
|
// Step 1) - rotate blocks
|
||||||
|
QMatrix angleMatrix;
|
||||||
|
angleMatrix.rotate(angle);
|
||||||
|
applyTransform(characters, angleMatrix);
|
||||||
|
|
||||||
|
// Step 2) - find k-nearest characters
|
||||||
|
struct NearestCharacterInfo
|
||||||
|
{
|
||||||
|
size_t index = std::numeric_limits<size_t>::max();
|
||||||
|
PDFReal distance = std::numeric_limits<PDFReal>::infinity();
|
||||||
|
|
||||||
|
inline bool operator<(const NearestCharacterInfo& other) const { return distance < other.distance; }
|
||||||
|
};
|
||||||
|
|
||||||
|
const size_t characterCount = characters.size();
|
||||||
|
const size_t bucketSize = m_settings.samples + 1;
|
||||||
|
std::vector<NearestCharacterInfo> nearestCharacters(bucketSize * characters.size(), NearestCharacterInfo());
|
||||||
|
|
||||||
|
auto findNearestCharacters = [&](size_t currentCharacterIndex)
|
||||||
|
{
|
||||||
|
// It will be iterator to the start of the nearest neighbour sequence
|
||||||
|
auto it = std::next(nearestCharacters.begin(), currentCharacterIndex * bucketSize);
|
||||||
|
auto itLast = std::next(it, m_settings.samples);
|
||||||
|
NearestCharacterInfo& insertInfo = *itLast;
|
||||||
|
QPointF currentPoint = characters[currentCharacterIndex].position;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < characterCount; ++i)
|
||||||
|
{
|
||||||
|
if (i == currentCharacterIndex)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
insertInfo.index = i;
|
||||||
|
insertInfo.distance = QLineF(currentPoint, characters[i].position).length();
|
||||||
|
|
||||||
|
// Now, use insert sort to sort the array of samples + 1 elements (#samples elements
|
||||||
|
// are sorted, we use only insert sort on the last element).
|
||||||
|
auto itInsert = std::upper_bound(it, itLast, *itLast);
|
||||||
|
std::rotate(itInsert, itLast, itLast + 1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto range = PDFIntegerRange<size_t>(0, characterCount);
|
||||||
|
std::for_each(std::execution::parallel_policy(), range.begin(), range.end(), findNearestCharacters);
|
||||||
|
|
||||||
|
// Step 3) - detect lines
|
||||||
|
PDFUnionFindAlgorithm<size_t> textLinesUF(characterCount);
|
||||||
|
for (size_t i = 0; i < characterCount; ++i)
|
||||||
|
{
|
||||||
|
auto it = std::next(nearestCharacters.begin(), i * bucketSize);
|
||||||
|
auto itEnd = std::next(it, m_settings.samples);
|
||||||
|
|
||||||
|
for (; it != itEnd; ++it)
|
||||||
|
{
|
||||||
|
const NearestCharacterInfo& info = *it;
|
||||||
|
if (info.index == std::numeric_limits<size_t>::max())
|
||||||
|
{
|
||||||
|
// We have reached the end - or we do not have enough characters
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Criteria:
|
||||||
|
// 1) Distance of characters is not too large
|
||||||
|
// 2) Characters are approximately at same line
|
||||||
|
// 3) Font size of characters are approximately equal
|
||||||
|
|
||||||
|
PDFReal fontSizeMax = qMax(characters[i].fontSize, characters[info.index].fontSize);
|
||||||
|
PDFReal fontSizeMin = qMin(characters[i].fontSize, characters[info.index].fontSize);
|
||||||
|
|
||||||
|
if (info.distance < m_settings.distanceSensitivity * characters[i].advance && // 1)
|
||||||
|
std::fabs(characters[i].position.y() - characters[info.index].position.y()) < fontSizeMin * m_settings.charactersOnLineSensitivity && // 2)
|
||||||
|
fontSizeMax / fontSizeMin < m_settings.fontSensitivity) // 3)
|
||||||
|
{
|
||||||
|
textLinesUF.unify(i, info.index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::map<size_t, TextCharacters> lineToCharactersMap;
|
||||||
|
for (size_t i = 0; i < characterCount; ++i)
|
||||||
|
{
|
||||||
|
lineToCharactersMap[textLinesUF.find(i)].push_back(characters[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
PDFTextLines lines;
|
||||||
|
lines.reserve(lineToCharactersMap.size());
|
||||||
|
for (auto& item : lineToCharactersMap)
|
||||||
|
{
|
||||||
|
lines.emplace_back(qMove(item.second));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4) - detect text blocks
|
||||||
|
const size_t lineCount = lines.size();
|
||||||
|
PDFUnionFindAlgorithm<size_t> textBlocksUF(lineCount);
|
||||||
|
for (size_t i = 0; i < lineCount; ++i)
|
||||||
|
{
|
||||||
|
for (size_t j = i + 1; j < lineCount; ++j)
|
||||||
|
{
|
||||||
|
QRectF bb1 = lines[i].getBoundingBox().boundingRect();
|
||||||
|
QRectF bb2 = lines[j].getBoundingBox().boundingRect();
|
||||||
|
|
||||||
|
// Jakub Melka: we will join two blocks, if these two conditions both holds:
|
||||||
|
// 1) bounding boxes overlap horizontally by large portion
|
||||||
|
// 2) vertical space between bounding boxes is not too large
|
||||||
|
|
||||||
|
QRectF bbUnion = bb1.united(bb2);
|
||||||
|
const PDFReal height = bbUnion.height();
|
||||||
|
const PDFReal heightLimit = (bb1.height() + bb2.height()) * m_settings.blockVerticalSensitivity;
|
||||||
|
const PDFReal overlap = qMax(0.0, bb1.width() + bb2.width() - bbUnion.width());
|
||||||
|
const PDFReal minimalOverlap = qMin(bb1.width(), bb2.width()) * m_settings.blockOverlapSensitivity;
|
||||||
|
if (height < heightLimit && overlap > minimalOverlap)
|
||||||
|
{
|
||||||
|
textBlocksUF.unify(i, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::map<size_t, PDFTextLines> blockToLines;
|
||||||
|
for (size_t i = 0; i < lineCount; ++i)
|
||||||
|
{
|
||||||
|
blockToLines[textBlocksUF.find(i)].push_back(qMove(lines[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
PDFTextBlocks blocks;
|
||||||
|
blocks.reserve(blockToLines.size());
|
||||||
|
for (auto& item : blockToLines)
|
||||||
|
{
|
||||||
|
blocks.emplace_back(qMove(item.second));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transform blocks back to original coordinate system
|
||||||
|
volatile int i = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextCharacters PDFTextLayout::getCharactersForAngle(PDFReal angle) const
|
||||||
|
{
|
||||||
|
TextCharacters result;
|
||||||
|
std::copy_if(m_characters.cbegin(), m_characters.cend(), std::back_inserter(result), [angle](const TextCharacter& character) { return character.angle == angle; });
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PDFTextLayout::applyTransform(TextCharacters& characters, const QMatrix& matrix)
|
||||||
|
{
|
||||||
|
for (TextCharacter& character : characters)
|
||||||
|
{
|
||||||
|
character.position = matrix.map(character.position);
|
||||||
|
character.boundingBox = matrix.map(character.boundingBox);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
PDFTextLine::PDFTextLine(TextCharacters characters) :
|
||||||
|
m_characters(qMove(characters))
|
||||||
|
{
|
||||||
|
std::sort(m_characters.begin(), m_characters.end(), [](const TextCharacter& l, const TextCharacter& r) { return l.position.x() < r.position.x(); });
|
||||||
|
|
||||||
|
QRectF boundingBox;
|
||||||
|
for (const TextCharacter& character : m_characters)
|
||||||
|
{
|
||||||
|
boundingBox = boundingBox.united(character.boundingBox.boundingRect());
|
||||||
|
}
|
||||||
|
m_boundingBox.addRect(boundingBox);
|
||||||
|
}
|
||||||
|
|
||||||
|
PDFTextBlock::PDFTextBlock(PDFTextLines textLines) :
|
||||||
|
m_lines(qMove(textLines))
|
||||||
|
{
|
||||||
|
auto sortFunction = [](const PDFTextLine& l, const PDFTextLine& r)
|
||||||
|
{
|
||||||
|
QRectF bl = l.getBoundingBox().boundingRect();
|
||||||
|
QRectF br = r.getBoundingBox().boundingRect();
|
||||||
|
const PDFReal xL = bl.x();
|
||||||
|
const PDFReal xR = br.x();
|
||||||
|
const PDFReal yL = qRound(bl.y() * 100.0);
|
||||||
|
const PDFReal yR = qRound(br.y() * 100.0);
|
||||||
|
return std::tie(yL, xL) < std::tie(yR, xR);
|
||||||
|
};
|
||||||
|
std::sort(m_lines.begin(), m_lines.end(), sortFunction);
|
||||||
|
|
||||||
|
QRectF boundingBox;
|
||||||
|
for (const PDFTextLine& line : m_lines)
|
||||||
|
{
|
||||||
|
boundingBox = boundingBox.united(line.getBoundingBox().boundingRect());
|
||||||
|
}
|
||||||
|
m_boundingBox.addRect(boundingBox);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace pdf
|
170
PdfForQtLib/sources/pdftextlayout.h
Normal file
170
PdfForQtLib/sources/pdftextlayout.h
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
// Copyright (C) 2019 Jakub Melka
|
||||||
|
//
|
||||||
|
// This file is part of PdfForQt.
|
||||||
|
//
|
||||||
|
// PdfForQt is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Lesser General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// PdfForQt is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Lesser General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Lesser General Public License
|
||||||
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#ifndef PDFTEXTLAYOUT_H
|
||||||
|
#define PDFTEXTLAYOUT_H
|
||||||
|
|
||||||
|
#include "pdfglobal.h"
|
||||||
|
|
||||||
|
#include <QPainterPath>
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
|
||||||
|
namespace pdf
|
||||||
|
{
|
||||||
|
|
||||||
|
struct PDFTextCharacterInfo
|
||||||
|
{
|
||||||
|
/// Character
|
||||||
|
QChar character;
|
||||||
|
|
||||||
|
/// Character path
|
||||||
|
QPainterPath outline;
|
||||||
|
|
||||||
|
/// Do we use a vertical writing system?
|
||||||
|
bool isVerticalWritingSystem = false;
|
||||||
|
|
||||||
|
/// Advance (in character space, it must be translated
|
||||||
|
/// into device space), for both vertical/horizontal modes.
|
||||||
|
PDFReal advance = 0.0;
|
||||||
|
|
||||||
|
/// Font size (in character space, it must be translated
|
||||||
|
/// into device space)
|
||||||
|
PDFReal fontSize = 0.0;
|
||||||
|
|
||||||
|
/// Transformation matrix from character space to device space
|
||||||
|
QMatrix matrix;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PDFTextLayoutSettings
|
||||||
|
{
|
||||||
|
/// Number of samples for 'docstrum' algorithm, i.e. number of
|
||||||
|
/// nearest characters. By default, 5 characters should fit.
|
||||||
|
size_t samples = 5;
|
||||||
|
|
||||||
|
/// Distance sensitivity to determine, if characters are close enough.
|
||||||
|
/// Maximal distance is computed as current character advance multiplied
|
||||||
|
/// by this constant.
|
||||||
|
PDFReal distanceSensitivity = 4.0;
|
||||||
|
|
||||||
|
/// Maximal vertical distance, in portion of font size, of two characters
|
||||||
|
/// to be considered they lie on same line.
|
||||||
|
PDFReal charactersOnLineSensitivity = 0.25;
|
||||||
|
|
||||||
|
/// Maximal ratio between font size of characters to be considered
|
||||||
|
/// that they lie on same line.
|
||||||
|
PDFReal fontSensitivity = 2.0;
|
||||||
|
|
||||||
|
/// Maximal space ratio between two lines of block. Default coefficient
|
||||||
|
/// means, that height ratio limit is (height1 + height2)
|
||||||
|
PDFReal blockVerticalSensitivity = 1.5;
|
||||||
|
|
||||||
|
/// Minimal horizontal overlap for two lines considered to be in one block
|
||||||
|
PDFReal blockOverlapSensitivity = 0.3;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Represents character in device space coordinates. All values (dimensions,
|
||||||
|
/// bounding box, etc. are in device space coordinates).
|
||||||
|
struct TextCharacter
|
||||||
|
{
|
||||||
|
QChar character;
|
||||||
|
QPointF position;
|
||||||
|
PDFReal angle = 0.0;
|
||||||
|
PDFReal fontSize = 0.0;
|
||||||
|
PDFReal advance = 0.0;
|
||||||
|
QPainterPath boundingBox;
|
||||||
|
};
|
||||||
|
|
||||||
|
using TextCharacters = std::vector<TextCharacter>;
|
||||||
|
|
||||||
|
/// Represents text line consisting of set of characters and line bounding box.
|
||||||
|
class PDFTextLine
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
/// Construct new line from characters. Characters are sorted in x-coordinate
|
||||||
|
/// and bounding box is computed.
|
||||||
|
/// \param characters
|
||||||
|
explicit PDFTextLine(TextCharacters characters);
|
||||||
|
|
||||||
|
const TextCharacters& getCharacters() const { return m_characters; }
|
||||||
|
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
TextCharacters m_characters;
|
||||||
|
QPainterPath m_boundingBox;
|
||||||
|
};
|
||||||
|
|
||||||
|
using PDFTextLines = std::vector<PDFTextLine>;
|
||||||
|
|
||||||
|
/// Represents text block consisting of set of lines and block bounding box.
|
||||||
|
class PDFTextBlock
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit inline PDFTextBlock(PDFTextLines textLines);
|
||||||
|
|
||||||
|
const PDFTextLines& getLines() const { return m_lines; }
|
||||||
|
const QPainterPath& getBoundingBox() const { return m_boundingBox; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
PDFTextLines m_lines;
|
||||||
|
QPainterPath m_boundingBox;
|
||||||
|
};
|
||||||
|
|
||||||
|
using PDFTextBlocks = std::vector<PDFTextBlock>;
|
||||||
|
|
||||||
|
/// Text layout of single page. Can handle various fonts, various angles of lines
|
||||||
|
/// and vertically oriented text. It performs the "docstrum" algorithm.
|
||||||
|
class PDFTextLayout
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit PDFTextLayout();
|
||||||
|
|
||||||
|
/// Adds character to the layout
|
||||||
|
void addCharacter(const PDFTextCharacterInfo& info);
|
||||||
|
|
||||||
|
/// Perorms text layout algorithm
|
||||||
|
void perform();
|
||||||
|
|
||||||
|
/// Optimizes layout memory allocation to contain less space
|
||||||
|
void optimize();
|
||||||
|
|
||||||
|
/// Returns estimate of number of bytes, which this mesh occupies in memory
|
||||||
|
qint64 getMemoryConsumptionEstimate() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
/// Makes layout for particular angle
|
||||||
|
void performDoLayout(PDFReal angle);
|
||||||
|
|
||||||
|
/// Returns a list of characters for particular angle. Exact match is used
|
||||||
|
/// for angle, even if angle is floating point number.
|
||||||
|
/// \param angle Angle
|
||||||
|
TextCharacters getCharactersForAngle(PDFReal angle) const;
|
||||||
|
|
||||||
|
/// Applies transform to text characters (positions and bounding boxes)
|
||||||
|
/// \param characters Characters
|
||||||
|
/// \param matrix Transform matrix
|
||||||
|
void applyTransform(TextCharacters& characters, const QMatrix& matrix);
|
||||||
|
|
||||||
|
TextCharacters m_characters;
|
||||||
|
std::set<PDFReal> m_angles;
|
||||||
|
PDFTextLayoutSettings m_settings;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace pdf
|
||||||
|
|
||||||
|
#endif // PDFTEXTLAYOUT_H
|
@ -362,6 +362,51 @@ public:
|
|||||||
static std::vector<PDFDependentLibraryInfo> getLibraryInfo();
|
static std::vector<PDFDependentLibraryInfo> getLibraryInfo();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Union-find algorithm, which uses path compression optimization. It can run in time
|
||||||
|
/// O(n + f * (1 + log(n)/log(2 + f/n)), where n is number of unions (resp. size of the
|
||||||
|
/// array) and f is number of find operations.
|
||||||
|
template<typename T>
|
||||||
|
class PDFUnionFindAlgorithm
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit PDFUnionFindAlgorithm(T size)
|
||||||
|
{
|
||||||
|
m_indices.resize(size, T(0));
|
||||||
|
std::iota(m_indices.begin(), m_indices.end(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
T find(T index)
|
||||||
|
{
|
||||||
|
// Use path compression optimization. We assume we will not
|
||||||
|
// have long paths, so we will use simple recursion and
|
||||||
|
// not while cycle.
|
||||||
|
if (m_indices[index] != index)
|
||||||
|
{
|
||||||
|
m_indices[index] = find(m_indices[index]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return m_indices[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
void unify(T x, T y)
|
||||||
|
{
|
||||||
|
T xRoot = find(x);
|
||||||
|
T yRoot = find(y);
|
||||||
|
|
||||||
|
if (xRoot < yRoot)
|
||||||
|
{
|
||||||
|
m_indices[yRoot] = xRoot;
|
||||||
|
}
|
||||||
|
else if (xRoot > yRoot)
|
||||||
|
{
|
||||||
|
m_indices[xRoot] = yRoot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<T> m_indices;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace pdf
|
} // namespace pdf
|
||||||
|
|
||||||
#endif // PDFUTILS_H
|
#endif // PDFUTILS_H
|
||||||
|
Reference in New Issue
Block a user