Font info tool - added character maps

This commit is contained in:
Jakub Melka 2020-10-25 13:51:57 +01:00
parent acee5f2186
commit f43459b88e
5 changed files with 239 additions and 3 deletions

View File

@ -363,6 +363,9 @@ public:
/// Returns postscript name of the font
virtual QString getPostScriptName() const { return QString(); }
/// Returns character info
virtual CharacterInfos getCharacterInfos() const = 0;
};
/// Implementation of the PDFRealizedFont class using PIMPL pattern for Type 3 fonts
@ -374,6 +377,7 @@ public:
virtual void fillTextSequence(const QByteArray& byteArray, TextSequence& textSequence, PDFRenderErrorReporter* reporter) override;
virtual bool isHorizontalWritingSystem() const override;
virtual CharacterInfos getCharacterInfos() const override;
private:
/// Pixel size of the font
@ -394,6 +398,7 @@ public:
virtual bool isHorizontalWritingSystem() const override { return !m_isVertical; }
virtual void dumpFontToTreeItem(QTreeWidgetItem* item) const override;
virtual QString getPostScriptName() const override { return m_postScriptName; }
virtual CharacterInfos getCharacterInfos() const override;
static constexpr const PDFReal PIXEL_SIZE_MULTIPLIER = 100.0;
@ -581,6 +586,107 @@ void PDFRealizedFontImpl::fillTextSequence(const QByteArray& byteArray, TextSequ
}
}
CharacterInfos PDFRealizedFontImpl::getCharacterInfos() const
{
CharacterInfos result;
switch (m_parentFont->getFontType())
{
case FontType::Type1:
case FontType::TrueType:
case FontType::MMType1:
{
// We can use encoding
Q_ASSERT(dynamic_cast<PDFSimpleFont*>(m_parentFont.get()));
const PDFSimpleFont* font = static_cast<PDFSimpleFont*>(m_parentFont.get());
const encoding::EncodingTable* encoding = font->getEncoding();
const GlyphIndices* glyphIndices = font->getGlyphIndices();
for (size_t i = 0; i < encoding->size(); ++i)
{
QChar character = (*encoding)[i];
GID glyphIndex = (*glyphIndices)[static_cast<uint8_t>(i)];
if (!glyphIndex)
{
// Try to obtain glyph index from unicode
if (m_face->charmap && m_face->charmap->encoding == FT_ENCODING_UNICODE)
{
glyphIndex = FT_Get_Char_Index(m_face, character.unicode());
}
}
if (glyphIndex)
{
CharacterInfo info;
info.gid = glyphIndex;
info.character = character;
result.emplace_back(qMove(info));
}
}
break;
}
case FontType::Type0:
{
Q_ASSERT(dynamic_cast<PDFType0Font*>(m_parentFont.get()));
const PDFType0Font* font = static_cast<PDFType0Font*>(m_parentFont.get());
const PDFFontCMap* toUnicode = font->getToUnicode();
const PDFCIDtoGIDMapper* CIDtoGIDmapper = font->getCIDtoGIDMapper();
FT_UInt index = 0;
FT_ULong character = FT_Get_First_Char(m_face, &index);
while (index != 0)
{
const GID gid = index;
const CID cid = CIDtoGIDmapper->unmap(gid);
CharacterInfo info;
info.gid = gid;
info.character = toUnicode->getToUnicode(cid);
result.emplace_back(qMove(info));
character = FT_Get_Next_Char(m_face, character, &index);
}
if (result.empty())
{
// We will try all reasonable high CIDs
for (CID cid = 0; cid < QChar::LastValidCodePoint; ++cid)
{
const GID gid = CIDtoGIDmapper->map(cid);
if (!gid)
{
continue;
}
if (!FT_Load_Glyph(m_face, gid, FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING))
{
CharacterInfo info;
info.gid = gid;
info.character = toUnicode->getToUnicode(cid);
result.emplace_back(qMove(info));
}
}
}
break;
}
default:
{
// Unhandled font type
Q_ASSERT(false);
break;
}
}
return result;
}
void PDFRealizedFontImpl::dumpFontToTreeItem(QTreeWidgetItem* item) const
{
QTreeWidgetItem* root = new QTreeWidgetItem(item, { PDFTranslationContext::tr("Details") });
@ -786,6 +892,11 @@ QString PDFRealizedFont::getPostScriptName() const
return m_impl->getPostScriptName();
}
CharacterInfos PDFRealizedFont::getCharacterInfos() const
{
return m_impl->getCharacterInfos();
}
PDFRealizedFontPointer PDFRealizedFont::createRealizedFont(PDFFontPointer font, PDFReal pixelSize, PDFRenderErrorReporter* reporter)
{
PDFRealizedFontPointer result;
@ -2261,4 +2372,22 @@ bool PDFRealizedType3FontImpl::isHorizontalWritingSystem() const
return true;
}
CharacterInfos PDFRealizedType3FontImpl::getCharacterInfos() const
{
CharacterInfos result;
Q_ASSERT(dynamic_cast<const PDFType3Font*>(m_parentFont.get()));
const PDFType3Font* parentFont = static_cast<const PDFType3Font*>(m_parentFont.get());
for (const auto& contentStreamItem : parentFont->getContentStreams())
{
CharacterInfo info;
info.gid = contentStreamItem.first;
info.character = parentFont->getUnicode(contentStreamItem.first);
result.emplace_back(qMove(info));
}
return result;
}
} // namespace pdf

View File

@ -219,6 +219,13 @@ class IRealizedFontImpl;
using PDFRealizedFontPointer = QSharedPointer<PDFRealizedFont>;
struct CharacterInfo
{
GID gid = 0;
QChar character;
};
using CharacterInfos = std::vector<CharacterInfo>;
/// Font, which has fixed pixel size. It is programmed as PIMPL, because we need
/// to remove FreeType types from the interface (so we do not include FreeType in the interface).
class PDFFORQTLIBSHARED_EXPORT PDFRealizedFont
@ -242,6 +249,9 @@ public:
/// Returns postscript name of the font
QString getPostScriptName() const;
/// Returns character info
CharacterInfos getCharacterInfos() const;
/// Creates new realized font from the standard font. If font can't be created,
/// then exception is thrown.
static PDFRealizedFontPointer createRealizedFont(PDFFontPointer font, PDFReal pixelSize, PDFRenderErrorReporter* reporter);
@ -441,6 +451,31 @@ public:
return 0;
}
/// Maps GID to CID (inverse mapping)
CID unmap(GID gid) const
{
if (m_mapping.isEmpty())
{
// This means identity mapping
return gid;
}
else
{
CID lastCid = CID(m_mapping.size() / 2);
for (CID i = 0; i < lastCid; ++i)
{
if (map(i) == gid)
{
return i;
}
}
}
// This should occur only in case of bad (damaged) PDF file - because in this case,
// encoding is missing. Return invalid character index.
return 0;
}
private:
QByteArray m_mapping;
};
@ -542,6 +577,7 @@ public:
const QMatrix& getFontMatrix() const { return m_fontMatrix; }
const PDFObject& getResources() const { return m_resources; }
const std::map<int, QByteArray>& getContentStreams() const { return m_characterContentStreams; }
/// Returns unicode character for given character index. If unicode mapping is not
/// present, empty (null) character is returned.

View File

@ -236,6 +236,11 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser*
parser->addOption(QCommandLineOption("say-struct-exp-form", "Say expanded form extracted from structure tree (only for tagged pdf)."));
parser->addOption(QCommandLineOption("say-struct-act-text", "Say actual text extracted from structure tree (only for tagged pdf)."));
}
if (optionFlags.testFlag(CharacterMaps))
{
parser->addOption(QCommandLineOption("character-maps", "Show character maps for embedded fonts."));
}
}
PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const
@ -404,6 +409,11 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser
options.textSpeechSayStructActualText = parser->isSet("say-struct-act-text");
}
if (optionFlags.testFlag(CharacterMaps))
{
options.showCharacterMapsForEmbeddedFonts = parser->isSet("character-maps");
}
return options;
}

View File

@ -108,6 +108,9 @@ struct PDFToolOptions
bool textSpeechSayStructActualText = false;
QString textSpeechAudioFormat = "mp3";
// For option 'CharacterMaps'
bool showCharacterMapsForEmbeddedFonts = false;
/// Returns page range. If page range is invalid, then \p errorMessage is empty.
/// \param pageCount Page count
/// \param[out] errorMessage Error message
@ -157,6 +160,7 @@ public:
TextShow = 0x0200, ///< Text extract and show options
VoiceSelector = 0x0400, ///< Select voice from SAPI
TextSpeech = 0x0800, ///< Text speech options
CharacterMaps = 0x1000, ///< Character maps for embedded fonts
};
Q_DECLARE_FLAGS(Options, Option)

View File

@ -50,6 +50,7 @@ QString PDFToolInfoFonts::getStandardString(StandardString standardString) const
struct FontInfo
{
pdf::PDFClosedIntervalSet pages;
QString fontFullName;
QString fontName;
QString fontTypeName;
QString encoding;
@ -58,6 +59,7 @@ struct FontInfo
bool isToUnicodePresent = false;
pdf::PDFObjectReference reference;
QString substitutedFont;
pdf::CharacterInfos characterInfos;
};
int PDFToolInfoFonts::execute(const PDFToolOptions& options)
@ -127,6 +129,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
const pdf::FontType fontType = font->getFontType();
const pdf::FontDescriptor* fontDescriptor = font->getFontDescriptor();
QString fontName = fontDescriptor->fontName;
QString fontFullName = fontName;
int plusPos = fontName.lastIndexOf('+');
// Jakub Melka: Detect, if font is subset. Font subsets have special form,
@ -190,6 +193,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
FontInfo info;
info.fontName = fontName;
info.fontFullName = fontFullName;
info.pages.addValue(pageIndex + 1);
info.fontTypeName = fontTypeName;
info.isEmbedded = fontDescriptor->isEmbedded() || fontType == pdf::FontType::Type3;
@ -198,6 +202,11 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
info.reference = fontReference;
info.substitutedFont = realizedFont->getPostScriptName();
if (options.showCharacterMapsForEmbeddedFonts && info.isEmbedded)
{
info.characterInfos = realizedFont->getCharacterInfos();
}
const pdf::PDFSimpleFont* simpleFont = dynamic_cast<const pdf::PDFSimpleFont*>(font.data());
if (simpleFont)
{
@ -292,6 +301,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
QString noText = PDFToolTranslationContext::tr("No");
QString noRef = PDFToolTranslationContext::tr("--");
bool hasEmbedded = false;
bool hasSubstitutions = false;
int ref = 1;
for (const FontInfo& info : directFonts)
@ -319,6 +329,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
}
hasSubstitutions = hasSubstitutions || !info.isEmbedded;
hasEmbedded = hasEmbedded || info.isEmbedded;
formatter.endTableRow();
++ref;
@ -326,10 +337,9 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
formatter.endTable();
formatter.endl();
if (hasSubstitutions)
{
formatter.endl();
formatter.beginTable("fonts-substitutions", PDFToolTranslationContext::tr("Substitutions"));
formatter.beginTableHeaderRow("header");
@ -374,6 +384,53 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
formatter.endTable();
}
if (options.showCharacterMapsForEmbeddedFonts && hasEmbedded)
{
formatter.endl();
formatter.beginHeader("font-character-maps", PDFToolTranslationContext::tr("Font Character Maps"));
int fontRef = 1;
for (const FontInfo& info : directFonts)
{
if (!info.isEmbedded)
{
continue;
}
formatter.beginTable("font-character-map", PDFToolTranslationContext::tr("Character Map for Font '%1'").arg(info.fontFullName));
formatter.beginTableHeaderRow("header");
formatter.writeTableHeaderColumn("no", PDFToolTranslationContext::tr("No."), Qt::AlignLeft);
formatter.writeTableHeaderColumn("glyph-index", PDFToolTranslationContext::tr("Glyph Index"), Qt::AlignLeft);
formatter.writeTableHeaderColumn("character", PDFToolTranslationContext::tr("Character"), Qt::AlignLeft);
formatter.writeTableHeaderColumn("unicode", PDFToolTranslationContext::tr("Unicode"), Qt::AlignLeft);
formatter.endTableHeaderRow();
int characterIndex = 1;
for (const pdf::CharacterInfo& characterInfo : info.characterInfos)
{
formatter.beginTableRow("character", characterInfo.gid);
QString character = characterInfo.character.isNull() ? "??" : QString(1, characterInfo.character);
QString unicode = QString("0x%1").arg(QString::number(characterInfo.character.unicode(), 16).toUpper().rightJustified(4, QChar('0')));
formatter.writeTableColumn("no", locale.toString(characterIndex++), Qt::AlignRight);
formatter.writeTableColumn("glyph-index", locale.toString(characterInfo.gid), Qt::AlignRight);
formatter.writeTableColumn("character", character);
formatter.writeTableColumn("unicode", unicode);
formatter.endTableRow();
}
formatter.endTable();
++fontRef;
formatter.endl();
}
formatter.endTable();
}
formatter.endDocument();
PDFConsole::writeText(formatter.getString(), options.outputCodec);
@ -382,7 +439,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options)
PDFToolAbstractApplication::Options PDFToolInfoFonts::getOptionsFlags() const
{
return ConsoleFormat | OpenDocument | PageSelector;
return ConsoleFormat | OpenDocument | PageSelector | CharacterMaps;
}
} // namespace pdftool