From f43459b88e4480b6a796bb11bc85731afbdf4df4 Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Sun, 25 Oct 2020 13:51:57 +0100 Subject: [PATCH] Font info tool - added character maps --- PdfForQtLib/sources/pdffont.cpp | 129 +++++++++++++++++++++++++ PdfForQtLib/sources/pdffont.h | 36 +++++++ PdfTool/pdftoolabstractapplication.cpp | 10 ++ PdfTool/pdftoolabstractapplication.h | 4 + PdfTool/pdftoolinfofonts.cpp | 63 +++++++++++- 5 files changed, 239 insertions(+), 3 deletions(-) diff --git a/PdfForQtLib/sources/pdffont.cpp b/PdfForQtLib/sources/pdffont.cpp index 495df9e..b5aaa9c 100644 --- a/PdfForQtLib/sources/pdffont.cpp +++ b/PdfForQtLib/sources/pdffont.cpp @@ -363,6 +363,9 @@ public: /// Returns postscript name of the font virtual QString getPostScriptName() const { return QString(); } + + /// Returns character info + virtual CharacterInfos getCharacterInfos() const = 0; }; /// Implementation of the PDFRealizedFont class using PIMPL pattern for Type 3 fonts @@ -374,6 +377,7 @@ public: virtual void fillTextSequence(const QByteArray& byteArray, TextSequence& textSequence, PDFRenderErrorReporter* reporter) override; virtual bool isHorizontalWritingSystem() const override; + virtual CharacterInfos getCharacterInfos() const override; private: /// Pixel size of the font @@ -394,6 +398,7 @@ public: virtual bool isHorizontalWritingSystem() const override { return !m_isVertical; } virtual void dumpFontToTreeItem(QTreeWidgetItem* item) const override; virtual QString getPostScriptName() const override { return m_postScriptName; } + virtual CharacterInfos getCharacterInfos() const override; static constexpr const PDFReal PIXEL_SIZE_MULTIPLIER = 100.0; @@ -581,6 +586,107 @@ void PDFRealizedFontImpl::fillTextSequence(const QByteArray& byteArray, TextSequ } } +CharacterInfos PDFRealizedFontImpl::getCharacterInfos() const +{ + CharacterInfos result; + + switch (m_parentFont->getFontType()) + { + case FontType::Type1: + case FontType::TrueType: + case FontType::MMType1: + { + // We can use encoding + Q_ASSERT(dynamic_cast(m_parentFont.get())); + const PDFSimpleFont* font = static_cast(m_parentFont.get()); + const encoding::EncodingTable* encoding = font->getEncoding(); + const GlyphIndices* glyphIndices = font->getGlyphIndices(); + + for (size_t i = 0; i < encoding->size(); ++i) + { + QChar character = (*encoding)[i]; + GID glyphIndex = (*glyphIndices)[static_cast(i)]; + + if (!glyphIndex) + { + // Try to obtain glyph index from unicode + if (m_face->charmap && m_face->charmap->encoding == FT_ENCODING_UNICODE) + { + glyphIndex = FT_Get_Char_Index(m_face, character.unicode()); + } + } + + if (glyphIndex) + { + CharacterInfo info; + info.gid = glyphIndex; + info.character = character; + result.emplace_back(qMove(info)); + } + } + + break; + } + + case FontType::Type0: + { + Q_ASSERT(dynamic_cast(m_parentFont.get())); + const PDFType0Font* font = static_cast(m_parentFont.get()); + + const PDFFontCMap* toUnicode = font->getToUnicode(); + const PDFCIDtoGIDMapper* CIDtoGIDmapper = font->getCIDtoGIDMapper(); + + FT_UInt index = 0; + FT_ULong character = FT_Get_First_Char(m_face, &index); + while (index != 0) + { + const GID gid = index; + const CID cid = CIDtoGIDmapper->unmap(gid); + + CharacterInfo info; + info.gid = gid; + info.character = toUnicode->getToUnicode(cid); + result.emplace_back(qMove(info)); + + character = FT_Get_Next_Char(m_face, character, &index); + } + + if (result.empty()) + { + // We will try all reasonable high CIDs + for (CID cid = 0; cid < QChar::LastValidCodePoint; ++cid) + { + const GID gid = CIDtoGIDmapper->map(cid); + + if (!gid) + { + continue; + } + + if (!FT_Load_Glyph(m_face, gid, FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING)) + { + CharacterInfo info; + info.gid = gid; + info.character = toUnicode->getToUnicode(cid); + result.emplace_back(qMove(info)); + } + } + } + + break; + } + + default: + { + // Unhandled font type + Q_ASSERT(false); + break; + } + } + + return result; +} + void PDFRealizedFontImpl::dumpFontToTreeItem(QTreeWidgetItem* item) const { QTreeWidgetItem* root = new QTreeWidgetItem(item, { PDFTranslationContext::tr("Details") }); @@ -786,6 +892,11 @@ QString PDFRealizedFont::getPostScriptName() const return m_impl->getPostScriptName(); } +CharacterInfos PDFRealizedFont::getCharacterInfos() const +{ + return m_impl->getCharacterInfos(); +} + PDFRealizedFontPointer PDFRealizedFont::createRealizedFont(PDFFontPointer font, PDFReal pixelSize, PDFRenderErrorReporter* reporter) { PDFRealizedFontPointer result; @@ -2261,4 +2372,22 @@ bool PDFRealizedType3FontImpl::isHorizontalWritingSystem() const return true; } +CharacterInfos PDFRealizedType3FontImpl::getCharacterInfos() const +{ + CharacterInfos result; + + Q_ASSERT(dynamic_cast(m_parentFont.get())); + const PDFType3Font* parentFont = static_cast(m_parentFont.get()); + + for (const auto& contentStreamItem : parentFont->getContentStreams()) + { + CharacterInfo info; + info.gid = contentStreamItem.first; + info.character = parentFont->getUnicode(contentStreamItem.first); + result.emplace_back(qMove(info)); + } + + return result; +} + } // namespace pdf diff --git a/PdfForQtLib/sources/pdffont.h b/PdfForQtLib/sources/pdffont.h index fe46ec1..9ef25de 100644 --- a/PdfForQtLib/sources/pdffont.h +++ b/PdfForQtLib/sources/pdffont.h @@ -219,6 +219,13 @@ class IRealizedFontImpl; using PDFRealizedFontPointer = QSharedPointer; +struct CharacterInfo +{ + GID gid = 0; + QChar character; +}; +using CharacterInfos = std::vector; + /// Font, which has fixed pixel size. It is programmed as PIMPL, because we need /// to remove FreeType types from the interface (so we do not include FreeType in the interface). class PDFFORQTLIBSHARED_EXPORT PDFRealizedFont @@ -242,6 +249,9 @@ public: /// Returns postscript name of the font QString getPostScriptName() const; + /// Returns character info + CharacterInfos getCharacterInfos() const; + /// Creates new realized font from the standard font. If font can't be created, /// then exception is thrown. static PDFRealizedFontPointer createRealizedFont(PDFFontPointer font, PDFReal pixelSize, PDFRenderErrorReporter* reporter); @@ -441,6 +451,31 @@ public: return 0; } + /// Maps GID to CID (inverse mapping) + CID unmap(GID gid) const + { + if (m_mapping.isEmpty()) + { + // This means identity mapping + return gid; + } + else + { + CID lastCid = CID(m_mapping.size() / 2); + for (CID i = 0; i < lastCid; ++i) + { + if (map(i) == gid) + { + return i; + } + } + } + + // This should occur only in case of bad (damaged) PDF file - because in this case, + // encoding is missing. Return invalid character index. + return 0; + } + private: QByteArray m_mapping; }; @@ -542,6 +577,7 @@ public: const QMatrix& getFontMatrix() const { return m_fontMatrix; } const PDFObject& getResources() const { return m_resources; } + const std::map& getContentStreams() const { return m_characterContentStreams; } /// Returns unicode character for given character index. If unicode mapping is not /// present, empty (null) character is returned. diff --git a/PdfTool/pdftoolabstractapplication.cpp b/PdfTool/pdftoolabstractapplication.cpp index 17b4c96..c5778a7 100644 --- a/PdfTool/pdftoolabstractapplication.cpp +++ b/PdfTool/pdftoolabstractapplication.cpp @@ -236,6 +236,11 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser* parser->addOption(QCommandLineOption("say-struct-exp-form", "Say expanded form extracted from structure tree (only for tagged pdf).")); parser->addOption(QCommandLineOption("say-struct-act-text", "Say actual text extracted from structure tree (only for tagged pdf).")); } + + if (optionFlags.testFlag(CharacterMaps)) + { + parser->addOption(QCommandLineOption("character-maps", "Show character maps for embedded fonts.")); + } } PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const @@ -404,6 +409,11 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser options.textSpeechSayStructActualText = parser->isSet("say-struct-act-text"); } + if (optionFlags.testFlag(CharacterMaps)) + { + options.showCharacterMapsForEmbeddedFonts = parser->isSet("character-maps"); + } + return options; } diff --git a/PdfTool/pdftoolabstractapplication.h b/PdfTool/pdftoolabstractapplication.h index 5a7a0f9..18f242b 100644 --- a/PdfTool/pdftoolabstractapplication.h +++ b/PdfTool/pdftoolabstractapplication.h @@ -108,6 +108,9 @@ struct PDFToolOptions bool textSpeechSayStructActualText = false; QString textSpeechAudioFormat = "mp3"; + // For option 'CharacterMaps' + bool showCharacterMapsForEmbeddedFonts = false; + /// Returns page range. If page range is invalid, then \p errorMessage is empty. /// \param pageCount Page count /// \param[out] errorMessage Error message @@ -157,6 +160,7 @@ public: TextShow = 0x0200, ///< Text extract and show options VoiceSelector = 0x0400, ///< Select voice from SAPI TextSpeech = 0x0800, ///< Text speech options + CharacterMaps = 0x1000, ///< Character maps for embedded fonts }; Q_DECLARE_FLAGS(Options, Option) diff --git a/PdfTool/pdftoolinfofonts.cpp b/PdfTool/pdftoolinfofonts.cpp index c0fad15..90586c7 100644 --- a/PdfTool/pdftoolinfofonts.cpp +++ b/PdfTool/pdftoolinfofonts.cpp @@ -50,6 +50,7 @@ QString PDFToolInfoFonts::getStandardString(StandardString standardString) const struct FontInfo { pdf::PDFClosedIntervalSet pages; + QString fontFullName; QString fontName; QString fontTypeName; QString encoding; @@ -58,6 +59,7 @@ struct FontInfo bool isToUnicodePresent = false; pdf::PDFObjectReference reference; QString substitutedFont; + pdf::CharacterInfos characterInfos; }; int PDFToolInfoFonts::execute(const PDFToolOptions& options) @@ -127,6 +129,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) const pdf::FontType fontType = font->getFontType(); const pdf::FontDescriptor* fontDescriptor = font->getFontDescriptor(); QString fontName = fontDescriptor->fontName; + QString fontFullName = fontName; int plusPos = fontName.lastIndexOf('+'); // Jakub Melka: Detect, if font is subset. Font subsets have special form, @@ -190,6 +193,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) FontInfo info; info.fontName = fontName; + info.fontFullName = fontFullName; info.pages.addValue(pageIndex + 1); info.fontTypeName = fontTypeName; info.isEmbedded = fontDescriptor->isEmbedded() || fontType == pdf::FontType::Type3; @@ -198,6 +202,11 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) info.reference = fontReference; info.substitutedFont = realizedFont->getPostScriptName(); + if (options.showCharacterMapsForEmbeddedFonts && info.isEmbedded) + { + info.characterInfos = realizedFont->getCharacterInfos(); + } + const pdf::PDFSimpleFont* simpleFont = dynamic_cast(font.data()); if (simpleFont) { @@ -292,6 +301,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) QString noText = PDFToolTranslationContext::tr("No"); QString noRef = PDFToolTranslationContext::tr("--"); + bool hasEmbedded = false; bool hasSubstitutions = false; int ref = 1; for (const FontInfo& info : directFonts) @@ -319,6 +329,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) } hasSubstitutions = hasSubstitutions || !info.isEmbedded; + hasEmbedded = hasEmbedded || info.isEmbedded; formatter.endTableRow(); ++ref; @@ -326,10 +337,9 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) formatter.endTable(); - formatter.endl(); - if (hasSubstitutions) { + formatter.endl(); formatter.beginTable("fonts-substitutions", PDFToolTranslationContext::tr("Substitutions")); formatter.beginTableHeaderRow("header"); @@ -374,6 +384,53 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) formatter.endTable(); } + if (options.showCharacterMapsForEmbeddedFonts && hasEmbedded) + { + formatter.endl(); + formatter.beginHeader("font-character-maps", PDFToolTranslationContext::tr("Font Character Maps")); + + int fontRef = 1; + for (const FontInfo& info : directFonts) + { + if (!info.isEmbedded) + { + continue; + } + + formatter.beginTable("font-character-map", PDFToolTranslationContext::tr("Character Map for Font '%1'").arg(info.fontFullName)); + + formatter.beginTableHeaderRow("header"); + formatter.writeTableHeaderColumn("no", PDFToolTranslationContext::tr("No."), Qt::AlignLeft); + formatter.writeTableHeaderColumn("glyph-index", PDFToolTranslationContext::tr("Glyph Index"), Qt::AlignLeft); + formatter.writeTableHeaderColumn("character", PDFToolTranslationContext::tr("Character"), Qt::AlignLeft); + formatter.writeTableHeaderColumn("unicode", PDFToolTranslationContext::tr("Unicode"), Qt::AlignLeft); + formatter.endTableHeaderRow(); + + int characterIndex = 1; + for (const pdf::CharacterInfo& characterInfo : info.characterInfos) + { + formatter.beginTableRow("character", characterInfo.gid); + + QString character = characterInfo.character.isNull() ? "??" : QString(1, characterInfo.character); + QString unicode = QString("0x%1").arg(QString::number(characterInfo.character.unicode(), 16).toUpper().rightJustified(4, QChar('0'))); + + formatter.writeTableColumn("no", locale.toString(characterIndex++), Qt::AlignRight); + formatter.writeTableColumn("glyph-index", locale.toString(characterInfo.gid), Qt::AlignRight); + formatter.writeTableColumn("character", character); + formatter.writeTableColumn("unicode", unicode); + + formatter.endTableRow(); + } + + formatter.endTable(); + ++fontRef; + + formatter.endl(); + } + + formatter.endTable(); + } + formatter.endDocument(); PDFConsole::writeText(formatter.getString(), options.outputCodec); @@ -382,7 +439,7 @@ int PDFToolInfoFonts::execute(const PDFToolOptions& options) PDFToolAbstractApplication::Options PDFToolInfoFonts::getOptionsFlags() const { - return ConsoleFormat | OpenDocument | PageSelector; + return ConsoleFormat | OpenDocument | PageSelector | CharacterMaps; } } // namespace pdftool