To Unicode mapping

2025-06-05 21:59:17 +02:00 · 2019-05-03 18:06:00 +02:00
parent 8667cbbf90
commit 3ad7485dbf
3 changed files with 99 additions and 9 deletions
--- a/PdfForQtLib/sources/pdffont.cpp
+++ b/PdfForQtLib/sources/pdffont.cpp
@@ -461,6 +461,7 @@ void PDFRealizedFontImpl::fillTextSequence(const QByteArray& byteArray, TextSequ
            const PDFType0Font* font = static_cast<PDFType0Font*>(m_parentFont.get());

            const PDFFontCMap* cmap = font->getCMap();
+            const PDFFontCMap* toUnicode = font->getToUnicode();
            const PDFCIDtoGIDMapper* CIDtoGIDmapper = font->getCIDtoGIDMapper();

            std::vector<CID> cids = cmap->interpret(byteArray);
@@ -472,9 +473,9 @@ void PDFRealizedFontImpl::fillTextSequence(const QByteArray& byteArray, TextSequ

                if (glyphIndex)
                {
-                    // TODO: Dodelat mapovani na unicode
+                    QChar character = toUnicode->getToUnicode(cid);
                    const Glyph& glyph = getGlyph(glyphIndex);
-                    textSequence.items.emplace_back(&glyph.glyph, QChar(), glyph.advance);
+                    textSequence.items.emplace_back(&glyph.glyph, character, glyph.advance);
                }
                else
                {
@@ -717,7 +718,7 @@ PDFFontPointer PDFFont::createFont(const PDFObject& object, const PDFDocument* d
    const PDFDictionary* fontDictionary = dereferencedFontDictionary.getDictionary();
    PDFDocumentDataLoaderDecorator fontLoader(document);

-    // TODO: Fonts - implement all types of the font
+    // TODO: Fonts - Implement Type 3 font
    // First, determine the font subtype
    constexpr const std::array<std::pair<const char*, FontType>, 3> fontTypes = {
        std::pair<const char*, FontType>{ "Type0", FontType::Type0 },
@@ -1111,7 +1112,20 @@ PDFFontPointer PDFFont::createFont(const PDFObject& object, const PDFDocument* d
                 }
            }

-            return PDFFontPointer(new PDFType0Font(qMove(fontDescriptor), qMove(cmap), qMove(cidToGidMapper), defaultWidth, qMove(advances)));
+            PDFFontCMap toUnicodeCMap;
+            const PDFObject& toUnicode = document->getObject(fontDictionary->get("ToUnicode"));
+            if (toUnicode.isName())
+            {
+                toUnicodeCMap = PDFFontCMap::createFromName(toUnicode.getString());
+            }
+            else if (toUnicode.isStream())
+            {
+                const PDFStream* stream = toUnicode.getStream();
+                QByteArray decodedStream = document->getDecodedStream(stream);
+                toUnicodeCMap = PDFFontCMap::createFromData(decodedStream);
+            }
+
+            return PDFFontPointer(new PDFType0Font(qMove(fontDescriptor), qMove(cmap), qMove(toUnicodeCMap), qMove(cidToGidMapper), defaultWidth, qMove(advances)));
        }

        default:
@@ -1136,9 +1150,6 @@ PDFFontPointer PDFFont::createFont(const PDFObject& object, const PDFDocument* d
        }
    }

-    // Read To Unicode
-    // TODO: Read To Unicode
-
    return PDFFontPointer();
 }

@@ -1360,6 +1371,25 @@ PDFFontCMap PDFFontCMap::createFromData(const QByteArray& data)
            return 0;
        };

+        auto fetchUnicode = [&parser](const PDFLexicalAnalyzer::Token& currentToken) -> CID
+        {
+            if (currentToken.type == PDFLexicalAnalyzer::TokenType::String)
+            {
+                QByteArray byteArray = currentToken.data.toByteArray();
+
+                if (byteArray.size() == 2)
+                {
+                    CID unicodeValue = 0;
+                    for (int i = 0; i < byteArray.size(); ++i)
+                    {
+                        unicodeValue = (unicodeValue << 8) + static_cast<unsigned char>(byteArray[i]);
+                    }
+                }
+            }
+
+            return 0;
+        };
+
        if (token.type == PDFLexicalAnalyzer::TokenType::Command)
        {
            QByteArray command = token.data.toByteArray();
@@ -1374,6 +1404,25 @@ PDFFontCMap PDFFontCMap::createFromData(const QByteArray& data)
                    throw PDFParserException(PDFTranslationContext::tr("Can't use cmap inside cmap file."));
                }
            }
+            else if (command == "beginbfrange")
+            {
+                PDFLexicalAnalyzer::Token token1 = parser.fetch();
+
+                if (token1.type == PDFLexicalAnalyzer::TokenType::Command &&
+                    token1.data.toByteArray() == "endbfrange")
+                {
+                    break;
+                }
+
+                PDFLexicalAnalyzer::Token token2 = parser.fetch();
+                PDFLexicalAnalyzer::Token token3 = parser.fetch();
+
+                std::pair<unsigned int, unsigned int> from = fetchCode(token1);
+                std::pair<unsigned int, unsigned int> to = fetchCode(token2);
+                CID cid = fetchUnicode(token3);
+
+                entries.emplace_back(from.first, to.first, qMax(from.second, to.second), cid);
+            }
            else if (command == "begincidrange")
            {
                while (true)
@@ -1413,6 +1462,26 @@ PDFFontCMap PDFFontCMap::createFromData(const QByteArray& data)
                    std::pair<unsigned int, unsigned int> code = fetchCode(token1);
                    CID cid = fetchCID(token2);

+                    entries.emplace_back(code.first, code.first, code.second, cid);
+                }
+            }
+            else if (command == "beginbfchar")
+            {
+                while (true)
+                {
+                    PDFLexicalAnalyzer::Token token1 = parser.fetch();
+
+                    if (token1.type == PDFLexicalAnalyzer::TokenType::Command &&
+                        token1.data.toByteArray() == "endbfchar")
+                    {
+                        break;
+                    }
+
+                    PDFLexicalAnalyzer::Token token2 = parser.fetch();
+
+                    std::pair<unsigned int, unsigned int> code = fetchCode(token1);
+                    CID cid = fetchUnicode(token2);
+
                    entries.emplace_back(code.first, code.first, code.second, cid);
                }
            }
@@ -1516,6 +1585,22 @@ std::vector<CID> PDFFontCMap::interpret(const QByteArray& byteArray) const
    return result;
 }

+QChar PDFFontCMap::getToUnicode(CID cid) const
+{
+    if (isValid())
+    {
+        auto it = std::find_if(m_entries.cbegin(), m_entries.cend(), [cid](const Entry& entry) { return entry.from <= cid && entry.to >= cid; });
+        if (it != m_entries.cend())
+        {
+            const Entry& entry = *it;
+            const CID unicodeCID = cid - entry.from + entry.cid;
+            return QChar(unicodeCID);
+        }
+    }
+
+    return QChar();
+}
+
 PDFFontCMap::PDFFontCMap(Entries&& entries, bool vertical) :
    m_entries(qMove(entries)),
    m_maxKeyLength(0),
--- a/PdfForQtLib/sources/pdffont.h
+++ b/PdfForQtLib/sources/pdffont.h
@@ -419,6 +419,9 @@ public:
    /// Converts byte array to array of CIDs
    std::vector<CID> interpret(const QByteArray& byteArray) const;

+    /// Converts CID to QChar, use only on ToUnicode CMaps
+    QChar getToUnicode(CID cid) const;
+
 private:

    struct Entry
@@ -468,9 +471,10 @@ private:
 class PDFType0Font : public PDFFont
 {
 public:
-    explicit inline PDFType0Font(FontDescriptor fontDescriptor, PDFFontCMap cmap, PDFCIDtoGIDMapper mapper, PDFReal defaultAdvance, std::unordered_map<CID, PDFReal> advances) :
+    explicit inline PDFType0Font(FontDescriptor fontDescriptor, PDFFontCMap cmap, PDFFontCMap toUnicode, PDFCIDtoGIDMapper mapper, PDFReal defaultAdvance, std::unordered_map<CID, PDFReal> advances) :
        PDFFont(qMove(fontDescriptor)),
        m_cmap(qMove(cmap)),
+        m_toUnicode(qMove(toUnicode)),
        m_mapper(qMove(mapper)),
        m_defaultAdvance(defaultAdvance),
        m_advances(qMove(advances))
@@ -483,6 +487,7 @@ public:
    virtual FontType getFontType() const override { return FontType::Type0; }

    const PDFFontCMap* getCMap() const { return &m_cmap; }
+    const PDFFontCMap* getToUnicode() const { return &m_toUnicode; }
    const PDFCIDtoGIDMapper* getCIDtoGIDMapper() const { return &m_mapper; }

    /// Returns the glyph advance, if it can be obtained, or zero, if it cannot
@@ -492,6 +497,7 @@ public:

 private:
    PDFFontCMap m_cmap;
+    PDFFontCMap m_toUnicode;
    PDFCIDtoGIDMapper m_mapper;
    PDFReal m_defaultAdvance;
    std::unordered_map<CID, PDFReal> m_advances;
--- a/PdfForQtLib/sources/pdfpagecontentprocessor.cpp
+++ b/PdfForQtLib/sources/pdfpagecontentprocessor.cpp
@@ -1753,7 +1753,6 @@ void PDFPageContentProcessor::operatorTextSetSpacingAndShowText(PDFReal t_w, PDF

 void PDFPageContentProcessor::drawText(const TextSequence& textSequence)
 {
-    // TODO: Kdyz nejsme v text rezimu, tak nekreslime text
    if (textSequence.items.empty())
    {
        // Do not display empty text