Editor plugin: Encoding content - text encoding

This commit is contained in:
Jakub Melka 2024-05-08 17:27:27 +02:00
parent eeadf328b2
commit 70b7c2464f
4 changed files with 220 additions and 0 deletions

View File

@ -1290,6 +1290,47 @@ QByteArray PDFFont::getFontId() const
return m_fontId; return m_fontId;
} }
PDFEncodedText PDFFont::encodeText(const QString& text) const
{
PDFEncodedText result;
result.isValid = true;
const PDFFontCMap* cmap = getCMap();
const PDFFontCMap* toUnicode = getToUnicode();
if (!cmap || !toUnicode)
{
result.errorString = PDFTranslationContext::tr("Invalid font encoding.");
return result;
}
for (const QChar& character : text)
{
CID cid = toUnicode->getFromUnicode(character);
if (cid != CID())
{
QByteArray encoded = cmap->encode(cid);
if (!encoded.isEmpty())
{
result.encodedText.append(encoded);
result.errorString += "_";
}
else
{
result.isValid = false;
result.errorString += character;
}
}
else
{
result.isValid = false;
result.errorString += character;
}
}
return result;
}
PDFFontPointer PDFFont::createFont(const PDFObject& object, QByteArray fontId, const PDFDocument* document) PDFFontPointer PDFFont::createFont(const PDFObject& object, QByteArray fontId, const PDFDocument* document)
{ {
const PDFObject& dereferencedFontDictionary = document->getObject(object); const PDFObject& dereferencedFontDictionary = document->getObject(object);
@ -1929,6 +1970,44 @@ PDFInteger PDFSimpleFont::getGlyphAdvance(size_t index) const
return 0; return 0;
} }
PDFEncodedText PDFSimpleFont::encodeText(const QString& text) const
{
PDFEncodedText result;
result.isValid = true;
const encoding::EncodingTable* encodingTable = getEncoding();
for (const QChar& character : text)
{
ushort unicode = character.unicode();
unsigned char converted = 0;
bool isFound = false;
for (size_t i = 0; i < encodingTable->size(); ++i)
{
if (unicode == (*encodingTable)[static_cast<unsigned char>(i)])
{
isFound = true;
converted = static_cast<unsigned char>(i);
break;
}
}
if (isFound)
{
result.encodedText.append(static_cast<char>(converted));
result.errorString += "_";
}
else
{
result.isValid = false;
result.errorString += character;
}
}
return result;
}
void PDFSimpleFont::dumpFontToTreeItem(ITreeFactory* treeFactory) const void PDFSimpleFont::dumpFontToTreeItem(ITreeFactory* treeFactory) const
{ {
BaseClass::dumpFontToTreeItem(treeFactory); BaseClass::dumpFontToTreeItem(treeFactory);
@ -2496,6 +2575,35 @@ std::vector<CID> PDFFontCMap::interpret(const QByteArray& byteArray) const
return result; return result;
} }
QByteArray PDFFontCMap::encode(CID cid) const
{
QByteArray byteArray;
for (const auto& entry : m_entries)
{
unsigned int minPossibleValue = entry.from + entry.cid;
unsigned int maxPossibleValue = entry.to + entry.cid;
if (cid >= minPossibleValue && cid <= maxPossibleValue)
{
// Calculate the original value from cid
unsigned int value = cid - entry.cid + entry.from;
byteArray.reserve(entry.byteCount);
// Construct byte array for this value based on the entry's byteCount
for (int i = entry.byteCount - 1; i >= 0; --i)
{
byteArray.append(static_cast<char>((value >> (8 * i)) & 0xFF));
}
break;
}
}
return byteArray;
}
QChar PDFFontCMap::getToUnicode(CID cid) const QChar PDFFontCMap::getToUnicode(CID cid) const
{ {
if (isValid()) if (isValid())
@ -2512,6 +2620,29 @@ QChar PDFFontCMap::getToUnicode(CID cid) const
return QChar(); return QChar();
} }
CID PDFFontCMap::getFromUnicode(QChar character) const
{
if (!character.isNull())
{
char16_t ucs4 = character.unicode();
const CID unicodeCID = ucs4;
for (const Entry& entry : m_entries)
{
const CID minUnicodeCID = entry.cid;
const CID maxUnicodeCID = (entry.to - entry.from) + entry.cid;
if (unicodeCID >= minUnicodeCID && unicodeCID <= maxUnicodeCID)
{
const CID cid = unicodeCID + entry.from - entry.cid;
return cid;
}
}
}
return CID();
}
PDFFontCMap::PDFFontCMap(Entries&& entries, bool vertical) : PDFFontCMap::PDFFontCMap(Entries&& entries, bool vertical) :
m_entries(qMove(entries)), m_entries(qMove(entries)),
m_maxKeyLength(0), m_maxKeyLength(0),

View File

@ -291,6 +291,13 @@ private:
IRealizedFontImpl* m_impl; IRealizedFontImpl* m_impl;
}; };
struct PDFEncodedText
{
QByteArray encodedText;
QString errorString;
bool isValid = false;
};
/// Base class representing font in the PDF file /// Base class representing font in the PDF file
class PDF4QTLIBCORESHARED_EXPORT PDFFont class PDF4QTLIBCORESHARED_EXPORT PDFFont
{ {
@ -335,6 +342,9 @@ public:
/// Returns font id from the font dictionary /// Returns font id from the font dictionary
QByteArray getFontId() const; QByteArray getFontId() const;
/// Encodes text into font encoding
virtual PDFEncodedText encodeText(const QString& text) const;
protected: protected:
CIDSystemInfo m_CIDSystemInfo; CIDSystemInfo m_CIDSystemInfo;
FontDescriptor m_fontDescriptor; FontDescriptor m_fontDescriptor;
@ -368,6 +378,8 @@ public:
/// Returns the glyph advance (or zero, if glyph advance is invalid) /// Returns the glyph advance (or zero, if glyph advance is invalid)
PDFInteger getGlyphAdvance(size_t index) const; PDFInteger getGlyphAdvance(size_t index) const;
virtual PDFEncodedText encodeText(const QString& text) const override;
virtual void dumpFontToTreeItem(ITreeFactory* treeFactory) const override; virtual void dumpFontToTreeItem(ITreeFactory* treeFactory) const override;
protected: protected:
@ -556,9 +568,15 @@ public:
/// Converts byte array to array of CIDs /// Converts byte array to array of CIDs
std::vector<CID> interpret(const QByteArray& byteArray) const; std::vector<CID> interpret(const QByteArray& byteArray) const;
/// Encodes character to byte array
QByteArray encode(CID cid) const;
/// Converts CID to QChar, use only on ToUnicode CMaps /// Converts CID to QChar, use only on ToUnicode CMaps
QChar getToUnicode(CID cid) const; QChar getToUnicode(CID cid) const;
/// Converts QChar to CID, use only on ToUnicode CMaps
CID getFromUnicode(QChar character) const;
private: private:
struct Entry struct Entry

View File

@ -16,6 +16,7 @@
// along with PDF4QT. If not, see <https://www.gnu.org/licenses/>. // along with PDF4QT. If not, see <https://www.gnu.org/licenses/>.
#include "pdfpagecontenteditorprocessor.h" #include "pdfpagecontenteditorprocessor.h"
#include "pdfdocumentbuilder.h"
#include <QStringBuilder> #include <QStringBuilder>
#include <QXmlStreamReader> #include <QXmlStreamReader>
@ -890,6 +891,7 @@ void PDFPageContentEditorContentStreamBuilder::writeText(QTextStream& stream, co
stream << "q BT" << Qt::endl; stream << "q BT" << Qt::endl;
QXmlStreamReader reader(text); QXmlStreamReader reader(text);
m_textFont = m_currentState.getTextFont();
auto isCommand = [&reader](const char* tag) -> bool auto isCommand = [&reader](const char* tag) -> bool
{ {
@ -1009,6 +1011,7 @@ void PDFPageContentEditorContentStreamBuilder::writeText(QTextStream& stream, co
} }
else else
{ {
v1 = selectFont(v1);
stream << "/" << v1 << " " << v2 << " Tf" << Qt::endl; stream << "/" << v1 << " " << v2 << " Tf" << Qt::endl;
} }
} }
@ -1087,12 +1090,77 @@ void PDFPageContentEditorContentStreamBuilder::writeText(QTextStream& stream, co
if (reader.isCharacters()) if (reader.isCharacters())
{ {
QString characters = reader.text().toString(); QString characters = reader.text().toString();
if (m_textFont)
{
PDFEncodedText encodedText = m_textFont->encodeText(characters);
if (!encodedText.encodedText.isEmpty())
{
stream << "<" << encodedText.encodedText.toHex() << "> Tj" << Qt::endl;
}
if (!encodedText.isValid)
{
addError(PDFTranslationContext::tr("Error during converting text to font encoding. Some characters were not converted: '%1'.").arg(encodedText.errorString));
}
}
else
{
addError(PDFTranslationContext::tr("Text font not defined!"));
}
} }
} }
stream << "ET Q" << Qt::endl; stream << "ET Q" << Qt::endl;
} }
QByteArray PDFPageContentEditorContentStreamBuilder::selectFont(const QByteArray& font)
{
m_textFont = nullptr;
PDFObject fontObject = m_fontDictionary.get(font);
if (!fontObject.isNull())
{
try
{
m_textFont = PDFFont::createFont(fontObject, font, m_document);
}
catch (const PDFException&)
{
addError(PDFTranslationContext::tr("Font '%1' is invalid.").arg(QString::fromLatin1(font)));
}
}
if (!m_textFont)
{
QByteArray defaultFontKey = "PDF4QT_DefFnt";
if (!m_fontDictionary.hasKey(defaultFontKey))
{
PDFObjectFactory defaultFontFactory;
defaultFontFactory.beginDictionary();
defaultFontFactory.beginDictionaryItem("Type");
defaultFontFactory << WrapName("Font");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.beginDictionaryItem("Subtype");
defaultFontFactory << WrapName("Type1");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.beginDictionaryItem("BaseFont");
defaultFontFactory << WrapName("Helvetica");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.beginDictionaryItem("Encoding");
defaultFontFactory << WrapName("WinAnsiEncoding");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.endDictionary();
m_fontDictionary.setEntry(PDFInplaceOrMemoryString(defaultFontKey), defaultFontFactory.takeObject());
}
m_textFont = PDFFont::createFont(fontObject, font, m_document);
}
}
void PDFPageContentEditorContentStreamBuilder::addError(const QString& error) void PDFPageContentEditorContentStreamBuilder::addError(const QString& error)
{ {

View File

@ -225,12 +225,15 @@ private:
bool isFilling); bool isFilling);
void writeText(QTextStream& stream, const QString& text); void writeText(QTextStream& stream, const QString& text);
QByteArray selectFont(const QByteArray& font);
void addError(const QString& error); void addError(const QString& error);
PDFDocument* m_document = nullptr;
PDFDictionary m_fontDictionary; PDFDictionary m_fontDictionary;
PDFDictionary m_xobjectDictionary; PDFDictionary m_xobjectDictionary;
QByteArray m_outputContent; QByteArray m_outputContent;
PDFPageContentProcessorState m_currentState; PDFPageContentProcessorState m_currentState;
PDFFontPointer m_textFont;
}; };
class PDF4QTLIBCORESHARED_EXPORT PDFPageContentEditorProcessor : public PDFPageContentProcessor class PDF4QTLIBCORESHARED_EXPORT PDFPageContentEditorProcessor : public PDFPageContentProcessor