Editor plugin: Encoding content - text encoding

This commit is contained in:
Jakub Melka 2024-05-08 17:27:27 +02:00
parent eeadf328b2
commit 70b7c2464f
4 changed files with 220 additions and 0 deletions

View File

@ -1290,6 +1290,47 @@ QByteArray PDFFont::getFontId() const
return m_fontId;
}
PDFEncodedText PDFFont::encodeText(const QString& text) const
{
PDFEncodedText result;
result.isValid = true;
const PDFFontCMap* cmap = getCMap();
const PDFFontCMap* toUnicode = getToUnicode();
if (!cmap || !toUnicode)
{
result.errorString = PDFTranslationContext::tr("Invalid font encoding.");
return result;
}
for (const QChar& character : text)
{
CID cid = toUnicode->getFromUnicode(character);
if (cid != CID())
{
QByteArray encoded = cmap->encode(cid);
if (!encoded.isEmpty())
{
result.encodedText.append(encoded);
result.errorString += "_";
}
else
{
result.isValid = false;
result.errorString += character;
}
}
else
{
result.isValid = false;
result.errorString += character;
}
}
return result;
}
PDFFontPointer PDFFont::createFont(const PDFObject& object, QByteArray fontId, const PDFDocument* document)
{
const PDFObject& dereferencedFontDictionary = document->getObject(object);
@ -1929,6 +1970,44 @@ PDFInteger PDFSimpleFont::getGlyphAdvance(size_t index) const
return 0;
}
PDFEncodedText PDFSimpleFont::encodeText(const QString& text) const
{
PDFEncodedText result;
result.isValid = true;
const encoding::EncodingTable* encodingTable = getEncoding();
for (const QChar& character : text)
{
ushort unicode = character.unicode();
unsigned char converted = 0;
bool isFound = false;
for (size_t i = 0; i < encodingTable->size(); ++i)
{
if (unicode == (*encodingTable)[static_cast<unsigned char>(i)])
{
isFound = true;
converted = static_cast<unsigned char>(i);
break;
}
}
if (isFound)
{
result.encodedText.append(static_cast<char>(converted));
result.errorString += "_";
}
else
{
result.isValid = false;
result.errorString += character;
}
}
return result;
}
void PDFSimpleFont::dumpFontToTreeItem(ITreeFactory* treeFactory) const
{
BaseClass::dumpFontToTreeItem(treeFactory);
@ -2496,6 +2575,35 @@ std::vector<CID> PDFFontCMap::interpret(const QByteArray& byteArray) const
return result;
}
QByteArray PDFFontCMap::encode(CID cid) const
{
QByteArray byteArray;
for (const auto& entry : m_entries)
{
unsigned int minPossibleValue = entry.from + entry.cid;
unsigned int maxPossibleValue = entry.to + entry.cid;
if (cid >= minPossibleValue && cid <= maxPossibleValue)
{
// Calculate the original value from cid
unsigned int value = cid - entry.cid + entry.from;
byteArray.reserve(entry.byteCount);
// Construct byte array for this value based on the entry's byteCount
for (int i = entry.byteCount - 1; i >= 0; --i)
{
byteArray.append(static_cast<char>((value >> (8 * i)) & 0xFF));
}
break;
}
}
return byteArray;
}
QChar PDFFontCMap::getToUnicode(CID cid) const
{
if (isValid())
@ -2512,6 +2620,29 @@ QChar PDFFontCMap::getToUnicode(CID cid) const
return QChar();
}
CID PDFFontCMap::getFromUnicode(QChar character) const
{
if (!character.isNull())
{
char16_t ucs4 = character.unicode();
const CID unicodeCID = ucs4;
for (const Entry& entry : m_entries)
{
const CID minUnicodeCID = entry.cid;
const CID maxUnicodeCID = (entry.to - entry.from) + entry.cid;
if (unicodeCID >= minUnicodeCID && unicodeCID <= maxUnicodeCID)
{
const CID cid = unicodeCID + entry.from - entry.cid;
return cid;
}
}
}
return CID();
}
PDFFontCMap::PDFFontCMap(Entries&& entries, bool vertical) :
m_entries(qMove(entries)),
m_maxKeyLength(0),

View File

@ -291,6 +291,13 @@ private:
IRealizedFontImpl* m_impl;
};
struct PDFEncodedText
{
QByteArray encodedText;
QString errorString;
bool isValid = false;
};
/// Base class representing font in the PDF file
class PDF4QTLIBCORESHARED_EXPORT PDFFont
{
@ -335,6 +342,9 @@ public:
/// Returns font id from the font dictionary
QByteArray getFontId() const;
/// Encodes text into font encoding
virtual PDFEncodedText encodeText(const QString& text) const;
protected:
CIDSystemInfo m_CIDSystemInfo;
FontDescriptor m_fontDescriptor;
@ -368,6 +378,8 @@ public:
/// Returns the glyph advance (or zero, if glyph advance is invalid)
PDFInteger getGlyphAdvance(size_t index) const;
virtual PDFEncodedText encodeText(const QString& text) const override;
virtual void dumpFontToTreeItem(ITreeFactory* treeFactory) const override;
protected:
@ -556,9 +568,15 @@ public:
/// Converts byte array to array of CIDs
std::vector<CID> interpret(const QByteArray& byteArray) const;
/// Encodes character to byte array
QByteArray encode(CID cid) const;
/// Converts CID to QChar, use only on ToUnicode CMaps
QChar getToUnicode(CID cid) const;
/// Converts QChar to CID, use only on ToUnicode CMaps
CID getFromUnicode(QChar character) const;
private:
struct Entry

View File

@ -16,6 +16,7 @@
// along with PDF4QT. If not, see <https://www.gnu.org/licenses/>.
#include "pdfpagecontenteditorprocessor.h"
#include "pdfdocumentbuilder.h"
#include <QStringBuilder>
#include <QXmlStreamReader>
@ -890,6 +891,7 @@ void PDFPageContentEditorContentStreamBuilder::writeText(QTextStream& stream, co
stream << "q BT" << Qt::endl;
QXmlStreamReader reader(text);
m_textFont = m_currentState.getTextFont();
auto isCommand = [&reader](const char* tag) -> bool
{
@ -1009,6 +1011,7 @@ void PDFPageContentEditorContentStreamBuilder::writeText(QTextStream& stream, co
}
else
{
v1 = selectFont(v1);
stream << "/" << v1 << " " << v2 << " Tf" << Qt::endl;
}
}
@ -1087,12 +1090,77 @@ void PDFPageContentEditorContentStreamBuilder::writeText(QTextStream& stream, co
if (reader.isCharacters())
{
QString characters = reader.text().toString();
if (m_textFont)
{
PDFEncodedText encodedText = m_textFont->encodeText(characters);
if (!encodedText.encodedText.isEmpty())
{
stream << "<" << encodedText.encodedText.toHex() << "> Tj" << Qt::endl;
}
if (!encodedText.isValid)
{
addError(PDFTranslationContext::tr("Error during converting text to font encoding. Some characters were not converted: '%1'.").arg(encodedText.errorString));
}
}
else
{
addError(PDFTranslationContext::tr("Text font not defined!"));
}
}
}
stream << "ET Q" << Qt::endl;
}
QByteArray PDFPageContentEditorContentStreamBuilder::selectFont(const QByteArray& font)
{
m_textFont = nullptr;
PDFObject fontObject = m_fontDictionary.get(font);
if (!fontObject.isNull())
{
try
{
m_textFont = PDFFont::createFont(fontObject, font, m_document);
}
catch (const PDFException&)
{
addError(PDFTranslationContext::tr("Font '%1' is invalid.").arg(QString::fromLatin1(font)));
}
}
if (!m_textFont)
{
QByteArray defaultFontKey = "PDF4QT_DefFnt";
if (!m_fontDictionary.hasKey(defaultFontKey))
{
PDFObjectFactory defaultFontFactory;
defaultFontFactory.beginDictionary();
defaultFontFactory.beginDictionaryItem("Type");
defaultFontFactory << WrapName("Font");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.beginDictionaryItem("Subtype");
defaultFontFactory << WrapName("Type1");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.beginDictionaryItem("BaseFont");
defaultFontFactory << WrapName("Helvetica");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.beginDictionaryItem("Encoding");
defaultFontFactory << WrapName("WinAnsiEncoding");
defaultFontFactory.endDictionaryItem();
defaultFontFactory.endDictionary();
m_fontDictionary.setEntry(PDFInplaceOrMemoryString(defaultFontKey), defaultFontFactory.takeObject());
}
m_textFont = PDFFont::createFont(fontObject, font, m_document);
}
}
void PDFPageContentEditorContentStreamBuilder::addError(const QString& error)
{

View File

@ -225,12 +225,15 @@ private:
bool isFilling);
void writeText(QTextStream& stream, const QString& text);
QByteArray selectFont(const QByteArray& font);
void addError(const QString& error);
PDFDocument* m_document = nullptr;
PDFDictionary m_fontDictionary;
PDFDictionary m_xobjectDictionary;
QByteArray m_outputContent;
PDFPageContentProcessorState m_currentState;
PDFFontPointer m_textFont;
};
class PDF4QTLIBCORESHARED_EXPORT PDFPageContentEditorProcessor : public PDFPageContentProcessor