UTF-8 support (PDF 2.0 compliance) and ProcedureSets (for compatibility)

This commit is contained in:
Jakub Melka
2020-08-02 15:29:10 +02:00
parent fa30ed37bb
commit a2c5de0fe4
4 changed files with 89 additions and 4 deletions

View File

@ -2193,6 +2193,10 @@ QString PDFEncoding::convertTextString(const QByteArray& stream)
{ {
return convertFromUnicode(stream); return convertFromUnicode(stream);
} }
else if (hasUTF8LeadMarkings(stream))
{
return QString::fromUtf8(stream);
}
else else
{ {
return convert(stream, Encoding::PDFDoc); return convert(stream, Encoding::PDFDoc);
@ -2348,4 +2352,20 @@ bool PDFEncoding::hasUnicodeLeadMarkings(const QByteArray& stream)
return false; return false;
} }
bool PDFEncoding::hasUTF8LeadMarkings(const QByteArray& stream)
{
if (stream.size() >= 3)
{
if (static_cast<unsigned char>(stream[0]) == 239 &&
static_cast<unsigned char>(stream[1]) == 187 &&
static_cast<unsigned char>(stream[2]) == 191)
{
// UTF-8
return true;
}
}
return false;
}
} // namespace pdf } // namespace pdf

View File

@ -105,11 +105,18 @@ public:
static const encoding::EncodingTable* getTableForEncoding(Encoding encoding); static const encoding::EncodingTable* getTableForEncoding(Encoding encoding);
private: private:
/// Returns true, if byte array has UTF-16BE unicode marking bytes at the /// Returns true, if byte array has UTF-16BE/LE unicode marking bytes at the
/// stream start. If they are present, then byte stream is probably encoded /// stream start. If they are present, then byte stream is probably encoded
/// as unicode. /// as unicode.
/// \param stream Stream to be tested /// \param stream Stream to be tested
static bool hasUnicodeLeadMarkings(const QByteArray& stream); static bool hasUnicodeLeadMarkings(const QByteArray& stream);
/// Returns true, if byte array has UTF-8 unicode marking bytes at the stream
/// start. If they are present, then byte stream is probably encoded
/// as UTF-8 string.
/// \note UTF-8 strings were added in PDF 2.0 specification
/// \param stream Stream to be tested
static bool hasUTF8LeadMarkings(const QByteArray& stream);
}; };
} // namespace pdf } // namespace pdf

View File

@ -156,10 +156,10 @@ static constexpr const std::pair<const char*, PDFPageContentProcessor::Operator>
}; };
void PDFPageContentProcessor::initDictionaries(const PDFObject& resourcesObject) void PDFPageContentProcessor::initDictionaries(const PDFObject& resourcesObject)
{
auto getDictionary = [this, &resourcesObject](const char* resourceName) -> const pdf::PDFDictionary*
{ {
const PDFObject& resources = m_document->getObject(resourcesObject); const PDFObject& resources = m_document->getObject(resourcesObject);
auto getDictionary = [this, &resources](const char* resourceName) -> const pdf::PDFDictionary*
{
if (resources.isDictionary() && resources.getDictionary()->hasKey(resourceName)) if (resources.isDictionary() && resources.getDictionary()->hasKey(resourceName))
{ {
const PDFObject& resourceDictionary = m_document->getObject(resources.getDictionary()->get(resourceName)); const PDFObject& resourceDictionary = m_document->getObject(resources.getDictionary()->get(resourceName));
@ -179,6 +179,43 @@ void PDFPageContentProcessor::initDictionaries(const PDFObject& resourcesObject)
m_propertiesDictionary = getDictionary("Properties"); m_propertiesDictionary = getDictionary("Properties");
m_shadingDictionary = getDictionary("Shading"); m_shadingDictionary = getDictionary("Shading");
m_patternDictionary = getDictionary("Pattern"); m_patternDictionary = getDictionary("Pattern");
m_procedureSets = NoProcSet;
if (resources.isDictionary() && resources.getDictionary()->hasKey("ProcSet"))
{
PDFDocumentDataLoaderDecorator loader(m_document);
std::vector<QByteArray> procedureSetNames = loader.readNameArrayFromDictionary(resources.getDictionary(), "ProcSet");
ProcedureSets newProcSet = EmptyProcSet;
for (const QByteArray& procedureSetName : procedureSetNames)
{
if (procedureSetName == "PDF")
{
newProcSet.setFlag(PDF);
}
else if (procedureSetName == "Text")
{
newProcSet.setFlag(Text);
}
else if (procedureSetName == "ImageB")
{
newProcSet.setFlag(ImageB);
}
else if (procedureSetName == "ImageC")
{
newProcSet.setFlag(ImageC);
}
else if (procedureSetName == "ImageI")
{
newProcSet.setFlag(ImageI);
}
}
if (newProcSet)
{
m_procedureSets = newProcSet;
}
}
} }
PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page, PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page,
@ -200,6 +237,7 @@ PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page,
m_propertiesDictionary(nullptr), m_propertiesDictionary(nullptr),
m_shadingDictionary(nullptr), m_shadingDictionary(nullptr),
m_patternDictionary(nullptr), m_patternDictionary(nullptr),
m_procedureSets(NoProcSet),
m_textBeginEndState(0), m_textBeginEndState(0),
m_compatibilityBeginEndState(0), m_compatibilityBeginEndState(0),
m_drawingUncoloredTilingPatternState(0), m_drawingUncoloredTilingPatternState(0),
@ -3501,7 +3539,8 @@ PDFPageContentProcessor::PDFPageContentProcessorStateGuard::PDFPageContentProces
m_extendedGraphicStateDictionary(processor->m_extendedGraphicStateDictionary), m_extendedGraphicStateDictionary(processor->m_extendedGraphicStateDictionary),
m_propertiesDictionary(processor->m_propertiesDictionary), m_propertiesDictionary(processor->m_propertiesDictionary),
m_shadingDictionary(processor->m_shadingDictionary), m_shadingDictionary(processor->m_shadingDictionary),
m_patternDictionary(processor->m_patternDictionary) m_patternDictionary(processor->m_patternDictionary),
m_procedureSets(processor->m_procedureSets)
{ {
m_processor->operatorSaveGraphicState(); m_processor->operatorSaveGraphicState();
} }
@ -3516,6 +3555,7 @@ PDFPageContentProcessor::PDFPageContentProcessorStateGuard::~PDFPageContentProce
m_processor->m_propertiesDictionary = m_propertiesDictionary; m_processor->m_propertiesDictionary = m_propertiesDictionary;
m_processor->m_shadingDictionary = m_shadingDictionary; m_processor->m_shadingDictionary = m_shadingDictionary;
m_processor->m_patternDictionary = m_patternDictionary; m_processor->m_patternDictionary = m_patternDictionary;
m_processor->m_procedureSets = m_procedureSets;
m_processor->operatorRestoreGraphicState(); m_processor->operatorRestoreGraphicState();
} }

View File

@ -197,6 +197,18 @@ public:
Invalid ///< Invalid operator, use for error reporting Invalid ///< Invalid operator, use for error reporting
}; };
enum ProcedureSet
{
EmptyProcSet = 0x0000,
NoProcSet = 0x0001,
PDF = 0x0002,
Text = 0x0004,
ImageB = 0x0008,
ImageC = 0x0010,
ImageI = 0x0020
};
Q_DECLARE_FLAGS(ProcedureSets, ProcedureSet)
/// Process the contents of the page /// Process the contents of the page
QList<PDFRenderError> processContents(); QList<PDFRenderError> processContents();
@ -547,6 +559,10 @@ protected:
/// Returns page bounding rectangle in device space /// Returns page bounding rectangle in device space
const QRectF& getPageBoundingRectDeviceSpace() const { return m_pageBoundingRectDeviceSpace; } const QRectF& getPageBoundingRectDeviceSpace() const { return m_pageBoundingRectDeviceSpace; }
/// Returns current procedure sets. Procedure sets are deprecated in PDF 2.0 and are here
/// only for compatibility purposes. See chapter 14.2 in PDF 2.0 specification.
ProcedureSets getProcedureSets() const { return m_procedureSets; }
private: private:
/// Initializes the resources dictionaries /// Initializes the resources dictionaries
void initDictionaries(const PDFObject& resourcesObject); void initDictionaries(const PDFObject& resourcesObject);
@ -621,6 +637,7 @@ private:
const PDFDictionary* m_propertiesDictionary; const PDFDictionary* m_propertiesDictionary;
const PDFDictionary* m_shadingDictionary; const PDFDictionary* m_shadingDictionary;
const PDFDictionary* m_patternDictionary; const PDFDictionary* m_patternDictionary;
ProcedureSets m_procedureSets;
}; };
class PDFPageContentProcessorGraphicStateSaveRestoreGuard class PDFPageContentProcessorGraphicStateSaveRestoreGuard
@ -878,6 +895,7 @@ private:
const PDFDictionary* m_propertiesDictionary; const PDFDictionary* m_propertiesDictionary;
const PDFDictionary* m_shadingDictionary; const PDFDictionary* m_shadingDictionary;
const PDFDictionary* m_patternDictionary; const PDFDictionary* m_patternDictionary;
ProcedureSets m_procedureSets;
// Default color spaces // Default color spaces
PDFColorSpacePointer m_deviceGrayColorSpace; PDFColorSpacePointer m_deviceGrayColorSpace;