UTF-8 support (PDF 2.0 compliance) and ProcedureSets (for compatibility)

This commit is contained in:
Jakub Melka
2020-08-02 15:29:10 +02:00
parent fa30ed37bb
commit a2c5de0fe4
4 changed files with 89 additions and 4 deletions

View File

@ -2193,6 +2193,10 @@ QString PDFEncoding::convertTextString(const QByteArray& stream)
{
return convertFromUnicode(stream);
}
else if (hasUTF8LeadMarkings(stream))
{
return QString::fromUtf8(stream);
}
else
{
return convert(stream, Encoding::PDFDoc);
@ -2348,4 +2352,20 @@ bool PDFEncoding::hasUnicodeLeadMarkings(const QByteArray& stream)
return false;
}
bool PDFEncoding::hasUTF8LeadMarkings(const QByteArray& stream)
{
if (stream.size() >= 3)
{
if (static_cast<unsigned char>(stream[0]) == 239 &&
static_cast<unsigned char>(stream[1]) == 187 &&
static_cast<unsigned char>(stream[2]) == 191)
{
// UTF-8
return true;
}
}
return false;
}
} // namespace pdf

View File

@ -105,11 +105,18 @@ public:
static const encoding::EncodingTable* getTableForEncoding(Encoding encoding);
private:
/// Returns true, if byte array has UTF-16BE unicode marking bytes at the
/// Returns true, if byte array has UTF-16BE/LE unicode marking bytes at the
/// stream start. If they are present, then byte stream is probably encoded
/// as unicode.
/// \param stream Stream to be tested
static bool hasUnicodeLeadMarkings(const QByteArray& stream);
/// Returns true, if byte array has UTF-8 unicode marking bytes at the stream
/// start. If they are present, then byte stream is probably encoded
/// as UTF-8 string.
/// \note UTF-8 strings were added in PDF 2.0 specification
/// \param stream Stream to be tested
static bool hasUTF8LeadMarkings(const QByteArray& stream);
};
} // namespace pdf

View File

@ -157,9 +157,9 @@ static constexpr const std::pair<const char*, PDFPageContentProcessor::Operator>
void PDFPageContentProcessor::initDictionaries(const PDFObject& resourcesObject)
{
auto getDictionary = [this, &resourcesObject](const char* resourceName) -> const pdf::PDFDictionary*
const PDFObject& resources = m_document->getObject(resourcesObject);
auto getDictionary = [this, &resources](const char* resourceName) -> const pdf::PDFDictionary*
{
const PDFObject& resources = m_document->getObject(resourcesObject);
if (resources.isDictionary() && resources.getDictionary()->hasKey(resourceName))
{
const PDFObject& resourceDictionary = m_document->getObject(resources.getDictionary()->get(resourceName));
@ -179,6 +179,43 @@ void PDFPageContentProcessor::initDictionaries(const PDFObject& resourcesObject)
m_propertiesDictionary = getDictionary("Properties");
m_shadingDictionary = getDictionary("Shading");
m_patternDictionary = getDictionary("Pattern");
m_procedureSets = NoProcSet;
if (resources.isDictionary() && resources.getDictionary()->hasKey("ProcSet"))
{
PDFDocumentDataLoaderDecorator loader(m_document);
std::vector<QByteArray> procedureSetNames = loader.readNameArrayFromDictionary(resources.getDictionary(), "ProcSet");
ProcedureSets newProcSet = EmptyProcSet;
for (const QByteArray& procedureSetName : procedureSetNames)
{
if (procedureSetName == "PDF")
{
newProcSet.setFlag(PDF);
}
else if (procedureSetName == "Text")
{
newProcSet.setFlag(Text);
}
else if (procedureSetName == "ImageB")
{
newProcSet.setFlag(ImageB);
}
else if (procedureSetName == "ImageC")
{
newProcSet.setFlag(ImageC);
}
else if (procedureSetName == "ImageI")
{
newProcSet.setFlag(ImageI);
}
}
if (newProcSet)
{
m_procedureSets = newProcSet;
}
}
}
PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page,
@ -200,6 +237,7 @@ PDFPageContentProcessor::PDFPageContentProcessor(const PDFPage* page,
m_propertiesDictionary(nullptr),
m_shadingDictionary(nullptr),
m_patternDictionary(nullptr),
m_procedureSets(NoProcSet),
m_textBeginEndState(0),
m_compatibilityBeginEndState(0),
m_drawingUncoloredTilingPatternState(0),
@ -3501,7 +3539,8 @@ PDFPageContentProcessor::PDFPageContentProcessorStateGuard::PDFPageContentProces
m_extendedGraphicStateDictionary(processor->m_extendedGraphicStateDictionary),
m_propertiesDictionary(processor->m_propertiesDictionary),
m_shadingDictionary(processor->m_shadingDictionary),
m_patternDictionary(processor->m_patternDictionary)
m_patternDictionary(processor->m_patternDictionary),
m_procedureSets(processor->m_procedureSets)
{
m_processor->operatorSaveGraphicState();
}
@ -3516,6 +3555,7 @@ PDFPageContentProcessor::PDFPageContentProcessorStateGuard::~PDFPageContentProce
m_processor->m_propertiesDictionary = m_propertiesDictionary;
m_processor->m_shadingDictionary = m_shadingDictionary;
m_processor->m_patternDictionary = m_patternDictionary;
m_processor->m_procedureSets = m_procedureSets;
m_processor->operatorRestoreGraphicState();
}

View File

@ -197,6 +197,18 @@ public:
Invalid ///< Invalid operator, use for error reporting
};
enum ProcedureSet
{
EmptyProcSet = 0x0000,
NoProcSet = 0x0001,
PDF = 0x0002,
Text = 0x0004,
ImageB = 0x0008,
ImageC = 0x0010,
ImageI = 0x0020
};
Q_DECLARE_FLAGS(ProcedureSets, ProcedureSet)
/// Process the contents of the page
QList<PDFRenderError> processContents();
@ -547,6 +559,10 @@ protected:
/// Returns page bounding rectangle in device space
const QRectF& getPageBoundingRectDeviceSpace() const { return m_pageBoundingRectDeviceSpace; }
/// Returns current procedure sets. Procedure sets are deprecated in PDF 2.0 and are here
/// only for compatibility purposes. See chapter 14.2 in PDF 2.0 specification.
ProcedureSets getProcedureSets() const { return m_procedureSets; }
private:
/// Initializes the resources dictionaries
void initDictionaries(const PDFObject& resourcesObject);
@ -621,6 +637,7 @@ private:
const PDFDictionary* m_propertiesDictionary;
const PDFDictionary* m_shadingDictionary;
const PDFDictionary* m_patternDictionary;
ProcedureSets m_procedureSets;
};
class PDFPageContentProcessorGraphicStateSaveRestoreGuard
@ -878,6 +895,7 @@ private:
const PDFDictionary* m_propertiesDictionary;
const PDFDictionary* m_shadingDictionary;
const PDFDictionary* m_patternDictionary;
ProcedureSets m_procedureSets;
// Default color spaces
PDFColorSpacePointer m_deviceGrayColorSpace;