Encoding tables

This commit is contained in:
Jakub Melka 2018-12-02 17:53:19 +01:00
parent bc8617751e
commit 2e805b198c
8 changed files with 2321 additions and 9 deletions

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE QtCreatorProject>
<!-- Written by QtCreator 4.7.2, 2018-11-30T19:34:52. -->
<!-- Written by QtCreator 4.7.2, 2018-12-02T11:05:49. -->
<qtcreator>
<data>
<variable>EnvironmentId</variable>
@ -67,7 +67,7 @@
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Desktop Qt 5.11.2 MSVC2017 64bit</value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Desktop Qt 5.11.2 MSVC2017 64bit</value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">qt.qt5.5112.win64_msvc2017_64_kit</value>
<value type="int" key="ProjectExplorer.Target.ActiveBuildConfiguration">0</value>
<value type="int" key="ProjectExplorer.Target.ActiveBuildConfiguration">1</value>
<value type="int" key="ProjectExplorer.Target.ActiveDeployConfiguration">0</value>
<value type="int" key="ProjectExplorer.Target.ActiveRunConfiguration">1</value>
<valuemap type="QVariantMap" key="ProjectExplorer.Target.BuildConfiguration.0">
@ -296,7 +296,7 @@
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments"></value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.ProFile">UnitTests/UnitTests.pro</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.UserWorkingDirectory"></value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.UserWorkingDirectory.default">K:/Programming/PDF/PDF_For_Qt/bin_debug/UnitTests/..</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.UserWorkingDirectory.default">K:/Programming/PDF/PDF_For_Qt/bin_release/UnitTests/..</value>
<value type="uint" key="RunConfiguration.QmlDebugServerPort">3768</value>
<value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
<value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
@ -353,7 +353,7 @@
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments"></value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.ProFile">PdfForQtViewer/PdfForQtViewer.pro</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.UserWorkingDirectory"></value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.UserWorkingDirectory.default">K:/Programming/PDF/PDF_For_Qt/bin_debug/PdfForQtViewer/..</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.UserWorkingDirectory.default">K:/Programming/PDF/PDF_For_Qt/bin_release/PdfForQtViewer/..</value>
<value type="uint" key="RunConfiguration.QmlDebugServerPort">3768</value>
<value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
<value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>

View File

@ -41,7 +41,8 @@ SOURCES += \
sources/pdfdocument.cpp \
sources/pdfdocumentreader.cpp \
sources/pdfxreftable.cpp \
sources/pdfvisitor.cpp
sources/pdfvisitor.cpp \
sources/pdfencoding.cpp
HEADERS += \
sources/pdfobject.h \
@ -52,7 +53,8 @@ HEADERS += \
sources/pdfdocumentreader.h \
sources/pdfxreftable.h \
sources/pdfflatmap.h \
sources/pdfvisitor.h
sources/pdfvisitor.h \
sources/pdfencoding.h
unix {
target.path = /usr/lib
@ -63,4 +65,4 @@ unix {
CONFIG += force_debug_info
QMAKE_CXXFLAGS += /std:c++latest
QMAKE_CXXFLAGS += /std:c++latest /utf-8

View File

@ -17,4 +17,147 @@
#include "pdfdocument.h"
#include "pdfparser.h"
#include "pdfencoding.h"
namespace pdf
{
// Entries for "Info" entry in trailer dictionary
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY = "Info";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_TITLE = "Title";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_AUTHOR = "Author";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_SUBJECT = "Subject";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_KEYWORDS = "Keywords";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_CREATOR = "Creator";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_PRODUCER = "Producer";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_CREATION_DATE = "CreationDate";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_MODIFIED_DATE = "ModDate";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_TRAPPED = "Trapped";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_TRAPPED_TRUE = "True";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_TRAPPED_FALSE = "False";
static constexpr const char* PDF_DOCUMENT_INFO_ENTRY_TRAPPED_UNKNOWN = "Unknown";
void PDFDocument::init()
{
initInfo();
}
void PDFDocument::initInfo()
{
const PDFObject& trailerDictionary = m_pdfObjectStorage.getTrailerDictionary();
// Trailer object should be dictionary here. It is verified in the document reader.
Q_ASSERT(trailerDictionary.isDictionary());
const PDFDictionary* dictionary = trailerDictionary.getDictionary();
Q_ASSERT(dictionary);
if (dictionary->hasKey(PDF_DOCUMENT_INFO_ENTRY))
{
const PDFObject& info = getObject(dictionary->get(PDF_DOCUMENT_INFO_ENTRY));
if (info.isDictionary())
{
const PDFDictionary* infoDictionary = info.getDictionary();
Q_ASSERT(infoDictionary);
auto readTextString = [this, infoDictionary](const char* entry, QString& fillEntry)
{
if (infoDictionary->hasKey(entry))
{
const PDFObject& stringObject = getObject(infoDictionary->get(entry));
if (stringObject.isString())
{
// We have succesfully read the string, convert it according to encoding
fillEntry = PDFEncoding::convertTextString(stringObject.getString());
}
else if (!stringObject.isNull())
{
throw PDFParserException(tr("Bad format of document info entry in trailer dictionary. String expected."));
}
}
};
readTextString(PDF_DOCUMENT_INFO_ENTRY_TITLE, m_info.title);
readTextString(PDF_DOCUMENT_INFO_ENTRY_AUTHOR, m_info.author);
readTextString(PDF_DOCUMENT_INFO_ENTRY_SUBJECT, m_info.subject);
readTextString(PDF_DOCUMENT_INFO_ENTRY_KEYWORDS, m_info.keywords);
readTextString(PDF_DOCUMENT_INFO_ENTRY_CREATOR, m_info.creator);
readTextString(PDF_DOCUMENT_INFO_ENTRY_PRODUCER, m_info.producer);
auto readDate= [this, infoDictionary](const char* entry, QDateTime& fillEntry)
{
if (infoDictionary->hasKey(entry))
{
const PDFObject& stringObject = getObject(infoDictionary->get(entry));
if (stringObject.isString())
{
// We have succesfully read the string, convert it to date time
fillEntry = PDFEncoding::convertToDateTime(stringObject.getString());
if (!fillEntry.isValid())
{
throw PDFParserException(tr("Bad format of document info entry in trailer dictionary. String with date time format expected."));
}
}
else if (!stringObject.isNull())
{
throw PDFParserException(tr("Bad format of document info entry in trailer dictionary. String with date time format expected."));
}
}
};
readDate(PDF_DOCUMENT_INFO_ENTRY_CREATION_DATE, m_info.creationDate);
readDate(PDF_DOCUMENT_INFO_ENTRY_MODIFIED_DATE, m_info.modifiedDate);
if (infoDictionary->hasKey(PDF_DOCUMENT_INFO_ENTRY_TRAPPED))
{
const PDFObject& nameObject = getObject(infoDictionary->get(PDF_DOCUMENT_INFO_ENTRY_TRAPPED));
if (nameObject.isName())
{
const QByteArray& name = nameObject.getString();
if (name == PDF_DOCUMENT_INFO_ENTRY_TRAPPED_TRUE)
{
m_info.trapped = Info::Trapped::True;
}
else if (name == PDF_DOCUMENT_INFO_ENTRY_TRAPPED_FALSE)
{
m_info.trapped = Info::Trapped::False;
}
else if (name == PDF_DOCUMENT_INFO_ENTRY_TRAPPED_UNKNOWN)
{
m_info.trapped = Info::Trapped::Unknown;
}
else
{
throw PDFParserException(tr("Bad format of document info entry in trailer dictionary. Trapping information expected"));
}
}
else
{
throw PDFParserException(tr("Bad format of document info entry in trailer dictionary. Trapping information expected"));
}
}
}
else if (!info.isNull()) // Info may be invalid...
{
throw PDFParserException(tr("Bad format of document info entry in trailer dictionary."));
}
}
}
const PDFObject& PDFObjectStorage::getObject(PDFObjectReference reference) const
{
if (reference.objectNumber >= 0 &&
reference.objectNumber < static_cast<PDFInteger>(m_objects.size()) &&
m_objects[reference.objectNumber].generation == reference.generation)
{
return m_objects[reference.objectNumber].object;
}
else
{
static const PDFObject dummy;
return dummy;
}
}
} // namespace pdf

View File

@ -22,6 +22,9 @@
#include "pdfglobal.h"
#include "pdfobject.h"
#include <QtCore>
#include <QDateTime>
namespace pdf
{
@ -56,6 +59,10 @@ public:
}
/// Returns object from the object storage. If invalid reference is passed,
/// then null object is returned (no exception is thrown).
const PDFObject& getObject(PDFObjectReference reference) const;
/// Returns array of objects stored in this storage
const PDFObjects& getObjects() const { return m_objects; }
@ -70,24 +77,82 @@ private:
/// PDF document main class.
class PDFDocument
{
Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocument)
public:
explicit PDFDocument() = default;
const PDFObjectStorage& getStorage() const { return m_pdfObjectStorage; }
/// Info about the document. Title, Author, Keywords...
struct Info
{
/// Indicates, that document was modified that it includes trapping information.
/// See PDF Reference 1.7, Section 10.10.5 "Trapping Support".
enum class Trapped
{
True, ///< Fully trapped
False, ///< Not yet trapped
Unknown ///< Either unknown, or it has been trapped partly, not fully
};
QString title;
QString author;
QString subject;
QString keywords;
QString creator;
QString producer;
QDateTime creationDate;
QDateTime modifiedDate;
Trapped trapped = Trapped::Unknown;
};
/// Returns info about the document (title, author, etc.)
const Info* getInfo() const { return &m_info; }
private:
friend class PDFDocumentReader;
explicit PDFDocument(PDFObjectStorage&& storage) :
m_pdfObjectStorage(std::move(storage))
{
init();
}
/// Initialize data based on object in the storage.
/// Can throw exception if error is detected.
void init();
/// Initialize the document info from the trailer dictionary.
/// If document info is not present, then default document
/// info is used. If error is detected, exception is thrown.
void initInfo();
/// If object is reference, the dereference attempt is performed
/// and object is returned. If it is not a reference, then self
/// is returned. If dereference attempt fails, then null object
/// is returned (no exception is thrown).
const PDFObject& getObject(const PDFObject& object) const;
/// Storage of objects
PDFObjectStorage m_pdfObjectStorage;
/// Info about the PDF document
Info m_info;
};
inline
const PDFObject& PDFDocument::getObject(const PDFObject& object) const
{
if (object.isReference())
{
// Try to dereference the object
return m_pdfObjectStorage.getObject(object.getReference());
}
return object;
}
} // namespace pdf
#endif // PDFDOCUMENT_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,90 @@
// Copyright (C) 2018 Jakub Melka
//
// This file is part of PdfForQt.
//
// PdfForQt is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// PdfForQt is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
#ifndef PDFENCODING_H
#define PDFENCODING_H
#include <QString>
#include <QDateTime>
#include <array>
namespace pdf
{
namespace encoding
{
using EncodingTable = std::array<QChar, 256>;
}
/// This class can convert byte stream to the QString in unicode encoding.
/// PDF has several encodings, see PDF Reference 1.7, Appendix D.
class PDFEncoding
{
public:
explicit PDFEncoding() = delete;
enum class Encoding
{
Standard, ///< Appendix D, Section D.1, StandardEncoding
MacRoman, ///< Appendix D, Section D.1, MacRomanEncoding
WinAnsi, ///< Appendix D, Section D.1, WinAnsiEncoding
PDFDoc, ///< Appendix D, Section D.1/D.2, PDFDocEncoding
MacExpert, ///< Appendix D, Section D.3, MacExpertEncoding
Symbol, ///< Appendix D, Section D.4, Symbol Set and Encoding
ZapfDingbats ///< Appendix D, Section D.5, Zapf Dingbats Encoding
};
/// Converts byte array to the unicode string using specified encoding
/// \param stream Stream (byte array string) to be processed
/// \param encoding Encoding used to convert to unicode string
/// \returns Converted unicode string
static QString convert(const QByteArray& stream, Encoding encoding);
/// Convert text string to the unicode string, using either PDFDocEncoding,
/// or UTF-16BE encoding. Please see PDF Reference 1.7, Chapter 3.8.1. If
/// UTF-16BE encoding is used, then leading bytes should be 0xFE and 0xFF
/// \param Stream
/// \returns Converted unicode string
static QString convertTextString(const QByteArray& stream);
/// Converts byte array from UTF-16BE encoding to QString with same encoding.
/// \param Stream
/// \returns Converted unicode string
static QString convertFromUnicode(const QByteArray& stream);
/// Convert stream to date time according to PDF Reference 1.7, Chapter 3.8.1.
/// If date cannot be converted (string is invalid), then invalid QDateTime
/// is returned.
/// \param stream Stream, from which date/time is read
static QDateTime convertToDateTime(const QByteArray& stream);
private:
/// Returns conversion table for particular encoding
/// \param encoding Encoding
static const encoding::EncodingTable* getTableForEncoding(Encoding encoding);
/// Returns true, if byte array has UTF-16BE unicode marking bytes at the
/// stream start. If they are present, then byte stream is probably encoded
/// as unicode.
/// \param stream Stream to be tested
static bool hasUnicodeLeadMarkings(const QByteArray& stream);
};
} // namespace pdf
#endif // PDFENCODING_H

View File

@ -22,7 +22,7 @@ DEFINES += QT_DEPRECATED_WARNINGS
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
QMAKE_CXXFLAGS += /std:c++latest
QMAKE_CXXFLAGS += /std:c++latest /utf-8
INCLUDEPATH += $$PWD/../PDFForQtLib/Sources

View File

@ -103,6 +103,7 @@ void LexicalAnalyzerTest::test_strings()
testTokens("(Text with special character: \\))", { Token(Type::String, QByteArray("Text with special character: )")) });
testTokens("(Text with special character: \\\\)", { Token(Type::String, QByteArray("Text with special character: \\")) });
testTokens("(\53)", { Token(Type::String, QByteArray("+")) });
testTokens("(\376\377)", { Token(Type::String, QByteArray("\376\377")) });
testTokens("(\0533)", { Token(Type::String, QByteArray("+3")) });
testTokens("(\053)", { Token(Type::String, QByteArray("+")) });
testTokens("(\053053)", { Token(Type::String, QByteArray("+053")) });