From a6163978bc91af969b652ece85c87f9bb638607f Mon Sep 17 00:00:00 2001 From: Jakub Melka Date: Wed, 30 Sep 2020 18:41:22 +0200 Subject: [PATCH] Tool for exporting internal format of PDF to xml file --- PdfForQtLib/sources/pdfencoding.cpp | 70 ++++++++ PdfForQtLib/sources/pdfencoding.h | 16 ++ PdfForQtLib/sources/pdfobject.h | 2 +- PdfForQtLib/sources/pdfvisitor.h | 2 +- PdfTool/PdfTool.pro | 6 +- PdfTool/pdftoolabstractapplication.cpp | 51 ++++++ PdfTool/pdftoolabstractapplication.h | 12 +- PdfTool/pdftoolxml.cpp | 212 +++++++++++++++++++++++++ PdfTool/pdftoolxml.h | 36 +++++ 9 files changed, 402 insertions(+), 5 deletions(-) create mode 100644 PdfTool/pdftoolxml.cpp create mode 100644 PdfTool/pdftoolxml.h diff --git a/PdfForQtLib/sources/pdfencoding.cpp b/PdfForQtLib/sources/pdfencoding.cpp index fd1a3d6..6f468d0 100644 --- a/PdfForQtLib/sources/pdfencoding.cpp +++ b/PdfForQtLib/sources/pdfencoding.cpp @@ -18,6 +18,7 @@ #include "pdfencoding.h" #include +#include #include @@ -2187,6 +2188,21 @@ bool PDFEncoding::canConvertToEncoding(const QString& string, PDFEncoding::Encod return true; } +bool PDFEncoding::canConvertFromEncoding(const QByteArray& stream, PDFEncoding::Encoding encoding) +{ + const encoding::EncodingTable* table = getTableForEncoding(encoding); + for (const unsigned char index : stream) + { + QChar character = (*table)[index]; + if (character == QChar(0xfffd)) + { + return false; + } + } + + return true; +} + QString PDFEncoding::convertTextString(const QByteArray& stream) { if (hasUnicodeLeadMarkings(stream)) @@ -2333,6 +2349,60 @@ const encoding::EncodingTable* PDFEncoding::getTableForEncoding(Encoding encodin return nullptr; } +QString PDFEncoding::convertSmartFromByteStringToUnicode(const QByteArray& stream, bool* isBinary) +{ + if (isBinary) + { + *isBinary = false; + } + + if (hasUnicodeLeadMarkings(stream)) + { + QTextCodec::ConverterState state = { }; + + { + QTextCodec* codec = QTextCodec::codecForName("UTF-16BE"); + QString text = codec->toUnicode(stream.constData(), stream.length(), &state); + if (state.invalidChars == 0) + { + return text; + } + } + + { + QTextCodec* codec = QTextCodec::codecForName("UTF-16LE"); + QString text = codec->toUnicode(stream.constData(), stream.length(), &state); + if (state.invalidChars == 0) + { + return text; + } + } + } + + if (hasUTF8LeadMarkings(stream)) + { + QTextCodec::ConverterState state = { }; + + QTextCodec* codec = QTextCodec::codecForName("UTF-8"); + QString text = codec->toUnicode(stream.constData(), stream.length(), &state); + if (state.invalidChars == 0) + { + return text; + } + } + + if (canConvertFromEncoding(stream, Encoding::PDFDoc)) + { + return convert(stream, Encoding::PDFDoc); + } + + if (isBinary) + { + *isBinary = true; + } + return QString::fromLatin1(stream.toHex()).toUpper(); +} + bool PDFEncoding::hasUnicodeLeadMarkings(const QByteArray& stream) { if (stream.size() >= 2) diff --git a/PdfForQtLib/sources/pdfencoding.h b/PdfForQtLib/sources/pdfencoding.h index b172333..b5f070c 100644 --- a/PdfForQtLib/sources/pdfencoding.h +++ b/PdfForQtLib/sources/pdfencoding.h @@ -77,6 +77,13 @@ public: /// \param encoding Encoding used in verification of conversion static bool canConvertToEncoding(const QString& string, Encoding encoding); + /// Checks, if stream can be converted to string using encoding (i.e. all + /// characters are defined). If all characters are valid, then true is + /// returned. This is only guess. + /// \param stream Stream + /// \param encoding Target encoding + static bool canConvertFromEncoding(const QByteArray& stream, Encoding encoding); + /// Convert text string to the unicode string, using either PDFDocEncoding, /// or UTF-16BE encoding. Please see PDF Reference 1.7, Chapter 3.8.1. If /// UTF-16BE encoding is used, then leading bytes should be 0xFE and 0xFF @@ -104,6 +111,15 @@ public: /// \param encoding Encoding static const encoding::EncodingTable* getTableForEncoding(Encoding encoding); + /// Tries to convert stream to unicode string. Stream can be binary. + /// If this is the case, then hexadecimal representation of stream is returned. + /// Function checks if stream can be converted to unicode by heuristic + /// way, it is not always reliable. + /// \param stream Stream + /// \param isBinary If specified, it is set to true if conversion failed + /// \returns Unicode string or string converted to hexadecimal representation + static QString convertSmartFromByteStringToUnicode(const QByteArray& stream, bool* isBinary); + private: /// Returns true, if byte array has UTF-16BE/LE unicode marking bytes at the /// stream start. If they are present, then byte stream is probably encoded diff --git a/PdfForQtLib/sources/pdfobject.h b/PdfForQtLib/sources/pdfobject.h index ff69fb2..4225cff 100644 --- a/PdfForQtLib/sources/pdfobject.h +++ b/PdfForQtLib/sources/pdfobject.h @@ -111,7 +111,7 @@ struct PDFInplaceString }; /// Reference to the string implementations -struct PDFStringRef +struct PDFFORQTLIBSHARED_EXPORT PDFStringRef { const PDFInplaceString* inplaceString = nullptr; const PDFString* memoryString = nullptr; diff --git a/PdfForQtLib/sources/pdfvisitor.h b/PdfForQtLib/sources/pdfvisitor.h index 3ebb0f1..2e4e118 100644 --- a/PdfForQtLib/sources/pdfvisitor.h +++ b/PdfForQtLib/sources/pdfvisitor.h @@ -32,7 +32,7 @@ namespace pdf { /// Abstract visitor, can iterate trough object tree -class PDFAbstractVisitor +class PDFFORQTLIBSHARED_EXPORT PDFAbstractVisitor { public: diff --git a/PdfTool/PdfTool.pro b/PdfTool/PdfTool.pro index f34991e..d3ff9d8 100644 --- a/PdfTool/PdfTool.pro +++ b/PdfTool/PdfTool.pro @@ -42,7 +42,8 @@ SOURCES += \ main.cpp \ pdfoutputformatter.cpp \ pdftoolabstractapplication.cpp \ - pdftoolverifysignatures.cpp + pdftoolverifysignatures.cpp \ + pdftoolxml.cpp # Default rules for deployment. qnx: target.path = /tmp/$${TARGET}/bin @@ -56,4 +57,5 @@ INSTALLS += application HEADERS += \ pdfoutputformatter.h \ pdftoolabstractapplication.h \ - pdftoolverifysignatures.h + pdftoolverifysignatures.h \ + pdftoolxml.h diff --git a/PdfTool/pdftoolabstractapplication.cpp b/PdfTool/pdftoolabstractapplication.cpp index bfbeead..1d77212 100644 --- a/PdfTool/pdftoolabstractapplication.cpp +++ b/PdfTool/pdftoolabstractapplication.cpp @@ -16,6 +16,7 @@ // along with PDFForQt. If not, see . #include "pdftoolabstractapplication.h" +#include "pdfdocumentreader.h" #include @@ -147,6 +148,12 @@ void PDFToolAbstractApplication::initializeCommandLineParser(QCommandLineParser* parser->addOption(QCommandLineOption("ver-ignore-exp-date", "Ignore certificate expiration date.")); parser->addOption(QCommandLineOption("ver-date-format", "Console output date/time format (valid values: short|long|iso|rfc2822).", "ver-date-format", "short")); } + + if (optionFlags.testFlag(XmlExport)) + { + parser->addOption(QCommandLineOption("xml-export-streams", "Export streams as hexadecimally encoded data. By default, stream data are not exported.")); + parser->addOption(QCommandLineOption("xml-export-streams-as-text", "Export streams as text, if possible. This flag enforces exporting stream data (possibly as hexadecimal strings).")); + } } PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser) const @@ -223,6 +230,50 @@ PDFToolOptions PDFToolAbstractApplication::getOptions(QCommandLineParser* parser return options; } +bool PDFToolAbstractApplication::readDocument(const PDFToolOptions& options, pdf::PDFDocument& document) +{ + bool isFirstPasswordAttempt = true; + auto passwordCallback = [&options, &isFirstPasswordAttempt](bool* ok) -> QString + { + *ok = isFirstPasswordAttempt; + isFirstPasswordAttempt = false; + return options.password; + }; + pdf::PDFDocumentReader reader(nullptr, passwordCallback, options.permissiveReading); + document = reader.readFromFile(options.document); + + switch (reader.getReadingResult()) + { + case pdf::PDFDocumentReader::Result::OK: + break; + + case pdf::PDFDocumentReader::Result::Cancelled: + { + PDFConsole::writeError(PDFToolTranslationContext::tr("Invalid password provided.")); + return false; + } + + case pdf::PDFDocumentReader::Result::Failed: + { + PDFConsole::writeError(PDFToolTranslationContext::tr("Error occured during document reading. %1").arg(reader.getErrorMessage())); + return false; + } + + default: + { + Q_ASSERT(false); + return false; + } + } + + for (const QString& warning : reader.getWarnings()) + { + PDFConsole::writeError(PDFToolTranslationContext::tr("Warning: %1").arg(warning)); + } + + return true; +} + PDFToolAbstractApplication* PDFToolApplicationStorage::getApplicationByCommand(const QString& command) { for (PDFToolAbstractApplication* application : getInstance()->m_applications) diff --git a/PdfTool/pdftoolabstractapplication.h b/PdfTool/pdftoolabstractapplication.h index 5aa2d98..ca5b884 100644 --- a/PdfTool/pdftoolabstractapplication.h +++ b/PdfTool/pdftoolabstractapplication.h @@ -19,6 +19,7 @@ #define PDFTOOLABSTRACTAPPLICATION_H #include "pdfoutputformatter.h" +#include "pdfdocument.h" #include #include @@ -54,6 +55,12 @@ struct PDFToolOptions bool verificationPrintCertificateDetails = false; bool verificationIgnoreExpirationDate = false; Qt::DateFormat verificationDateFormat = Qt::DefaultLocaleShortDate; + + // For option 'XMLExport' + bool xmlExportStreams = false; dodelat optiony + bool xmlExportStreamsAsText = false; + bool xmlUseIndent = true; dodelat + bool xmlAlwaysBinary = false; dodelat }; /// Base class for all applications @@ -81,7 +88,8 @@ public: { ConsoleFormat = 0x0001, ///< Set format of console output (text, xml or html) OpenDocument = 0x0002, ///< Flags for document reading - SignatureVerification = 0x0004, ///< Flags for signature verification + SignatureVerification = 0x0004, ///< Flags for signature verification, + XmlExport = 0x0008, ///< Flags for xml export }; Q_DECLARE_FLAGS(Options, Option) @@ -91,6 +99,8 @@ public: void initializeCommandLineParser(QCommandLineParser* parser) const; PDFToolOptions getOptions(QCommandLineParser* parser) const; + + bool readDocument(const PDFToolOptions& options, pdf::PDFDocument& document); }; /// This class stores information about all applications available. Application diff --git a/PdfTool/pdftoolxml.cpp b/PdfTool/pdftoolxml.cpp new file mode 100644 index 0000000..0b92205 --- /dev/null +++ b/PdfTool/pdftoolxml.cpp @@ -0,0 +1,212 @@ +// Copyright (C) 2020 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + +#include "pdftoolxml.h" +#include "pdfvisitor.h" +#include "pdfencoding.h" + +#include + +namespace pdftool +{ + +static PDFToolXmlApplication s_xmlApplication; + +class PDFXmlExportVisitor : public pdf::PDFAbstractVisitor +{ +public: + PDFXmlExportVisitor(QXmlStreamWriter* writer) : + m_writer(writer) + { + + } + + virtual void visitNull() override; + virtual void visitBool(bool value) override; + virtual void visitInt(pdf::PDFInteger value) override; + virtual void visitReal(pdf::PDFReal value) override; + virtual void visitString(pdf::PDFStringRef string) override; + virtual void visitName(pdf::PDFStringRef name) override; + virtual void visitArray(const pdf::PDFArray* array) override; + virtual void visitDictionary(const pdf::PDFDictionary* dictionary) override; + virtual void visitStream(const pdf::PDFStream* stream) override; + virtual void visitReference(const pdf::PDFObjectReference reference) override; + +private: + void writeTextOrBinary(const QByteArray& stream, QString name); + + QXmlStreamWriter* m_writer; +}; + +void PDFXmlExportVisitor::visitNull() +{ + m_writer->writeEmptyElement("null"); +} + +void PDFXmlExportVisitor::visitBool(bool value) +{ + m_writer->writeTextElement("bool", value ? "true" : "false"); +} + +void PDFXmlExportVisitor::visitInt(pdf::PDFInteger value) +{ + m_writer->writeTextElement("int", QString::number(value)); +} + +void PDFXmlExportVisitor::visitReal(pdf::PDFReal value) +{ + m_writer->writeTextElement("real", QString::number(value)); +} + +void PDFXmlExportVisitor::visitString(pdf::PDFStringRef string) +{ + writeTextOrBinary(string.getString(), "string"); +} + +void PDFXmlExportVisitor::visitName(pdf::PDFStringRef name) +{ + writeTextOrBinary(name.getString(), "name"); +} + +void PDFXmlExportVisitor::visitArray(const pdf::PDFArray* array) +{ + m_writer->writeStartElement("array"); + acceptArray(array); + m_writer->writeEndElement(); +} + +void PDFXmlExportVisitor::visitDictionary(const pdf::PDFDictionary* dictionary) +{ + m_writer->writeStartElement("dictionary"); + + const size_t count = dictionary->getCount(); + for (size_t i = 0; i < count; ++i) + { + m_writer->writeStartElement("entry"); + m_writer->writeAttribute("key", QString::fromLatin1(dictionary->getKey(i).getString())); + dictionary->getValue(i).accept(this); + m_writer->writeEndElement(); + } + + m_writer->writeEndElement(); +} + +void PDFXmlExportVisitor::visitStream(const pdf::PDFStream* stream) +{ + m_writer->writeStartElement("stream"); + visitDictionary(stream->getDictionary()); + dodelat export dat + m_writer->writeEndElement(); +} + +void PDFXmlExportVisitor::visitReference(const pdf::PDFObjectReference reference) +{ + m_writer->writeStartElement("reference"); + m_writer->writeAttribute("id", QString::number(reference.objectNumber)); + m_writer->writeAttribute("gen", QString::number(reference.generation)); + m_writer->writeCharacters(QString("%1 %2 R").arg(reference.objectNumber).arg(reference.generation)); + m_writer->writeEndElement(); +} + +void PDFXmlExportVisitor::writeTextOrBinary(const QByteArray& stream, QString name) +{ + bool isBinary = false; + m_writer->writeStartElement(name); + QString text = pdf::PDFEncoding::convertSmartFromByteStringToUnicode(stream, &isBinary); + m_writer->writeAttribute("form", isBinary ? "binary" : "text"); + m_writer->writeCharacters(text); + m_writer->writeEndElement(); +} + +QString PDFToolXmlApplication::getStandardString(PDFToolAbstractApplication::StandardString standardString) const +{ + switch (standardString) + { + case Command: + return "xml"; + + case Name: + return PDFToolTranslationContext::tr("XML export"); + + case Description: + return PDFToolTranslationContext::tr("Export internal data structure to xml."); + + default: + Q_ASSERT(false); + break; + } + + return QString(); +} + +int PDFToolXmlApplication::execute(const PDFToolOptions& options) +{ + pdf::PDFDocument document; + if (!readDocument(options, document)) + { + return ErrorDocumentReading; + } + + QString xmlString; + QXmlStreamWriter writer(&xmlString); + + if (options.xmlUseIndent) + { + writer.setAutoFormatting(true); + writer.setAutoFormattingIndent(2); + } + + QString comment = QString("Processed by %1 %2").arg(QCoreApplication::applicationName(), QCoreApplication::applicationVersion()); + writer.writeStartDocument(); + writer.writeComment(comment); + writer.writeStartElement("document"); + + PDFXmlExportVisitor visitor(&writer); + writer.writeStartElement("trailer"); + document.getStorage().getTrailerDictionary().accept(&visitor); + writer.writeEndElement(); + + pdf::PDFObjectStorage::PDFObjects entries = document.getStorage().getObjects(); + for (pdf::PDFInteger i = 0; i < pdf::PDFInteger(entries.size()); ++i) + { + const pdf::PDFObjectStorage::Entry& entry = entries[i]; + + if (entry.object.isNull()) + { + continue; + } + + writer.writeStartElement("pdfobject"); + writer.writeAttribute("id", QString::number(i)); + writer.writeAttribute("gen", QString::number(entry.generation)); + entry.object.accept(&visitor); + writer.writeEndElement(); + } + + writer.writeEndElement(); + writer.writeEndDocument(); + + PDFConsole::writeText(xmlString); + return ExitSuccess; +} + +PDFToolAbstractApplication::Options PDFToolXmlApplication::getOptionsFlags() const +{ + return OpenDocument | XmlExport; +} + +} // namespace pdftool diff --git a/PdfTool/pdftoolxml.h b/PdfTool/pdftoolxml.h new file mode 100644 index 0000000..de8b144 --- /dev/null +++ b/PdfTool/pdftoolxml.h @@ -0,0 +1,36 @@ +// Copyright (C) 2020 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + +#ifndef PDFTOOLXML_H +#define PDFTOOLXML_H + +#include "pdftoolabstractapplication.h" + +namespace pdftool +{ + +class PDFToolXmlApplication : public PDFToolAbstractApplication +{ +public: + virtual QString getStandardString(StandardString standardString) const override; + virtual int execute(const PDFToolOptions& options) override; + virtual Options getOptionsFlags() const override; +}; + +} // namespace pdftool + +#endif // PDFTOOLXML_H