commit 58ad59e4071b894b9ccfe0eefb01e1aa31baad81 Author: Jakub Melka Date: Sat Nov 17 16:48:30 2018 +0100 Initial commit diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..11e8067 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/PdfForQt.pro b/PdfForQt.pro new file mode 100644 index 0000000..26b66a3 --- /dev/null +++ b/PdfForQt.pro @@ -0,0 +1,23 @@ +# Copyright (C) 2018 Jakub Melka +# +# This file is part of PdfForQt. +# +# PdfForQt is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PdfForQt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with PDFForQt. If not, see . + +TEMPLATE = subdirs + +SUBDIRS += \ + PdfForQtLib \ + UnitTests + diff --git a/PdfForQt.pro.user b/PdfForQt.pro.user new file mode 100644 index 0000000..f8e82b3 --- /dev/null +++ b/PdfForQt.pro.user @@ -0,0 +1,322 @@ + + + + + + EnvironmentId + {dbb5431a-2266-4222-a26c-4e68ccc98ac1} + + + ProjectExplorer.Project.ActiveTarget + 0 + + + ProjectExplorer.Project.EditorSettings + + true + false + true + + Cpp + + CppGlobal + + + + QmlJS + + QmlJSGlobal + + + 2 + UTF-8 + false + 4 + false + 80 + true + true + 1 + true + false + 0 + true + true + 0 + 8 + true + 1 + true + true + true + false + + + + ProjectExplorer.Project.PluginSettings + + + -fno-delayed-template-parsing + + true + + + + ProjectExplorer.Project.Target.0 + + Desktop Qt 5.11.2 MSVC2017 64bit + Desktop Qt 5.11.2 MSVC2017 64bit + qt.qt5.5112.win64_msvc2017_64_kit + 0 + 0 + 0 + + K:/Programming/PDF/PDF_For_Qt/bin_debug + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + false + false + + + true + Make + + Qt4ProjectManager.MakeStep + + false + + + + 2 + Build + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + true + clean + + + 1 + Clean + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Ladění + Ladění + Qt4ProjectManager.Qt4BuildConfiguration + 2 + true + + + K:/Programming/PDF/PDF_For_Qt/bin_release + + + true + qmake + + QtProjectManager.QMakeBuildStep + false + + false + false + true + + + true + Make + + Qt4ProjectManager.MakeStep + + false + + + + 2 + Build + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + true + clean + + + 1 + Clean + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Vydání + Vydání + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + + K:/Programming/PDF/PDF_For_Qt/bin_profile + + + true + qmake + + QtProjectManager.QMakeBuildStep + true + + false + true + true + + + true + Make + + Qt4ProjectManager.MakeStep + + false + + + + 2 + Build + + ProjectExplorer.BuildSteps.Build + + + + true + Make + + Qt4ProjectManager.MakeStep + + true + clean + + + 1 + Clean + + ProjectExplorer.BuildSteps.Clean + + 2 + false + + Profile + Profile + Qt4ProjectManager.Qt4BuildConfiguration + 0 + true + + 3 + + + 0 + Nasazení + + ProjectExplorer.BuildSteps.Deploy + + 1 + Deploy Configuration + + ProjectExplorer.DefaultDeployConfiguration + + 1 + + + false + false + 1000 + + true + + false + false + false + false + true + 0.01 + 10 + true + 1 + 25 + + 1 + true + false + true + valgrind + + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + + 2 + + UnitTests + + Qt4ProjectManager.Qt4RunConfiguration:K:/Programming/PDF/PDF_For_Qt/PdfForQt/UnitTests/UnitTests.pro + true + + UnitTests/UnitTests.pro + + + 3768 + false + true + false + false + true + + 1 + + + + ProjectExplorer.Project.TargetCount + 1 + + + ProjectExplorer.Project.Updater.FileVersion + 18 + + + Version + 18 + + diff --git a/PdfForQtLib/PdfForQtLib.pro b/PdfForQtLib/PdfForQtLib.pro new file mode 100644 index 0000000..53ab146 --- /dev/null +++ b/PdfForQtLib/PdfForQtLib.pro @@ -0,0 +1,57 @@ +# Copyright (C) 2018 Jakub Melka +# +# This file is part of PdfForQt. +# +# PdfForQt is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PdfForQt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with PDFForQt. If not, see . + +QT -= gui + +TARGET = PdfForQtLib +TEMPLATE = lib + +DEFINES += PDFFORQTLIB_LIBRARY + +# The following define makes your compiler emit warnings if you use +# any feature of Qt which has been marked as deprecated (the exact warnings +# depend on your compiler). Please consult the documentation of the +# deprecated API in order to know how to port your code away from it. +DEFINES += QT_DEPRECATED_WARNINGS + +# You can also make your code fail to compile if you use deprecated APIs. +# In order to do so, uncomment the following line. +# You can also select to disable deprecated APIs only up to a certain version of Qt. +#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0 + +DESTDIR = $$OUT_PWD/.. + +SOURCES += \ + sources/pdfobject.cpp \ + sources/pdfparser.cpp \ + sources/pdfdocument.cpp \ + sources/pdfdocumentreader.cpp + +HEADERS += \ + sources/pdfobject.h \ + sources/pdfparser.h \ + sources/pdfglobal.h \ + sources/pdfconstants.h \ + sources/pdfdocument.h \ + sources/pdfdocumentreader.h + +unix { + target.path = /usr/lib + INSTALLS += target +} + +QMAKE_CXXFLAGS += /std:c++latest diff --git a/PdfForQtLib/sources/pdfconstants.h b/PdfForQtLib/sources/pdfconstants.h new file mode 100644 index 0000000..8536659 --- /dev/null +++ b/PdfForQtLib/sources/pdfconstants.h @@ -0,0 +1,47 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#ifndef PDFCONSTANTS_H +#define PDFCONSTANTS_H + +namespace pdf +{ + +// Structure file constants +static constexpr const char* PDF_END_OF_FILE_MARK = "%%EOF"; +static constexpr const char* PDF_START_OF_XREF_MARK = "startxref"; + +static constexpr const char* PDF_FILE_HEADER_V1 = "%PDF-?.?"; +static constexpr const char* PDF_FILE_HEADER_V2 = "%!PS-Adobe-?.? PDF-?.?"; +static constexpr const char* PDF_FILE_HEADER_REGEXP = "%PDF-([[:digit:]]\\.[[:digit:]])|%!PS-Adobe-[[:digit:]]\\.[[:digit:]] PDF-([[:digit:]]\\.[[:digit:]])"; + +static constexpr const int PDF_HEADER_SCAN_LIMIT = 1024; +static constexpr const int PDF_FOOTER_SCAN_LIMIT = 1024; + +// Stream dictionary constants - entries common to all stream dictionaries +static constexpr const char* PDF_STREAM_DICT_LENGTH = "Length"; +static constexpr const char* PDF_STREAM_DICT_FILTER = "Filter"; +static constexpr const char* PDF_STREAM_DICT_DECODE_PARMS = "DecodeParms"; +static constexpr const char* PDF_STREAM_DICT_FILE_SPECIFICATION = "F"; +static constexpr const char* PDF_STREAM_DICT_FILE_FILTER = "FFilter"; +static constexpr const char* PDF_STREAM_DICT_FDECODE_PARMS = "FDecodeParms"; +static constexpr const char* PDF_STREAM_DICT_DECODED_LENGTH = "DL"; + +} // namespace pdf + +#endif // PDFCONSTANTS_H diff --git a/PdfForQtLib/sources/pdfdocument.cpp b/PdfForQtLib/sources/pdfdocument.cpp new file mode 100644 index 0000000..f300a3c --- /dev/null +++ b/PdfForQtLib/sources/pdfdocument.cpp @@ -0,0 +1,20 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#include "pdfdocument.h" + diff --git a/PdfForQtLib/sources/pdfdocument.h b/PdfForQtLib/sources/pdfdocument.h new file mode 100644 index 0000000..029cc3c --- /dev/null +++ b/PdfForQtLib/sources/pdfdocument.h @@ -0,0 +1,35 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#ifndef PDFDOCUMENT_H +#define PDFDOCUMENT_H + +#include "pdfglobal.h" + +namespace pdf +{ + +class PDFDocument +{ +public: + explicit PDFDocument() = default; +}; + +} // namespace pdf + +#endif // PDFDOCUMENT_H diff --git a/PdfForQtLib/sources/pdfdocumentreader.cpp b/PdfForQtLib/sources/pdfdocumentreader.cpp new file mode 100644 index 0000000..475a9da --- /dev/null +++ b/PdfForQtLib/sources/pdfdocumentreader.cpp @@ -0,0 +1,207 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#include "pdfdocumentreader.h" +#include "pdfparser.h" +#include "pdfconstants.h" + +#include + +#include +#include +#include + +namespace pdf +{ + +PDFDocumentReader::PDFDocumentReader() : + m_successfull(true) +{ + +} + +PDFDocument PDFDocumentReader::readFromFile(const QString& fileName) +{ + QFile file(fileName); + + reset(); + + if (file.exists()) + { + if (file.open(QFile::ReadOnly)) + { + PDFDocument document = readFromDevice(&file); + file.close(); + return document; + } + else + { + m_successfull = false; + m_errorMessage = tr("File '%1' cannot be opened for reading. %1").arg(file.errorString()); + } + } + else + { + m_successfull = false; + m_errorMessage = tr("File '%1' doesn't exist.").arg(fileName); + } + + return PDFDocument(); +} + +PDFDocument PDFDocumentReader::readFromDevice(QIODevice* device) +{ + reset(); + + if (device->isOpen()) + { + if (device->isReadable()) + { + // Do not close the device, it was not opened by us. + return readFromBuffer(device->readAll()); + } + else + { + m_successfull = false; + m_errorMessage = tr("Device is not opened for reading."); + } + } + else if (device->open(QIODevice::ReadOnly)) + { + QByteArray byteArray = device->readAll(); + device->close(); + return readFromBuffer(byteArray); + } + else + { + m_successfull = false; + m_errorMessage = tr("Can't open device for reading."); + } + + return PDFDocument(); +} + +PDFDocument PDFDocumentReader::readFromBuffer(const QByteArray& buffer) +{ + try + { + // FOOTER CHECKING + // 1) Check, if EOF marking is present + // 2) Find start of cross reference table + if (findFromEnd(PDF_END_OF_FILE_MARK, buffer, PDF_FOOTER_SCAN_LIMIT) == FIND_NOT_FOUND_RESULT) + { + throw PDFParserException(tr("End of file marking was not found.")); + } + + const int startXRefPosition = findFromEnd(PDF_START_OF_XREF_MARK, buffer, PDF_FOOTER_SCAN_LIMIT); + if (startXRefPosition == FIND_NOT_FOUND_RESULT) + { + throw PDFParserException(tr("Start of object reference table not found.")); + } + + // HEADER CHECKING + // 1) Check if header is present + // 2) Scan header version + + // According to PDF Reference 1.7, Appendix H, file header can have two formats: + // - %PDF-x.x + // - %!PS-Adobe-y.y PDF-x.x + // We will search for both of these formats. + + std::regex headerRegExp("(%PDF-[[:digit:]]\\.[[:digit:]])|(%!PS-Adobe-[[:digit:]]\\.[[:digit:]] PDF-[[:digit:]]\\.[[:digit:]])"); + std::cmatch headerMatch; + + auto itBegin = buffer.cbegin(); + auto itEnd = std::next(buffer.cbegin(), qMin(buffer.size(), PDF_HEADER_SCAN_LIMIT)); + + if (std::regex_search(itBegin, itEnd, headerMatch, headerRegExp)) + { + // Size depends on regular expression, not on the text (if regular expresion is matched) + Q_ASSERT(headerMatch.size() == 3); + Q_ASSERT(headerMatch[1].matched != headerMatch[2].matched); + + for (int i : { 1, 2 }) + { + if (headerMatch[i].matched) + { + Q_ASSERT(std::distance(headerMatch[i].first, headerMatch[i].second) == 3); + m_version = PDFVersion(*headerMatch[i].first - '0', *std::prev(headerMatch[i].second) - '0'); + break; + } + } + } + else + { + throw PDFParserException(tr("Header of PDF file was not found.")); + } + + // Check, if version is valid + if (!m_version.isValid()) + { + throw PDFParserException(tr("Version of the PDF file is not valid.")); + } + + + } + catch (PDFParserException parserException) + { + m_successfull = false; + m_errorMessage = parserException.getMessage(); + } + + return PDFDocument(); +} + +void PDFDocumentReader::reset() +{ + m_successfull = true; + m_errorMessage = QString(); + m_version = PDFVersion(); +} + +int PDFDocumentReader::findFromEnd(const char* what, const QByteArray& byteArray, int limit) +{ + if (byteArray.isEmpty()) + { + // Byte array is empty, no value found + return FIND_NOT_FOUND_RESULT; + } + + const int size = byteArray.size(); + const int adjustedLimit = qMin(byteArray.size(), limit); + const int whatLength = static_cast(std::strlen(what)); + + if (adjustedLimit < whatLength) + { + // Buffer is smaller than scan string + return FIND_NOT_FOUND_RESULT; + } + + auto itBegin = std::next(byteArray.cbegin(), size - adjustedLimit); + auto itEnd = byteArray.cend(); + auto it = std::find_end(itBegin, itEnd, what, std::next(what, whatLength)); + + if (it != byteArray.cend()) + { + return std::distance(byteArray.cbegin(), it); + } + + return FIND_NOT_FOUND_RESULT; +} + +} // namespace pdf diff --git a/PdfForQtLib/sources/pdfdocumentreader.h b/PdfForQtLib/sources/pdfdocumentreader.h new file mode 100644 index 0000000..f1978b1 --- /dev/null +++ b/PdfForQtLib/sources/pdfdocumentreader.h @@ -0,0 +1,86 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#ifndef PDFDOCUMENTREADER_H +#define PDFDOCUMENTREADER_H + +#include "pdfglobal.h" +#include "pdfdocument.h" + +#include +#include + +namespace pdf +{ + +/// This class is a reader of PDF document from various devices (file, io device, +/// byte buffer). This class doesn't throw exceptions, to check errors, use +/// appropriate functions. +class PDFDocumentReader +{ + Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocumentReader) + +public: + explicit PDFDocumentReader(); + + /// Reads a PDF document from the specified file. If file doesn't exist, + /// cannot be opened or contain invalid pdf, empty PDF file is returned. + /// No exception is thrown. + PDFDocument readFromFile(const QString& fileName); + + /// Reads a PDF document from the specified device. If device is not opened + /// for reading, then function tries it to open for reading. If it is opened, + /// but not for reading, empty PDF document is returned. This also occurs + /// when incorrect PDF is read. No exception is thrown. + PDFDocument readFromDevice(QIODevice* device); + + /// Reads a PDF document from the specified buffer (byte array). If incorrect + /// PDF is read, then empty PDF document is returned. No exception is thrown. + PDFDocument readFromBuffer(const QByteArray& buffer); + + /// Returns true, if document was successfully read from device + bool isSuccessfull() const { return m_successfull; } + +private: + static constexpr const int FIND_NOT_FOUND_RESULT = -1; + + /// Resets the internal state and prepares it for new reading cycle + void reset(); + + /// Find a last string in the byte array, scan only \p limit bytes. If string + /// is not found, then FIND_NOT_FOUND_RESULT is returned, if it is found, then + /// it position from the beginning of byte array is returned. + /// \param what String to be found + /// \param byteArray Byte array to be scanned from the end + /// \param limit Scan up to this value bytes from the end + /// \returns Position of string, or FIND_NOT_FOUND_RESULT + int findFromEnd(const char* what, const QByteArray& byteArray, int limit); + + /// This bool flag is set, if pdf document was successfully read from the device + bool m_successfull; + + /// In case if error occurs, it is stored here + QString m_errorMessage; + + /// Version of the scanned file + PDFVersion m_version; +}; + +} // namespace pdf + +#endif // PDFDOCUMENTREADER_H diff --git a/PdfForQtLib/sources/pdfglobal.h b/PdfForQtLib/sources/pdfglobal.h new file mode 100644 index 0000000..7934a55 --- /dev/null +++ b/PdfForQtLib/sources/pdfglobal.h @@ -0,0 +1,103 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#ifndef PDFGLOBAL_H +#define PDFGLOBAL_H + +#include + +#include +#include + +#if defined(PDFFORQTLIB_LIBRARY) +# define PDFFORQTLIBSHARED_EXPORT Q_DECL_EXPORT +#else +# define PDFFORQTLIBSHARED_EXPORT Q_DECL_IMPORT +#endif + +namespace pdf +{ + +using PDFInteger = int64_t; +using PDFReal = double; + +// These constants define minimum/maximum integer and are defined in such a way, +// that even 100 times bigger integers are representable. + +constexpr PDFInteger PDF_INTEGER_MIN = std::numeric_limits::min() / 100; +constexpr PDFInteger PDF_INTEGER_MAX = std::numeric_limits::max() / 100; + +static constexpr bool isValidInteger(PDFInteger integer) +{ + return integer >= PDF_INTEGER_MIN && integer <= PDF_INTEGER_MAX; +} + +/// This structure represents a reference to the object - consisting of the +/// object number, and generation number. +struct PDFObjectReference +{ + constexpr inline PDFObjectReference() : + objectNumber(0), + generation(0) + { + + } + + constexpr inline PDFObjectReference(PDFInteger objectNumber, PDFInteger generation) : + objectNumber(objectNumber), + generation(generation) + { + + } + + PDFInteger objectNumber; + PDFInteger generation; + + constexpr bool operator==(const PDFObjectReference& other) const + { + return objectNumber == other.objectNumber && generation == other.generation; + } + + constexpr bool operator!=(const PDFObjectReference& other) const { return !(*this == other); } + + constexpr bool operator<(const PDFObjectReference& other) const + { + return std::tie(objectNumber, generation) < std::tie(other.objectNumber, other.generation); + } +}; + +/// Represents version identification +struct PDFVersion +{ + constexpr explicit PDFVersion() = default; + constexpr explicit PDFVersion(uint16_t major, uint16_t minor) : + major(major), + minor(minor) + { + + } + + uint16_t major = 0; + uint16_t minor = 0; + + bool isValid() const { return major > 0; } +}; + +} // namespace pdf + +#endif // PDFGLOBAL_H diff --git a/PdfForQtLib/sources/pdfobject.cpp b/PdfForQtLib/sources/pdfobject.cpp new file mode 100644 index 0000000..3be9fc9 --- /dev/null +++ b/PdfForQtLib/sources/pdfobject.cpp @@ -0,0 +1,137 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#include "pdfobject.h" + +namespace pdf +{ + +QByteArray PDFObject::getString() const +{ + const PDFObjectContentPointer& objectContent = std::get(m_data); + + Q_ASSERT(dynamic_cast(objectContent.get())); + const PDFString* string = static_cast(objectContent.get()); + return string->getString(); +} + +bool PDFObject::operator==(const PDFObject &other) const +{ + if (m_type == other.m_type) + { + Q_ASSERT(std::holds_alternative(m_data) == std::holds_alternative(other.m_data)); + + // If we have content object defined, then use its equal operator, + // otherwise use default compare operator. The only problem with + // default compare operator can occur, when we have a double + // with NaN value. Then operator == can return false, even if + // values are "equal" (NaN == NaN returns false) + if (std::holds_alternative(m_data)) + { + Q_ASSERT(std::get(m_data)); + return std::get(m_data)->equals(std::get(other.m_data).get()); + } + + return m_data == other.m_data; + } + + return false; +} + +bool PDFString::equals(const PDFObjectContent* other) const +{ + Q_ASSERT(dynamic_cast(other)); + const PDFString* otherString = static_cast(other); + return m_string == otherString->m_string; +} + +QByteArray PDFString::getString() const +{ + return m_string; +} + +void PDFString::setString(const QByteArray& string) +{ + m_string = string; +} + +bool PDFArray::equals(const PDFObjectContent* other) const +{ + Q_ASSERT(dynamic_cast(other)); + const PDFArray* otherArray = static_cast(other); + return m_objects == otherArray->m_objects; +} + +void PDFArray::appendItem(PDFObject object) +{ + m_objects.push_back(std::move(object)); +} + +bool PDFDictionary::equals(const PDFObjectContent* other) const +{ + Q_ASSERT(dynamic_cast(other)); + const PDFDictionary* otherStream = static_cast(other); + return m_dictionary == otherStream->m_dictionary; +} + +const PDFObject& PDFDictionary::get(const QByteArray& key) const +{ + auto it = find(key); + if (it != m_dictionary.cend()) + { + return it->second; + } + else + { + static PDFObject dummy; + return dummy; + } +} + +const PDFObject& PDFDictionary::get(const char* key) const +{ + auto it = find(key); + if (it != m_dictionary.cend()) + { + return it->second; + } + else + { + static PDFObject dummy; + return dummy; + } +} + +std::vector::const_iterator PDFDictionary::find(const QByteArray& key) const +{ + return std::find_if(m_dictionary.cbegin(), m_dictionary.cend(), [&key](const DictionaryEntry& entry) { return entry.first == key; }); +} + +std::vector::const_iterator PDFDictionary::find(const char* key) const +{ + return std::find_if(m_dictionary.cbegin(), m_dictionary.cend(), [&key](const DictionaryEntry& entry) { return entry.first == key; }); +} + +bool PDFStream::equals(const PDFObjectContent* other) const +{ + Q_ASSERT(dynamic_cast(other)); + const PDFStream* otherStream = static_cast(other); + return m_dictionary.equals(&otherStream->m_dictionary) && m_content == otherStream->m_content; +} + +} // namespace pdf diff --git a/PdfForQtLib/sources/pdfobject.h b/PdfForQtLib/sources/pdfobject.h new file mode 100644 index 0000000..b254084 --- /dev/null +++ b/PdfForQtLib/sources/pdfobject.h @@ -0,0 +1,274 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#ifndef PDFOBJECT_H +#define PDFOBJECT_H + +#include "pdfglobal.h" + +#include + +#include +#include +#include + +namespace pdf +{ + +/// This class represents a content of the PDF object. It can be +/// array of objects, dictionary, content stream data, or string data. +class PDFObjectContent +{ +public: + constexpr PDFObjectContent() = default; + virtual ~PDFObjectContent() = default; + + /// Equals operator. Returns true, if content of this object is + /// equal to the content of the other object. + virtual bool equals(const PDFObjectContent* other) const = 0; +}; + +class PDFFORQTLIBSHARED_EXPORT PDFObject +{ +public: + enum class Type + { + // Simple PDF objects + Null, + Bool, + Int, + Real, + String, + Name, + + // Complex PDF objects + Array, + Dictionary, + Stream, + Reference + }; + + typedef std::shared_ptr PDFObjectContentPointer; + + // Default constructor should be constexpr + constexpr inline PDFObject() : + m_type(Type::Null), + m_data() + { + + } + + // Default destructor should be OK + inline ~PDFObject() = default; + + // Enforce default copy constructor and default move constructor + constexpr inline PDFObject(const PDFObject&) = default; + constexpr inline PDFObject(PDFObject&&) = default; + + // Enforce default copy assignment operator and move assignment operator + constexpr inline PDFObject& operator=(const PDFObject&) = default; + constexpr inline PDFObject& operator=(PDFObject&&) = default; + + // Test operators + inline bool isNull() const { return m_type == Type::Null; } + inline bool isBool() const { return m_type == Type::Bool; } + inline bool isInt() const { return m_type == Type::Int; } + inline bool isReal() const { return m_type == Type::Real; } + inline bool isString() const { return m_type == Type::String; } + inline bool isName() const { return m_type == Type::Name; } + inline bool isArray() const { return m_type == Type::Array; } + inline bool isDictionary() const { return m_type == Type::Dictionary; } + inline bool isStream() const { return m_type == Type::Stream; } + inline bool isReference() const { return m_type == Type::Reference; } + + inline PDFInteger getInteger() const { return std::get(m_data); } + QByteArray getString() const; + + bool operator==(const PDFObject& other) const; + bool operator!=(const PDFObject& other) const { return !(*this == other); } + + /// Creates a null object + static inline PDFObject createNull() { return PDFObject(); } + + /// Creates a boolean object + static inline PDFObject createBool(bool value) { return PDFObject(Type::Bool, value); } + + /// Creates an integer object + static inline PDFObject createInteger(PDFInteger value) { return PDFObject(Type::Int, value); } + + /// Creates an object with real number + static inline PDFObject createReal(PDFReal value) { return PDFObject(Type::Real, value); } + + /// Creates a name object + static inline PDFObject createName(PDFObjectContentPointer&& value) { return PDFObject(Type::Name, std::move(value)); } + + /// Creates a reference object + static inline PDFObject createReference(const PDFObjectReference& reference) { return PDFObject(Type::Reference, reference); } + + /// Creates a string object + static inline PDFObject createString(PDFObjectContentPointer&& value) { return PDFObject(Type::String, std::move(value)); } + + /// Creates an array object + static inline PDFObject createArray(PDFObjectContentPointer&& value) { return PDFObject(Type::Array, std::move(value)); } + + /// Creates a dictionary object + static inline PDFObject createDictionary(PDFObjectContentPointer&& value) { return PDFObject(Type::Dictionary, std::move(value)); } + + /// Creates a stream object + static inline PDFObject createStream(PDFObjectContentPointer&& value) { return PDFObject(Type::Stream, std::move(value)); } + +private: + template + constexpr inline PDFObject(Type type, T&& value) : + m_type(type), + m_data(std::forward(value)) + { + + } + + Type m_type; + std::variant m_data; +}; + +/// Represents raw string in the PDF file. No conversions are performed, this is +/// reason, that we do not use QString, but QByteArray instead. QString is +/// encoded int UTF-8. +class PDFString : public PDFObjectContent +{ +public: + inline explicit PDFString() = default; + inline explicit PDFString(QByteArray&& value) : + m_string(std::move(value)) + { + + } + + virtual ~PDFString() override = default; + + virtual bool equals(const PDFObjectContent* other) const override; + + QByteArray getString() const; + void setString(const QByteArray &getString); + +private: + QByteArray m_string; +}; + +/// Represents an array of objects in the PDF file. +class PDFArray : public PDFObjectContent +{ +public: + inline constexpr PDFArray() = default; + virtual ~PDFArray() override = default; + + virtual bool equals(const PDFObjectContent* other) const override; + + /// Returns item at the specified index. If index is invalid, + /// then it throws an exception. + const PDFObject& getItem(size_t index) const { return m_objects.at(index); } + + /// Returns size of the array (number of elements) + size_t getCount() const { return m_objects.size(); } + + /// Appends object to the end of object list + void appendItem(PDFObject object); + +private: + std::vector m_objects; +}; + +/// Represents a dictionary of objects in the PDF file. Dictionary is +/// an array of pairs key-value, where key is name object and value is any +/// PDF object. For this reason, we use QByteArray for key. We do not use +/// map, because dictionaries are usually small. +class PDFDictionary : public PDFObjectContent +{ +private: + using DictionaryEntry = std::pair; + +public: + inline constexpr PDFDictionary() = default; + virtual ~PDFDictionary() override = default; + + virtual bool equals(const PDFObjectContent* other) const override; + + /// Returns object for the key. If key is not found in the dictionary, + /// then valid reference to the null object is returned. + /// \param key Key + const PDFObject& get(const QByteArray& key) const; + + /// Returns object for the key. If key is not found in the dictionary, + /// then valid reference to the null object is returned. + /// \param key Key + const PDFObject& get(const char* key) const; + + /// Returns true, if dictionary contains a particular key + /// \param key Key to be found in the dictionary + bool hasKey(const QByteArray& key) const { return find(key) != m_dictionary.cend(); } + + /// Returns true, if dictionary contains a particular key + /// \param key Key to be found in the dictionary + bool hasKey(const char* key) const { return find(key) != m_dictionary.cend(); } + + /// Adds a new entry to the dictionary. + /// \param key Key + /// \param value Value + void addEntry(QByteArray&& key, PDFObject&& value) { m_dictionary.emplace_back(std::move(key), std::move(value)); } + +private: + /// Finds an item in the dictionary array, if the item is not in the dictionary, + /// then end iterator is returned. + /// \param key Key to be found + std::vector::const_iterator find(const QByteArray& key) const; + + /// Finds an item in the dictionary array, if the item is not in the dictionary, + /// then end iterator is returned. + /// \param key Key to be found + std::vector::const_iterator find(const char* key) const; + + std::vector m_dictionary; +}; + +/// Represents a stream object in the PDF file. Stream consists of dictionary +/// and stream content - byte array. +class PDFStream : public PDFObjectContent +{ +public: + inline explicit constexpr PDFStream() = default; + inline explicit PDFStream(PDFDictionary&& dictionary, QByteArray&& content) : + m_dictionary(std::move(dictionary)), + m_content(std::move(content)) + { + + } + + virtual ~PDFStream() override = default; + + virtual bool equals(const PDFObjectContent* other) const override; + + /// Returns dictionary for this content stream + const PDFDictionary* getDictionary() const { return &m_dictionary; } + +private: + PDFDictionary m_dictionary; + QByteArray m_content; +}; + +} // namespace pdf + +#endif // PDFOBJECT_H diff --git a/PdfForQtLib/sources/pdfparser.cpp b/PdfForQtLib/sources/pdfparser.cpp new file mode 100644 index 0000000..cface04 --- /dev/null +++ b/PdfForQtLib/sources/pdfparser.cpp @@ -0,0 +1,861 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#include "pdfparser.h" +#include "pdfconstants.h" + +#include + +#include +#include + +namespace pdf +{ + +PDFLexicalAnalyzer::PDFLexicalAnalyzer(const char* begin, const char* end) : + m_begin(begin), + m_current(begin), + m_end(end) +{ + +} + +PDFLexicalAnalyzer::Token PDFLexicalAnalyzer::fetch() +{ + // Skip whitespace/comments at first + skipWhitespaceAndComments(); + + // If we are at end of token, then return immediately + if (isAtEnd()) + { + return Token(TokenType::EndOfFile); + } + + switch (lookChar()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '+': + case '-': + case '.': + { + // Scan integer or real number. If integer overflows, then it is converted to the real number. If + // real number overflow, then error is reported. This behaviour is according to the PDF 1.7 specification, + // chapter 3.2.2. + + // First, treat special characters + bool positive = fetchChar('+'); + bool negative = fetchChar('-'); + bool dot = fetchChar('.'); + bool treatAsReal = dot; + bool atLeastOneDigit = false; + + if (isAtEnd()) + { + error(tr("Expected a number, but end of stream reached.")); + } + + PDFInteger integer = 0; + PDFReal real = 0.0; + PDFReal scale = 0.1; + + // Now, we can only have digits and a single dot + while (!isAtEnd()) + { + if (!dot && fetchChar('.')) + { + // Entering real mode + dot = true; + treatAsReal = true; + real = integer; + } + else if (std::isdigit(lookChar())) + { + atLeastOneDigit = true; + PDFInteger digit = lookChar() - '0'; + ++m_current; + + if (!treatAsReal) + { + // Treat value as integer + integer = integer * 10 + digit; + + // Check, if integer has not overflown, if yes, treat him as real + // according to the PDF 1.7 specification. + if (!isValidInteger(integer)) + { + treatAsReal = true; + real = integer; + } + } + else + { + // Treat value as real + if (!dot) + { + real = real * 10.0 + digit; + } + else + { + real = real + scale * digit; + scale *= 0.1; + } + } + } + else if (isWhitespace(lookChar())) + { + // Whitespace appeared - whitespaces delimits tokens - break + break; + } + else + { + // Another character other than dot and digit appeared - this is an error + error(tr("Invalid format of number. Character '%1' appeared.").arg(lookChar())); + } + } + + // Now, we have scanned whole token number, check for errors. + if (positive && negative) + { + error(tr("Both '+' and '-' appeared in number. Invalid format of number.")); + } + + if (!atLeastOneDigit) + { + error(tr("Bad format of number - no digits appeared.")); + } + + // Check for real overflow + if (treatAsReal && !std::isfinite(real)) + { + error(tr("Real number overflow.")); + } + + if (negative) + { + integer = -integer; + real = -real; + } + + return !treatAsReal ? Token(TokenType::Integer, integer) : Token(TokenType::Real, real); + } + + case CHAR_LEFT_BRACKET: + { + // String '(', sequence of literal characters enclosed in "()", see PDF 1.7 Reference, + // chapter 3.2.3. Note: literal string can have properly balanced brackets inside. + + int parenthesisBalance = 1; + QByteArray string; + string.reserve(STRING_BUFFER_RESERVE); + + // Skip first character + fetchChar(); + + while (true) + { + // Scan string, see, what next char is. + const char character = fetchChar(); + switch (character) + { + case CHAR_LEFT_BRACKET: + { + ++parenthesisBalance; + string.push_back(character); + break; + } + case CHAR_RIGHT_BRACKET: + { + if (--parenthesisBalance == 0) + { + // We are done. + return Token(TokenType::String, string); + } + else + { + string.push_back(character); + } + break; + } + + case CHAR_BACKSLASH: + { + // Escape sequence. Check, what it means. Possible values are in PDF 1.7 Reference, + // chapter 3.2.3, Table 3.2 - Escape Sequence in Literal Strings + const char escaped = fetchChar(); + switch (escaped) + { + case 'n': + { + string += '\n'; + break; + } + case 'r': + { + string += '\r'; + break; + } + case 't': + { + string += '\t'; + break; + } + case 'b': + { + string += '\b'; + break; + } + case 'f': + { + string += '\f'; + break; + } + case '\\': + case '(': + case ')': + { + string += escaped; + break; + } + + case '\n': + { + // Nothing done here, EOL is not part of the string, because it was escaped + break; + } + + case '\r': + { + // Skip EOL + fetchChar('\n'); + break; + } + + default: + { + // Try to scan octal value. Octal number can have 3 digits in this case. + // According to specification, overflow value can be truncated. + int octalNumber = -1; + if (fetchOctalNumber(3, &octalNumber)) + { + string += static_cast(octalNumber); + } + else + { + error(tr("Expected octal number with 1-3 digits.")); + } + + break; + } + } + + break; + } + + default: + { + // Normal character + string.push_back(character); + break; + } + } + } + + // This code should be unreachable. Either normal string is scanned - then it is returned + // in the while cycle above, or exception is thrown. + Q_ASSERT(false); + return Token(TokenType::EndOfFile); + } + + case CHAR_SLASH: + { + // Name object. According to the PDF Reference 1.7, chapter 3.2.4 name object can have zero length, + // and can contain #XX characters, where XX is hexadecimal number. + + fetchChar(); + + QByteArray name; + name.reserve(NAME_BUFFER_RESERVE); + + while (!isAtEnd()) + { + if (fetchChar(CHAR_MARK)) + { + const char hexHighCharacter = fetchChar(); + const char hexLowCharacter = fetchChar(); + + if (isHexCharacter(hexHighCharacter) && isHexCharacter(hexLowCharacter)) + { + name += QByteArray::fromHex(QByteArray::fromRawData(m_current - 2, 2)); + } + else + { + // Throw an error - hexadecimal number is expected. + error(tr("Hexadecimal number must follow character '#' in the name.")); + } + + continue; + } + + // Now, we have other character, than '#', if it is a regular character, + // then add it to the name, otherwise end scanning. + const char character = lookChar(); + + if (isRegular(character)) + { + name += character; + ++m_current; + } + else + { + // Matched non-regular character - end of name. + break; + } + } + + return Token(TokenType::Name, std::move(name)); + } + + case CHAR_ARRAY_START: + { + ++m_current; + return Token(TokenType::ArrayStart); + } + + case CHAR_ARRAY_END: + { + ++m_current; + return Token(TokenType::ArrayEnd); + } + + case CHAR_LEFT_ANGLE: + { + ++m_current; + + // Check if it is dictionary start + if (fetchChar(CHAR_LEFT_ANGLE)) + { + return Token(TokenType::DictionaryStart); + } + else + { + // Reserve two times normal size, because in hexadecimal string, each character + // is represented as a pair of hexadecimal numbers. + QByteArray hexadecimalString; + hexadecimalString.reserve(STRING_BUFFER_RESERVE * 2); + + // Scan hexadecimal string + while (!isAtEnd()) + { + const char character = fetchChar(); + if (isHexCharacter(character)) + { + hexadecimalString += character; + } + else if (character == CHAR_RIGHT_ANGLE) + { + // End of string mark. According to the specification, string can contain odd number + // of hexadecimal digits, in this case, zero is appended to the string. + if (hexadecimalString.size() % 2 == 1) + { + hexadecimalString += '0'; + } + + QByteArray decodedString = QByteArray::fromHex(hexadecimalString); + return Token(TokenType::String, std::move(decodedString)); + } + else + { + // This is unexpected. Invalid character in hexadecimal string. + error(tr("Invalid character in hexadecimal string.")); + } + } + + error(tr("Unexpected end of stream reached while scanning hexadecimal string.")); + } + break; + } + + case CHAR_RIGHT_ANGLE: + { + // This must be a mark of dictionary end, because in other way, we should reach end of + // string in the code above. + ++m_current; + + if (fetchChar(CHAR_RIGHT_ANGLE)) + { + return Token(TokenType::DictionaryEnd); + } + + error(tr("Invalid character '%1'").arg(CHAR_RIGHT_ANGLE)); + break; + } + + default: + { + // Now, we have skipped whitespaces. So actual character must be either regular, or it is special. + // We have treated all special characters above. For this reason, if we match special character, + // then we report an error. + Q_ASSERT(!isWhitespace(lookChar())); + + if (isRegular(lookChar())) + { + // It should be sequence of regular characters - command, true, false, null... + QByteArray command; + command.reserve(COMMAND_BUFFER_RESERVE); + + while (!isAtEnd() && isRegular(lookChar())) + { + command += fetchChar(); + } + + if (command == BOOL_OBJECT_TRUE_STRING) + { + return Token(TokenType::Boolean, true); + } + else if (command == BOOL_OBJECT_FALSE_STRING) + { + return Token(TokenType::Boolean, false); + } + else if (command == NULL_OBJECT_STRING) + { + return Token(TokenType::Null); + } + else + { + return Token(TokenType::Command, std::move(command)); + } + } + else + { + error(tr("Unexpected character '%1' in the stream.").arg(lookChar())); + } + break; + } + } + + return Token(TokenType::EndOfFile); +} + +void PDFLexicalAnalyzer::skipWhitespaceAndComments() +{ + bool isComment = false; + + while (m_current != m_end) + { + if (isComment) + { + // Comment ends at end of line + if (*m_current == CHAR_CARRIAGE_RETURN || *m_current == CHAR_LINE_FEED) + { + isComment = false; + } + + // Commented character - step to the next character + ++m_current; + } + else if (*m_current == CHAR_PERCENT) + { + isComment = true; + ++m_current; + } + else if (isWhitespace(*m_current)) + { + ++m_current; + } + else + { + // Not a whitespace and not in comment + break; + } + } +} + +void PDFLexicalAnalyzer::skipStreamStart() +{ + // According to the PDF Reference 1.7, chapter 3.2.7, after the 'stream' keyword, + // either carriage return + line feed, or just line feed can appear. Eat them. + fetchChar(CHAR_CARRIAGE_RETURN); + fetchChar(CHAR_LINE_FEED); +} + +QByteArray PDFLexicalAnalyzer::fetchByteArray(PDFInteger length) +{ + Q_ASSERT(length >= 0); + + if (std::distance(m_current, m_end) < length) + { + error(tr("Can't read %1 bytes from the input stream. Input stream end reached.").arg(length)); + } + + QByteArray result(m_current, length); + std::advance(m_current, length); + return result; +} + +bool PDFLexicalAnalyzer::fetchChar(const char character) +{ + if (!isAtEnd() && lookChar() == character) + { + ++m_current; + return true; + } + + return false; +} + +char PDFLexicalAnalyzer::fetchChar() +{ + if (!isAtEnd()) + { + return *m_current++; + } + + error(tr("Unexpected end of stream reached.")); + + return 0; +} + +bool PDFLexicalAnalyzer::fetchOctalNumber(int maxDigits, int* output) +{ + Q_ASSERT(output); + + *output = 0; + int fetchedNumbers = 0; + + while (!isAtEnd() && fetchedNumbers < maxDigits) + { + const char c = lookChar(); + if (c >= '0' && c <= '7') + { + // Valid octal characters + const int number = c - '0'; + *output = *output * 8 + number; + ++m_current; + ++fetchedNumbers; + } + else + { + // Non-octal character reached + break; + } + } + + return fetchedNumbers >= 1; +} + +constexpr bool PDFLexicalAnalyzer::isHexCharacter(const char character) +{ + return (character >= '0' && character <= '9') || (character >= 'A' && character <= 'F') || (character >= 'a' && character <= 'f'); +} + +void PDFLexicalAnalyzer::error(const QString& message) const +{ + throw PDFParserException(message); +} + +PDFObject PDFParsingContext::getObject(const PDFObject& object) const +{ + Q_ASSERT(false); + return PDFObject(); +} + +void PDFParsingContext::beginParsingObject(PDFObjectReference reference) +{ + if (m_activeParsedObjectSet.count(reference)) + { + throw PDFParserException(tr("Cyclical reference found while parsing object %1 %2.").arg(reference.objectNumber).arg(reference.generation)); + } + else + { + m_activeParsedObjectSet.insert(reference); + } +} + +void PDFParsingContext::endParsingObject(PDFObjectReference reference) +{ + Q_ASSERT(m_activeParsedObjectSet.count(reference)); + m_activeParsedObjectSet.erase(reference); +} + +PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context) : + m_context(context), + m_lexicalAnalyzer(begin, end) +{ + m_lookAhead1 = m_lexicalAnalyzer.fetch(); + m_lookAhead2 = m_lexicalAnalyzer.fetch(); +} + +PDFObject PDFParser::getObject() +{ + /* + * + // Complex PDF objects + , + Dictionary, + Stream, + */ + switch (m_lookAhead1.type) + { + case PDFLexicalAnalyzer::TokenType::Boolean: + { + Q_ASSERT(m_lookAhead1.data.type() == QVariant::Bool); + const bool value = m_lookAhead1.data.toBool(); + shift(); + return PDFObject::createBool(value); + } + + case PDFLexicalAnalyzer::TokenType::Integer: + { + Q_ASSERT(m_lookAhead1.data.type() == QVariant::LongLong); + const PDFInteger value = m_lookAhead1.data.toLongLong(); + shift(); + + // We must check, if we are reading reference. In this case, + // actual value is integer and next value is command "R". + if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Integer && + m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command && + m_lookAhead2.data.toByteArray() == PDF_REFERENCE_COMMAND) + { + Q_ASSERT(m_lookAhead1.data.type() == QVariant::LongLong); + const PDFInteger generation = m_lookAhead1.data.toLongLong(); + shift(); + shift(); + return PDFObject::createReference(PDFObjectReference(value, generation)); + } + else + { + // Just normal integer + return PDFObject::createInteger(value); + } + } + + case PDFLexicalAnalyzer::TokenType::Real: + { + Q_ASSERT(m_lookAhead1.data.type() == QVariant::Double); + const PDFReal value = m_lookAhead1.data.toDouble(); + shift(); + return PDFObject::createReal(value); + } + + case PDFLexicalAnalyzer::TokenType::String: + { + Q_ASSERT(m_lookAhead1.data.type() == QVariant::ByteArray); + QByteArray array = m_lookAhead1.data.toByteArray(); + array.shrink_to_fit(); + shift(); + return PDFObject::createString(std::make_shared(std::move(array))); + } + + case PDFLexicalAnalyzer::TokenType::Name: + { + Q_ASSERT(m_lookAhead1.data.type() == QVariant::ByteArray); + QByteArray array = m_lookAhead1.data.toByteArray(); + array.shrink_to_fit(); + shift(); + return PDFObject::createName(std::make_shared(std::move(array))); + } + + case PDFLexicalAnalyzer::TokenType::ArrayStart: + { + shift(); + + // Create shared pointer to the array (if the exception is thrown, array + // will be properly destroyed by the shared array destructor) + std::shared_ptr arraySharedPointer = std::make_shared(); + PDFArray* array = static_cast(arraySharedPointer.get()); + + while (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::EndOfFile && + m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::ArrayEnd) + { + array->appendItem(getObject()); + } + + // Now, we have either end of file, or array end. If former appears, then + // it is an error - error should be reported. + if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::EndOfFile) + { + error(tr("Stream ended inside array.")); + } + else + { + shift(); + return PDFObject::createArray(std::move(arraySharedPointer)); + } + } + case PDFLexicalAnalyzer::TokenType::DictionaryStart: + { + shift(); + + // Start reading the dictionary. BEWARE! It can also be a stream. In this case, + // we must load also the stream content. + std::shared_ptr dictionarySharedPointer = std::make_shared(); + PDFDictionary* dictionary = dictionarySharedPointer.get(); + + // Now, scan key/value pairs + while (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::EndOfFile && + m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::DictionaryEnd) + { + // First value should be a key + if (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::Name) + { + error(tr("Dictionary key must be a name.")); + } + + QByteArray key = m_lookAhead1.data.toByteArray(); + shift(); + + // Second value should be a value + PDFObject object = getObject(); + + dictionary->addEntry(std::move(key), std::move(object)); + } + + // Now, we should reach dictionary end. If it is not the case, then end of stream occured. + if (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::DictionaryEnd) + { + error(tr("End of stream inside dictionary reached.")); + } + + // Is it a content stream? + if (m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command && + m_lookAhead2.data.toByteArray() == PDF_STREAM_START_COMMAND) + { + // Read stream content. According to the PDF Reference 1.7, chapter 3.2.7, stream + // content can be placed in the file. If this is the case, then try to load file + // content in the memory. But even in this case, stream content should be skipped. + + if (!dictionary->hasKey(PDF_STREAM_DICT_LENGTH)) + { + error(tr("Stream length is not specified.")); + } + + PDFObject lengthObject = m_context->getObject(dictionary->get(PDF_STREAM_DICT_LENGTH)); + if (!lengthObject.isInt()) + { + error(tr("Bad value of stream length. It should be an integer number.")); + } + PDFInteger length = lengthObject.getInteger(); + + if (length < 0) + { + error(tr("Length of the stream buffer is negative (%1). It must be a positive number.").arg(length)); + } + + QByteArray buffer = m_lexicalAnalyzer.fetchByteArray(length); + + // According to the PDF Reference 1.7, chapter 3.2.7, stream content can also be specified + // in the external file. If this is the case, then we must try to load the stream data + // from the external file. + if (dictionary->hasKey(PDF_STREAM_DICT_FILE_SPECIFICATION)) + { + PDFObject fileName = m_context->getObject(dictionary->get(PDF_STREAM_DICT_FILE_SPECIFICATION)); + + if (!fileName.isString()) + { + error(tr("Stream data should be in external file, but invalid file name is specified.")); + } + + QFile streamDataFile(fileName.getString()); + if (streamDataFile.open(QFile::ReadOnly)) + { + buffer = streamDataFile.readAll(); + streamDataFile.close(); + } + else + { + error(tr("Can't open stream data stored in external file '%1'.").arg(QString(fileName.getString()))); + } + } + + // Refill lookahead tokens + m_lookAhead1 = m_lexicalAnalyzer.fetch(); + m_lookAhead2 = m_lexicalAnalyzer.fetch(); + + if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command && + m_lookAhead1.data.toByteArray() == PDF_STREAM_END_COMMAND) + { + // Everything OK, just advance and return stream object + shift(); + return PDFObject::createStream(std::make_shared(std::move(*dictionary), std::move(buffer))); + } + else + { + error(tr("End of stream should end in keyword 'endstream'.")); + } + } + else + { + // Just shift (eat dictionary end) and return dictionary + shift(); + return PDFObject::createDictionary(std::move(dictionarySharedPointer)); + } + } + + case PDFLexicalAnalyzer::TokenType::Null: + { + shift(); + return PDFObject::createNull(); + } + + case PDFLexicalAnalyzer::TokenType::ArrayEnd: + case PDFLexicalAnalyzer::TokenType::DictionaryEnd: + case PDFLexicalAnalyzer::TokenType::Command: + { + error(tr("Cannot read object. Unexpected token appeared.")); + break; + } + + case PDFLexicalAnalyzer::TokenType::EndOfFile: + { + error(tr("Cannot read object. End of stream reached.")); + break; + } + } + + // This code should be unreachable. All values should be handled in the switch above. + Q_ASSERT(false); + return PDFObject::createNull(); +} + +PDFObject PDFParser::getObject(PDFObjectReference reference) +{ + PDFParsingContext::PDFParsingContextGuard guard(m_context, reference); + return getObject(); +} + +void PDFParser::error(const QString& message) const +{ + throw new PDFParserException(message); +} + +void PDFParser::shift() +{ + m_lookAhead1 = std::move(m_lookAhead2); + m_lookAhead2 = m_lexicalAnalyzer.fetch(); +} + +} // namespace pdf diff --git a/PdfForQtLib/sources/pdfparser.h b/PdfForQtLib/sources/pdfparser.h new file mode 100644 index 0000000..ae0be62 --- /dev/null +++ b/PdfForQtLib/sources/pdfparser.h @@ -0,0 +1,315 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#ifndef PDFPARSER_H +#define PDFPARSER_H + +#include "pdfglobal.h" +#include "pdfobject.h" + +#include +#include +#include + +#include + +namespace pdf +{ + +// Group of whitespace characters + +constexpr const char CHAR_NULL = 0x00; +constexpr const char CHAR_TAB = 0x09; +constexpr const char CHAR_LINE_FEED = 0x0A; +constexpr const char CHAR_FORM_FEED = 0x0C; +constexpr const char CHAR_CARRIAGE_RETURN = 0x0D; +constexpr const char CHAR_SPACE = 0x20; + +// According to specification, chapter 3.1, EOL marker is one of the following characters: +// 1) Either CHAR_CARRIAGE_RETURN, or CHAR_LINE_FEED, +// 2) CHAR_CARRIAGE_RETURN followed immediately by CHAR_LINE_FEED + +// Group of delimiter characters + +constexpr const char CHAR_LEFT_BRACKET = '('; +constexpr const char CHAR_RIGHT_BRACKET = ')'; +constexpr const char CHAR_LEFT_ANGLE = '<'; +constexpr const char CHAR_RIGHT_ANGLE = '>'; +constexpr const char CHAR_ARRAY_START = '['; +constexpr const char CHAR_ARRAY_END = ']'; +constexpr const char CHAR_SLASH = '/'; +constexpr const char CHAR_PERCENT = '%'; +constexpr const char CHAR_BACKSLASH = '\\'; +constexpr const char CHAR_MARK = '#'; + +// These constants reserves memory while reading string or name + +constexpr const int STRING_BUFFER_RESERVE = 32; +constexpr const int NAME_BUFFER_RESERVE = 16; +constexpr const int COMMAND_BUFFER_RESERVE = 16; + +// Special objects - bool, null object + +constexpr const char* BOOL_OBJECT_TRUE_STRING = "true"; +constexpr const char* BOOL_OBJECT_FALSE_STRING = "false"; +constexpr const char* NULL_OBJECT_STRING = "null"; + +// Special commands +constexpr const char* PDF_REFERENCE_COMMAND = "R"; +constexpr const char* PDF_STREAM_START_COMMAND = "stream"; +constexpr const char* PDF_STREAM_END_COMMAND = "endstream"; + +class PDFParserException : public std::exception +{ +public: + PDFParserException(const QString& message) : + m_message(message) + { + + } + + /// Returns error message + const QString& getMessage() const { return m_message; } + +private: + QString m_message; +}; + +class PDFFORQTLIBSHARED_EXPORT PDFLexicalAnalyzer +{ + Q_GADGET + Q_DECLARE_TR_FUNCTIONS(pdf::PDFLexicalAnalyzer) + +public: + PDFLexicalAnalyzer(const char* begin, const char* end); + + enum class TokenType + { + Boolean, + Integer, + Real, + String, + Name, + ArrayStart, + ArrayEnd, + DictionaryStart, + DictionaryEnd, + Null, + Command, + EndOfFile + }; + + Q_ENUM(TokenType) + + struct Token + { + explicit Token() : type(TokenType::EndOfFile) { } + explicit Token(TokenType type) : type(type) { } + explicit Token(TokenType type, QVariant data) : type(type), data(qMove(data)) { } + + Token(const Token&) = default; + Token(Token&&) = default; + + Token& operator=(const Token&) = default; + Token& operator=(Token&&) = default; + + bool operator==(const Token& other) const { return type == other.type && data == other.data; } + + TokenType type; + QVariant data; + }; + + /// Fetches a new token from the input stream. If we are at end of the input + /// stream, then EndOfFile token is returned. + Token fetch(); + + /// Skips whitespace and comments + void skipWhitespaceAndComments(); + + /// Skips stream start + void skipStreamStart(); + + /// Reads number of bytes from the buffer and creates a byte array from it. + /// If end of stream appears before desired end byte, exception is thrown. + /// \param length Length of the buffer + QByteArray fetchByteArray(PDFInteger length); + + /// Returns, if whole stream was scanned + inline bool isAtEnd() const { return m_current == m_end; } + + /// Returns true, if character is a whitespace character according to the PDF 1.7 specification + /// \param character Character to be tested + static constexpr bool isWhitespace(char character); + + /// Returns true, if character is a delimiter character according to the PDF 1.7 specification + /// \param character Character to be tested + static constexpr bool isDelimiter(char character); + + /// Returns true, if character is a regular character according to the PDF 1.7 specification + /// \param character Character to be tested + static constexpr bool isRegular(char character) { return !isWhitespace(character) && !isDelimiter(character); } + +private: + inline char lookChar() const { Q_ASSERT(m_current != m_end); return *m_current; } + + /// If current char is equal to the argument, then move position by one character and return true. + /// If not, then return false and current position will be unchanged. + /// \param character Character to be fetched + bool fetchChar(const char character); + + /// Forcefully fetches next char from the stream. If stream is at end, then exception is thrown. + /// Current position will be advanced to the next one. + char fetchChar(); + + /// Tries to fetch octal number with minimum 1 digits and specified maximum number of digits. + /// If octal number cannot be fetched, then false is returned, otherwise true is returned. + /// Result number is stored in the pointer. + /// \param maxDigits Maximum number of digits + /// \param output Non-null pointer to the result number + bool fetchOctalNumber(int maxDigits, int* output); + + /// Returns true, if charachter represents hexadecimal number, i.e. digit 0-9, + /// or letter A-F, or small letter a-f. + static constexpr bool isHexCharacter(const char character); + + /// Throws an error exception + void error(const QString& message) const; + + const char* m_begin; + const char* m_current; + const char* m_end; +}; + +/// Parsing context. Used for example to detect cyclic reference errors. +class PDFParsingContext +{ + Q_DECLARE_TR_FUNCTIONS(pdf::PDFParsingContext) + +public: + + /// Guard guarding the cyclical references. + class PDFParsingContextGuard + { + public: + explicit inline PDFParsingContextGuard(PDFParsingContext* context, PDFObjectReference reference) : + m_context(context), + m_reference(reference) + { + m_context->beginParsingObject(m_reference); + } + + inline ~PDFParsingContextGuard() + { + m_context->endParsingObject(m_reference); + } + + private: + PDFParsingContext* m_context; + PDFObjectReference m_reference; + }; + + /// Returns dereferenced object, if object is a reference. If it is not a reference, + /// then same object is returned. + PDFObject getObject(const PDFObject& object) const; + +private: + void beginParsingObject(PDFObjectReference reference); + void endParsingObject(PDFObjectReference reference); + + /// Set containing objects currently being parsed. + std::set m_activeParsedObjectSet; +}; + +/// Class for parsing objects. Checks cyclical references. If +/// the object cannot be obtained from the stream, exception is thrown. +class PDFParser +{ + Q_DECLARE_TR_FUNCTIONS(pdf::PDFParser) + +public: + explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context); + + /// Fetches single object from the stream. Does not check + /// cyclical references. If object cannot be fetched, then + /// exception is thrown. + PDFObject getObject(); + + /// Fetches signle object from the stream. Performs check for + /// cyclical references. If object cannot be fetched, then + /// exception is thrown. + PDFObject getObject(PDFObjectReference reference); + + /// Throws an error exception + void error(const QString& message) const; + +private: + void shift(); + + /// Parsing context (multiple parsers can share it) + PDFParsingContext* m_context; + + /// Lexical analyzer for scanning tokens + PDFLexicalAnalyzer m_lexicalAnalyzer; + + PDFLexicalAnalyzer::Token m_lookAhead1; + PDFLexicalAnalyzer::Token m_lookAhead2; +}; + +// Implementation + +inline +constexpr bool PDFLexicalAnalyzer::isWhitespace(char character) +{ + switch (character) + { + case CHAR_NULL: + case CHAR_TAB: + case CHAR_LINE_FEED: + case CHAR_FORM_FEED: + case CHAR_CARRIAGE_RETURN: + case CHAR_SPACE: + return true; + + default: + return false; + } +} + +inline +constexpr bool PDFLexicalAnalyzer::isDelimiter(char character) +{ + switch (character) + { + case CHAR_LEFT_BRACKET: + case CHAR_RIGHT_BRACKET: + case CHAR_LEFT_ANGLE: + case CHAR_RIGHT_ANGLE: + case CHAR_ARRAY_START: + case CHAR_ARRAY_END: + case CHAR_SLASH: + case CHAR_PERCENT: + return true; + + default: + return false; + } +} + +} // namespace pdf + +#endif // PDFPARSER_H diff --git a/UnitTests/UnitTests.pro b/UnitTests/UnitTests.pro new file mode 100644 index 0000000..4b195ea --- /dev/null +++ b/UnitTests/UnitTests.pro @@ -0,0 +1,20 @@ +QT += testlib +QT -= gui + +CONFIG += qt console warn_on depend_includepath testcase +CONFIG -= app_bundle + +TEMPLATE = app + +INCLUDEPATH += $$PWD/../PDFForQtLib/Sources + +DESTDIR = $$OUT_PWD/.. + +LIBS += -L$$OUT_PWD/.. + +LIBS += -lPDFForQtLib + +QMAKE_CXXFLAGS += /std:c++latest + +SOURCES += \ + tst_lexicalanalyzertest.cpp diff --git a/UnitTests/UnitTests.pro.autosave b/UnitTests/UnitTests.pro.autosave new file mode 100644 index 0000000..fddbd1a --- /dev/null +++ b/UnitTests/UnitTests.pro.autosave @@ -0,0 +1,38 @@ +# Copyright (C) 2018 Jakub Melka +# +# This file is part of PdfForQt. +# +# PdfForQt is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PdfForQt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with PDFForQt. If not, see . + + +QT += testlib +QT -= gui + +CONFIG += qt console warn_on depend_includepath testcase +CONFIG -= app_bundle + +TEMPLATE = app + +INCLUDEPATH += $$PWD/../PDFForQtLib/Sources + +DESTDIR = $$OUT_PWD/.. + +LIBS += -L$$OUT_PWD/.. + +LIBS += -lPDFForQtLib + +QMAKE_CXXFLAGS += /std:c++latest + +SOURCES += \ + tst_lexicalanalyzertest.cpp diff --git a/UnitTests/tst_lexicalanalyzertest.cpp b/UnitTests/tst_lexicalanalyzertest.cpp new file mode 100644 index 0000000..71df4e3 --- /dev/null +++ b/UnitTests/tst_lexicalanalyzertest.cpp @@ -0,0 +1,297 @@ +// Copyright (C) 2018 Jakub Melka +// +// This file is part of PdfForQt. +// +// PdfForQt is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// PdfForQt is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with PDFForQt. If not, see . + + +#include +#include + +#include "pdfparser.h" +#include "pdfconstants.h" + +#include + +class LexicalAnalyzerTest : public QObject +{ + Q_OBJECT + +public: + explicit LexicalAnalyzerTest(); + virtual ~LexicalAnalyzerTest() override; + +private slots: + void test_null(); + void test_numbers(); + void test_strings(); + void test_name(); + void test_bool(); + void test_ad(); + void test_command(); + void test_invalid_input(); + void test_header_regexp(); + +private: + void scanWholeStream(const char* stream); + void testTokens(const char* stream, const std::vector& tokens); + + QString getStringFromTokens(const std::vector& tokens); +}; + +LexicalAnalyzerTest::LexicalAnalyzerTest() +{ + +} + +LexicalAnalyzerTest::~LexicalAnalyzerTest() +{ + +} + +void LexicalAnalyzerTest::test_null() +{ + using Token = pdf::PDFLexicalAnalyzer::Token; + using Type = pdf::PDFLexicalAnalyzer::TokenType; + + testTokens("null", { Token(Type::Null) }); + testTokens(" null ", { Token(Type::Null), Token(Type::EndOfFile) }); + testTokens("%null\n null %comment", { Token(Type::Null), Token(Type::EndOfFile) }); + testTokens(" \n\t null\n", { Token(Type::Null), Token(Type::EndOfFile) }); + testTokens(" null %and null\n null", { Token(Type::Null), Token(Type::Null) }); + testTokens(" null %and null\n null ", { Token(Type::Null), Token(Type::Null), Token(Type::EndOfFile) }); +} + +void LexicalAnalyzerTest::test_numbers() +{ + using Token = pdf::PDFLexicalAnalyzer::Token; + using Type = pdf::PDFLexicalAnalyzer::TokenType; + + testTokens("1 +2 -3 +40 -55", { Token(Type::Integer, 1), Token(Type::Integer, 2), Token(Type::Integer, -3), Token(Type::Integer, 40), Token(Type::Integer, -55) }); + testTokens(".0 0.1 3.5 -4. +5.0 -6.58 7.478", { Token(Type::Real, 0.0), Token(Type::Real, 0.1), Token(Type::Real, 3.5), Token(Type::Real, -4.0), Token(Type::Real, 5.0), Token(Type::Real, -6.58), Token(Type::Real, 7.478) }); + testTokens("1000000000000000000000000000", { Token(Type::Real, 1e27) }); +} + +void LexicalAnalyzerTest::test_strings() +{ + using Token = pdf::PDFLexicalAnalyzer::Token; + using Type = pdf::PDFLexicalAnalyzer::TokenType; + + testTokens("(Simple string)", { Token(Type::String, QByteArray("Simple string")) }); + testTokens("(String with (brackets))", { Token(Type::String, QByteArray("String with (brackets)")) }); + testTokens("(String with \\( unbalanced brackets \\(\\))", { Token(Type::String, QByteArray("String with ( unbalanced brackets ()")) }); + testTokens("()", { Token(Type::String, QByteArray("")) }); + testTokens("(Text with special character: \\n)", { Token(Type::String, QByteArray("Text with special character: \n")) }); + testTokens("(Text with special character: \\r)", { Token(Type::String, QByteArray("Text with special character: \r")) }); + testTokens("(Text with special character: \\t)", { Token(Type::String, QByteArray("Text with special character: \t")) }); + testTokens("(Text with special character: \\b)", { Token(Type::String, QByteArray("Text with special character: \b")) }); + testTokens("(Text with special character: \\f)", { Token(Type::String, QByteArray("Text with special character: \f")) }); + testTokens("(Text with special character: \\()", { Token(Type::String, QByteArray("Text with special character: (")) }); + testTokens("(Text with special character: \\))", { Token(Type::String, QByteArray("Text with special character: )")) }); + testTokens("(Text with special character: \\\\)", { Token(Type::String, QByteArray("Text with special character: \\")) }); + testTokens("(\53)", { Token(Type::String, QByteArray("+")) }); + testTokens("(\0533)", { Token(Type::String, QByteArray("+3")) }); + testTokens("(\053)", { Token(Type::String, QByteArray("+")) }); + testTokens("(\053053)", { Token(Type::String, QByteArray("+053")) }); + testTokens("(\5)", { Token(Type::String, QByteArray("\5")) }); + testTokens("<901FA3>", { Token(Type::String, QByteArray("\220\037\243")) }); + testTokens("<901fa3>", { Token(Type::String, QByteArray("\220\037\243")) }); + testTokens("<901fa>", { Token(Type::String, QByteArray("\220\037\240")) }); + testTokens("<901FA>", { Token(Type::String, QByteArray("\220\037\240")) }); + testTokens("<>", { Token(Type::String, QByteArray("")) }); + + testTokens("(Simple string)(Simple string)", { Token(Type::String, QByteArray("Simple string")), Token(Type::String, QByteArray("Simple string")) }); + testTokens("(String with (brackets))(String with (brackets))", { Token(Type::String, QByteArray("String with (brackets)")), Token(Type::String, QByteArray("String with (brackets)")) }); + testTokens("(String with \\( unbalanced brackets \\(\\))(String with \\( unbalanced brackets \\(\\))", { Token(Type::String, QByteArray("String with ( unbalanced brackets ()")), Token(Type::String, QByteArray("String with ( unbalanced brackets ()")) }); + testTokens("()()", { Token(Type::String, QByteArray("")), Token(Type::String, QByteArray("")) }); + testTokens("(Text with special character: \\n)(Text with special character: \\n)", { Token(Type::String, QByteArray("Text with special character: \n")), Token(Type::String, QByteArray("Text with special character: \n")) }); + testTokens("(Text with special character: \\r)(Text with special character: \\r)", { Token(Type::String, QByteArray("Text with special character: \r")), Token(Type::String, QByteArray("Text with special character: \r")) }); + testTokens("(Text with special character: \\t)(Text with special character: \\t)", { Token(Type::String, QByteArray("Text with special character: \t")), Token(Type::String, QByteArray("Text with special character: \t")) }); + testTokens("(Text with special character: \\b)(Text with special character: \\b)", { Token(Type::String, QByteArray("Text with special character: \b")), Token(Type::String, QByteArray("Text with special character: \b")) }); + testTokens("(Text with special character: \\f)(Text with special character: \\f)", { Token(Type::String, QByteArray("Text with special character: \f")), Token(Type::String, QByteArray("Text with special character: \f")) }); + testTokens("(Text with special character: \\()(Text with special character: \\()", { Token(Type::String, QByteArray("Text with special character: (")), Token(Type::String, QByteArray("Text with special character: (")) }); + testTokens("(Text with special character: \\))(Text with special character: \\))", { Token(Type::String, QByteArray("Text with special character: )")), Token(Type::String, QByteArray("Text with special character: )")) }); + testTokens("(Text with special character: \\\\)(Text with special character: \\\\)", { Token(Type::String, QByteArray("Text with special character: \\")), Token(Type::String, QByteArray("Text with special character: \\")) }); + testTokens("(\53)(\53)", { Token(Type::String, QByteArray("+")), Token(Type::String, QByteArray("+")) }); + testTokens("(\0533)(\0533)", { Token(Type::String, QByteArray("+3")), Token(Type::String, QByteArray("+3")) }); + testTokens("(\053)(\053)", { Token(Type::String, QByteArray("+")), Token(Type::String, QByteArray("+")) }); + testTokens("(\053053)(\053053)", { Token(Type::String, QByteArray("+053")), Token(Type::String, QByteArray("+053")) }); + testTokens("(\5)(\5)", { Token(Type::String, QByteArray("\5")), Token(Type::String, QByteArray("\5")) }); + testTokens("<901FA3><901FA3>", { Token(Type::String, QByteArray("\220\037\243")), Token(Type::String, QByteArray("\220\037\243")) }); + testTokens("<901fa3><901fa3>", { Token(Type::String, QByteArray("\220\037\243")), Token(Type::String, QByteArray("\220\037\243")) }); + testTokens("<901fa><901fa>", { Token(Type::String, QByteArray("\220\037\240")), Token(Type::String, QByteArray("\220\037\240")) }); + testTokens("<901FA><901FA>", { Token(Type::String, QByteArray("\220\037\240")), Token(Type::String, QByteArray("\220\037\240")) }); + testTokens("<><>", { Token(Type::String, QByteArray("")), Token(Type::String, QByteArray("")) }); +} + +void LexicalAnalyzerTest::test_name() +{ + using Token = pdf::PDFLexicalAnalyzer::Token; + using Type = pdf::PDFLexicalAnalyzer::TokenType; + + testTokens("/Name123", { Token(Type::Name, QByteArray("Name123")) }); + testTokens("/VeryLongName", { Token(Type::Name, QByteArray("VeryLongName")) }); + testTokens("/A;Name_With^Various***Characters", { Token(Type::Name, QByteArray("A;Name_With^Various***Characters")) }); + testTokens("/1.2", { Token(Type::Name, QByteArray("1.2")) }); + testTokens("/$$", { Token(Type::Name, QByteArray("$$")) }); + testTokens("/@MatchedPattern", { Token(Type::Name, QByteArray("@MatchedPattern")) }); + testTokens("/.undefined", { Token(Type::Name, QByteArray(".undefined")) }); + testTokens("/The#20Major#20And#20The#20#23", { Token(Type::Name, QByteArray("The Major And The #")) }); + testTokens("/A#42", { Token(Type::Name, QByteArray("AB")) }); + testTokens("/#20", { Token(Type::Name, QByteArray(" ")) }); + testTokens("/#23#20#23/AB", { Token(Type::Name, QByteArray("# #")), Token(Type::Name, QByteArray("AB")) }); + + testTokens("/Name123/Name123", { Token(Type::Name, QByteArray("Name123")), Token(Type::Name, QByteArray("Name123")) }); + testTokens("/VeryLongName/VeryLongName", { Token(Type::Name, QByteArray("VeryLongName")), Token(Type::Name, QByteArray("VeryLongName")) }); + testTokens("/A;Name_With^Various***Characters/A;Name_With^Various***Characters", { Token(Type::Name, QByteArray("A;Name_With^Various***Characters")), Token(Type::Name, QByteArray("A;Name_With^Various***Characters")) }); + testTokens("/1.2/1.2", { Token(Type::Name, QByteArray("1.2")), Token(Type::Name, QByteArray("1.2")) }); + testTokens("/$$/$$", { Token(Type::Name, QByteArray("$$")), Token(Type::Name, QByteArray("$$")) }); + testTokens("/@MatchedPattern/@MatchedPattern", { Token(Type::Name, QByteArray("@MatchedPattern")), Token(Type::Name, QByteArray("@MatchedPattern")) }); + testTokens("/.undefined/.undefined", { Token(Type::Name, QByteArray(".undefined")), Token(Type::Name, QByteArray(".undefined")) }); + testTokens("/The#20Major#20And#20The#20#23/The#20Major#20And#20The#20#23", { Token(Type::Name, QByteArray("The Major And The #")), Token(Type::Name, QByteArray("The Major And The #")) }); + testTokens("/A#42/A#42", { Token(Type::Name, QByteArray("AB")), Token(Type::Name, QByteArray("AB")) }); + testTokens("/#20/#20", { Token(Type::Name, QByteArray(" ")), Token(Type::Name, QByteArray(" ")) }); + testTokens("/#23#20#23/AB/#23#20#23/AB", { Token(Type::Name, QByteArray("# #")), Token(Type::Name, QByteArray("AB")), Token(Type::Name, QByteArray("# #")), Token(Type::Name, QByteArray("AB")) }); +} + +void LexicalAnalyzerTest::test_bool() +{ + using Token = pdf::PDFLexicalAnalyzer::Token; + using Type = pdf::PDFLexicalAnalyzer::TokenType; + + testTokens("true", { Token(Type::Boolean, true) }); + testTokens("false", { Token(Type::Boolean, false) }); + testTokens("true false true false", { Token(Type::Boolean, true), Token(Type::Boolean, false), Token(Type::Boolean, true), Token(Type::Boolean, false) }); +} + +void LexicalAnalyzerTest::test_ad() +{ + using Token = pdf::PDFLexicalAnalyzer::Token; + using Type = pdf::PDFLexicalAnalyzer::TokenType; + + testTokens("<<", { Token(Type::DictionaryStart) }); + testTokens("%comment\n<<", { Token(Type::DictionaryStart) }); + testTokens(">>", { Token(Type::DictionaryEnd) }); + testTokens("[", { Token(Type::ArrayStart) }); + testTokens("]", { Token(Type::ArrayEnd) }); +} + +void LexicalAnalyzerTest::test_command() +{ + using Token = pdf::PDFLexicalAnalyzer::Token; + using Type = pdf::PDFLexicalAnalyzer::TokenType; + + testTokens("command", { Token(Type::Command, QByteArray("command")) }); + testTokens("command1 command2", { Token(Type::Command, QByteArray("command1")), Token(Type::Command, QByteArray("command2")) }); +} + +void LexicalAnalyzerTest::test_invalid_input() +{ + QByteArray bigNumber(500, '0'); + bigNumber.front() = '1'; + bigNumber.back() = 0; + + QVERIFY_EXCEPTION_THROWN(scanWholeStream("(\\9adoctalnumber)"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("(\\)"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("123 456 +4-5"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("123 456 +"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("123 456 + 45"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream(bigNumber.constData()), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("/#Q1FF"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("/#1QFF"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("/# "), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream(""), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("<1FA3"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("<1FA"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream("> albatros"), pdf::PDFParserException); + QVERIFY_EXCEPTION_THROWN(scanWholeStream(")"), pdf::PDFParserException); +} + +void LexicalAnalyzerTest::test_header_regexp() +{ + std::regex regex(pdf::PDF_FILE_HEADER_REGEXP); + + for (const char* string : { "%PDF-1.4", " %PDF-1.4abs", "%PDF-1.4", "%test %PDF %PDF-1.4", "%!PS-Adobe-3.0 PDF-1.4"}) + { + std::cmatch cmatch; + const bool matched = std::regex_search(string, string + strlen(string), cmatch, regex); + QVERIFY(matched); + + if (matched) + { + QVERIFY(cmatch.size() == 3); + QVERIFY(cmatch[1].matched || cmatch[2].matched); + } + } +} + +void LexicalAnalyzerTest::scanWholeStream(const char* stream) +{ + pdf::PDFLexicalAnalyzer analyzer(stream, stream + strlen(stream)); + + // Scan whole stream + while (!analyzer.isAtEnd()) + { + analyzer.fetch(); + } +} + +void LexicalAnalyzerTest::testTokens(const char* stream, const std::vector& tokens) +{ + pdf::PDFLexicalAnalyzer analyzer(stream, stream + strlen(stream)); + + std::vector scanned; + scanned.reserve(tokens.size()); + + // Scan whole stream + while (!analyzer.isAtEnd()) + { + scanned.emplace_back(analyzer.fetch()); + } + + // Format error message + QString actual = getStringFromTokens(scanned); + QString expected = getStringFromTokens(tokens); + + // Now, compare scanned tokens + QVERIFY2(scanned == tokens, qPrintable(QString("stream: %1, actual = %2, expected = %3").arg(QString(stream), actual, expected))); +} + +QString LexicalAnalyzerTest::getStringFromTokens(const std::vector& tokens) +{ + QStringList stringTokens; + + QMetaEnum metaEnum = QMetaEnum::fromType(); + Q_ASSERT(metaEnum.isValid()); + + for (const pdf::PDFLexicalAnalyzer::Token& token : tokens) + { + QString tokenTypeAsString = metaEnum.valueToKey(static_cast(token.type)); + + if (!token.data.isValid()) + { + stringTokens << tokenTypeAsString; + } + else + { + stringTokens << QString("%1(%2)").arg(tokenTypeAsString, token.data.toString()); + } + } + + return QString("{ %1 }").arg(stringTokens.join(", ")); +} + +QTEST_APPLESS_MAIN(LexicalAnalyzerTest) + +#include "tst_lexicalanalyzertest.moc"