2021-04-30 20:12:10 +02:00
|
|
|
// Copyright (C) 2018-2021 Jakub Melka
|
2018-11-17 16:48:30 +01:00
|
|
|
//
|
2020-12-20 19:03:58 +01:00
|
|
|
// This file is part of Pdf4Qt.
|
2018-11-17 16:48:30 +01:00
|
|
|
//
|
2020-12-20 19:03:58 +01:00
|
|
|
// Pdf4Qt is free software: you can redistribute it and/or modify
|
2018-11-17 16:48:30 +01:00
|
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
2021-04-30 20:12:10 +02:00
|
|
|
// with the written consent of the copyright owner, any later version.
|
2018-11-17 16:48:30 +01:00
|
|
|
//
|
2020-12-20 19:03:58 +01:00
|
|
|
// Pdf4Qt is distributed in the hope that it will be useful,
|
2018-11-17 16:48:30 +01:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Lesser General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
2020-12-20 19:03:58 +01:00
|
|
|
// along with Pdf4Qt. If not, see <https://www.gnu.org/licenses/>.
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
|
|
|
|
#ifndef PDFDOCUMENTREADER_H
|
|
|
|
#define PDFDOCUMENTREADER_H
|
|
|
|
|
|
|
|
#include "pdfglobal.h"
|
|
|
|
#include "pdfdocument.h"
|
2019-11-09 17:27:17 +01:00
|
|
|
#include "pdfprogress.h"
|
2020-09-17 19:17:16 +02:00
|
|
|
#include "pdfxreftable.h"
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
#include <QtCore>
|
|
|
|
#include <QIODevice>
|
|
|
|
|
|
|
|
namespace pdf
|
|
|
|
{
|
2020-09-17 19:17:16 +02:00
|
|
|
class PDFXRefTable;
|
|
|
|
class PDFParsingContext;
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
/// This class is a reader of PDF document from various devices (file, io device,
|
|
|
|
/// byte buffer). This class doesn't throw exceptions, to check errors, use
|
|
|
|
/// appropriate functions.
|
2020-12-20 19:03:58 +01:00
|
|
|
class Pdf4QtLIBSHARED_EXPORT PDFDocumentReader
|
2018-11-17 16:48:30 +01:00
|
|
|
{
|
|
|
|
Q_DECLARE_TR_FUNCTIONS(pdf::PDFDocumentReader)
|
|
|
|
|
|
|
|
public:
|
2021-05-11 18:46:33 +02:00
|
|
|
explicit PDFDocumentReader(PDFProgress* progress, const std::function<QString(bool*)>& getPasswordCallback, bool permissive, bool authorizeOwnerOnly);
|
2018-11-17 16:48:30 +01:00
|
|
|
|
2018-11-21 19:30:15 +01:00
|
|
|
constexpr inline PDFDocumentReader(const PDFDocumentReader&) = delete;
|
|
|
|
constexpr inline PDFDocumentReader(PDFDocumentReader&&) = delete;
|
|
|
|
constexpr inline PDFDocumentReader& operator=(const PDFDocumentReader&) = delete;
|
|
|
|
constexpr inline PDFDocumentReader& operator=(PDFDocumentReader&&) = delete;
|
|
|
|
|
2019-08-12 12:02:40 +02:00
|
|
|
enum class Result
|
|
|
|
{
|
|
|
|
OK, ///< Document was successfully loaded
|
|
|
|
Failed, ///< Error occured during document reading
|
|
|
|
Cancelled ///< User cancelled document reading
|
|
|
|
};
|
|
|
|
|
2018-11-17 16:48:30 +01:00
|
|
|
/// Reads a PDF document from the specified file. If file doesn't exist,
|
|
|
|
/// cannot be opened or contain invalid pdf, empty PDF file is returned.
|
|
|
|
/// No exception is thrown.
|
|
|
|
PDFDocument readFromFile(const QString& fileName);
|
|
|
|
|
|
|
|
/// Reads a PDF document from the specified device. If device is not opened
|
|
|
|
/// for reading, then function tries it to open for reading. If it is opened,
|
|
|
|
/// but not for reading, empty PDF document is returned. This also occurs
|
|
|
|
/// when incorrect PDF is read. No exception is thrown.
|
|
|
|
PDFDocument readFromDevice(QIODevice* device);
|
|
|
|
|
|
|
|
/// Reads a PDF document from the specified buffer (byte array). If incorrect
|
|
|
|
/// PDF is read, then empty PDF document is returned. No exception is thrown.
|
|
|
|
PDFDocument readFromBuffer(const QByteArray& buffer);
|
|
|
|
|
2019-08-12 12:02:40 +02:00
|
|
|
/// Returns result code for reading document from the device
|
|
|
|
Result getReadingResult() const { return m_result; }
|
2018-11-17 16:48:30 +01:00
|
|
|
|
2018-11-21 19:30:15 +01:00
|
|
|
/// Returns error message, if document reading was unsuccessfull
|
|
|
|
const QString& getErrorMessage() const { return m_errorMessage; }
|
|
|
|
|
2020-06-14 18:40:42 +02:00
|
|
|
/// Get source data of the document
|
|
|
|
const QByteArray& getSource() const { return m_source; }
|
|
|
|
|
2020-09-17 19:17:16 +02:00
|
|
|
/// Returns warning messages
|
|
|
|
const QStringList& getWarnings() const { return m_warnings; }
|
|
|
|
|
2018-11-17 16:48:30 +01:00
|
|
|
private:
|
|
|
|
static constexpr const int FIND_NOT_FOUND_RESULT = -1;
|
|
|
|
|
|
|
|
/// Resets the internal state and prepares it for new reading cycle
|
|
|
|
void reset();
|
|
|
|
|
|
|
|
/// Find a last string in the byte array, scan only \p limit bytes. If string
|
|
|
|
/// is not found, then FIND_NOT_FOUND_RESULT is returned, if it is found, then
|
|
|
|
/// it position from the beginning of byte array is returned.
|
|
|
|
/// \param what String to be found
|
|
|
|
/// \param byteArray Byte array to be scanned from the end
|
|
|
|
/// \param limit Scan up to this value bytes from the end
|
|
|
|
/// \returns Position of string, or FIND_NOT_FOUND_RESULT
|
|
|
|
int findFromEnd(const char* what, const QByteArray& byteArray, int limit);
|
|
|
|
|
2020-09-17 19:17:16 +02:00
|
|
|
void checkFooter(const QByteArray& buffer);
|
|
|
|
void checkHeader(const QByteArray& buffer);
|
|
|
|
const PDFInteger findXrefTableOffset(const QByteArray& buffer);
|
|
|
|
Result processReferenceTableEntries(PDFXRefTable* xrefTable, const std::vector<PDFXRefTable::Entry>& occupiedEntries, PDFObjectStorage::PDFObjects& objects);
|
|
|
|
Result processSecurityHandler(const PDFObject& trailerDictionaryObject, const std::vector<PDFXRefTable::Entry>& occupiedEntries, PDFObjectStorage::PDFObjects& objects);
|
|
|
|
void processObjectStreams(PDFXRefTable* xrefTable, PDFObjectStorage::PDFObjects& objects);
|
|
|
|
|
|
|
|
/// This function fetches object from the buffer from the specified offset.
|
|
|
|
/// Can throw exception, returns a pair of scanned reference and object content.
|
2020-09-19 14:42:40 +02:00
|
|
|
/// \param context Context
|
|
|
|
/// \param offset Offset
|
|
|
|
/// \param reference Reference to parsed object
|
2020-09-17 19:17:16 +02:00
|
|
|
PDFObject getObject(PDFParsingContext* context, PDFInteger offset, PDFObjectReference reference) const;
|
|
|
|
|
2020-09-19 14:42:40 +02:00
|
|
|
/// Tries to restore objects from object list. This function can be used in multiple pass, because
|
|
|
|
/// for example streams, can have length defined in referred object. If such is the case, then
|
|
|
|
/// second pass is needed. Returns true, if all object were correctly read.
|
|
|
|
/// \param restoredObjects Map of restored objects
|
|
|
|
/// \param offsets Offsets, from which are objects being read
|
|
|
|
bool restoreObjects(std::map<PDFObjectReference, PDFObject>& restoredObjects, const std::vector<std::pair<int, int>>& offsets);
|
|
|
|
|
2020-09-17 19:17:16 +02:00
|
|
|
/// Fetch object from reference table
|
|
|
|
PDFObject getObjectFromXrefTable(PDFXRefTable* xrefTable, PDFParsingContext* context, PDFObjectReference reference) const;
|
|
|
|
|
2020-09-19 14:42:40 +02:00
|
|
|
/// Tries to read damaged trailer dictionary
|
|
|
|
PDFObject readDamagedTrailerDictionary() const;
|
|
|
|
|
|
|
|
/// Attempts to read a damaged PDF document from the specified buffer (byte array). If incorrect
|
|
|
|
/// PDF is read, then empty PDF document is returned. No exception is thrown.
|
|
|
|
PDFDocument readDamagedDocumentFromBuffer(const QByteArray& buffer);
|
|
|
|
|
|
|
|
/// This function is used, when damaged pdf document is being restored. It returns
|
|
|
|
/// array of hints, where objects should appear. It constists of pair of start offset,
|
|
|
|
/// and end offset. Start offset is always a valid index to the buffer, end offset
|
|
|
|
/// can be one index after the buffers end (it is end iterator).
|
|
|
|
/// \param buffer Buffer
|
|
|
|
std::vector<std::pair<int, int>> findObjectByteOffsets(const QByteArray& buffer) const;
|
|
|
|
|
2020-01-01 18:23:18 +01:00
|
|
|
void progressStart(size_t stepCount, QString text);
|
2019-11-09 17:27:17 +01:00
|
|
|
void progressStep();
|
|
|
|
void progressFinish();
|
|
|
|
|
2018-11-25 14:48:08 +01:00
|
|
|
/// Mutex for access to variables of this reader from more threads
|
|
|
|
/// (providing thread safety)
|
|
|
|
QMutex m_mutex;
|
|
|
|
|
2019-08-12 12:02:40 +02:00
|
|
|
/// Result of document reading from the device
|
|
|
|
std::atomic<Result> m_result;
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
/// In case if error occurs, it is stored here
|
|
|
|
QString m_errorMessage;
|
|
|
|
|
|
|
|
/// Version of the scanned file
|
|
|
|
PDFVersion m_version;
|
2019-08-10 17:24:12 +02:00
|
|
|
|
|
|
|
/// Callback to obtain password from the user
|
|
|
|
std::function<QString(bool*)> m_getPasswordCallback;
|
2019-11-09 17:27:17 +01:00
|
|
|
|
|
|
|
/// Progress indicator
|
|
|
|
PDFProgress* m_progress;
|
2020-06-14 18:40:42 +02:00
|
|
|
|
|
|
|
/// Raw document data (byte array containing source data for created document)
|
|
|
|
QByteArray m_source;
|
2020-09-17 19:17:16 +02:00
|
|
|
|
|
|
|
/// Security handler
|
|
|
|
PDFSecurityHandlerPointer m_securityHandler;
|
|
|
|
|
|
|
|
/// Be permissive when reading, tolerate errors and try to fix broken document
|
|
|
|
bool m_permissive;
|
|
|
|
|
2021-05-11 18:46:33 +02:00
|
|
|
/// Authorize as owner only (if owner authorization fails, then whole document
|
|
|
|
/// reading fails)
|
|
|
|
bool m_authorizeOwnerOnly;
|
|
|
|
|
2020-09-17 19:17:16 +02:00
|
|
|
/// Warnings
|
|
|
|
QStringList m_warnings;
|
2018-11-17 16:48:30 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace pdf
|
|
|
|
|
|
|
|
#endif // PDFDOCUMENTREADER_H
|