mirror of
				https://github.com/JakubMelka/PDF4QT.git
				synced 2025-06-05 21:59:17 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			394 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			394 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| //    Copyright (C) 2018-2021 Jakub Melka
 | |
| //
 | |
| //    This file is part of Pdf4Qt.
 | |
| //
 | |
| //    Pdf4Qt is free software: you can redistribute it and/or modify
 | |
| //    it under the terms of the GNU Lesser General Public License as published by
 | |
| //    the Free Software Foundation, either version 3 of the License, or
 | |
| //    with the written consent of the copyright owner, any later version.
 | |
| //
 | |
| //    Pdf4Qt is distributed in the hope that it will be useful,
 | |
| //    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| //    GNU Lesser General Public License for more details.
 | |
| //
 | |
| //    You should have received a copy of the GNU Lesser General Public License
 | |
| //    along with Pdf4Qt.  If not, see <https://www.gnu.org/licenses/>.
 | |
| 
 | |
| 
 | |
| #ifndef PDFPARSER_H
 | |
| #define PDFPARSER_H
 | |
| 
 | |
| #include "pdfglobal.h"
 | |
| #include "pdfobject.h"
 | |
| #include "pdfflatmap.h"
 | |
| 
 | |
| #include <QtCore>
 | |
| #include <QVariant>
 | |
| #include <QByteArray>
 | |
| 
 | |
| #include <set>
 | |
| #include <functional>
 | |
| 
 | |
| namespace pdf
 | |
| {
 | |
| 
 | |
| // Group of whitespace characters
 | |
| 
 | |
| constexpr const char CHAR_NULL              = 0x00;
 | |
| constexpr const char CHAR_TAB               = 0x09;
 | |
| constexpr const char CHAR_LINE_FEED         = 0x0A;
 | |
| constexpr const char CHAR_FORM_FEED         = 0x0C;
 | |
| constexpr const char CHAR_CARRIAGE_RETURN   = 0x0D;
 | |
| constexpr const char CHAR_SPACE             = 0x20;
 | |
| 
 | |
| // According to specification, chapter 3.1, EOL marker is one of the following characters:
 | |
| //  1) Either CHAR_CARRIAGE_RETURN, or CHAR_LINE_FEED,
 | |
| //  2) CHAR_CARRIAGE_RETURN followed immediately by CHAR_LINE_FEED
 | |
| 
 | |
| // Group of delimiter characters
 | |
| 
 | |
| constexpr const char CHAR_LEFT_BRACKET          = '(';
 | |
| constexpr const char CHAR_RIGHT_BRACKET         = ')';
 | |
| constexpr const char CHAR_LEFT_ANGLE            = '<';
 | |
| constexpr const char CHAR_RIGHT_ANGLE           = '>';
 | |
| constexpr const char CHAR_ARRAY_START           = '[';
 | |
| constexpr const char CHAR_ARRAY_END             = ']';
 | |
| constexpr const char CHAR_LEFT_CURLY_BRACKET    = '{';
 | |
| constexpr const char CHAR_RIGHT_CURLY_BRACKET   = '}';
 | |
| constexpr const char CHAR_SLASH                 = '/';
 | |
| constexpr const char CHAR_PERCENT               = '%';
 | |
| constexpr const char CHAR_BACKSLASH             = '\\';
 | |
| constexpr const char CHAR_MARK                  = '#';
 | |
| 
 | |
| // These constants reserves memory while reading string or name
 | |
| 
 | |
| constexpr const int STRING_BUFFER_RESERVE = 32;
 | |
| constexpr const int NAME_BUFFER_RESERVE = 16;
 | |
| constexpr const int COMMAND_BUFFER_RESERVE = 16;
 | |
| 
 | |
| // Special objects - bool, null object
 | |
| 
 | |
| constexpr const char* BOOL_OBJECT_TRUE_STRING = "true";
 | |
| constexpr const char* BOOL_OBJECT_FALSE_STRING = "false";
 | |
| constexpr const char* NULL_OBJECT_STRING = "null";
 | |
| 
 | |
| // Special commands
 | |
| constexpr const char* PDF_REFERENCE_COMMAND = "R";
 | |
| constexpr const char* PDF_STREAM_START_COMMAND = "stream";
 | |
| constexpr const char* PDF_STREAM_END_COMMAND = "endstream";
 | |
| 
 | |
| class Pdf4QtLIBSHARED_EXPORT PDFLexicalAnalyzer
 | |
| {
 | |
|     Q_GADGET
 | |
|     Q_DECLARE_TR_FUNCTIONS(pdf::PDFLexicalAnalyzer)
 | |
| 
 | |
| public:
 | |
|     PDFLexicalAnalyzer(const char* begin, const char* end);
 | |
| 
 | |
|     enum class TokenType
 | |
|     {
 | |
|         Boolean,
 | |
|         Integer,
 | |
|         Real,
 | |
|         String,
 | |
|         Name,
 | |
|         ArrayStart,
 | |
|         ArrayEnd,
 | |
|         DictionaryStart,
 | |
|         DictionaryEnd,
 | |
|         Null,
 | |
|         Command,
 | |
|         EndOfFile
 | |
|     };
 | |
| 
 | |
|     Q_ENUM(TokenType)
 | |
| 
 | |
|     struct Token
 | |
|     {
 | |
|         explicit Token() : type(TokenType::EndOfFile) { }
 | |
|         explicit Token(TokenType type) : type(type) { }
 | |
|         explicit Token(TokenType type, QVariant data) : type(type), data(qMove(data)) { }
 | |
| 
 | |
|         Token(const Token&) = default;
 | |
|         Token(Token&&) = default;
 | |
| 
 | |
|         Token& operator=(const Token&) = default;
 | |
|         Token& operator=(Token&&) = default;
 | |
| 
 | |
|         bool operator==(const Token& other) const { return type == other.type && data == other.data; }
 | |
| 
 | |
|         TokenType type;
 | |
|         QVariant data;
 | |
|     };
 | |
| 
 | |
|     /// Fetches a new token from the input stream. If we are at end of the input
 | |
|     /// stream, then EndOfFile token is returned.
 | |
|     Token fetch();
 | |
| 
 | |
|     /// Seeks stream from the start. If stream cannot be seeked (position is invalid),
 | |
|     /// then exception is thrown.
 | |
|     void seek(PDFInteger offset);
 | |
| 
 | |
|     /// Skips whitespace and comments
 | |
|     void skipWhitespaceAndComments();
 | |
| 
 | |
|     /// Skips stream start
 | |
|     void skipStreamStart();
 | |
| 
 | |
|     /// Reads number of bytes from the buffer and creates a byte array from it.
 | |
|     /// If end of stream appears before desired end byte, exception is thrown.
 | |
|     /// \param length Length of the buffer
 | |
|     QByteArray fetchByteArray(PDFInteger length);
 | |
| 
 | |
|     /// Returns, if whole stream was scanned
 | |
|     inline bool isAtEnd() const { return m_current == m_end; }
 | |
| 
 | |
|     /// Returns current position in the stream
 | |
|     inline PDFInteger pos() const { return std::distance(m_begin, m_current); }
 | |
| 
 | |
|     /// Finds substring in the content stream, search is started from position.
 | |
|     /// If position is invalid, or substring is not found, -1 is returned.
 | |
|     /// \param str String to be found
 | |
|     /// \param position Position, from which the search is started
 | |
|     PDFInteger findSubstring(const char* str, PDFInteger position) const;
 | |
| 
 | |
|     /// Switch parser mode for tokenizing PostScript function
 | |
|     void setTokenizingPostScriptFunction() { m_tokenizingPostScriptFunction = true; }
 | |
| 
 | |
|     /// Returns true, if character is a whitespace character according to the PDF 1.7 specification
 | |
|     /// \param character Character to be tested
 | |
|     static constexpr bool isWhitespace(char character);
 | |
| 
 | |
|     /// Returns true, if character is a delimiter character according to the PDF 1.7 specification
 | |
|     /// \param character Character to be tested
 | |
|     static constexpr bool isDelimiter(char character);
 | |
| 
 | |
|     /// Returns true, if character is a regular character according to the PDF 1.7 specification
 | |
|     /// \param character Character to be tested
 | |
|     static constexpr bool isRegular(char character) { return !isWhitespace(character) && !isDelimiter(character); }
 | |
| 
 | |
|     /// Returns string from operand type
 | |
|     /// \param type Token type
 | |
|     static QString getStringFromOperandType(TokenType type);
 | |
| 
 | |
| private:
 | |
|     inline char lookChar() const { Q_ASSERT(m_current != m_end); return *m_current; }
 | |
| 
 | |
|     /// If current char is equal to the argument, then move position by one character and return true.
 | |
|     /// If not, then return false and current position will be unchanged.
 | |
|     /// \param character Character to be fetched
 | |
|     bool fetchChar(const char character);
 | |
| 
 | |
|     /// Forcefully fetches next char from the stream. If stream is at end, then exception is thrown.
 | |
|     /// Current position will be advanced to the next one.
 | |
|     char fetchChar();
 | |
| 
 | |
|     /// Tries to fetch octal number with minimum 1 digits and specified maximum number of digits.
 | |
|     /// If octal number cannot be fetched, then false is returned, otherwise true is returned.
 | |
|     /// Result number is stored in the pointer.
 | |
|     /// \param maxDigits Maximum number of digits
 | |
|     /// \param output Non-null pointer to the result number
 | |
|     bool fetchOctalNumber(int maxDigits, int* output);
 | |
| 
 | |
|     /// Returns true, if charachter represents hexadecimal number, i.e. digit 0-9,
 | |
|     /// or letter A-F, or small letter a-f.
 | |
|     static constexpr bool isHexCharacter(const char character);
 | |
| 
 | |
|     /// Throws an error exception
 | |
|     void error(const QString& message) const;
 | |
| 
 | |
|     const char* m_begin;
 | |
|     const char* m_current;
 | |
|     const char* m_end;
 | |
|     bool m_tokenizingPostScriptFunction;
 | |
| };
 | |
| 
 | |
| /// Parsing context. Used for example to detect cyclic reference errors.
 | |
| class PDFParsingContext
 | |
| {
 | |
|     Q_DECLARE_TR_FUNCTIONS(pdf::PDFParsingContext)
 | |
| 
 | |
| public:
 | |
|     explicit PDFParsingContext(std::function<PDFObject(PDFParsingContext*, PDFObjectReference)> objectFetcher) :
 | |
|         m_objectFetcher(std::move(objectFetcher))
 | |
|     {
 | |
| 
 | |
|     }
 | |
| 
 | |
|     /// Guard guarding the cyclical references (by reference).
 | |
|     class PDFParsingContextGuard
 | |
|     {
 | |
|     public:
 | |
|         explicit inline PDFParsingContextGuard(PDFParsingContext* context, PDFObjectReference reference) :
 | |
|             m_context(context),
 | |
|             m_reference(reference)
 | |
|         {
 | |
|             m_context->beginParsingObject(m_reference);
 | |
|         }
 | |
| 
 | |
|         inline ~PDFParsingContextGuard()
 | |
|         {
 | |
|             m_context->endParsingObject(m_reference);
 | |
|         }
 | |
| 
 | |
|     private:
 | |
|         PDFParsingContext* m_context;
 | |
|         PDFObjectReference m_reference;
 | |
|     };
 | |
| 
 | |
|     /// Guard guarding the cyclical references (by object).
 | |
|     class PDFParsingContextObjectGuard
 | |
|     {
 | |
|     public:
 | |
|         explicit inline PDFParsingContextObjectGuard(PDFParsingContext* context, const PDFObject* object) :
 | |
|             m_context(context),
 | |
|             m_object(object)
 | |
|         {
 | |
|             if (object->isReference())
 | |
|             {
 | |
|                 m_context->beginParsingObject(object->getReference());
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         inline ~PDFParsingContextObjectGuard()
 | |
|         {
 | |
|             if (m_object->isReference())
 | |
|             {
 | |
|                 m_context->endParsingObject(m_object->getReference());
 | |
|             }
 | |
|         }
 | |
| 
 | |
|     private:
 | |
|         PDFParsingContext* m_context;
 | |
|         const PDFObject* m_object;
 | |
|     };
 | |
| 
 | |
|     /// Returns dereferenced object, if object is a reference. If it is not a reference,
 | |
|     /// then same object is returned.
 | |
|     PDFObject getObject(const PDFObject& object);
 | |
| 
 | |
| private:
 | |
|     void beginParsingObject(PDFObjectReference reference);
 | |
|     void endParsingObject(PDFObjectReference reference);
 | |
| 
 | |
|     using KeySet = PDFFlatMap<PDFObjectReference, 2>;
 | |
| 
 | |
|     /// This function fetches object, if it is needed
 | |
|     std::function<PDFObject(PDFParsingContext*, PDFObjectReference)> m_objectFetcher;
 | |
| 
 | |
|     /// Set containing objects currently being parsed.
 | |
|     KeySet m_activeParsedObjectSet;
 | |
| };
 | |
| 
 | |
| /// Class for parsing objects. Checks cyclical references. If
 | |
| /// the object cannot be obtained from the stream, exception is thrown.
 | |
| class Pdf4QtLIBSHARED_EXPORT PDFParser
 | |
| {
 | |
|     Q_DECLARE_TR_FUNCTIONS(pdf::PDFParser)
 | |
| 
 | |
| public:
 | |
|     enum Feature
 | |
|     {
 | |
|         None            = 0x0000,
 | |
|         AllowStreams    = 0x0001
 | |
|     };
 | |
| 
 | |
|     Q_DECLARE_FLAGS(Features, Feature)
 | |
| 
 | |
|     explicit PDFParser(const QByteArray& data, PDFParsingContext* context, Features features);
 | |
|     explicit PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features);
 | |
|     explicit PDFParser(std::function<PDFLexicalAnalyzer::Token(void)> tokenFetcher);
 | |
| 
 | |
|     /// Fetches single object from the stream. Does not check
 | |
|     /// cyclical references. If object cannot be fetched, then
 | |
|     /// exception is thrown.
 | |
|     PDFObject getObject();
 | |
| 
 | |
|     /// Fetches single object from the stream. Performs check for
 | |
|     /// cyclical references. If object cannot be fetched, then
 | |
|     /// exception is thrown.
 | |
|     PDFObject getObject(PDFObjectReference reference);
 | |
| 
 | |
|     /// Throws an error exception
 | |
|     void error(const QString& message) const;
 | |
| 
 | |
|     /// Seeks stream from the start. If stream cannot be seeked (position is invalid),
 | |
|     /// then exception is thrown.
 | |
|     void seek(PDFInteger offset);
 | |
| 
 | |
|     /// Returns currently scanned token
 | |
|     const PDFLexicalAnalyzer::Token& lookahead() const { return m_lookAhead1; }
 | |
| 
 | |
|     /// If current token is a command with same string, then eat this command
 | |
|     /// and return true. Otherwise do nothing and return false.
 | |
|     /// \param command Command to be fetched
 | |
|     bool fetchCommand(const char* command);
 | |
| 
 | |
| private:
 | |
|     void shift();
 | |
| 
 | |
|     PDFLexicalAnalyzer::Token fetch();
 | |
| 
 | |
|     /// Functor for fetching tokens
 | |
|     std::function<PDFLexicalAnalyzer::Token(void)> m_tokenFetcher;
 | |
| 
 | |
|     /// Parsing context (multiple parsers can share it)
 | |
|     PDFParsingContext* m_context;
 | |
| 
 | |
|     /// Enabled features
 | |
|     Features m_features;
 | |
| 
 | |
|     /// Lexical analyzer for scanning tokens
 | |
|     PDFLexicalAnalyzer m_lexicalAnalyzer;
 | |
| 
 | |
|     PDFLexicalAnalyzer::Token m_lookAhead1;
 | |
|     PDFLexicalAnalyzer::Token m_lookAhead2;
 | |
| };
 | |
| 
 | |
| // Implementation
 | |
| 
 | |
| inline
 | |
| constexpr bool PDFLexicalAnalyzer::isWhitespace(char character)
 | |
| {
 | |
|     switch (character)
 | |
|     {
 | |
|         case CHAR_NULL:
 | |
|         case CHAR_TAB:
 | |
|         case CHAR_LINE_FEED:
 | |
|         case CHAR_FORM_FEED:
 | |
|         case CHAR_CARRIAGE_RETURN:
 | |
|         case CHAR_SPACE:
 | |
|             return true;
 | |
| 
 | |
|         default:
 | |
|             return false;
 | |
|     }
 | |
| }
 | |
| 
 | |
| inline
 | |
| constexpr bool PDFLexicalAnalyzer::isDelimiter(char character)
 | |
| {
 | |
|     switch (character)
 | |
|     {
 | |
|         case CHAR_LEFT_BRACKET:
 | |
|         case CHAR_RIGHT_BRACKET:
 | |
|         case CHAR_LEFT_ANGLE:
 | |
|         case CHAR_RIGHT_ANGLE:
 | |
|         case CHAR_ARRAY_START:
 | |
|         case CHAR_ARRAY_END:
 | |
|         case CHAR_LEFT_CURLY_BRACKET:
 | |
|         case CHAR_RIGHT_CURLY_BRACKET:
 | |
|         case CHAR_SLASH:
 | |
|         case CHAR_PERCENT:
 | |
|             return true;
 | |
| 
 | |
|         default:
 | |
|             return false;
 | |
|     }
 | |
| }
 | |
| 
 | |
| }   // namespace pdf
 | |
| 
 | |
| #endif // PDFPARSER_H
 |