// Copyright (C) 2018 Jakub Melka // // This file is part of PdfForQt. // // PdfForQt is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // PdfForQt is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with PDFForQt. If not, see . #include "pdfparser.h" #include "pdfconstants.h" #include #include #include #include namespace pdf { PDFLexicalAnalyzer::PDFLexicalAnalyzer(const char* begin, const char* end) : m_begin(begin), m_current(begin), m_end(end) { } PDFLexicalAnalyzer::Token PDFLexicalAnalyzer::fetch() { // Skip whitespace/comments at first skipWhitespaceAndComments(); // If we are at end of token, then return immediately if (isAtEnd()) { return Token(TokenType::EndOfFile); } switch (lookChar()) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '+': case '-': case '.': { // Scan integer or real number. If integer overflows, then it is converted to the real number. If // real number overflow, then error is reported. This behaviour is according to the PDF 1.7 specification, // chapter 3.2.2. // First, treat special characters bool positive = fetchChar('+'); bool negative = fetchChar('-'); bool dot = fetchChar('.'); bool treatAsReal = dot; bool atLeastOneDigit = false; if (isAtEnd()) { error(tr("Expected a number, but end of stream reached.")); } PDFInteger integer = 0; PDFReal real = 0.0; PDFReal scale = 0.1; // Now, we can only have digits and a single dot while (!isAtEnd()) { if (!dot && fetchChar('.')) { // Entering real mode dot = true; treatAsReal = true; real = integer; } else if (std::isdigit(static_cast(lookChar()))) { atLeastOneDigit = true; PDFInteger digit = lookChar() - '0'; ++m_current; if (!treatAsReal) { // Treat value as integer integer = integer * 10 + digit; // Check, if integer has not overflown, if yes, treat him as real // according to the PDF 1.7 specification. if (!isValidInteger(integer)) { treatAsReal = true; real = integer; } } else { // Treat value as real if (!dot) { real = real * 10.0 + digit; } else { real = real + scale * digit; scale *= 0.1; } } } else if (isWhitespace(lookChar()) || isDelimiter(lookChar())) { // Whitespace appeared - whitespaces/delimiters delimits tokens - break break; } else { // Another character other than dot and digit appeared - this is an error error(tr("Invalid format of number. Character '%1' appeared.").arg(lookChar())); } } // Now, we have scanned whole token number, check for errors. if (positive && negative) { error(tr("Both '+' and '-' appeared in number. Invalid format of number.")); } if (!atLeastOneDigit) { error(tr("Bad format of number - no digits appeared.")); } // Check for real overflow if (treatAsReal && !std::isfinite(real)) { error(tr("Real number overflow.")); } if (negative) { integer = -integer; real = -real; } return !treatAsReal ? Token(TokenType::Integer, integer) : Token(TokenType::Real, real); } case CHAR_LEFT_BRACKET: { // String '(', sequence of literal characters enclosed in "()", see PDF 1.7 Reference, // chapter 3.2.3. Note: literal string can have properly balanced brackets inside. int parenthesisBalance = 1; QByteArray string; string.reserve(STRING_BUFFER_RESERVE); // Skip first character fetchChar(); while (true) { // Scan string, see, what next char is. const char character = fetchChar(); switch (character) { case CHAR_LEFT_BRACKET: { ++parenthesisBalance; string.push_back(character); break; } case CHAR_RIGHT_BRACKET: { if (--parenthesisBalance == 0) { // We are done. return Token(TokenType::String, string); } else { string.push_back(character); } break; } case CHAR_BACKSLASH: { // Escape sequence. Check, what it means. Possible values are in PDF 1.7 Reference, // chapter 3.2.3, Table 3.2 - Escape Sequence in Literal Strings const char escaped = fetchChar(); switch (escaped) { case 'n': { string += '\n'; break; } case 'r': { string += '\r'; break; } case 't': { string += '\t'; break; } case 'b': { string += '\b'; break; } case 'f': { string += '\f'; break; } case '\\': case '(': case ')': { string += escaped; break; } case '\n': { // Nothing done here, EOL is not part of the string, because it was escaped break; } case '\r': { // Skip EOL fetchChar('\n'); break; } default: { // Undo fetch char, we do not want to miss first digit --m_current; // Try to scan octal value. Octal number can have 3 digits in this case. // According to specification, overflow value can be truncated. int octalNumber = -1; if (fetchOctalNumber(3, &octalNumber)) { string += static_cast(octalNumber); } else { error(tr("Expected octal number with 1-3 digits.")); } break; } } break; } default: { // Normal character string.push_back(character); break; } } } // This code should be unreachable. Either normal string is scanned - then it is returned // in the while cycle above, or exception is thrown. Q_ASSERT(false); return Token(TokenType::EndOfFile); } case CHAR_SLASH: { // Name object. According to the PDF Reference 1.7, chapter 3.2.4 name object can have zero length, // and can contain #XX characters, where XX is hexadecimal number. fetchChar(); QByteArray name; name.reserve(NAME_BUFFER_RESERVE); while (!isAtEnd()) { if (fetchChar(CHAR_MARK)) { const char hexHighCharacter = fetchChar(); const char hexLowCharacter = fetchChar(); if (isHexCharacter(hexHighCharacter) && isHexCharacter(hexLowCharacter)) { name += QByteArray::fromHex(QByteArray::fromRawData(m_current - 2, 2)); } else { // Throw an error - hexadecimal number is expected. error(tr("Hexadecimal number must follow character '#' in the name.")); } continue; } // Now, we have other character, than '#', if it is a regular character, // then add it to the name, otherwise end scanning. const char character = lookChar(); if (isRegular(character)) { name += character; ++m_current; } else { // Matched non-regular character - end of name. break; } } return Token(TokenType::Name, std::move(name)); } case CHAR_ARRAY_START: { ++m_current; return Token(TokenType::ArrayStart); } case CHAR_ARRAY_END: { ++m_current; return Token(TokenType::ArrayEnd); } case CHAR_LEFT_ANGLE: { ++m_current; // Check if it is dictionary start if (fetchChar(CHAR_LEFT_ANGLE)) { return Token(TokenType::DictionaryStart); } else { // Reserve two times normal size, because in hexadecimal string, each character // is represented as a pair of hexadecimal numbers. QByteArray hexadecimalString; hexadecimalString.reserve(STRING_BUFFER_RESERVE * 2); // Scan hexadecimal string while (!isAtEnd()) { const char character = fetchChar(); if (isHexCharacter(character)) { hexadecimalString += character; } else if (character == CHAR_RIGHT_ANGLE) { // End of string mark. According to the specification, string can contain odd number // of hexadecimal digits, in this case, zero is appended to the string. if (hexadecimalString.size() % 2 == 1) { hexadecimalString += '0'; } QByteArray decodedString = QByteArray::fromHex(hexadecimalString); return Token(TokenType::String, std::move(decodedString)); } else { // This is unexpected. Invalid character in hexadecimal string. error(tr("Invalid character in hexadecimal string.")); } } error(tr("Unexpected end of stream reached while scanning hexadecimal string.")); } break; } case CHAR_RIGHT_ANGLE: { // This must be a mark of dictionary end, because in other way, we should reach end of // string in the code above. ++m_current; if (fetchChar(CHAR_RIGHT_ANGLE)) { return Token(TokenType::DictionaryEnd); } error(tr("Invalid character '%1'").arg(CHAR_RIGHT_ANGLE)); break; } default: { // Now, we have skipped whitespaces. So actual character must be either regular, or it is special. // We have treated all special characters above. For this reason, if we match special character, // then we report an error. Q_ASSERT(!isWhitespace(lookChar())); if (isRegular(lookChar())) { // It should be sequence of regular characters - command, true, false, null... QByteArray command; command.reserve(COMMAND_BUFFER_RESERVE); while (!isAtEnd() && isRegular(lookChar())) { command += fetchChar(); } if (command == BOOL_OBJECT_TRUE_STRING) { return Token(TokenType::Boolean, true); } else if (command == BOOL_OBJECT_FALSE_STRING) { return Token(TokenType::Boolean, false); } else if (command == NULL_OBJECT_STRING) { return Token(TokenType::Null); } else { return Token(TokenType::Command, std::move(command)); } } else { error(tr("Unexpected character '%1' in the stream.").arg(lookChar())); } break; } } return Token(TokenType::EndOfFile); } void PDFLexicalAnalyzer::seek(PDFInteger offset) { const PDFInteger limit = std::distance(m_begin, m_end); if (offset >= 0 && offset < limit) { m_current = std::next(m_begin, offset); } else { error(tr("Trying to seek stream position to %1 bytes from the start, byte offset is invalid.").arg(offset)); } } void PDFLexicalAnalyzer::skipWhitespaceAndComments() { bool isComment = false; while (m_current != m_end) { if (isComment) { // Comment ends at end of line if (*m_current == CHAR_CARRIAGE_RETURN || *m_current == CHAR_LINE_FEED) { isComment = false; } // Commented character - step to the next character ++m_current; } else if (*m_current == CHAR_PERCENT) { isComment = true; ++m_current; } else if (isWhitespace(*m_current)) { ++m_current; } else { // Not a whitespace and not in comment break; } } } void PDFLexicalAnalyzer::skipStreamStart() { // According to the PDF Reference 1.7, chapter 3.2.7, after the 'stream' keyword, // either carriage return + line feed, or just line feed can appear. Eat them. fetchChar(CHAR_CARRIAGE_RETURN); fetchChar(CHAR_LINE_FEED); } QByteArray PDFLexicalAnalyzer::fetchByteArray(PDFInteger length) { Q_ASSERT(length >= 0); if (std::distance(m_current, m_end) < length) { error(tr("Can't read %1 bytes from the input stream. Input stream end reached.").arg(length)); } QByteArray result(m_current, length); std::advance(m_current, length); return result; } QString PDFLexicalAnalyzer::getStringFromOperandType(TokenType type) { QMetaEnum metaEnum = QMetaEnum::fromType(); Q_ASSERT(metaEnum.isValid()); const char* typeName = metaEnum.valueToKey(static_cast(type)); Q_ASSERT(typeName); return typeName; } bool PDFLexicalAnalyzer::fetchChar(const char character) { if (!isAtEnd() && lookChar() == character) { ++m_current; return true; } return false; } char PDFLexicalAnalyzer::fetchChar() { if (!isAtEnd()) { return *m_current++; } error(tr("Unexpected end of stream reached.")); return 0; } bool PDFLexicalAnalyzer::fetchOctalNumber(int maxDigits, int* output) { Q_ASSERT(output); *output = 0; int fetchedNumbers = 0; while (!isAtEnd() && fetchedNumbers < maxDigits) { const char c = lookChar(); if (c >= '0' && c <= '7') { // Valid octal characters const int number = c - '0'; *output = *output * 8 + number; ++m_current; ++fetchedNumbers; } else { // Non-octal character reached break; } } return fetchedNumbers >= 1; } constexpr bool PDFLexicalAnalyzer::isHexCharacter(const char character) { return (character >= '0' && character <= '9') || (character >= 'A' && character <= 'F') || (character >= 'a' && character <= 'f'); } void PDFLexicalAnalyzer::error(const QString& message) const { std::size_t distance = std::distance(m_begin, m_current); throw PDFParserException(tr("Error near position %1. %2").arg(distance).arg(message)); } PDFObject PDFParsingContext::getObject(const PDFObject& object) { if (object.isReference()) { Q_ASSERT(m_objectFetcher); return m_objectFetcher(this, object.getReference()); } return object; } void PDFParsingContext::beginParsingObject(PDFObjectReference reference) { if (m_activeParsedObjectSet.search(reference)) { throw PDFParserException(tr("Cyclical reference found while parsing object %1 %2.").arg(reference.objectNumber).arg(reference.generation)); } else { m_activeParsedObjectSet.insert(reference); } } void PDFParsingContext::endParsingObject(PDFObjectReference reference) { Q_ASSERT(m_activeParsedObjectSet.search(reference)); m_activeParsedObjectSet.erase(reference); } PDFParser::PDFParser(const QByteArray& data, PDFParsingContext* context, Features features) : m_context(context), m_lexicalAnalyzer(data.constData(), data.constData() + data.size()), m_features(features) { m_lookAhead1 = m_lexicalAnalyzer.fetch(); m_lookAhead2 = m_lexicalAnalyzer.fetch(); } PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features) : m_context(context), m_lexicalAnalyzer(begin, end), m_features(features) { m_lookAhead1 = m_lexicalAnalyzer.fetch(); m_lookAhead2 = m_lexicalAnalyzer.fetch(); } PDFObject PDFParser::getObject() { switch (m_lookAhead1.type) { case PDFLexicalAnalyzer::TokenType::Boolean: { Q_ASSERT(m_lookAhead1.data.type() == QVariant::Bool); const bool value = m_lookAhead1.data.toBool(); shift(); return PDFObject::createBool(value); } case PDFLexicalAnalyzer::TokenType::Integer: { Q_ASSERT(m_lookAhead1.data.type() == QVariant::LongLong); const PDFInteger value = m_lookAhead1.data.toLongLong(); shift(); // We must check, if we are reading reference. In this case, // actual value is integer and next value is command "R". if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Integer && m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command && m_lookAhead2.data.toByteArray() == PDF_REFERENCE_COMMAND) { Q_ASSERT(m_lookAhead1.data.type() == QVariant::LongLong); const PDFInteger generation = m_lookAhead1.data.toLongLong(); shift(); shift(); return PDFObject::createReference(PDFObjectReference(value, generation)); } else { // Just normal integer return PDFObject::createInteger(value); } } case PDFLexicalAnalyzer::TokenType::Real: { Q_ASSERT(m_lookAhead1.data.type() == QVariant::Double); const PDFReal value = m_lookAhead1.data.toDouble(); shift(); return PDFObject::createReal(value); } case PDFLexicalAnalyzer::TokenType::String: { Q_ASSERT(m_lookAhead1.data.type() == QVariant::ByteArray); QByteArray array = m_lookAhead1.data.toByteArray(); array.shrink_to_fit(); shift(); return PDFObject::createString(std::make_shared(std::move(array))); } case PDFLexicalAnalyzer::TokenType::Name: { Q_ASSERT(m_lookAhead1.data.type() == QVariant::ByteArray); QByteArray array = m_lookAhead1.data.toByteArray(); array.shrink_to_fit(); shift(); return PDFObject::createName(std::make_shared(std::move(array))); } case PDFLexicalAnalyzer::TokenType::ArrayStart: { shift(); // Create shared pointer to the array (if the exception is thrown, array // will be properly destroyed by the shared array destructor) std::shared_ptr arraySharedPointer = std::make_shared(); PDFArray* array = static_cast(arraySharedPointer.get()); while (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::EndOfFile && m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::ArrayEnd) { array->appendItem(getObject()); } // Now, we have either end of file, or array end. If former appears, then // it is an error - error should be reported. if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::EndOfFile) { error(tr("Stream ended inside array.")); } else { shift(); return PDFObject::createArray(std::move(arraySharedPointer)); } } case PDFLexicalAnalyzer::TokenType::DictionaryStart: { shift(); // Start reading the dictionary. BEWARE! It can also be a stream. In this case, // we must load also the stream content. std::shared_ptr dictionarySharedPointer = std::make_shared(); PDFDictionary* dictionary = dictionarySharedPointer.get(); // Now, scan key/value pairs while (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::EndOfFile && m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::DictionaryEnd) { // First value should be a key if (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::Name) { error(tr("Dictionary key must be a name.")); } QByteArray key = m_lookAhead1.data.toByteArray(); shift(); // Second value should be a value PDFObject object = getObject(); dictionary->addEntry(std::move(key), std::move(object)); } // Now, we should reach dictionary end. If it is not the case, then end of stream occured. if (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::DictionaryEnd) { error(tr("End of stream inside dictionary reached.")); } // Is it a content stream? if (m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command && m_lookAhead2.data.toByteArray() == PDF_STREAM_START_COMMAND) { if (!m_features.testFlag(AllowStreams)) { error(tr("Streams are not allowed in this context.")); } // Read stream content. According to the PDF Reference 1.7, chapter 3.2.7, stream // content can be placed in the file. If this is the case, then try to load file // content in the memory. But even in this case, stream content should be skipped. if (!dictionary->hasKey(PDF_STREAM_DICT_LENGTH)) { error(tr("Stream length is not specified.")); } PDFObject lengthObject = m_context->getObject(dictionary->get(PDF_STREAM_DICT_LENGTH)); if (!lengthObject.isInt()) { error(tr("Bad value of stream length. It should be an integer number.")); } PDFInteger length = lengthObject.getInteger(); if (length < 0) { error(tr("Length of the stream buffer is negative (%1). It must be a positive number.").arg(length)); } // Skip the stream start, then fetch data of the stream m_lexicalAnalyzer.skipStreamStart(); QByteArray buffer = m_lexicalAnalyzer.fetchByteArray(length); // According to the PDF Reference 1.7, chapter 3.2.7, stream content can also be specified // in the external file. If this is the case, then we must try to load the stream data // from the external file. if (dictionary->hasKey(PDF_STREAM_DICT_FILE_SPECIFICATION)) { PDFObject fileName = m_context->getObject(dictionary->get(PDF_STREAM_DICT_FILE_SPECIFICATION)); if (!fileName.isString()) { error(tr("Stream data should be in external file, but invalid file name is specified.")); } QFile streamDataFile(fileName.getString()); if (streamDataFile.open(QFile::ReadOnly)) { buffer = streamDataFile.readAll(); streamDataFile.close(); } else { error(tr("Can't open stream data stored in external file '%1'.").arg(QString(fileName.getString()))); } } // Refill lookahead tokens m_lookAhead1 = m_lexicalAnalyzer.fetch(); m_lookAhead2 = m_lexicalAnalyzer.fetch(); if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command && m_lookAhead1.data.toByteArray() == PDF_STREAM_END_COMMAND) { // Everything OK, just advance and return stream object shift(); return PDFObject::createStream(std::make_shared(std::move(*dictionary), std::move(buffer))); } else { error(tr("End of stream should end in keyword 'endstream'.")); } } else { // Just shift (eat dictionary end) and return dictionary shift(); return PDFObject::createDictionary(std::move(dictionarySharedPointer)); } } case PDFLexicalAnalyzer::TokenType::Null: { shift(); return PDFObject::createNull(); } case PDFLexicalAnalyzer::TokenType::ArrayEnd: case PDFLexicalAnalyzer::TokenType::DictionaryEnd: case PDFLexicalAnalyzer::TokenType::Command: { error(tr("Cannot read object. Unexpected token appeared.")); break; } case PDFLexicalAnalyzer::TokenType::EndOfFile: { error(tr("Cannot read object. End of stream reached.")); break; } } // This code should be unreachable. All values should be handled in the switch above. Q_ASSERT(false); return PDFObject::createNull(); } PDFObject PDFParser::getObject(PDFObjectReference reference) { PDFParsingContext::PDFParsingContextGuard guard(m_context, reference); return getObject(); } void PDFParser::error(const QString& message) const { throw new PDFParserException(message); } void PDFParser::seek(PDFInteger offset) { m_lexicalAnalyzer.seek(offset); // We must read lookahead symbols, because we invalidated them m_lookAhead1 = m_lexicalAnalyzer.fetch(); m_lookAhead2 = m_lexicalAnalyzer.fetch(); } bool PDFParser::fetchCommand(const char* command) { if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command && m_lookAhead1.data.toByteArray() == command) { shift(); return true; } return false; } void PDFParser::shift() { m_lookAhead1 = std::move(m_lookAhead2); m_lookAhead2 = m_lexicalAnalyzer.fetch(); } } // namespace pdf