2020-01-18 11:38:54 +01:00
|
|
|
// Copyright (C) 2018-2020 Jakub Melka
|
2018-11-17 16:48:30 +01:00
|
|
|
//
|
|
|
|
// This file is part of PdfForQt.
|
|
|
|
//
|
|
|
|
// PdfForQt is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// PdfForQt is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Lesser General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with PDFForQt. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
|
#include "pdfparser.h"
|
|
|
|
#include "pdfconstants.h"
|
2019-04-29 17:03:19 +02:00
|
|
|
#include "pdfexception.h"
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
#include <QFile>
|
2018-11-25 14:48:08 +01:00
|
|
|
#include <QThread>
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
#include <cctype>
|
|
|
|
#include <memory>
|
|
|
|
|
|
|
|
namespace pdf
|
|
|
|
{
|
|
|
|
|
|
|
|
PDFLexicalAnalyzer::PDFLexicalAnalyzer(const char* begin, const char* end) :
|
|
|
|
m_begin(begin),
|
|
|
|
m_current(begin),
|
|
|
|
m_end(end)
|
|
|
|
{
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
PDFLexicalAnalyzer::Token PDFLexicalAnalyzer::fetch()
|
|
|
|
{
|
|
|
|
// Skip whitespace/comments at first
|
|
|
|
skipWhitespaceAndComments();
|
|
|
|
|
|
|
|
// If we are at end of token, then return immediately
|
|
|
|
if (isAtEnd())
|
|
|
|
{
|
|
|
|
return Token(TokenType::EndOfFile);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (lookChar())
|
|
|
|
{
|
|
|
|
case '0':
|
|
|
|
case '1':
|
|
|
|
case '2':
|
|
|
|
case '3':
|
|
|
|
case '4':
|
|
|
|
case '5':
|
|
|
|
case '6':
|
|
|
|
case '7':
|
|
|
|
case '8':
|
|
|
|
case '9':
|
|
|
|
case '+':
|
|
|
|
case '-':
|
|
|
|
case '.':
|
|
|
|
{
|
|
|
|
// Scan integer or real number. If integer overflows, then it is converted to the real number. If
|
|
|
|
// real number overflow, then error is reported. This behaviour is according to the PDF 1.7 specification,
|
|
|
|
// chapter 3.2.2.
|
|
|
|
|
|
|
|
// First, treat special characters
|
|
|
|
bool positive = fetchChar('+');
|
|
|
|
bool negative = fetchChar('-');
|
|
|
|
bool dot = fetchChar('.');
|
|
|
|
bool treatAsReal = dot;
|
|
|
|
bool atLeastOneDigit = false;
|
|
|
|
|
|
|
|
if (isAtEnd())
|
|
|
|
{
|
|
|
|
error(tr("Expected a number, but end of stream reached."));
|
|
|
|
}
|
|
|
|
|
|
|
|
PDFInteger integer = 0;
|
|
|
|
PDFReal real = 0.0;
|
|
|
|
PDFReal scale = 0.1;
|
|
|
|
|
|
|
|
// Now, we can only have digits and a single dot
|
|
|
|
while (!isAtEnd())
|
|
|
|
{
|
|
|
|
if (!dot && fetchChar('.'))
|
|
|
|
{
|
|
|
|
// Entering real mode
|
|
|
|
dot = true;
|
|
|
|
treatAsReal = true;
|
|
|
|
real = integer;
|
|
|
|
}
|
2018-12-02 18:41:19 +01:00
|
|
|
else if (std::isdigit(static_cast<unsigned char>(lookChar())))
|
2018-11-17 16:48:30 +01:00
|
|
|
{
|
|
|
|
atLeastOneDigit = true;
|
|
|
|
PDFInteger digit = lookChar() - '0';
|
|
|
|
++m_current;
|
|
|
|
|
|
|
|
if (!treatAsReal)
|
|
|
|
{
|
|
|
|
// Treat value as integer
|
|
|
|
integer = integer * 10 + digit;
|
|
|
|
|
|
|
|
// Check, if integer has not overflown, if yes, treat him as real
|
|
|
|
// according to the PDF 1.7 specification.
|
|
|
|
if (!isValidInteger(integer))
|
|
|
|
{
|
|
|
|
treatAsReal = true;
|
|
|
|
real = integer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Treat value as real
|
|
|
|
if (!dot)
|
|
|
|
{
|
|
|
|
real = real * 10.0 + digit;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
real = real + scale * digit;
|
|
|
|
scale *= 0.1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-11-25 14:48:08 +01:00
|
|
|
else if (isWhitespace(lookChar()) || isDelimiter(lookChar()))
|
2018-11-17 16:48:30 +01:00
|
|
|
{
|
2018-11-25 14:48:08 +01:00
|
|
|
// Whitespace appeared - whitespaces/delimiters delimits tokens - break
|
2018-11-17 16:48:30 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Another character other than dot and digit appeared - this is an error
|
|
|
|
error(tr("Invalid format of number. Character '%1' appeared.").arg(lookChar()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, we have scanned whole token number, check for errors.
|
|
|
|
if (positive && negative)
|
|
|
|
{
|
|
|
|
error(tr("Both '+' and '-' appeared in number. Invalid format of number."));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!atLeastOneDigit)
|
|
|
|
{
|
|
|
|
error(tr("Bad format of number - no digits appeared."));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for real overflow
|
|
|
|
if (treatAsReal && !std::isfinite(real))
|
|
|
|
{
|
|
|
|
error(tr("Real number overflow."));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (negative)
|
|
|
|
{
|
|
|
|
integer = -integer;
|
|
|
|
real = -real;
|
|
|
|
}
|
|
|
|
|
|
|
|
return !treatAsReal ? Token(TokenType::Integer, integer) : Token(TokenType::Real, real);
|
|
|
|
}
|
|
|
|
|
|
|
|
case CHAR_LEFT_BRACKET:
|
|
|
|
{
|
|
|
|
// String '(', sequence of literal characters enclosed in "()", see PDF 1.7 Reference,
|
|
|
|
// chapter 3.2.3. Note: literal string can have properly balanced brackets inside.
|
|
|
|
|
|
|
|
int parenthesisBalance = 1;
|
|
|
|
QByteArray string;
|
|
|
|
string.reserve(STRING_BUFFER_RESERVE);
|
|
|
|
|
|
|
|
// Skip first character
|
|
|
|
fetchChar();
|
|
|
|
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
// Scan string, see, what next char is.
|
|
|
|
const char character = fetchChar();
|
|
|
|
switch (character)
|
|
|
|
{
|
|
|
|
case CHAR_LEFT_BRACKET:
|
|
|
|
{
|
|
|
|
++parenthesisBalance;
|
|
|
|
string.push_back(character);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CHAR_RIGHT_BRACKET:
|
|
|
|
{
|
|
|
|
if (--parenthesisBalance == 0)
|
|
|
|
{
|
|
|
|
// We are done.
|
|
|
|
return Token(TokenType::String, string);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
string.push_back(character);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case CHAR_BACKSLASH:
|
|
|
|
{
|
|
|
|
// Escape sequence. Check, what it means. Possible values are in PDF 1.7 Reference,
|
|
|
|
// chapter 3.2.3, Table 3.2 - Escape Sequence in Literal Strings
|
|
|
|
const char escaped = fetchChar();
|
|
|
|
switch (escaped)
|
|
|
|
{
|
|
|
|
case 'n':
|
|
|
|
{
|
|
|
|
string += '\n';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 'r':
|
|
|
|
{
|
|
|
|
string += '\r';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 't':
|
|
|
|
{
|
|
|
|
string += '\t';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 'b':
|
|
|
|
{
|
|
|
|
string += '\b';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 'f':
|
|
|
|
{
|
|
|
|
string += '\f';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case '\\':
|
|
|
|
case '(':
|
|
|
|
case ')':
|
|
|
|
{
|
|
|
|
string += escaped;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case '\n':
|
|
|
|
{
|
|
|
|
// Nothing done here, EOL is not part of the string, because it was escaped
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case '\r':
|
|
|
|
{
|
|
|
|
// Skip EOL
|
|
|
|
fetchChar('\n');
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
{
|
2018-12-02 18:41:19 +01:00
|
|
|
// Undo fetch char, we do not want to miss first digit
|
|
|
|
--m_current;
|
|
|
|
|
2018-11-17 16:48:30 +01:00
|
|
|
// Try to scan octal value. Octal number can have 3 digits in this case.
|
|
|
|
// According to specification, overflow value can be truncated.
|
|
|
|
int octalNumber = -1;
|
|
|
|
if (fetchOctalNumber(3, &octalNumber))
|
|
|
|
{
|
|
|
|
string += static_cast<const char>(octalNumber);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
error(tr("Expected octal number with 1-3 digits."));
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
{
|
|
|
|
// Normal character
|
|
|
|
string.push_back(character);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// This code should be unreachable. Either normal string is scanned - then it is returned
|
|
|
|
// in the while cycle above, or exception is thrown.
|
|
|
|
Q_ASSERT(false);
|
|
|
|
return Token(TokenType::EndOfFile);
|
|
|
|
}
|
|
|
|
|
|
|
|
case CHAR_SLASH:
|
|
|
|
{
|
|
|
|
// Name object. According to the PDF Reference 1.7, chapter 3.2.4 name object can have zero length,
|
|
|
|
// and can contain #XX characters, where XX is hexadecimal number.
|
|
|
|
|
|
|
|
fetchChar();
|
|
|
|
|
|
|
|
QByteArray name;
|
|
|
|
name.reserve(NAME_BUFFER_RESERVE);
|
|
|
|
|
|
|
|
while (!isAtEnd())
|
|
|
|
{
|
|
|
|
if (fetchChar(CHAR_MARK))
|
|
|
|
{
|
|
|
|
const char hexHighCharacter = fetchChar();
|
|
|
|
const char hexLowCharacter = fetchChar();
|
|
|
|
|
|
|
|
if (isHexCharacter(hexHighCharacter) && isHexCharacter(hexLowCharacter))
|
|
|
|
{
|
|
|
|
name += QByteArray::fromHex(QByteArray::fromRawData(m_current - 2, 2));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Throw an error - hexadecimal number is expected.
|
|
|
|
error(tr("Hexadecimal number must follow character '#' in the name."));
|
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, we have other character, than '#', if it is a regular character,
|
|
|
|
// then add it to the name, otherwise end scanning.
|
|
|
|
const char character = lookChar();
|
|
|
|
|
|
|
|
if (isRegular(character))
|
|
|
|
{
|
|
|
|
name += character;
|
|
|
|
++m_current;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Matched non-regular character - end of name.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Token(TokenType::Name, std::move(name));
|
|
|
|
}
|
|
|
|
|
|
|
|
case CHAR_ARRAY_START:
|
|
|
|
{
|
|
|
|
++m_current;
|
|
|
|
return Token(TokenType::ArrayStart);
|
|
|
|
}
|
|
|
|
|
|
|
|
case CHAR_ARRAY_END:
|
|
|
|
{
|
|
|
|
++m_current;
|
|
|
|
return Token(TokenType::ArrayEnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
case CHAR_LEFT_ANGLE:
|
|
|
|
{
|
|
|
|
++m_current;
|
|
|
|
|
|
|
|
// Check if it is dictionary start
|
|
|
|
if (fetchChar(CHAR_LEFT_ANGLE))
|
|
|
|
{
|
|
|
|
return Token(TokenType::DictionaryStart);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Reserve two times normal size, because in hexadecimal string, each character
|
|
|
|
// is represented as a pair of hexadecimal numbers.
|
|
|
|
QByteArray hexadecimalString;
|
|
|
|
hexadecimalString.reserve(STRING_BUFFER_RESERVE * 2);
|
|
|
|
|
|
|
|
// Scan hexadecimal string
|
|
|
|
while (!isAtEnd())
|
|
|
|
{
|
|
|
|
const char character = fetchChar();
|
|
|
|
if (isHexCharacter(character))
|
|
|
|
{
|
|
|
|
hexadecimalString += character;
|
|
|
|
}
|
|
|
|
else if (character == CHAR_RIGHT_ANGLE)
|
|
|
|
{
|
|
|
|
// End of string mark. According to the specification, string can contain odd number
|
|
|
|
// of hexadecimal digits, in this case, zero is appended to the string.
|
|
|
|
if (hexadecimalString.size() % 2 == 1)
|
|
|
|
{
|
|
|
|
hexadecimalString += '0';
|
|
|
|
}
|
|
|
|
|
|
|
|
QByteArray decodedString = QByteArray::fromHex(hexadecimalString);
|
|
|
|
return Token(TokenType::String, std::move(decodedString));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// This is unexpected. Invalid character in hexadecimal string.
|
|
|
|
error(tr("Invalid character in hexadecimal string."));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
error(tr("Unexpected end of stream reached while scanning hexadecimal string."));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case CHAR_RIGHT_ANGLE:
|
|
|
|
{
|
|
|
|
// This must be a mark of dictionary end, because in other way, we should reach end of
|
|
|
|
// string in the code above.
|
|
|
|
++m_current;
|
|
|
|
|
|
|
|
if (fetchChar(CHAR_RIGHT_ANGLE))
|
|
|
|
{
|
|
|
|
return Token(TokenType::DictionaryEnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
error(tr("Invalid character '%1'").arg(CHAR_RIGHT_ANGLE));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
{
|
|
|
|
// Now, we have skipped whitespaces. So actual character must be either regular, or it is special.
|
|
|
|
// We have treated all special characters above. For this reason, if we match special character,
|
|
|
|
// then we report an error.
|
|
|
|
Q_ASSERT(!isWhitespace(lookChar()));
|
|
|
|
|
|
|
|
if (isRegular(lookChar()))
|
|
|
|
{
|
|
|
|
// It should be sequence of regular characters - command, true, false, null...
|
|
|
|
QByteArray command;
|
|
|
|
command.reserve(COMMAND_BUFFER_RESERVE);
|
|
|
|
|
|
|
|
while (!isAtEnd() && isRegular(lookChar()))
|
|
|
|
{
|
|
|
|
command += fetchChar();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (command == BOOL_OBJECT_TRUE_STRING)
|
|
|
|
{
|
|
|
|
return Token(TokenType::Boolean, true);
|
|
|
|
}
|
|
|
|
else if (command == BOOL_OBJECT_FALSE_STRING)
|
|
|
|
{
|
|
|
|
return Token(TokenType::Boolean, false);
|
|
|
|
}
|
|
|
|
else if (command == NULL_OBJECT_STRING)
|
|
|
|
{
|
|
|
|
return Token(TokenType::Null);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return Token(TokenType::Command, std::move(command));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
error(tr("Unexpected character '%1' in the stream.").arg(lookChar()));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Token(TokenType::EndOfFile);
|
|
|
|
}
|
|
|
|
|
2018-11-21 19:30:15 +01:00
|
|
|
void PDFLexicalAnalyzer::seek(PDFInteger offset)
|
|
|
|
{
|
|
|
|
const PDFInteger limit = std::distance(m_begin, m_end);
|
2020-06-23 19:28:25 +02:00
|
|
|
if (offset >= 0 && offset <= limit)
|
2018-11-21 19:30:15 +01:00
|
|
|
{
|
|
|
|
m_current = std::next(m_begin, offset);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
error(tr("Trying to seek stream position to %1 bytes from the start, byte offset is invalid.").arg(offset));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-17 16:48:30 +01:00
|
|
|
void PDFLexicalAnalyzer::skipWhitespaceAndComments()
|
|
|
|
{
|
|
|
|
bool isComment = false;
|
|
|
|
|
|
|
|
while (m_current != m_end)
|
|
|
|
{
|
|
|
|
if (isComment)
|
|
|
|
{
|
|
|
|
// Comment ends at end of line
|
|
|
|
if (*m_current == CHAR_CARRIAGE_RETURN || *m_current == CHAR_LINE_FEED)
|
|
|
|
{
|
|
|
|
isComment = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Commented character - step to the next character
|
|
|
|
++m_current;
|
|
|
|
}
|
|
|
|
else if (*m_current == CHAR_PERCENT)
|
|
|
|
{
|
|
|
|
isComment = true;
|
|
|
|
++m_current;
|
|
|
|
}
|
|
|
|
else if (isWhitespace(*m_current))
|
|
|
|
{
|
|
|
|
++m_current;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Not a whitespace and not in comment
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void PDFLexicalAnalyzer::skipStreamStart()
|
|
|
|
{
|
|
|
|
// According to the PDF Reference 1.7, chapter 3.2.7, after the 'stream' keyword,
|
|
|
|
// either carriage return + line feed, or just line feed can appear. Eat them.
|
|
|
|
fetchChar(CHAR_CARRIAGE_RETURN);
|
|
|
|
fetchChar(CHAR_LINE_FEED);
|
|
|
|
}
|
|
|
|
|
|
|
|
QByteArray PDFLexicalAnalyzer::fetchByteArray(PDFInteger length)
|
|
|
|
{
|
|
|
|
Q_ASSERT(length >= 0);
|
|
|
|
|
|
|
|
if (std::distance(m_current, m_end) < length)
|
|
|
|
{
|
|
|
|
error(tr("Can't read %1 bytes from the input stream. Input stream end reached.").arg(length));
|
|
|
|
}
|
|
|
|
|
|
|
|
QByteArray result(m_current, length);
|
|
|
|
std::advance(m_current, length);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2019-07-21 17:31:39 +02:00
|
|
|
PDFInteger PDFLexicalAnalyzer::findSubstring(const char* str, PDFInteger position) const
|
|
|
|
{
|
|
|
|
const PDFInteger length = std::distance(m_begin, m_end);
|
|
|
|
if (position < 0 || position >= length)
|
|
|
|
{
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
const PDFInteger substringLength = qstrlen(str);
|
|
|
|
const PDFInteger startPos = position;
|
|
|
|
const PDFInteger endPos = length - substringLength;
|
2020-06-23 19:28:25 +02:00
|
|
|
for (PDFInteger i = startPos; i <= endPos; ++i)
|
2019-07-21 17:31:39 +02:00
|
|
|
{
|
|
|
|
Q_ASSERT(std::distance(m_begin + i + substringLength - 1, m_end) >= 0);
|
|
|
|
if (memcmp(m_begin + i, str, substringLength) == 0)
|
|
|
|
{
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2019-02-14 19:45:07 +01:00
|
|
|
QString PDFLexicalAnalyzer::getStringFromOperandType(TokenType type)
|
|
|
|
{
|
|
|
|
QMetaEnum metaEnum = QMetaEnum::fromType<TokenType>();
|
|
|
|
Q_ASSERT(metaEnum.isValid());
|
|
|
|
|
|
|
|
const char* typeName = metaEnum.valueToKey(static_cast<int>(type));
|
|
|
|
Q_ASSERT(typeName);
|
|
|
|
|
|
|
|
return typeName;
|
|
|
|
}
|
|
|
|
|
2018-11-17 16:48:30 +01:00
|
|
|
bool PDFLexicalAnalyzer::fetchChar(const char character)
|
|
|
|
{
|
|
|
|
if (!isAtEnd() && lookChar() == character)
|
|
|
|
{
|
|
|
|
++m_current;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
char PDFLexicalAnalyzer::fetchChar()
|
|
|
|
{
|
|
|
|
if (!isAtEnd())
|
|
|
|
{
|
|
|
|
return *m_current++;
|
|
|
|
}
|
|
|
|
|
|
|
|
error(tr("Unexpected end of stream reached."));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PDFLexicalAnalyzer::fetchOctalNumber(int maxDigits, int* output)
|
|
|
|
{
|
|
|
|
Q_ASSERT(output);
|
|
|
|
|
|
|
|
*output = 0;
|
|
|
|
int fetchedNumbers = 0;
|
|
|
|
|
|
|
|
while (!isAtEnd() && fetchedNumbers < maxDigits)
|
|
|
|
{
|
|
|
|
const char c = lookChar();
|
|
|
|
if (c >= '0' && c <= '7')
|
|
|
|
{
|
|
|
|
// Valid octal characters
|
|
|
|
const int number = c - '0';
|
|
|
|
*output = *output * 8 + number;
|
|
|
|
++m_current;
|
|
|
|
++fetchedNumbers;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Non-octal character reached
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return fetchedNumbers >= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
constexpr bool PDFLexicalAnalyzer::isHexCharacter(const char character)
|
|
|
|
{
|
|
|
|
return (character >= '0' && character <= '9') || (character >= 'A' && character <= 'F') || (character >= 'a' && character <= 'f');
|
|
|
|
}
|
|
|
|
|
|
|
|
void PDFLexicalAnalyzer::error(const QString& message) const
|
|
|
|
{
|
2018-11-25 14:48:08 +01:00
|
|
|
std::size_t distance = std::distance(m_begin, m_current);
|
2019-09-27 18:41:56 +02:00
|
|
|
throw PDFException(tr("Error near position %1. %2").arg(distance).arg(message));
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
2018-11-25 17:57:39 +01:00
|
|
|
PDFObject PDFParsingContext::getObject(const PDFObject& object)
|
2018-11-17 16:48:30 +01:00
|
|
|
{
|
2018-11-25 14:48:08 +01:00
|
|
|
if (object.isReference())
|
|
|
|
{
|
|
|
|
Q_ASSERT(m_objectFetcher);
|
2018-11-25 17:57:39 +01:00
|
|
|
return m_objectFetcher(this, object.getReference());
|
2018-11-25 14:48:08 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return object;
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void PDFParsingContext::beginParsingObject(PDFObjectReference reference)
|
|
|
|
{
|
2018-11-25 17:57:39 +01:00
|
|
|
if (m_activeParsedObjectSet.search(reference))
|
2018-11-17 16:48:30 +01:00
|
|
|
{
|
2019-09-27 18:41:56 +02:00
|
|
|
throw PDFException(tr("Cyclical reference found while parsing object %1 %2.").arg(reference.objectNumber).arg(reference.generation));
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-11-25 17:57:39 +01:00
|
|
|
m_activeParsedObjectSet.insert(reference);
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void PDFParsingContext::endParsingObject(PDFObjectReference reference)
|
|
|
|
{
|
2018-11-25 17:57:39 +01:00
|
|
|
Q_ASSERT(m_activeParsedObjectSet.search(reference));
|
|
|
|
m_activeParsedObjectSet.erase(reference);
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
2018-11-21 19:30:15 +01:00
|
|
|
PDFParser::PDFParser(const QByteArray& data, PDFParsingContext* context, Features features) :
|
2018-11-17 16:48:30 +01:00
|
|
|
m_context(context),
|
2018-11-21 19:30:15 +01:00
|
|
|
m_lexicalAnalyzer(data.constData(), data.constData() + data.size()),
|
|
|
|
m_features(features)
|
|
|
|
{
|
2019-07-04 17:52:38 +02:00
|
|
|
m_lookAhead1 = fetch();
|
|
|
|
m_lookAhead2 = fetch();
|
2018-11-21 19:30:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
PDFParser::PDFParser(const char* begin, const char* end, PDFParsingContext* context, Features features) :
|
|
|
|
m_context(context),
|
|
|
|
m_lexicalAnalyzer(begin, end),
|
|
|
|
m_features(features)
|
2018-11-17 16:48:30 +01:00
|
|
|
{
|
2019-07-04 17:52:38 +02:00
|
|
|
m_lookAhead1 = fetch();
|
|
|
|
m_lookAhead2 = fetch();
|
|
|
|
}
|
|
|
|
|
|
|
|
PDFParser::PDFParser(std::function<PDFLexicalAnalyzer::Token ()> tokenFetcher) :
|
|
|
|
m_tokenFetcher(qMove(tokenFetcher)),
|
|
|
|
m_context(nullptr),
|
|
|
|
m_lexicalAnalyzer(nullptr, nullptr),
|
|
|
|
m_features(None)
|
|
|
|
{
|
|
|
|
m_lookAhead1 = fetch();
|
|
|
|
m_lookAhead2 = fetch();
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
PDFObject PDFParser::getObject()
|
|
|
|
{
|
|
|
|
switch (m_lookAhead1.type)
|
|
|
|
{
|
|
|
|
case PDFLexicalAnalyzer::TokenType::Boolean:
|
|
|
|
{
|
|
|
|
Q_ASSERT(m_lookAhead1.data.type() == QVariant::Bool);
|
|
|
|
const bool value = m_lookAhead1.data.toBool();
|
|
|
|
shift();
|
|
|
|
return PDFObject::createBool(value);
|
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::Integer:
|
|
|
|
{
|
|
|
|
Q_ASSERT(m_lookAhead1.data.type() == QVariant::LongLong);
|
|
|
|
const PDFInteger value = m_lookAhead1.data.toLongLong();
|
|
|
|
shift();
|
|
|
|
|
|
|
|
// We must check, if we are reading reference. In this case,
|
|
|
|
// actual value is integer and next value is command "R".
|
|
|
|
if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Integer &&
|
|
|
|
m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command &&
|
|
|
|
m_lookAhead2.data.toByteArray() == PDF_REFERENCE_COMMAND)
|
|
|
|
{
|
|
|
|
Q_ASSERT(m_lookAhead1.data.type() == QVariant::LongLong);
|
|
|
|
const PDFInteger generation = m_lookAhead1.data.toLongLong();
|
|
|
|
shift();
|
|
|
|
shift();
|
|
|
|
return PDFObject::createReference(PDFObjectReference(value, generation));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Just normal integer
|
|
|
|
return PDFObject::createInteger(value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::Real:
|
|
|
|
{
|
|
|
|
Q_ASSERT(m_lookAhead1.data.type() == QVariant::Double);
|
|
|
|
const PDFReal value = m_lookAhead1.data.toDouble();
|
|
|
|
shift();
|
|
|
|
return PDFObject::createReal(value);
|
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::String:
|
|
|
|
{
|
|
|
|
Q_ASSERT(m_lookAhead1.data.type() == QVariant::ByteArray);
|
|
|
|
QByteArray array = m_lookAhead1.data.toByteArray();
|
|
|
|
array.shrink_to_fit();
|
|
|
|
shift();
|
2020-05-29 19:56:10 +02:00
|
|
|
return PDFObject::createString(std::move(array));
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::Name:
|
|
|
|
{
|
|
|
|
Q_ASSERT(m_lookAhead1.data.type() == QVariant::ByteArray);
|
|
|
|
QByteArray array = m_lookAhead1.data.toByteArray();
|
|
|
|
array.shrink_to_fit();
|
|
|
|
shift();
|
2020-05-29 19:56:10 +02:00
|
|
|
return PDFObject::createName(std::move(array));
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::ArrayStart:
|
|
|
|
{
|
|
|
|
shift();
|
|
|
|
|
|
|
|
// Create shared pointer to the array (if the exception is thrown, array
|
|
|
|
// will be properly destroyed by the shared array destructor)
|
|
|
|
std::shared_ptr<PDFObjectContent> arraySharedPointer = std::make_shared<PDFArray>();
|
|
|
|
PDFArray* array = static_cast<PDFArray*>(arraySharedPointer.get());
|
|
|
|
|
|
|
|
while (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::EndOfFile &&
|
|
|
|
m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::ArrayEnd)
|
|
|
|
{
|
|
|
|
array->appendItem(getObject());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, we have either end of file, or array end. If former appears, then
|
|
|
|
// it is an error - error should be reported.
|
|
|
|
if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::EndOfFile)
|
|
|
|
{
|
|
|
|
error(tr("Stream ended inside array."));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
shift();
|
|
|
|
return PDFObject::createArray(std::move(arraySharedPointer));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
case PDFLexicalAnalyzer::TokenType::DictionaryStart:
|
|
|
|
{
|
|
|
|
shift();
|
|
|
|
|
|
|
|
// Start reading the dictionary. BEWARE! It can also be a stream. In this case,
|
|
|
|
// we must load also the stream content.
|
|
|
|
std::shared_ptr<PDFDictionary> dictionarySharedPointer = std::make_shared<PDFDictionary>();
|
|
|
|
PDFDictionary* dictionary = dictionarySharedPointer.get();
|
|
|
|
|
|
|
|
// Now, scan key/value pairs
|
|
|
|
while (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::EndOfFile &&
|
|
|
|
m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::DictionaryEnd)
|
|
|
|
{
|
|
|
|
// First value should be a key
|
|
|
|
if (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::Name)
|
|
|
|
{
|
|
|
|
error(tr("Dictionary key must be a name."));
|
|
|
|
}
|
|
|
|
|
|
|
|
QByteArray key = m_lookAhead1.data.toByteArray();
|
|
|
|
shift();
|
|
|
|
|
|
|
|
// Second value should be a value
|
|
|
|
PDFObject object = getObject();
|
|
|
|
|
2020-06-08 19:42:00 +02:00
|
|
|
dictionary->addEntry(PDFInplaceOrMemoryString(std::move(key)), std::move(object));
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Now, we should reach dictionary end. If it is not the case, then end of stream occured.
|
|
|
|
if (m_lookAhead1.type != PDFLexicalAnalyzer::TokenType::DictionaryEnd)
|
|
|
|
{
|
|
|
|
error(tr("End of stream inside dictionary reached."));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Is it a content stream?
|
|
|
|
if (m_lookAhead2.type == PDFLexicalAnalyzer::TokenType::Command &&
|
|
|
|
m_lookAhead2.data.toByteArray() == PDF_STREAM_START_COMMAND)
|
|
|
|
{
|
2018-11-21 19:30:15 +01:00
|
|
|
if (!m_features.testFlag(AllowStreams))
|
|
|
|
{
|
|
|
|
error(tr("Streams are not allowed in this context."));
|
|
|
|
}
|
|
|
|
|
2018-11-17 16:48:30 +01:00
|
|
|
// Read stream content. According to the PDF Reference 1.7, chapter 3.2.7, stream
|
|
|
|
// content can be placed in the file. If this is the case, then try to load file
|
|
|
|
// content in the memory. But even in this case, stream content should be skipped.
|
|
|
|
|
|
|
|
if (!dictionary->hasKey(PDF_STREAM_DICT_LENGTH))
|
|
|
|
{
|
|
|
|
error(tr("Stream length is not specified."));
|
|
|
|
}
|
|
|
|
|
2019-06-28 18:11:05 +02:00
|
|
|
PDFObject lengthObject = m_context ? m_context->getObject(dictionary->get(PDF_STREAM_DICT_LENGTH)) : dictionary->get(PDF_STREAM_DICT_LENGTH);
|
2018-11-17 16:48:30 +01:00
|
|
|
if (!lengthObject.isInt())
|
|
|
|
{
|
|
|
|
error(tr("Bad value of stream length. It should be an integer number."));
|
|
|
|
}
|
|
|
|
PDFInteger length = lengthObject.getInteger();
|
|
|
|
|
|
|
|
if (length < 0)
|
|
|
|
{
|
|
|
|
error(tr("Length of the stream buffer is negative (%1). It must be a positive number.").arg(length));
|
|
|
|
}
|
|
|
|
|
2018-11-25 14:48:08 +01:00
|
|
|
// Skip the stream start, then fetch data of the stream
|
|
|
|
m_lexicalAnalyzer.skipStreamStart();
|
2018-11-17 16:48:30 +01:00
|
|
|
QByteArray buffer = m_lexicalAnalyzer.fetchByteArray(length);
|
|
|
|
|
|
|
|
// According to the PDF Reference 1.7, chapter 3.2.7, stream content can also be specified
|
|
|
|
// in the external file. If this is the case, then we must try to load the stream data
|
|
|
|
// from the external file.
|
|
|
|
if (dictionary->hasKey(PDF_STREAM_DICT_FILE_SPECIFICATION))
|
|
|
|
{
|
2019-06-28 18:11:05 +02:00
|
|
|
PDFObject fileName = m_context ? m_context->getObject(dictionary->get(PDF_STREAM_DICT_FILE_SPECIFICATION)) : dictionary->get(PDF_STREAM_DICT_FILE_SPECIFICATION);
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
if (!fileName.isString())
|
|
|
|
{
|
|
|
|
error(tr("Stream data should be in external file, but invalid file name is specified."));
|
|
|
|
}
|
|
|
|
|
|
|
|
QFile streamDataFile(fileName.getString());
|
|
|
|
if (streamDataFile.open(QFile::ReadOnly))
|
|
|
|
{
|
|
|
|
buffer = streamDataFile.readAll();
|
|
|
|
streamDataFile.close();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
error(tr("Can't open stream data stored in external file '%1'.").arg(QString(fileName.getString())));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Refill lookahead tokens
|
2019-07-04 17:52:38 +02:00
|
|
|
m_lookAhead1 = fetch();
|
|
|
|
m_lookAhead2 = fetch();
|
2018-11-17 16:48:30 +01:00
|
|
|
|
|
|
|
if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command &&
|
|
|
|
m_lookAhead1.data.toByteArray() == PDF_STREAM_END_COMMAND)
|
|
|
|
{
|
|
|
|
// Everything OK, just advance and return stream object
|
|
|
|
shift();
|
|
|
|
return PDFObject::createStream(std::make_shared<PDFStream>(std::move(*dictionary), std::move(buffer)));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
error(tr("End of stream should end in keyword 'endstream'."));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Just shift (eat dictionary end) and return dictionary
|
|
|
|
shift();
|
|
|
|
return PDFObject::createDictionary(std::move(dictionarySharedPointer));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::Null:
|
|
|
|
{
|
|
|
|
shift();
|
|
|
|
return PDFObject::createNull();
|
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::ArrayEnd:
|
|
|
|
case PDFLexicalAnalyzer::TokenType::DictionaryEnd:
|
|
|
|
case PDFLexicalAnalyzer::TokenType::Command:
|
|
|
|
{
|
|
|
|
error(tr("Cannot read object. Unexpected token appeared."));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case PDFLexicalAnalyzer::TokenType::EndOfFile:
|
|
|
|
{
|
|
|
|
error(tr("Cannot read object. End of stream reached."));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// This code should be unreachable. All values should be handled in the switch above.
|
|
|
|
Q_ASSERT(false);
|
|
|
|
return PDFObject::createNull();
|
|
|
|
}
|
|
|
|
|
|
|
|
PDFObject PDFParser::getObject(PDFObjectReference reference)
|
|
|
|
{
|
|
|
|
PDFParsingContext::PDFParsingContextGuard guard(m_context, reference);
|
|
|
|
return getObject();
|
|
|
|
}
|
|
|
|
|
|
|
|
void PDFParser::error(const QString& message) const
|
|
|
|
{
|
2019-09-27 18:41:56 +02:00
|
|
|
throw PDFException(message);
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
2018-11-21 19:30:15 +01:00
|
|
|
void PDFParser::seek(PDFInteger offset)
|
|
|
|
{
|
|
|
|
m_lexicalAnalyzer.seek(offset);
|
|
|
|
|
|
|
|
// We must read lookahead symbols, because we invalidated them
|
2019-07-04 17:52:38 +02:00
|
|
|
m_lookAhead1 = fetch();
|
|
|
|
m_lookAhead2 = fetch();
|
2018-11-21 19:30:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
bool PDFParser::fetchCommand(const char* command)
|
|
|
|
{
|
|
|
|
if (m_lookAhead1.type == PDFLexicalAnalyzer::TokenType::Command &&
|
|
|
|
m_lookAhead1.data.toByteArray() == command)
|
|
|
|
{
|
|
|
|
shift();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-11-17 16:48:30 +01:00
|
|
|
void PDFParser::shift()
|
|
|
|
{
|
|
|
|
m_lookAhead1 = std::move(m_lookAhead2);
|
2019-07-04 17:52:38 +02:00
|
|
|
m_lookAhead2 = fetch();
|
|
|
|
}
|
|
|
|
|
|
|
|
PDFLexicalAnalyzer::Token PDFParser::fetch()
|
|
|
|
{
|
|
|
|
return m_tokenFetcher ? m_tokenFetcher() : m_lexicalAnalyzer.fetch();
|
2018-11-17 16:48:30 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace pdf
|