Object classifier

This commit is contained in:
Jakub Melka
2021-06-11 19:01:18 +02:00
parent 2745c7828c
commit eb5f904842
9 changed files with 428 additions and 40 deletions

View File

@@ -2413,6 +2413,28 @@ QString PDFEncoding::convertSmartFromByteStringToUnicode(const QByteArray& strea
return QString::fromLatin1(stream.toHex()).toUpper();
}
QString PDFEncoding::convertSmartFromByteStringToRepresentableQString(const QByteArray& stream)
{
if (stream.startsWith("D:"))
{
QDateTime dateTime = convertToDateTime(stream);
if (dateTime.isValid())
{
return dateTime.toString(Qt::TextDate);
}
}
bool isBinary = false;
QString text = convertSmartFromByteStringToUnicode(stream, &isBinary);
if (!isBinary)
{
return text;
}
return stream.toPercentEncoding(" ", QByteArray(), '%');
}
QString PDFEncoding::getEncodingCharacters(Encoding encoding)
{
QString string;

View File

@@ -117,10 +117,16 @@ public:
/// Function checks if stream can be converted to unicode by heuristic
/// way, it is not always reliable.
/// \param stream Stream
/// \param isBinary If specified, it is set to true if conversion failed
/// \param[out] isBinary If specified, it is set to true if conversion failed
/// \returns Unicode string or string converted to hexadecimal representation
static QString convertSmartFromByteStringToUnicode(const QByteArray& stream, bool* isBinary);
/// Tries to convert stream to representable string. If it cannot be done,
/// percentage encoding is used.
/// \param stream Stream
/// \returns Unicode string or string converted to percentage representation
static QString convertSmartFromByteStringToRepresentableQString(const QByteArray& stream);
/// Returns all characters of the given encoding
/// \param encoding Encoding
/// \returns All characters reprezentable by encoding.

View File

@@ -15,7 +15,6 @@
// You should have received a copy of the GNU Lesser General Public License
// along with Pdf4Qt. If not, see <https://www.gnu.org/licenses/>.
#include "pdfobjectutils.h"
#include "pdfvisitor.h"
@@ -210,6 +209,16 @@ std::set<PDFObjectReference> PDFObjectUtils::getReferences(const std::vector<PDF
return references;
}
std::set<PDFObjectReference> PDFObjectUtils::getDirectReferences(const PDFObject& object)
{
std::set<PDFObjectReference> references;
PDFCollectReferencesVisitor collectReferencesVisitor(references);
object.accept(&collectReferencesVisitor);
return references;
}
PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::map<PDFObjectReference, PDFObjectReference>& referenceMapping)
{
PDFReplaceReferencesVisitor replaceReferencesVisitor(referenceMapping);
@@ -217,4 +226,162 @@ PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::
return replaceReferencesVisitor.getObject();
}
void PDFObjectClassifier::classify(const PDFDocument* document)
{
// Clear old classification, if it exist
m_classification.clear();
m_allTypesUsed = None;
if (!document)
{
return;
}
PDFDocumentDataLoaderDecorator loader(document);
const PDFObjectStorage& storage = document->getStorage();
const PDFObjectStorage::PDFObjects& objects = storage.getObjects();
m_classification.resize(objects.size(), Classification());
for (size_t i = 0; i < objects.size(); ++i)
{
PDFObjectReference reference(i, objects[i].generation);
m_classification[i].reference = reference;
}
// First, iterate trough pages of the document
const PDFCatalog* catalog = document->getCatalog();
const size_t pageCount = catalog->getPageCount();
for (size_t i = 0; i < pageCount; ++i)
{
const PDFPage* page = catalog->getPage(i);
if (!page)
{
continue;
}
// Handle page itself
if (hasObject(page->getPageReference()))
{
mark(page->getPageReference(), Page);
}
// Handle annotations
for (const PDFObjectReference& reference : page->getAnnotations())
{
if (hasObject(reference))
{
mark(reference, Annotation);
}
}
// Handle contents
PDFObject pageObject = document->getObjectByReference(page->getPageReference());
Q_ASSERT(pageObject.isDictionary());
const PDFDictionary* dictionary = pageObject.getDictionary();
const PDFObject& contentsObject = dictionary->get("Contents");
if (contentsObject.isReference())
{
mark(contentsObject.getReference(), ContentStream);
}
// Handle resources
if (const PDFDictionary* resourcesDictionary = document->getDictionaryFromObject(dictionary->get("Resources")))
{
markDictionary(document, resourcesDictionary->get("ExtGState"), GraphicState);
markDictionary(document, resourcesDictionary->get("ColorSpace"), ColorSpace);
markDictionary(document, resourcesDictionary->get("Pattern"), Pattern);
markDictionary(document, resourcesDictionary->get("Shading"), Shading);
markDictionary(document, resourcesDictionary->get("Font"), Font);
if (const PDFDictionary* xobjectDictionary = document->getDictionaryFromObject(resourcesDictionary->get("XObject")))
{
const size_t count = xobjectDictionary->getCount();
for (size_t i = 0; i < count; ++i)
{
const PDFObject& item = xobjectDictionary->getValue(i);
if (item.isReference() && hasObject(item.getReference()))
{
if (const PDFDictionary* xobjectItemDictionary = document->getDictionaryFromObject(item))
{
QByteArray subtype = loader.readNameFromDictionary(xobjectItemDictionary, "Subtype");
if (subtype == "Image")
{
mark(item.getReference(), Image);
}
else if (subtype == "Form")
{
mark(item.getReference(), Form);
}
}
}
}
}
}
}
for (Classification& classification : m_classification)
{
if (const PDFDictionary* dictionary = document->getDictionaryFromObject(document->getObjectByReference(classification.reference)))
{
QByteArray typeName = loader.readNameFromDictionary(dictionary, "Type");
if (typeName == "Action")
{
classification.types.setFlag(Action);
}
}
}
for (const Classification& classification : m_classification)
{
m_allTypesUsed |= classification.types;
}
}
bool PDFObjectClassifier::hasObject(PDFObjectReference reference) const
{
return reference.isValid() &&
reference.objectNumber < PDFInteger(m_classification.size()) &&
m_classification[reference.objectNumber].reference == reference;
}
std::vector<PDFObjectReference> PDFObjectClassifier::getObjectsByType(Type type) const
{
std::vector<PDFObjectReference> result;
for (const Classification& classification : m_classification)
{
if (classification.types.testFlag(type))
{
result.push_back(classification.reference);
}
}
return result;
}
void PDFObjectClassifier::mark(PDFObjectReference reference, Type type)
{
Q_ASSERT(hasObject(reference));
m_classification[reference.objectNumber].types.setFlag(type, true);
}
void PDFObjectClassifier::markDictionary(const PDFDocument* document, PDFObject object, Type type)
{
if (const PDFDictionary* dictionary = document->getDictionaryFromObject(object))
{
const size_t count = dictionary->getCount();
for (size_t i = 0; i < count; ++i)
{
const PDFObject& item = dictionary->getValue(i);
if (item.isReference() && hasObject(item.getReference()))
{
mark(item.getReference(), type);
}
}
}
}
} // namespace pdf

View File

@@ -20,23 +20,30 @@
#include "pdfobject.h"
#include <QtCore>
#include <set>
#include <vector>
namespace pdf
{
class PDFObjectStorage;
class PDFDocument;
/// Utilities for manipulation with objects
class PDFObjectUtils
{
public:
/// Returns list of references referenced by \p objects. So, all references, which are present
/// Returns a list of references referenced by \p objects. So, all references, which are present
/// in objects, appear in the result set, including objects, which are referenced by referenced
/// objects (so, transitive closure above reference graph is returned).
/// \param objects Objects
/// \param storage Storage
static std::set<PDFObjectReference> getReferences(const std::vector<PDFObject>& objects, const PDFObjectStorage& storage);
/// Returns a list of references directly referenced from object. References itself are not followed.
static std::set<PDFObjectReference> getDirectReferences(const PDFObject& object);
static PDFObject replaceReferences(const PDFObject& object, const std::map<PDFObjectReference, PDFObjectReference>& referenceMapping);
private:
@@ -97,6 +104,67 @@ private:
bool m_locked;
};
/// Classifies objects according to their type. Some heuristic is used
/// when object type is missing or document is not well-formed.
class Pdf4QtLIBSHARED_EXPORT PDFObjectClassifier
{
public:
inline PDFObjectClassifier() = default;
/// Performs object classification on a document. Old classification
/// is being cleared.
/// \param document Document
void classify(const PDFDocument* document);
enum Type : uint32_t
{
None = 0x00000000,
Page = 0x00000001,
ContentStream = 0x00000002,
GraphicState = 0x00000004,
ColorSpace = 0x00000008,
Pattern = 0x00000010,
Shading = 0x00000020,
Image = 0x00000040,
Form = 0x00000080,
Font = 0x00000100,
Action = 0x00000200,
Annotation = 0x00000400
};
Q_DECLARE_FLAGS(Types, Type)
/// Returns true, if object with given reference exists
/// and was classified.
/// \param reference Reference
bool hasObject(PDFObjectReference reference) const;
/// Returns true, if any object with given type is present in a document
/// \param type Object type
bool hasType(Type type) const { return m_allTypesUsed.testFlag(type); }
/// Returns a list of objects with a given type
/// \param type Type
std::vector<PDFObjectReference> getObjectsByType(Type type) const;
private:
struct Classification
{
PDFObjectReference reference;
Types types = None;
};
/// Marks object with a given type
void mark(PDFObjectReference reference, Type type);
/// Marks objects in dictionary with a given type
void markDictionary(const PDFDocument* document, PDFObject object, Type type);
std::vector<Classification> m_classification;
Types m_allTypesUsed;
};
} // namespace pdf
#endif // PDFOBJECTUTILS_H