mirror of
https://github.com/JakubMelka/PDF4QT.git
synced 2025-06-05 21:59:17 +02:00
Object classifier
This commit is contained in:
@@ -2413,6 +2413,28 @@ QString PDFEncoding::convertSmartFromByteStringToUnicode(const QByteArray& strea
|
||||
return QString::fromLatin1(stream.toHex()).toUpper();
|
||||
}
|
||||
|
||||
QString PDFEncoding::convertSmartFromByteStringToRepresentableQString(const QByteArray& stream)
|
||||
{
|
||||
if (stream.startsWith("D:"))
|
||||
{
|
||||
QDateTime dateTime = convertToDateTime(stream);
|
||||
if (dateTime.isValid())
|
||||
{
|
||||
return dateTime.toString(Qt::TextDate);
|
||||
}
|
||||
}
|
||||
|
||||
bool isBinary = false;
|
||||
QString text = convertSmartFromByteStringToUnicode(stream, &isBinary);
|
||||
|
||||
if (!isBinary)
|
||||
{
|
||||
return text;
|
||||
}
|
||||
|
||||
return stream.toPercentEncoding(" ", QByteArray(), '%');
|
||||
}
|
||||
|
||||
QString PDFEncoding::getEncodingCharacters(Encoding encoding)
|
||||
{
|
||||
QString string;
|
||||
|
@@ -117,10 +117,16 @@ public:
|
||||
/// Function checks if stream can be converted to unicode by heuristic
|
||||
/// way, it is not always reliable.
|
||||
/// \param stream Stream
|
||||
/// \param isBinary If specified, it is set to true if conversion failed
|
||||
/// \param[out] isBinary If specified, it is set to true if conversion failed
|
||||
/// \returns Unicode string or string converted to hexadecimal representation
|
||||
static QString convertSmartFromByteStringToUnicode(const QByteArray& stream, bool* isBinary);
|
||||
|
||||
/// Tries to convert stream to representable string. If it cannot be done,
|
||||
/// percentage encoding is used.
|
||||
/// \param stream Stream
|
||||
/// \returns Unicode string or string converted to percentage representation
|
||||
static QString convertSmartFromByteStringToRepresentableQString(const QByteArray& stream);
|
||||
|
||||
/// Returns all characters of the given encoding
|
||||
/// \param encoding Encoding
|
||||
/// \returns All characters reprezentable by encoding.
|
||||
|
@@ -15,7 +15,6 @@
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with Pdf4Qt. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
#include "pdfobjectutils.h"
|
||||
#include "pdfvisitor.h"
|
||||
|
||||
@@ -210,6 +209,16 @@ std::set<PDFObjectReference> PDFObjectUtils::getReferences(const std::vector<PDF
|
||||
return references;
|
||||
}
|
||||
|
||||
std::set<PDFObjectReference> PDFObjectUtils::getDirectReferences(const PDFObject& object)
|
||||
{
|
||||
std::set<PDFObjectReference> references;
|
||||
|
||||
PDFCollectReferencesVisitor collectReferencesVisitor(references);
|
||||
object.accept(&collectReferencesVisitor);
|
||||
|
||||
return references;
|
||||
}
|
||||
|
||||
PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::map<PDFObjectReference, PDFObjectReference>& referenceMapping)
|
||||
{
|
||||
PDFReplaceReferencesVisitor replaceReferencesVisitor(referenceMapping);
|
||||
@@ -217,4 +226,162 @@ PDFObject PDFObjectUtils::replaceReferences(const PDFObject& object, const std::
|
||||
return replaceReferencesVisitor.getObject();
|
||||
}
|
||||
|
||||
void PDFObjectClassifier::classify(const PDFDocument* document)
|
||||
{
|
||||
// Clear old classification, if it exist
|
||||
m_classification.clear();
|
||||
m_allTypesUsed = None;
|
||||
|
||||
if (!document)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
PDFDocumentDataLoaderDecorator loader(document);
|
||||
const PDFObjectStorage& storage = document->getStorage();
|
||||
const PDFObjectStorage::PDFObjects& objects = storage.getObjects();
|
||||
|
||||
m_classification.resize(objects.size(), Classification());
|
||||
for (size_t i = 0; i < objects.size(); ++i)
|
||||
{
|
||||
PDFObjectReference reference(i, objects[i].generation);
|
||||
m_classification[i].reference = reference;
|
||||
}
|
||||
|
||||
// First, iterate trough pages of the document
|
||||
const PDFCatalog* catalog = document->getCatalog();
|
||||
const size_t pageCount = catalog->getPageCount();
|
||||
for (size_t i = 0; i < pageCount; ++i)
|
||||
{
|
||||
const PDFPage* page = catalog->getPage(i);
|
||||
|
||||
if (!page)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle page itself
|
||||
if (hasObject(page->getPageReference()))
|
||||
{
|
||||
mark(page->getPageReference(), Page);
|
||||
}
|
||||
|
||||
// Handle annotations
|
||||
for (const PDFObjectReference& reference : page->getAnnotations())
|
||||
{
|
||||
if (hasObject(reference))
|
||||
{
|
||||
mark(reference, Annotation);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle contents
|
||||
PDFObject pageObject = document->getObjectByReference(page->getPageReference());
|
||||
Q_ASSERT(pageObject.isDictionary());
|
||||
|
||||
const PDFDictionary* dictionary = pageObject.getDictionary();
|
||||
const PDFObject& contentsObject = dictionary->get("Contents");
|
||||
if (contentsObject.isReference())
|
||||
{
|
||||
mark(contentsObject.getReference(), ContentStream);
|
||||
}
|
||||
|
||||
// Handle resources
|
||||
if (const PDFDictionary* resourcesDictionary = document->getDictionaryFromObject(dictionary->get("Resources")))
|
||||
{
|
||||
markDictionary(document, resourcesDictionary->get("ExtGState"), GraphicState);
|
||||
markDictionary(document, resourcesDictionary->get("ColorSpace"), ColorSpace);
|
||||
markDictionary(document, resourcesDictionary->get("Pattern"), Pattern);
|
||||
markDictionary(document, resourcesDictionary->get("Shading"), Shading);
|
||||
markDictionary(document, resourcesDictionary->get("Font"), Font);
|
||||
|
||||
if (const PDFDictionary* xobjectDictionary = document->getDictionaryFromObject(resourcesDictionary->get("XObject")))
|
||||
{
|
||||
const size_t count = xobjectDictionary->getCount();
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
const PDFObject& item = xobjectDictionary->getValue(i);
|
||||
if (item.isReference() && hasObject(item.getReference()))
|
||||
{
|
||||
if (const PDFDictionary* xobjectItemDictionary = document->getDictionaryFromObject(item))
|
||||
{
|
||||
QByteArray subtype = loader.readNameFromDictionary(xobjectItemDictionary, "Subtype");
|
||||
|
||||
if (subtype == "Image")
|
||||
{
|
||||
mark(item.getReference(), Image);
|
||||
}
|
||||
else if (subtype == "Form")
|
||||
{
|
||||
mark(item.getReference(), Form);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (Classification& classification : m_classification)
|
||||
{
|
||||
if (const PDFDictionary* dictionary = document->getDictionaryFromObject(document->getObjectByReference(classification.reference)))
|
||||
{
|
||||
QByteArray typeName = loader.readNameFromDictionary(dictionary, "Type");
|
||||
if (typeName == "Action")
|
||||
{
|
||||
classification.types.setFlag(Action);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const Classification& classification : m_classification)
|
||||
{
|
||||
m_allTypesUsed |= classification.types;
|
||||
}
|
||||
}
|
||||
|
||||
bool PDFObjectClassifier::hasObject(PDFObjectReference reference) const
|
||||
{
|
||||
return reference.isValid() &&
|
||||
reference.objectNumber < PDFInteger(m_classification.size()) &&
|
||||
m_classification[reference.objectNumber].reference == reference;
|
||||
}
|
||||
|
||||
std::vector<PDFObjectReference> PDFObjectClassifier::getObjectsByType(Type type) const
|
||||
{
|
||||
std::vector<PDFObjectReference> result;
|
||||
|
||||
for (const Classification& classification : m_classification)
|
||||
{
|
||||
if (classification.types.testFlag(type))
|
||||
{
|
||||
result.push_back(classification.reference);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void PDFObjectClassifier::mark(PDFObjectReference reference, Type type)
|
||||
{
|
||||
Q_ASSERT(hasObject(reference));
|
||||
m_classification[reference.objectNumber].types.setFlag(type, true);
|
||||
}
|
||||
|
||||
void PDFObjectClassifier::markDictionary(const PDFDocument* document, PDFObject object, Type type)
|
||||
{
|
||||
if (const PDFDictionary* dictionary = document->getDictionaryFromObject(object))
|
||||
{
|
||||
const size_t count = dictionary->getCount();
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
const PDFObject& item = dictionary->getValue(i);
|
||||
if (item.isReference() && hasObject(item.getReference()))
|
||||
{
|
||||
mark(item.getReference(), type);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pdf
|
||||
|
@@ -20,23 +20,30 @@
|
||||
|
||||
#include "pdfobject.h"
|
||||
|
||||
#include <QtCore>
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
namespace pdf
|
||||
{
|
||||
class PDFObjectStorage;
|
||||
class PDFDocument;
|
||||
|
||||
/// Utilities for manipulation with objects
|
||||
class PDFObjectUtils
|
||||
{
|
||||
public:
|
||||
/// Returns list of references referenced by \p objects. So, all references, which are present
|
||||
/// Returns a list of references referenced by \p objects. So, all references, which are present
|
||||
/// in objects, appear in the result set, including objects, which are referenced by referenced
|
||||
/// objects (so, transitive closure above reference graph is returned).
|
||||
/// \param objects Objects
|
||||
/// \param storage Storage
|
||||
static std::set<PDFObjectReference> getReferences(const std::vector<PDFObject>& objects, const PDFObjectStorage& storage);
|
||||
|
||||
/// Returns a list of references directly referenced from object. References itself are not followed.
|
||||
static std::set<PDFObjectReference> getDirectReferences(const PDFObject& object);
|
||||
|
||||
static PDFObject replaceReferences(const PDFObject& object, const std::map<PDFObjectReference, PDFObjectReference>& referenceMapping);
|
||||
|
||||
private:
|
||||
@@ -97,6 +104,67 @@ private:
|
||||
bool m_locked;
|
||||
};
|
||||
|
||||
/// Classifies objects according to their type. Some heuristic is used
|
||||
/// when object type is missing or document is not well-formed.
|
||||
class Pdf4QtLIBSHARED_EXPORT PDFObjectClassifier
|
||||
{
|
||||
public:
|
||||
|
||||
inline PDFObjectClassifier() = default;
|
||||
|
||||
/// Performs object classification on a document. Old classification
|
||||
/// is being cleared.
|
||||
/// \param document Document
|
||||
void classify(const PDFDocument* document);
|
||||
|
||||
enum Type : uint32_t
|
||||
{
|
||||
None = 0x00000000,
|
||||
Page = 0x00000001,
|
||||
ContentStream = 0x00000002,
|
||||
GraphicState = 0x00000004,
|
||||
ColorSpace = 0x00000008,
|
||||
Pattern = 0x00000010,
|
||||
Shading = 0x00000020,
|
||||
Image = 0x00000040,
|
||||
Form = 0x00000080,
|
||||
Font = 0x00000100,
|
||||
Action = 0x00000200,
|
||||
Annotation = 0x00000400
|
||||
};
|
||||
|
||||
Q_DECLARE_FLAGS(Types, Type)
|
||||
|
||||
/// Returns true, if object with given reference exists
|
||||
/// and was classified.
|
||||
/// \param reference Reference
|
||||
bool hasObject(PDFObjectReference reference) const;
|
||||
|
||||
/// Returns true, if any object with given type is present in a document
|
||||
/// \param type Object type
|
||||
bool hasType(Type type) const { return m_allTypesUsed.testFlag(type); }
|
||||
|
||||
/// Returns a list of objects with a given type
|
||||
/// \param type Type
|
||||
std::vector<PDFObjectReference> getObjectsByType(Type type) const;
|
||||
|
||||
private:
|
||||
struct Classification
|
||||
{
|
||||
PDFObjectReference reference;
|
||||
Types types = None;
|
||||
};
|
||||
|
||||
/// Marks object with a given type
|
||||
void mark(PDFObjectReference reference, Type type);
|
||||
|
||||
/// Marks objects in dictionary with a given type
|
||||
void markDictionary(const PDFDocument* document, PDFObject object, Type type);
|
||||
|
||||
std::vector<Classification> m_classification;
|
||||
Types m_allTypesUsed;
|
||||
};
|
||||
|
||||
} // namespace pdf
|
||||
|
||||
#endif // PDFOBJECTUTILS_H
|
||||
|
Reference in New Issue
Block a user