Separate tool for extracting pages

This commit is contained in:
Jakub Melka
2020-10-31 14:23:13 +01:00
parent 4051591929
commit c58158e3ee
12 changed files with 591 additions and 21 deletions

View File

@@ -180,7 +180,7 @@ PDFCatalog PDFCatalog::parse(const PDFObject& catalog, const PDFDocument* docume
PDFCatalog catalogObject;
catalogObject.m_viewerPreferences = PDFViewerPreferences::parse(catalog, document);
catalogObject.m_pages = PDFPage::parse(document, catalogDictionary->get("Pages"));
catalogObject.m_pages = PDFPage::parse(&document->getStorage(), catalogDictionary->get("Pages"));
catalogObject.m_pageLabels = PDFNumberTreeLoader<PDFPageLabel>::parse(&document->getStorage(), catalogDictionary->get("PageLabels"));
if (catalogDictionary->hasKey("OCProperties"))

View File

@@ -862,6 +862,113 @@ QRectF PDFDocumentBuilder::getPolygonsBoundingRect(const Polygons& polygons) con
return rect;
}
void PDFDocumentBuilder::flattenPageTree()
{
PDFObjectReference pageTreeRoot = getPageTreeRoot();
PDFObject pageTree = PDFObject::createReference(pageTreeRoot);
std::vector<PDFPage> pages = PDFPage::parse(&m_storage, pageTree);
std::vector<PDFObjectReference> pageReferences;
// First, fill inheritable attributes to pages and correct parent
for (const PDFPage& page : pages)
{
PDFObjectFactory objectBuilder;
objectBuilder.beginDictionary();
objectBuilder.beginDictionaryItem("Parent");
objectBuilder << pageTreeRoot;
objectBuilder.endDictionaryItem();
objectBuilder.beginDictionaryItem("MediaBox");
objectBuilder << page.getMediaBox();
objectBuilder.endDictionaryItem();
if (page.getCropBox().isValid())
{
objectBuilder.beginDictionaryItem("CropBox");
objectBuilder << page.getCropBox();
objectBuilder.endDictionaryItem();
}
if (!page.getResources().isNull())
{
objectBuilder.beginDictionaryItem("Resources");
objectBuilder << page.getResources();
objectBuilder.endDictionaryItem();
}
if (page.getPageRotation() != PageRotation::None)
{
PDFInteger angle = 0;
switch (page.getPageRotation())
{
case PageRotation::Rotate90:
angle = 90;
break;
case PageRotation::Rotate180:
angle = 180;
break;
case PageRotation::Rotate270:
angle = 270;
break;
default:
break;
}
objectBuilder.beginDictionaryItem("Rotate");
objectBuilder << angle;
objectBuilder.endDictionaryItem();
}
objectBuilder.endDictionary();
mergeTo(page.getPageReference(), objectBuilder.takeObject());
pageReferences.push_back(page.getPageReference());
}
setPages(pageReferences);
}
void PDFDocumentBuilder::setPages(const std::vector<PDFObjectReference>& pageReferences)
{
PDFObjectFactory objectBuilder;
objectBuilder.beginDictionary();
objectBuilder.beginDictionaryItem("Kids");
objectBuilder.beginArray();
for (const PDFObjectReference& pageReference : pageReferences)
{
objectBuilder << pageReference;
}
objectBuilder.endArray();
objectBuilder.endDictionaryItem();
objectBuilder.beginDictionaryItem("Count");
objectBuilder << PDFInteger(pageReferences.size());
objectBuilder.endDictionaryItem();
objectBuilder.endDictionary();
mergeTo(getPageTreeRoot(), objectBuilder.takeObject());
}
std::vector<PDFObjectReference> PDFDocumentBuilder::getPages() const
{
std::vector<PDFObjectReference> result;
if (const PDFDictionary* pageTreeRoot = m_storage.getDictionaryFromObject(m_storage.getObject(getPageTreeRoot())))
{
PDFDocumentDataLoaderDecorator loader(&m_storage);
result = loader.readReferenceArrayFromDictionary(pageTreeRoot, "Kids");
}
return result;
}
std::vector<PDFObject> PDFDocumentBuilder::copyFrom(const std::vector<PDFObject>& objects, const PDFObjectStorage& storage, bool createReferences)
{
// 1) Collect all references, which we must copy. If object is referenced, then
@@ -2756,6 +2863,20 @@ PDFObject PDFDocumentBuilder::createTrailerDictionary(PDFObjectReference catalog
}
void PDFDocumentBuilder::removeOutline()
{
PDFObjectFactory objectBuilder;
objectBuilder.beginDictionary();
objectBuilder.beginDictionaryItem("Outlines");
objectBuilder << PDFObject();
objectBuilder.endDictionaryItem();
objectBuilder.endDictionary();
PDFObject updatedCatalog = objectBuilder.takeObject();
mergeTo(getCatalogReference(), updatedCatalog);
}
void PDFDocumentBuilder::setAnnotationAppearanceState(PDFObjectReference annotation,
QByteArray appearanceState)
{
@@ -3124,6 +3245,54 @@ void PDFDocumentBuilder::updateTrailerDictionary(PDFInteger objectCount)
}
void PDFDocumentBuilder::removeThreads()
{
PDFObjectFactory objectBuilder;
objectBuilder.beginDictionary();
objectBuilder.beginDictionaryItem("Threads");
objectBuilder << PDFObject();
objectBuilder.endDictionaryItem();
objectBuilder.endDictionary();
PDFObject updatedCatalog = objectBuilder.takeObject();
mergeTo(getCatalogReference(), updatedCatalog);
}
void PDFDocumentBuilder::removeDocumentActions()
{
PDFObjectFactory objectBuilder;
objectBuilder.beginDictionary();
objectBuilder.beginDictionaryItem("OpenAction");
objectBuilder << PDFObject();
objectBuilder.endDictionaryItem();
objectBuilder.beginDictionaryItem("AA");
objectBuilder << PDFObject();
objectBuilder.endDictionaryItem();
objectBuilder.endDictionary();
PDFObject updatedCatalog = objectBuilder.takeObject();
mergeTo(getCatalogReference(), updatedCatalog);
}
void PDFDocumentBuilder::removeStructureTree()
{
PDFObjectFactory objectBuilder;
objectBuilder.beginDictionary();
objectBuilder.beginDictionaryItem("StructTreeRoot");
objectBuilder << PDFObject();
objectBuilder.endDictionaryItem();
objectBuilder.beginDictionaryItem("MarkInfo");
objectBuilder << PDFObject();
objectBuilder.endDictionaryItem();
objectBuilder.endDictionary();
PDFObject updatedCatalog = objectBuilder.takeObject();
mergeTo(getCatalogReference(), updatedCatalog);
}
/* END GENERATED CODE */
} // namespace pdf

View File

@@ -289,6 +289,19 @@ public:
const PDFFormManager* getFormManager() const;
void setFormManager(const PDFFormManager* formManager);
/// Flattens page tree, inheritable attributes in non-leaf nodes will
/// be written into the page tree. Templates will be lost.
void flattenPageTree();
/// Sets a list of page references to page tree. Page tree must
/// be flattened to use this function. \sa flattenPageTree
/// \param pageReferences Page references
void setPages(const std::vector<PDFObjectReference>& pageReferences);
/// Returns a list of page references. Page tree must
/// be flattened to use this function. \sa flattenPageTree
std::vector<PDFObjectReference> getPages() const;
/* START GENERATED CODE */
/// Appends a new page after last page.
@@ -810,6 +823,10 @@ public:
PDFObject createTrailerDictionary(PDFObjectReference catalog);
/// Removes outline tree from document catalog.
void removeOutline();
/// Sets annotation appearance state.
/// \param annotation Annotation
/// \param appearanceState Appearance state
@@ -971,6 +988,18 @@ public:
void updateTrailerDictionary(PDFInteger objectCount);
/// Removes threads from document catalog.
void removeThreads();
/// Removes document actions from document catalog.
void removeDocumentActions();
/// Removes structure tree from document catalog.
void removeStructureTree();
/* END GENERATED CODE */
private:

View File

@@ -94,4 +94,6 @@ private:
} // namespace pdf
Q_DECLARE_OPERATORS_FOR_FLAGS(pdf::PDFOptimizer::OptimizationFlags)
#endif // PDFOPTIMIZER_H

View File

@@ -25,14 +25,14 @@ namespace pdf
PDFPageInheritableAttributes PDFPageInheritableAttributes::parse(const PDFPageInheritableAttributes& templateAttributes,
const PDFObject& dictionary,
const PDFDocument* document)
const PDFObjectStorage* storage)
{
PDFPageInheritableAttributes result(templateAttributes);
const PDFObject& dereferencedDictionary = document->getObject(dictionary);
const PDFObject& dereferencedDictionary = storage->getObject(dictionary);
if (dereferencedDictionary.isDictionary())
{
PDFDocumentDataLoaderDecorator loader(document);
PDFDocumentDataLoaderDecorator loader(storage);
const PDFDictionary* dictionary = dereferencedDictionary.getDictionary();
if (dictionary->hasKey("MediaBox"))
@@ -106,11 +106,11 @@ PageRotation PDFPageInheritableAttributes::getPageRotation() const
return PageRotation::None;
}
std::vector<PDFPage> PDFPage::parse(const PDFDocument* document, const PDFObject& root)
std::vector<PDFPage> PDFPage::parse(const PDFObjectStorage* storage, const PDFObject& root)
{
std::vector<PDFPage> result;
std::set<PDFObjectReference> visited;
parseImpl(result, visited, PDFPageInheritableAttributes(), root, document);
parseImpl(result, visited, PDFPageInheritableAttributes(), root, storage);
return result;
}
@@ -228,24 +228,24 @@ void PDFPage::parseImpl(std::vector<PDFPage>& pages,
std::set<PDFObjectReference>& visitedReferences,
const PDFPageInheritableAttributes& templateAttributes,
const PDFObject& root,
const PDFDocument* document)
const PDFObjectStorage* storage)
{
// Are we in internal node, or leaf (page object)?
PDFObjectReference objectReference = root.isReference() ? root.getReference() : PDFObjectReference();
const PDFObject& dereferenced = document->getObject(root);
const PDFObject& dereferenced = storage->getObject(root);
if (dereferenced.isDictionary())
{
const PDFDictionary* dictionary = dereferenced.getDictionary();
const PDFObject& typeObject = document->getObject(dictionary->get("Type"));
const PDFObject& typeObject = storage->getObject(dictionary->get("Type"));
if (typeObject.isName())
{
PDFPageInheritableAttributes currentInheritableAttributes = PDFPageInheritableAttributes::parse(templateAttributes, root, document);
PDFPageInheritableAttributes currentInheritableAttributes = PDFPageInheritableAttributes::parse(templateAttributes, root, storage);
QByteArray typeString = typeObject.getString();
if (typeString == "Pages")
{
const PDFObject& kids = document->getObject(dictionary->get("Kids"));
const PDFObject& kids = storage->getObject(dictionary->get("Kids"));
if (kids.isArray())
{
const PDFArray* kidsArray = kids.getArray();
@@ -268,7 +268,7 @@ void PDFPage::parseImpl(std::vector<PDFPage>& pages,
}
visitedReferences.insert(kid.getReference());
parseImpl(pages, visitedReferences, currentInheritableAttributes, kid, document);
parseImpl(pages, visitedReferences, currentInheritableAttributes, kid, storage);
}
}
else
@@ -284,7 +284,7 @@ void PDFPage::parseImpl(std::vector<PDFPage>& pages,
page.m_pageReference = objectReference;
page.m_mediaBox = currentInheritableAttributes.getMediaBox();
page.m_cropBox = currentInheritableAttributes.getCropBox();
page.m_resources = document->getObject(currentInheritableAttributes.getResources());
page.m_resources = storage->getObject(currentInheritableAttributes.getResources());
page.m_pageRotation = currentInheritableAttributes.getPageRotation();
if (!page.m_cropBox.isValid())
@@ -292,11 +292,11 @@ void PDFPage::parseImpl(std::vector<PDFPage>& pages,
page.m_cropBox = page.m_mediaBox;
}
PDFDocumentDataLoaderDecorator loader(document);
PDFDocumentDataLoaderDecorator loader(storage);
page.m_bleedBox = loader.readRectangle(dictionary->get("BleedBox"), page.getCropBox());
page.m_trimBox = loader.readRectangle(dictionary->get("TrimBox"), page.getCropBox());
page.m_artBox = loader.readRectangle(dictionary->get("ArtBox"), page.getCropBox());
page.m_contents = document->getObject(dictionary->get("Contents"));
page.m_contents = storage->getObject(dictionary->get("Contents"));
page.m_annots = loader.readReferenceArrayFromDictionary(dictionary, "Annots");
page.m_lastModified = PDFEncoding::convertToDateTime(loader.readStringFromDictionary(dictionary, "LastModified"));
page.m_thumbnailReference = loader.readReferenceFromDictionary(dictionary, "Thumb");

View File

@@ -94,8 +94,8 @@ public:
/// Parses inheritable attributes from the page tree node
/// \param templateAttributes Template attributes
/// \param dictionary Dictionary, from which the data will be read
/// \param document Document owning this data
static PDFPageInheritableAttributes parse(const PDFPageInheritableAttributes& templateAttributes, const PDFObject& dictionary, const PDFDocument* document);
/// \param storage Storage owning this data
static PDFPageInheritableAttributes parse(const PDFPageInheritableAttributes& templateAttributes, const PDFObject& dictionary, const PDFObjectStorage* storage);
const QRectF& getMediaBox() const { return m_mediaBox; }
const QRectF& getCropBox() const { return m_cropBox; }
@@ -117,9 +117,9 @@ public:
explicit PDFPage() = default;
/// Parses the page tree. If error occurs, then exception is thrown.
/// \param document Document owning this tree
/// \param storage Storage owning this tree
/// \param root Root object of page tree
static std::vector<PDFPage> parse(const PDFDocument* document, const PDFObject& root);
static std::vector<PDFPage> parse(const PDFObjectStorage* storage, const PDFObject& root);
inline const QRectF& getMediaBox() const { return m_mediaBox; }
inline const QRectF& getCropBox() const { return m_cropBox; }
@@ -249,12 +249,12 @@ private:
/// \param visitedReferences Visited references (to check cycles in page tree and avoid hangup)
/// \param templateAttributes Template attributes (inheritable attributes defined in parent)
/// \param root Root object of page tree
/// \param document Document owning this tree
/// \param storage Storage owning this tree
static void parseImpl(std::vector<PDFPage>& pages,
std::set<PDFObjectReference>& visitedReferences,
const PDFPageInheritableAttributes& templateAttributes,
const PDFObject& root,
const PDFDocument* document);
const PDFObjectStorage* storage);
/// Returns object from page dictionary. This function requires,
/// that storage of object is present, for object fetching. Objects