From cc74b3fc691724cc4b1d79f596b9348f42cc86e6 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Sun, 16 Apr 2023 14:44:34 +0100 Subject: [PATCH] more --- cpdfattach.ml | 45 +++++++++++++++++++++++++++++++++++++++++++++ cpdfattach.mli | 3 +++ cpdfcomposition.ml | 16 ++++++---------- cpdfmanual.tex | 2 +- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/cpdfattach.ml b/cpdfattach.ml index eac7739..82a0cee 100644 --- a/cpdfattach.ml +++ b/cpdfattach.ml @@ -302,3 +302,48 @@ let dump_attached_files pdf out = iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf) with e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e)) + +let size_attachment pdf (_, embeddedfile) = + match Pdf.lookup_direct pdf "/F" embeddedfile with + | Some (Pdf.String s) -> + begin match Pdf.lookup_direct pdf "/EF" embeddedfile with + | Some d -> + let stream = + match Pdf.lookup_direct pdf "/F" d with + | Some s -> s + | None -> error "Bad embedded file stream" + in + begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> bytes_size b | _ -> error "Bad embedded file stream" end + | _ -> error "Bad embedded file stream" + end + | _ -> 0 + +let size_page_files pdf page = + let annots = + match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with + | Some (Pdf.Array l) -> l + | _ -> [] + in + let efannots = + keep + (fun annot -> + match Pdf.lookup_direct pdf "/Subtype" annot with + | Some (Pdf.Name "/FileAttachment") -> true + | _ -> false) + annots + in + let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in + map (size_attachment pdf) (map (fun x -> 0, x) fsannots) + +let size_document_files pdf = + let root = Pdf.lookup_obj pdf pdf.Pdf.root in + let names = + match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary [] + in + match Pdf.lookup_direct pdf "/EmbeddedFiles" names with + | Some x -> + sum (map (size_attachment pdf) (Pdf.contents_of_nametree pdf x)) + | None -> 0 + +let size_attached_files pdf = + size_document_files pdf + sum (flatten (map (size_page_files pdf) (Pdfpage.pages_of_pagetree pdf))) diff --git a/cpdfattach.mli b/cpdfattach.mli index 4e32872..a90e04d 100644 --- a/cpdfattach.mli +++ b/cpdfattach.mli @@ -22,3 +22,6 @@ val list_attached_files : Pdf.t -> attachment list (** Dump attached files to a given directory. *) val dump_attached_files : Pdf.t -> string -> unit + +(** Total size in bytes of all attached files. *) +val size_attached_files : Pdf.t -> int diff --git a/cpdfcomposition.ml b/cpdfcomposition.ml index aa9b9eb..77859ea 100644 --- a/cpdfcomposition.ml +++ b/cpdfcomposition.ml @@ -73,14 +73,11 @@ let find_composition_content_streams pdf i obj marked = [i] | _ -> [] -let find_composition_embedded_files pdf i obj marked = [] - let find_composition pdf = let marked = null_hash () in let images = ref [] in let fonts = ref [] in let content_streams = ref [] in - let embedded_files = ref [] in Pdf.objiter (fun i obj -> (*Printf.printf "Looking at object %i\n" i; @@ -89,13 +86,12 @@ let find_composition pdf = Hashtbl.iter (fun k () -> Printf.printf "%i " k) marked; Printf.printf "\n";*) match Hashtbl.find marked i with _ -> () | exception Not_found -> - embedded_files := find_composition_embedded_files pdf i obj marked @ !embedded_files; images := find_composition_images pdf i obj marked @ !images; content_streams := find_composition_content_streams pdf i obj marked @ !content_streams; fonts := find_composition_fonts pdf i obj marked @ !fonts) pdf; let structure_info = find_composition_structure_info pdf marked in - (!images, !fonts, !content_streams, structure_info, !embedded_files) + (!images, !fonts, !content_streams, structure_info) let size pdf i = String.length (Pdfwrite.string_of_pdf_including_data (Pdf.lookup_obj pdf i)) @@ -123,21 +119,21 @@ let compressed_xref_table_size pdf = let show_composition_json filesize pdf = let perc x = float_of_int x /. float_of_int filesize *. 100. in - let o_images, o_fonts, o_content_streams, o_structure_info, o_embedded_files = find_composition pdf in - let images, fonts, content_streams, structure_info, embedded_files, xref_table = + let o_images, o_fonts, o_content_streams, o_structure_info = find_composition pdf in + let images, fonts, content_streams, structure_info, attached_files, xref_table = compressed_size pdf o_images, compressed_size pdf o_fonts, compressed_size pdf o_content_streams, compressed_size pdf o_structure_info, - compressed_size pdf o_embedded_files, + Cpdfattach.size_attached_files pdf, compressed_xref_table_size pdf in - let r = images + fonts + content_streams + structure_info + embedded_files + xref_table in + let r = images + fonts + content_streams + structure_info + attached_files + xref_table in `List [`Tuple [`String "Images"; `Int images; `Float (perc images)]; `Tuple [`String "Fonts"; `Int fonts; `Float (perc fonts)]; `Tuple [`String "Content streams"; `Int content_streams; `Float (perc content_streams)]; `Tuple [`String "Structure Info"; `Int structure_info; `Float (perc structure_info)]; - `Tuple [`String "Embedded Files"; `Int embedded_files; `Float (perc embedded_files)]; + `Tuple [`String "Attached Files"; `Int attached_files; `Float (perc attached_files)]; `Tuple [`String "XRef Table"; `Int xref_table; `Float (perc xref_table)]; `Tuple [`String "Unclassified"; `Int (filesize - r); `Float (perc (filesize - r))]] diff --git a/cpdfmanual.tex b/cpdfmanual.tex index 8c96c41..6b82f4f 100644 --- a/cpdfmanual.tex +++ b/cpdfmanual.tex @@ -5,7 +5,7 @@ %Document -list-annotations[-json] now obey page range %Document round-tripping of annotations, supersede -copy-annotations. %Document -utf for JSON and mark -clean-strings as deprecated since can fail to round-trip binary strings which begin with a BOM? -%Document -composition[-json] +%Document -composition[-json] - mention residue may be negative %Document discourage GhostScript usage, since it can strip data (-gs-malformed, embed missing fonts) %Document [ ] pagespecs %Document extensions to -info