This commit is contained in:
John Whitington 2023-04-16 14:44:34 +01:00
parent e54868a64b
commit cc74b3fc69
4 changed files with 55 additions and 11 deletions

View File

@ -302,3 +302,48 @@ let dump_attached_files pdf out =
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf) iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
with with
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e)) e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
let size_attachment pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> bytes_size b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
| _ -> 0
let size_page_files pdf page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
map (size_attachment pdf) (map (fun x -> 0, x) fsannots)
let size_document_files pdf =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
sum (map (size_attachment pdf) (Pdf.contents_of_nametree pdf x))
| None -> 0
let size_attached_files pdf =
size_document_files pdf + sum (flatten (map (size_page_files pdf) (Pdfpage.pages_of_pagetree pdf)))

View File

@ -22,3 +22,6 @@ val list_attached_files : Pdf.t -> attachment list
(** Dump attached files to a given directory. *) (** Dump attached files to a given directory. *)
val dump_attached_files : Pdf.t -> string -> unit val dump_attached_files : Pdf.t -> string -> unit
(** Total size in bytes of all attached files. *)
val size_attached_files : Pdf.t -> int

View File

@ -73,14 +73,11 @@ let find_composition_content_streams pdf i obj marked =
[i] [i]
| _ -> [] | _ -> []
let find_composition_embedded_files pdf i obj marked = []
let find_composition pdf = let find_composition pdf =
let marked = null_hash () in let marked = null_hash () in
let images = ref [] in let images = ref [] in
let fonts = ref [] in let fonts = ref [] in
let content_streams = ref [] in let content_streams = ref [] in
let embedded_files = ref [] in
Pdf.objiter Pdf.objiter
(fun i obj -> (fun i obj ->
(*Printf.printf "Looking at object %i\n" i; (*Printf.printf "Looking at object %i\n" i;
@ -89,13 +86,12 @@ let find_composition pdf =
Hashtbl.iter (fun k () -> Printf.printf "%i " k) marked; Hashtbl.iter (fun k () -> Printf.printf "%i " k) marked;
Printf.printf "\n";*) Printf.printf "\n";*)
match Hashtbl.find marked i with _ -> () | exception Not_found -> match Hashtbl.find marked i with _ -> () | exception Not_found ->
embedded_files := find_composition_embedded_files pdf i obj marked @ !embedded_files;
images := find_composition_images pdf i obj marked @ !images; images := find_composition_images pdf i obj marked @ !images;
content_streams := find_composition_content_streams pdf i obj marked @ !content_streams; content_streams := find_composition_content_streams pdf i obj marked @ !content_streams;
fonts := find_composition_fonts pdf i obj marked @ !fonts) fonts := find_composition_fonts pdf i obj marked @ !fonts)
pdf; pdf;
let structure_info = find_composition_structure_info pdf marked in let structure_info = find_composition_structure_info pdf marked in
(!images, !fonts, !content_streams, structure_info, !embedded_files) (!images, !fonts, !content_streams, structure_info)
let size pdf i = let size pdf i =
String.length (Pdfwrite.string_of_pdf_including_data (Pdf.lookup_obj pdf i)) String.length (Pdfwrite.string_of_pdf_including_data (Pdf.lookup_obj pdf i))
@ -123,21 +119,21 @@ let compressed_xref_table_size pdf =
let show_composition_json filesize pdf = let show_composition_json filesize pdf =
let perc x = float_of_int x /. float_of_int filesize *. 100. in let perc x = float_of_int x /. float_of_int filesize *. 100. in
let o_images, o_fonts, o_content_streams, o_structure_info, o_embedded_files = find_composition pdf in let o_images, o_fonts, o_content_streams, o_structure_info = find_composition pdf in
let images, fonts, content_streams, structure_info, embedded_files, xref_table = let images, fonts, content_streams, structure_info, attached_files, xref_table =
compressed_size pdf o_images, compressed_size pdf o_images,
compressed_size pdf o_fonts, compressed_size pdf o_fonts,
compressed_size pdf o_content_streams, compressed_size pdf o_content_streams,
compressed_size pdf o_structure_info, compressed_size pdf o_structure_info,
compressed_size pdf o_embedded_files, Cpdfattach.size_attached_files pdf,
compressed_xref_table_size pdf compressed_xref_table_size pdf
in in
let r = images + fonts + content_streams + structure_info + embedded_files + xref_table in let r = images + fonts + content_streams + structure_info + attached_files + xref_table in
`List [`Tuple [`String "Images"; `Int images; `Float (perc images)]; `List [`Tuple [`String "Images"; `Int images; `Float (perc images)];
`Tuple [`String "Fonts"; `Int fonts; `Float (perc fonts)]; `Tuple [`String "Fonts"; `Int fonts; `Float (perc fonts)];
`Tuple [`String "Content streams"; `Int content_streams; `Float (perc content_streams)]; `Tuple [`String "Content streams"; `Int content_streams; `Float (perc content_streams)];
`Tuple [`String "Structure Info"; `Int structure_info; `Float (perc structure_info)]; `Tuple [`String "Structure Info"; `Int structure_info; `Float (perc structure_info)];
`Tuple [`String "Embedded Files"; `Int embedded_files; `Float (perc embedded_files)]; `Tuple [`String "Attached Files"; `Int attached_files; `Float (perc attached_files)];
`Tuple [`String "XRef Table"; `Int xref_table; `Float (perc xref_table)]; `Tuple [`String "XRef Table"; `Int xref_table; `Float (perc xref_table)];
`Tuple [`String "Unclassified"; `Int (filesize - r); `Float (perc (filesize - r))]] `Tuple [`String "Unclassified"; `Int (filesize - r); `Float (perc (filesize - r))]]

View File

@ -5,7 +5,7 @@
%Document -list-annotations[-json] now obey page range %Document -list-annotations[-json] now obey page range
%Document round-tripping of annotations, supersede -copy-annotations. %Document round-tripping of annotations, supersede -copy-annotations.
%Document -utf for JSON and mark -clean-strings as deprecated since can fail to round-trip binary strings which begin with a BOM? %Document -utf for JSON and mark -clean-strings as deprecated since can fail to round-trip binary strings which begin with a BOM?
%Document -composition[-json] %Document -composition[-json] - mention residue may be negative
%Document discourage GhostScript usage, since it can strip data (-gs-malformed, embed missing fonts) %Document discourage GhostScript usage, since it can strip data (-gs-malformed, embed missing fonts)
%Document [ ] pagespecs %Document [ ] pagespecs
%Document extensions to -info %Document extensions to -info