This commit is contained in:
John Whitington 2023-04-16 14:44:34 +01:00
parent e54868a64b
commit cc74b3fc69
4 changed files with 55 additions and 11 deletions

View File

@ -302,3 +302,48 @@ let dump_attached_files pdf out =
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
with
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
let size_attachment pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> bytes_size b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
| _ -> 0
let size_page_files pdf page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
map (size_attachment pdf) (map (fun x -> 0, x) fsannots)
let size_document_files pdf =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
sum (map (size_attachment pdf) (Pdf.contents_of_nametree pdf x))
| None -> 0
let size_attached_files pdf =
size_document_files pdf + sum (flatten (map (size_page_files pdf) (Pdfpage.pages_of_pagetree pdf)))

View File

@ -22,3 +22,6 @@ val list_attached_files : Pdf.t -> attachment list
(** Dump attached files to a given directory. *)
val dump_attached_files : Pdf.t -> string -> unit
(** Total size in bytes of all attached files. *)
val size_attached_files : Pdf.t -> int

View File

@ -73,14 +73,11 @@ let find_composition_content_streams pdf i obj marked =
[i]
| _ -> []
let find_composition_embedded_files pdf i obj marked = []
let find_composition pdf =
let marked = null_hash () in
let images = ref [] in
let fonts = ref [] in
let content_streams = ref [] in
let embedded_files = ref [] in
Pdf.objiter
(fun i obj ->
(*Printf.printf "Looking at object %i\n" i;
@ -89,13 +86,12 @@ let find_composition pdf =
Hashtbl.iter (fun k () -> Printf.printf "%i " k) marked;
Printf.printf "\n";*)
match Hashtbl.find marked i with _ -> () | exception Not_found ->
embedded_files := find_composition_embedded_files pdf i obj marked @ !embedded_files;
images := find_composition_images pdf i obj marked @ !images;
content_streams := find_composition_content_streams pdf i obj marked @ !content_streams;
fonts := find_composition_fonts pdf i obj marked @ !fonts)
pdf;
let structure_info = find_composition_structure_info pdf marked in
(!images, !fonts, !content_streams, structure_info, !embedded_files)
(!images, !fonts, !content_streams, structure_info)
let size pdf i =
String.length (Pdfwrite.string_of_pdf_including_data (Pdf.lookup_obj pdf i))
@ -123,21 +119,21 @@ let compressed_xref_table_size pdf =
let show_composition_json filesize pdf =
let perc x = float_of_int x /. float_of_int filesize *. 100. in
let o_images, o_fonts, o_content_streams, o_structure_info, o_embedded_files = find_composition pdf in
let images, fonts, content_streams, structure_info, embedded_files, xref_table =
let o_images, o_fonts, o_content_streams, o_structure_info = find_composition pdf in
let images, fonts, content_streams, structure_info, attached_files, xref_table =
compressed_size pdf o_images,
compressed_size pdf o_fonts,
compressed_size pdf o_content_streams,
compressed_size pdf o_structure_info,
compressed_size pdf o_embedded_files,
Cpdfattach.size_attached_files pdf,
compressed_xref_table_size pdf
in
let r = images + fonts + content_streams + structure_info + embedded_files + xref_table in
let r = images + fonts + content_streams + structure_info + attached_files + xref_table in
`List [`Tuple [`String "Images"; `Int images; `Float (perc images)];
`Tuple [`String "Fonts"; `Int fonts; `Float (perc fonts)];
`Tuple [`String "Content streams"; `Int content_streams; `Float (perc content_streams)];
`Tuple [`String "Structure Info"; `Int structure_info; `Float (perc structure_info)];
`Tuple [`String "Embedded Files"; `Int embedded_files; `Float (perc embedded_files)];
`Tuple [`String "Attached Files"; `Int attached_files; `Float (perc attached_files)];
`Tuple [`String "XRef Table"; `Int xref_table; `Float (perc xref_table)];
`Tuple [`String "Unclassified"; `Int (filesize - r); `Float (perc (filesize - r))]]

View File

@ -5,7 +5,7 @@
%Document -list-annotations[-json] now obey page range
%Document round-tripping of annotations, supersede -copy-annotations.
%Document -utf for JSON and mark -clean-strings as deprecated since can fail to round-trip binary strings which begin with a BOM?
%Document -composition[-json]
%Document -composition[-json] - mention residue may be negative
%Document discourage GhostScript usage, since it can strip data (-gs-malformed, embed missing fonts)
%Document [ ] pagespecs
%Document extensions to -info