From cc74b3fc691724cc4b1d79f596b9348f42cc86e6 Mon Sep 17 00:00:00 2001
From: John Whitington <john@coherentgraphics.co.uk>
Date: Sun, 16 Apr 2023 14:44:34 +0100
Subject: [PATCH] more

---
 cpdfattach.ml      | 45 +++++++++++++++++++++++++++++++++++++++++++++
 cpdfattach.mli     |  3 +++
 cpdfcomposition.ml | 16 ++++++----------
 cpdfmanual.tex     |  2 +-
 4 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/cpdfattach.ml b/cpdfattach.ml
index eac7739..82a0cee 100644
--- a/cpdfattach.ml
+++ b/cpdfattach.ml
@@ -302,3 +302,48 @@ let dump_attached_files pdf out =
     iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
   with
     e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
+
+let size_attachment pdf (_, embeddedfile) =
+  match Pdf.lookup_direct pdf "/F" embeddedfile with
+  | Some (Pdf.String s) ->
+      begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
+      | Some d ->
+          let stream =
+            match Pdf.lookup_direct pdf "/F" d with
+            | Some s -> s
+            | None -> error "Bad embedded file stream"
+          in
+            begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> bytes_size b | _ -> error "Bad embedded file stream" end
+      | _ -> error "Bad embedded file stream"
+      end
+  | _ -> 0
+
+let size_page_files pdf page =
+  let annots =
+    match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
+    | Some (Pdf.Array l) -> l
+    | _ -> []
+  in
+    let efannots =
+      keep
+        (fun annot ->
+           match Pdf.lookup_direct pdf "/Subtype" annot with
+           | Some (Pdf.Name "/FileAttachment") -> true
+           | _ -> false)
+        annots
+    in
+      let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
+        map (size_attachment pdf) (map (fun x -> 0, x) fsannots)
+
+let size_document_files pdf =
+  let root = Pdf.lookup_obj pdf pdf.Pdf.root in
+    let names =
+      match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
+    in
+      match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
+      | Some x ->
+          sum (map (size_attachment pdf) (Pdf.contents_of_nametree pdf x))
+      | None -> 0 
+
+let size_attached_files pdf =
+  size_document_files pdf + sum (flatten (map (size_page_files pdf) (Pdfpage.pages_of_pagetree pdf)))
diff --git a/cpdfattach.mli b/cpdfattach.mli
index 4e32872..a90e04d 100644
--- a/cpdfattach.mli
+++ b/cpdfattach.mli
@@ -22,3 +22,6 @@ val list_attached_files : Pdf.t -> attachment list
 
 (** Dump attached files to a given directory. *)
 val dump_attached_files : Pdf.t -> string -> unit
+
+(** Total size in bytes of all attached files. *)
+val size_attached_files : Pdf.t -> int
diff --git a/cpdfcomposition.ml b/cpdfcomposition.ml
index aa9b9eb..77859ea 100644
--- a/cpdfcomposition.ml
+++ b/cpdfcomposition.ml
@@ -73,14 +73,11 @@ let find_composition_content_streams pdf i obj marked =
           [i]
       | _ -> []
 
-let find_composition_embedded_files pdf i obj marked = []
-
 let find_composition pdf =
   let marked = null_hash () in
   let images = ref [] in
   let fonts = ref [] in
   let content_streams = ref [] in
-  let embedded_files = ref [] in
     Pdf.objiter
       (fun i obj ->
         (*Printf.printf "Looking at object %i\n" i;
@@ -89,13 +86,12 @@ let find_composition pdf =
         Hashtbl.iter (fun k () -> Printf.printf "%i " k) marked;
         Printf.printf "\n";*)
          match Hashtbl.find marked i with _ -> () | exception Not_found ->
-           embedded_files := find_composition_embedded_files pdf i obj marked @ !embedded_files;
            images := find_composition_images pdf i obj marked @ !images;
            content_streams := find_composition_content_streams pdf i obj marked @ !content_streams;
            fonts := find_composition_fonts pdf i obj marked @ !fonts)
       pdf;
     let structure_info = find_composition_structure_info pdf marked in
-    (!images, !fonts, !content_streams, structure_info, !embedded_files)
+    (!images, !fonts, !content_streams, structure_info)
 
 let size pdf i =
   String.length (Pdfwrite.string_of_pdf_including_data (Pdf.lookup_obj pdf i))
@@ -123,21 +119,21 @@ let compressed_xref_table_size pdf =
 
 let show_composition_json filesize pdf =
   let perc x = float_of_int x /. float_of_int filesize *. 100. in
-  let o_images, o_fonts, o_content_streams, o_structure_info, o_embedded_files = find_composition pdf in
-  let images, fonts, content_streams, structure_info, embedded_files, xref_table =
+  let o_images, o_fonts, o_content_streams, o_structure_info = find_composition pdf in
+  let images, fonts, content_streams, structure_info, attached_files, xref_table =
       compressed_size pdf o_images,
       compressed_size pdf o_fonts,
       compressed_size pdf o_content_streams,
       compressed_size pdf o_structure_info,
-      compressed_size pdf o_embedded_files,
+      Cpdfattach.size_attached_files pdf,
       compressed_xref_table_size pdf
   in
-  let r = images + fonts + content_streams + structure_info + embedded_files + xref_table in
+  let r = images + fonts + content_streams + structure_info + attached_files + xref_table in
     `List [`Tuple [`String "Images"; `Int images; `Float (perc images)];
            `Tuple [`String "Fonts"; `Int fonts; `Float (perc fonts)];
            `Tuple [`String "Content streams"; `Int content_streams; `Float (perc content_streams)];
            `Tuple [`String "Structure Info"; `Int structure_info; `Float (perc structure_info)];
-           `Tuple [`String "Embedded Files"; `Int embedded_files; `Float (perc embedded_files)];
+           `Tuple [`String "Attached Files"; `Int attached_files; `Float (perc attached_files)];
            `Tuple [`String "XRef Table"; `Int xref_table; `Float (perc xref_table)];
            `Tuple [`String "Unclassified"; `Int (filesize - r); `Float (perc (filesize - r))]]
 
diff --git a/cpdfmanual.tex b/cpdfmanual.tex
index 8c96c41..6b82f4f 100644
--- a/cpdfmanual.tex
+++ b/cpdfmanual.tex
@@ -5,7 +5,7 @@
 %Document -list-annotations[-json] now obey page range
 %Document round-tripping of annotations, supersede -copy-annotations.
 %Document -utf for JSON and mark -clean-strings as deprecated since can fail to round-trip binary strings which begin with a BOM?
-%Document -composition[-json]
+%Document -composition[-json] - mention residue may be negative
 %Document discourage GhostScript usage, since it can strip data (-gs-malformed, embed missing fonts)
 %Document [ ] pagespecs
 %Document extensions to -info