2023-04-13 17:51:11 +02:00
|
|
|
open Pdfutil
|
|
|
|
|
2023-04-14 20:44:00 +02:00
|
|
|
let find_composition_structure_info pdf marked =
|
|
|
|
match Pdf.lookup_obj pdf pdf.Pdf.root with
|
|
|
|
| Pdf.Dictionary d ->
|
|
|
|
begin match lookup "/StructTreeRoot" d with
|
|
|
|
| Some x ->
|
2023-04-15 14:25:35 +02:00
|
|
|
let l = ref [] in
|
2023-04-14 20:44:00 +02:00
|
|
|
let objs = Pdf.objects_referenced ["/Pg"] [] pdf x in
|
|
|
|
Printf.printf "Found %i struct tree items\n" (length objs);
|
|
|
|
iter
|
|
|
|
(fun i ->
|
|
|
|
match Hashtbl.find marked i with
|
|
|
|
| () -> ()
|
2023-04-15 14:25:35 +02:00
|
|
|
| exception Not_found -> l := i::!l; Hashtbl.add marked i ())
|
2023-04-14 20:44:00 +02:00
|
|
|
objs;
|
|
|
|
!l
|
2023-04-15 14:25:35 +02:00
|
|
|
| _ -> []
|
2023-04-14 20:44:00 +02:00
|
|
|
end
|
2023-04-15 14:25:35 +02:00
|
|
|
| _ -> []
|
2023-04-14 20:44:00 +02:00
|
|
|
|
2023-04-13 17:51:11 +02:00
|
|
|
let find_composition_images pdf i obj marked =
|
2023-04-15 14:25:35 +02:00
|
|
|
match Hashtbl.find marked i with () -> [] | exception Not_found ->
|
2023-04-13 17:51:11 +02:00
|
|
|
match Pdf.lookup_direct pdf "/Subtype" obj with
|
|
|
|
| Some (Pdf.Name "/Image") ->
|
2023-04-15 14:25:35 +02:00
|
|
|
Hashtbl.add marked i (); [i]
|
|
|
|
| _ -> []
|
2023-04-13 17:51:11 +02:00
|
|
|
|
2023-04-13 20:11:29 +02:00
|
|
|
(* If it has /Font, find all objects referenced from it, and add
|
|
|
|
any not already marked to the count *)
|
|
|
|
let find_composition_fonts pdf i obj marked =
|
2023-04-15 14:25:35 +02:00
|
|
|
match Hashtbl.find marked i with () -> [] | exception Not_found ->
|
|
|
|
let l = ref [] in
|
2023-04-13 20:11:29 +02:00
|
|
|
match Pdf.lookup_direct pdf "/Type" obj with
|
|
|
|
| Some (Pdf.Name "/Font") ->
|
|
|
|
iter
|
|
|
|
(fun i ->
|
2023-04-13 21:23:59 +02:00
|
|
|
(*Printf.printf "Object %i\n%s\n" i (Pdfwrite.string_of_pdf (Pdf.lookup_obj pdf i));*)
|
2023-04-13 20:11:29 +02:00
|
|
|
match Hashtbl.find marked i with
|
|
|
|
| () -> ()
|
2023-04-15 14:25:35 +02:00
|
|
|
| exception Not_found -> l := i::!l; Hashtbl.add marked i ())
|
2023-04-13 20:11:29 +02:00
|
|
|
(Pdf.objects_referenced [] [] pdf (Pdf.Indirect i));
|
|
|
|
!l
|
2023-04-15 14:25:35 +02:00
|
|
|
| _ -> []
|
2023-04-13 17:51:11 +02:00
|
|
|
|
|
|
|
let find_composition_content_streams pdf i obj marked =
|
2023-04-15 14:25:35 +02:00
|
|
|
match Hashtbl.find marked i with () -> [] | exception Not_found ->
|
2023-04-13 17:51:11 +02:00
|
|
|
match Pdf.lookup_direct pdf "/Type" obj with
|
|
|
|
| Some (Pdf.Name "/Page") ->
|
2023-04-13 21:23:59 +02:00
|
|
|
(*Printf.printf "Found a page...%s\n" (Pdfwrite.string_of_pdf (Pdf.direct pdf obj));*)
|
2023-04-13 17:51:11 +02:00
|
|
|
let cs =
|
2023-04-13 21:23:59 +02:00
|
|
|
match obj with Pdf.Dictionary d ->
|
|
|
|
begin match lookup "/Contents" d with
|
|
|
|
| Some (Pdf.Indirect i) -> [i]
|
|
|
|
| Some (Pdf.Array is) -> option_map (function Pdf.Indirect i -> Some i | _ -> None) is
|
|
|
|
| _ -> []
|
|
|
|
end
|
2023-04-13 17:51:11 +02:00
|
|
|
| _ -> []
|
|
|
|
in
|
2023-04-13 21:23:59 +02:00
|
|
|
(*Printf.printf "Found %i content streams\n" (length cs);*)
|
2023-04-15 14:25:35 +02:00
|
|
|
let l = ref [] in
|
2023-04-13 17:51:11 +02:00
|
|
|
iter
|
|
|
|
(fun i ->
|
2023-04-13 21:23:59 +02:00
|
|
|
(*Printf.printf "Considering content stream %i\n" i;*)
|
2023-04-13 20:11:29 +02:00
|
|
|
match Hashtbl.find marked i with
|
|
|
|
| () -> ()
|
2023-04-15 14:25:35 +02:00
|
|
|
| exception Not_found -> Hashtbl.add marked i (); l := i::!l)
|
2023-04-13 17:51:11 +02:00
|
|
|
cs;
|
|
|
|
!l
|
2023-04-14 18:13:49 +02:00
|
|
|
| _ ->
|
|
|
|
match Pdf.lookup_direct pdf "/Subtype" obj with
|
|
|
|
| Some (Pdf.Name "/Form") ->
|
|
|
|
Hashtbl.add marked i ();
|
2023-04-15 14:25:35 +02:00
|
|
|
[i]
|
|
|
|
| _ -> []
|
2023-04-13 17:51:11 +02:00
|
|
|
|
2023-04-15 14:25:35 +02:00
|
|
|
let find_composition_embedded_files pdf i obj marked = []
|
2023-04-13 17:51:11 +02:00
|
|
|
|
|
|
|
let find_composition pdf =
|
|
|
|
let marked = null_hash () in
|
2023-04-15 14:25:35 +02:00
|
|
|
let images = ref [] in
|
|
|
|
let fonts = ref [] in
|
|
|
|
let content_streams = ref [] in
|
|
|
|
let embedded_files = ref [] in
|
2023-04-13 17:51:11 +02:00
|
|
|
Pdf.objiter
|
|
|
|
(fun i obj ->
|
2023-04-14 18:13:49 +02:00
|
|
|
(*Printf.printf "Looking at object %i\n" i;
|
|
|
|
Printf.printf "Which is %s\n" (Pdfwrite.string_of_pdf (Pdf.lookup_obj pdf i));
|
|
|
|
Printf.printf "Marked objects at beginning: ";
|
2023-04-13 21:23:59 +02:00
|
|
|
Hashtbl.iter (fun k () -> Printf.printf "%i " k) marked;
|
|
|
|
Printf.printf "\n";*)
|
2023-04-13 17:51:11 +02:00
|
|
|
match Hashtbl.find marked i with _ -> () | exception Not_found ->
|
2023-04-15 14:25:35 +02:00
|
|
|
embedded_files := find_composition_embedded_files pdf i obj marked @ !embedded_files;
|
|
|
|
images := find_composition_images pdf i obj marked @ !images;
|
|
|
|
content_streams := find_composition_content_streams pdf i obj marked @ !content_streams;
|
|
|
|
fonts := find_composition_fonts pdf i obj marked @ !fonts)
|
2023-04-13 17:51:11 +02:00
|
|
|
pdf;
|
2023-04-14 20:44:00 +02:00
|
|
|
let structure_info = find_composition_structure_info pdf marked in
|
|
|
|
(!images, !fonts, !content_streams, structure_info, !embedded_files)
|
2023-04-13 17:51:11 +02:00
|
|
|
|
2023-04-15 14:25:35 +02:00
|
|
|
let size pdf i =
|
|
|
|
String.length (Pdfwrite.string_of_pdf_including_data (Pdf.lookup_obj pdf i))
|
|
|
|
|
|
|
|
let compressed_size pdf objnums =
|
|
|
|
if objnums = [] then 0 else
|
|
|
|
(* If there were object streams, assume objects were in them, and compressed with FlateDecode *)
|
|
|
|
if Hashtbl.length pdf.Pdf.objects.Pdf.object_stream_ids = 0 then
|
|
|
|
sum (map (size pdf) (setify objnums))
|
|
|
|
else
|
|
|
|
let b = Buffer.create 262144 in
|
|
|
|
let streams = ref 0 in
|
|
|
|
iter
|
|
|
|
(fun i ->
|
|
|
|
match Pdf.lookup_obj pdf i with
|
|
|
|
| Pdf.Stream _ -> streams += size pdf i
|
|
|
|
| obj -> Buffer.add_string b (Pdfwrite.string_of_pdf_including_data obj))
|
|
|
|
objnums;
|
|
|
|
!streams + Pdfio.bytes_size (Pdfcodec.encode_flate (Pdfio.bytes_of_string (Buffer.contents b)))
|
|
|
|
|
2023-04-13 17:51:11 +02:00
|
|
|
(* First go: images, fonts, content streams, structure info, link annotations, embedded files *)
|
|
|
|
let show_composition_json filesize pdf =
|
|
|
|
let perc x = float_of_int x /. float_of_int filesize *. 100. in
|
2023-04-15 14:25:35 +02:00
|
|
|
let o_images, o_fonts, o_content_streams, o_structure_info, o_embedded_files = find_composition pdf in
|
|
|
|
let images, fonts, content_streams, structure_info, embedded_files =
|
|
|
|
compressed_size pdf o_images,
|
|
|
|
compressed_size pdf o_fonts,
|
|
|
|
compressed_size pdf o_content_streams,
|
|
|
|
compressed_size pdf o_structure_info,
|
|
|
|
compressed_size pdf o_embedded_files
|
|
|
|
in
|
2023-04-14 20:44:00 +02:00
|
|
|
let r = images + fonts + content_streams + structure_info + embedded_files in
|
2023-04-13 17:51:11 +02:00
|
|
|
`List [`Tuple [`String "Images"; `Int images; `Float (perc images)];
|
|
|
|
`Tuple [`String "Fonts"; `Int fonts; `Float (perc fonts)];
|
|
|
|
`Tuple [`String "Content streams"; `Int content_streams; `Float (perc content_streams)];
|
|
|
|
`Tuple [`String "Structure Info"; `Int structure_info; `Float (perc structure_info)];
|
|
|
|
`Tuple [`String "Embedded Files"; `Int embedded_files; `Float (perc embedded_files)];
|
|
|
|
`Tuple [`String "Unclassified"; `Int (filesize - r); `Float (perc (filesize - r))]]
|
|
|
|
|
|
|
|
let show_composition filesize json pdf =
|
|
|
|
let module J = Cpdfyojson.Safe in
|
|
|
|
let j = show_composition_json filesize pdf in
|
|
|
|
if json then (flprint (J.pretty_to_string j); flprint "\n") else
|
|
|
|
match j with
|
|
|
|
| `List js ->
|
2023-04-13 20:11:29 +02:00
|
|
|
iter
|
|
|
|
(function
|
|
|
|
| `Tuple [`String a; `Int b; `Float c] -> Printf.printf "%s: %i bytes (%.2f%%)\n" a b c
|
|
|
|
| _ -> ())
|
|
|
|
js
|
2023-04-13 17:51:11 +02:00
|
|
|
| _ -> ()
|