Tentative extract images-within-xobjects-within-xobjects work

This commit is contained in:
John Whitington
2024-11-21 18:47:45 +00:00
parent c63c539daa
commit 0ccd603163

View File

@ -136,25 +136,28 @@ let extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resou
in in
iter2 (write_image ~raw ?path_to_p2p ?path_to_im pdf resources) names images iter2 (write_image ~raw ?path_to_p2p ?path_to_im pdf resources) names images
let extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum form = let rec extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
Printf.printf "form = %s\n" (Pdfwrite.string_of_pdf form);
let resources = let resources =
match Pdf.lookup_direct pdf "/Resources" form with match Pdf.lookup_direct pdf "/Resources" form with
Some (Pdf.Dictionary d) -> Pdf.Dictionary d Some (Pdf.Dictionary d) -> Pdf.Dictionary d
| _ -> Pdf.Dictionary [] | _ -> Pdf.Dictionary []
in in
let images = let images, forms =
let xobjects = let xobjects =
match Pdf.lookup_direct pdf "/XObject" resources with match Pdf.lookup_direct pdf "/XObject" resources with
| Some (Pdf.Dictionary elts) -> map snd elts | Some (Pdf.Dictionary elts) -> map snd elts
| _ -> [] | _ -> []
in in
(* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *) (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in let images, forms = List.partition (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
if dedup || dedup_per_page then if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
images images, forms
in in
Printf.printf "extract_images_form_xobject: found %i images and %i subforms\n" (length images) (length forms);
iter (extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms;
extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images
let extract_images ?(raw=false) ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf range stem = let extract_images ?(raw=false) ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf range stem =