Deduplication for -extract-images

This commit is contained in:
John Whitington 2020-12-20 15:41:52 +00:00
parent 2960b5f092
commit 21ef7e8f99
1 changed files with 32 additions and 4 deletions

View File

@ -451,7 +451,9 @@ type args =
mutable jsonparsecontentstreams : bool; mutable jsonparsecontentstreams : bool;
mutable jsonnostreamdata : bool; mutable jsonnostreamdata : bool;
mutable ocgrenamefrom : string; mutable ocgrenamefrom : string;
mutable ocgrenameto : string} mutable ocgrenameto : string;
mutable dedup : bool;
mutable dedup_per_page : bool}
let args = let args =
{op = None; {op = None;
@ -553,7 +555,9 @@ let args =
jsonparsecontentstreams = false; jsonparsecontentstreams = false;
jsonnostreamdata = false; jsonnostreamdata = false;
ocgrenamefrom = ""; ocgrenamefrom = "";
ocgrenameto = ""} ocgrenameto = "";
dedup = false;
dedup_per_page = false}
let reset_arguments () = let reset_arguments () =
args.op <- None; args.op <- None;
@ -640,7 +644,9 @@ let reset_arguments () =
args.jsonparsecontentstreams <- false; args.jsonparsecontentstreams <- false;
args.jsonnostreamdata <- false; args.jsonnostreamdata <- false;
args.ocgrenamefrom <- ""; args.ocgrenamefrom <- "";
args.ocgrenameto <- "" args.ocgrenameto <- "";
args.dedup <- false;
args.dedup_per_page <- false
(* Do not reset original_filename or cpdflin or was_encrypted or (* Do not reset original_filename or cpdflin or was_encrypted or
* was_decrypted_with_owner or recrypt or producer or creator or path_to_* or * was_decrypted_with_owner or recrypt or producer or creator or path_to_* or
* gs_malformed or gs_quiet, since we want these to work across ANDs. Or * gs_malformed or gs_quiet, since we want these to work across ANDs. Or
@ -1497,6 +1503,12 @@ let setsqueezepagedata () =
let setsqueezerecompress () = let setsqueezerecompress () =
args.squeeze_recompress <- false args.squeeze_recompress <- false
let set_dedup () =
args.dedup <- true
let set_dedup_per_page () =
args.dedup_per_page <- true
let whingemalformed () = let whingemalformed () =
prerr_string "Command line must be of exactly the form\ncpdf <infile> -gs <path> -gs-malformed-force -o <outfile>\n"; prerr_string "Command line must be of exactly the form\ncpdf <infile> -gs <path> -gs-malformed-force -o <outfile>\n";
exit 1 exit 1
@ -2139,6 +2151,12 @@ and specs =
("-extract-images", ("-extract-images",
Arg.Unit (setop ExtractImages), Arg.Unit (setop ExtractImages),
" Extract images to file"); " Extract images to file");
("-dedup",
Arg.Unit set_dedup,
" Deduplicate extracted images fully");
("-dedup-perpage",
Arg.Unit set_dedup_per_page,
" Deduplicate extracted images per page only");
("-squeeze", Arg.Unit setsqueeze, " Squeeze"); ("-squeeze", Arg.Unit setsqueeze, " Squeeze");
("-squeeze-log-to", Arg.String setsqueezelogto, " Squeeze log location"); ("-squeeze-log-to", Arg.String setsqueezelogto, " Squeeze log location");
("-squeeze-no-pagedata", Arg.Unit setsqueezepagedata, " Don't recompress pages"); ("-squeeze-no-pagedata", Arg.Unit setsqueezepagedata, " Don't recompress pages");
@ -2840,6 +2858,8 @@ let write_image pdf resources name image =
| _ -> | _ ->
Printf.eprintf "Unsupported image type when extracting image %s " name Printf.eprintf "Unsupported image type when extracting image %s " name
let written = ref []
let extract_images_inner serial pdf resources stem pnum images = let extract_images_inner serial pdf resources stem pnum images =
let names = map let names = map
(fun _ -> (fun _ ->
@ -2861,11 +2881,16 @@ let rec extract_images_form_xobject pdf serial stem pnum form =
| Some (Pdf.Dictionary elts) -> map snd elts | Some (Pdf.Dictionary elts) -> map snd elts
| _ -> [] | _ -> []
in in
keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let images, already_written = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
images
in in
extract_images_inner serial pdf resources stem pnum images extract_images_inner serial pdf resources stem pnum images
let extract_images pdf range stem = let extract_images pdf range stem =
if args.dedup || args.dedup_per_page then written := [];
let pdf_pages = Pdfpage.pages_of_pagetree pdf in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let pages = let pages =
option_map option_map
@ -2875,12 +2900,15 @@ let extract_images pdf range stem =
let serial = ref 0 in let serial = ref 0 in
iter2 iter2
(fun page pnum -> (fun page pnum ->
if args.dedup_per_page then written := [];
let xobjects = let xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) -> map snd elts | Some (Pdf.Dictionary elts) -> map snd elts
| _ -> [] | _ -> []
in in
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let images, already_written = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
extract_images_inner serial pdf page.Pdfpage.resources stem pnum images; extract_images_inner serial pdf page.Pdfpage.resources stem pnum images;
iter (extract_images_form_xobject pdf serial stem pnum) forms) iter (extract_images_form_xobject pdf serial stem pnum) forms)