Deduplication for -extract-images
This commit is contained in:
parent
2960b5f092
commit
21ef7e8f99
|
@ -451,7 +451,9 @@ type args =
|
||||||
mutable jsonparsecontentstreams : bool;
|
mutable jsonparsecontentstreams : bool;
|
||||||
mutable jsonnostreamdata : bool;
|
mutable jsonnostreamdata : bool;
|
||||||
mutable ocgrenamefrom : string;
|
mutable ocgrenamefrom : string;
|
||||||
mutable ocgrenameto : string}
|
mutable ocgrenameto : string;
|
||||||
|
mutable dedup : bool;
|
||||||
|
mutable dedup_per_page : bool}
|
||||||
|
|
||||||
let args =
|
let args =
|
||||||
{op = None;
|
{op = None;
|
||||||
|
@ -553,7 +555,9 @@ let args =
|
||||||
jsonparsecontentstreams = false;
|
jsonparsecontentstreams = false;
|
||||||
jsonnostreamdata = false;
|
jsonnostreamdata = false;
|
||||||
ocgrenamefrom = "";
|
ocgrenamefrom = "";
|
||||||
ocgrenameto = ""}
|
ocgrenameto = "";
|
||||||
|
dedup = false;
|
||||||
|
dedup_per_page = false}
|
||||||
|
|
||||||
let reset_arguments () =
|
let reset_arguments () =
|
||||||
args.op <- None;
|
args.op <- None;
|
||||||
|
@ -640,7 +644,9 @@ let reset_arguments () =
|
||||||
args.jsonparsecontentstreams <- false;
|
args.jsonparsecontentstreams <- false;
|
||||||
args.jsonnostreamdata <- false;
|
args.jsonnostreamdata <- false;
|
||||||
args.ocgrenamefrom <- "";
|
args.ocgrenamefrom <- "";
|
||||||
args.ocgrenameto <- ""
|
args.ocgrenameto <- "";
|
||||||
|
args.dedup <- false;
|
||||||
|
args.dedup_per_page <- false
|
||||||
(* Do not reset original_filename or cpdflin or was_encrypted or
|
(* Do not reset original_filename or cpdflin or was_encrypted or
|
||||||
* was_decrypted_with_owner or recrypt or producer or creator or path_to_* or
|
* was_decrypted_with_owner or recrypt or producer or creator or path_to_* or
|
||||||
* gs_malformed or gs_quiet, since we want these to work across ANDs. Or
|
* gs_malformed or gs_quiet, since we want these to work across ANDs. Or
|
||||||
|
@ -1497,6 +1503,12 @@ let setsqueezepagedata () =
|
||||||
let setsqueezerecompress () =
|
let setsqueezerecompress () =
|
||||||
args.squeeze_recompress <- false
|
args.squeeze_recompress <- false
|
||||||
|
|
||||||
|
let set_dedup () =
|
||||||
|
args.dedup <- true
|
||||||
|
|
||||||
|
let set_dedup_per_page () =
|
||||||
|
args.dedup_per_page <- true
|
||||||
|
|
||||||
let whingemalformed () =
|
let whingemalformed () =
|
||||||
prerr_string "Command line must be of exactly the form\ncpdf <infile> -gs <path> -gs-malformed-force -o <outfile>\n";
|
prerr_string "Command line must be of exactly the form\ncpdf <infile> -gs <path> -gs-malformed-force -o <outfile>\n";
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -2139,6 +2151,12 @@ and specs =
|
||||||
("-extract-images",
|
("-extract-images",
|
||||||
Arg.Unit (setop ExtractImages),
|
Arg.Unit (setop ExtractImages),
|
||||||
" Extract images to file");
|
" Extract images to file");
|
||||||
|
("-dedup",
|
||||||
|
Arg.Unit set_dedup,
|
||||||
|
" Deduplicate extracted images fully");
|
||||||
|
("-dedup-perpage",
|
||||||
|
Arg.Unit set_dedup_per_page,
|
||||||
|
" Deduplicate extracted images per page only");
|
||||||
("-squeeze", Arg.Unit setsqueeze, " Squeeze");
|
("-squeeze", Arg.Unit setsqueeze, " Squeeze");
|
||||||
("-squeeze-log-to", Arg.String setsqueezelogto, " Squeeze log location");
|
("-squeeze-log-to", Arg.String setsqueezelogto, " Squeeze log location");
|
||||||
("-squeeze-no-pagedata", Arg.Unit setsqueezepagedata, " Don't recompress pages");
|
("-squeeze-no-pagedata", Arg.Unit setsqueezepagedata, " Don't recompress pages");
|
||||||
|
@ -2840,6 +2858,8 @@ let write_image pdf resources name image =
|
||||||
| _ ->
|
| _ ->
|
||||||
Printf.eprintf "Unsupported image type when extracting image %s " name
|
Printf.eprintf "Unsupported image type when extracting image %s " name
|
||||||
|
|
||||||
|
let written = ref []
|
||||||
|
|
||||||
let extract_images_inner serial pdf resources stem pnum images =
|
let extract_images_inner serial pdf resources stem pnum images =
|
||||||
let names = map
|
let names = map
|
||||||
(fun _ ->
|
(fun _ ->
|
||||||
|
@ -2861,11 +2881,16 @@ let rec extract_images_form_xobject pdf serial stem pnum form =
|
||||||
| Some (Pdf.Dictionary elts) -> map snd elts
|
| Some (Pdf.Dictionary elts) -> map snd elts
|
||||||
| _ -> []
|
| _ -> []
|
||||||
in
|
in
|
||||||
keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects
|
(* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
|
||||||
|
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
|
||||||
|
let images, already_written = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
|
||||||
|
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
|
||||||
|
images
|
||||||
in
|
in
|
||||||
extract_images_inner serial pdf resources stem pnum images
|
extract_images_inner serial pdf resources stem pnum images
|
||||||
|
|
||||||
let extract_images pdf range stem =
|
let extract_images pdf range stem =
|
||||||
|
if args.dedup || args.dedup_per_page then written := [];
|
||||||
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
|
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
|
||||||
let pages =
|
let pages =
|
||||||
option_map
|
option_map
|
||||||
|
@ -2875,12 +2900,15 @@ let extract_images pdf range stem =
|
||||||
let serial = ref 0 in
|
let serial = ref 0 in
|
||||||
iter2
|
iter2
|
||||||
(fun page pnum ->
|
(fun page pnum ->
|
||||||
|
if args.dedup_per_page then written := [];
|
||||||
let xobjects =
|
let xobjects =
|
||||||
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
||||||
| Some (Pdf.Dictionary elts) -> map snd elts
|
| Some (Pdf.Dictionary elts) -> map snd elts
|
||||||
| _ -> []
|
| _ -> []
|
||||||
in
|
in
|
||||||
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
|
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
|
||||||
|
let images, already_written = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
|
||||||
|
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
|
||||||
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
|
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
|
||||||
extract_images_inner serial pdf page.Pdfpage.resources stem pnum images;
|
extract_images_inner serial pdf page.Pdfpage.resources stem pnum images;
|
||||||
iter (extract_images_form_xobject pdf serial stem pnum) forms)
|
iter (extract_images_form_xobject pdf serial stem pnum) forms)
|
||||||
|
|
Loading…
Reference in New Issue