Added -raw option for extract_images

This commit is contained in:
John Whitington 2023-11-10 13:46:52 +00:00
parent 1fe0c33924
commit 33c690343c
4 changed files with 17 additions and 15 deletions

View File

@ -4160,7 +4160,7 @@ let go () =
in in
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
Cpdfimage.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec Cpdfimage.extract_images ~raw:(args.encoding = Cpdfmetadata.Raw) ?path_to_p2p:(match args.path_to_p2p with "" -> None | x -> Some x) ?path_to_im:(match args.path_to_im with "" -> None | x -> Some x) args.encoding args.dedup args.dedup_per_page pdf range output_spec
| Some (ImageResolution f) -> | Some (ImageResolution f) ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in

View File

@ -27,7 +27,7 @@ let write_stream name stream =
Pdfio.bytes_to_output_channel fh stream; Pdfio.bytes_to_output_channel fh stream;
close_out fh close_out fh
let write_image path_to_p2p path_to_im pdf resources name image = let write_image ~raw ?path_to_p2p ?path_to_im pdf resources name image =
match Pdfimage.get_image_24bpp pdf resources image with match Pdfimage.get_image_24bpp pdf resources image with
| Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
| Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
@ -39,10 +39,11 @@ let write_image path_to_p2p path_to_im pdf resources name image =
pnm_to_channel_24 fh w h stream; pnm_to_channel_24 fh w h stream;
close_out fh; close_out fh;
begin match path_to_p2p with begin match path_to_p2p with
| "" -> | None ->
begin match path_to_im with begin match path_to_im with
"" -> Pdfe.log "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n" None ->
| _ -> if not raw then Pdfe.log "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n"
| Some path_to_im ->
begin match begin match
Sys.command (Filename.quote_command path_to_im [pnm; png]) Sys.command (Filename.quote_command path_to_im [pnm; png])
with with
@ -52,7 +53,7 @@ let write_image path_to_p2p path_to_im pdf resources name image =
Sys.remove pnm Sys.remove pnm
end end
end end
| _ -> | Some path_to_p2p ->
begin match begin match
Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm]) Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
with with
@ -67,16 +68,16 @@ let write_image path_to_p2p path_to_im pdf resources name image =
let written = ref [] let written = ref []
let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images = let extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images =
let names = map let names = map
(fun _ -> (fun _ ->
Cpdfbookmarks.name_of_spec Cpdfbookmarks.name_of_spec
encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum) encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
(let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images) (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
in in
iter2 (write_image path_to_p2p path_to_im pdf resources) names images iter2 (write_image ~raw ?path_to_p2p ?path_to_im pdf resources) names images
let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form = let rec extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
let resources = let resources =
match Pdf.lookup_direct pdf "/Resources" form with match Pdf.lookup_direct pdf "/Resources" form with
Some (Pdf.Dictionary d) -> Pdf.Dictionary d Some (Pdf.Dictionary d) -> Pdf.Dictionary d
@ -95,9 +96,9 @@ let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
images images
in in
extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images
let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem = let extract_images ?(raw=false) ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf range stem =
if dedup || dedup_per_page then written := []; if dedup || dedup_per_page then written := [];
let pdf_pages = Pdfpage.pages_of_pagetree pdf in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let pages = let pages =
@ -119,8 +120,8 @@ let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf rang
if dedup || dedup_per_page then if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images; extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms) iter (extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
pages pages
(indx pages) (indx pages)

View File

@ -1,8 +1,8 @@
(** Images *) (** Images *)
(** Extract images. *) (** Extract images. *)
val extract_images : string -> val extract_images : ?raw:bool -> ?path_to_p2p:string ->
string -> ?path_to_im:string ->
Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
(** Report image resolutions. *) (** Report image resolutions. *)

View File

@ -3,6 +3,7 @@
%Document -info-json, -page-info-json, -page-labels-json, -list-fonts-json %Document -info-json, -page-info-json, -page-labels-json, -list-fonts-json
%Document subformat information %Document subformat information
%Document -list-images[-json], -list-image-uses[-json], -image-resolution-json %Document -list-images[-json], -list-image-uses[-json], -image-resolution-json
%Document -raw for -extract-images
\documentclass{book} \documentclass{book}
% Edit here to produce cpdfmanual.pdf, cpdflibmanual.pdf, pycpdfmanual.pdf, % Edit here to produce cpdfmanual.pdf, cpdflibmanual.pdf, pycpdfmanual.pdf,
% dotnetcpdflibmanual.pdf, jcpdflibmanual.pdf jscpdflibmanual.pdf etc. % dotnetcpdflibmanual.pdf, jcpdflibmanual.pdf jscpdflibmanual.pdf etc.