Added -raw option for extract_images

This commit is contained in:
John Whitington 2023-11-10 13:46:52 +00:00
parent 1fe0c33924
commit 33c690343c
4 changed files with 17 additions and 15 deletions

View File

@ -4160,7 +4160,7 @@ let go () =
in
let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
Cpdfimage.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec
Cpdfimage.extract_images ~raw:(args.encoding = Cpdfmetadata.Raw) ?path_to_p2p:(match args.path_to_p2p with "" -> None | x -> Some x) ?path_to_im:(match args.path_to_im with "" -> None | x -> Some x) args.encoding args.dedup args.dedup_per_page pdf range output_spec
| Some (ImageResolution f) ->
let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in

View File

@ -27,7 +27,7 @@ let write_stream name stream =
Pdfio.bytes_to_output_channel fh stream;
close_out fh
let write_image path_to_p2p path_to_im pdf resources name image =
let write_image ~raw ?path_to_p2p ?path_to_im pdf resources name image =
match Pdfimage.get_image_24bpp pdf resources image with
| Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
| Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
@ -39,10 +39,11 @@ let write_image path_to_p2p path_to_im pdf resources name image =
pnm_to_channel_24 fh w h stream;
close_out fh;
begin match path_to_p2p with
| "" ->
| None ->
begin match path_to_im with
"" -> Pdfe.log "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n"
| _ ->
None ->
if not raw then Pdfe.log "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n"
| Some path_to_im ->
begin match
Sys.command (Filename.quote_command path_to_im [pnm; png])
with
@ -52,7 +53,7 @@ let write_image path_to_p2p path_to_im pdf resources name image =
Sys.remove pnm
end
end
| _ ->
| Some path_to_p2p ->
begin match
Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
with
@ -67,16 +68,16 @@ let write_image path_to_p2p path_to_im pdf resources name image =
let written = ref []
let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images =
let extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images =
let names = map
(fun _ ->
Cpdfbookmarks.name_of_spec
encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
(let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
in
iter2 (write_image path_to_p2p path_to_im pdf resources) names images
iter2 (write_image ~raw ?path_to_p2p ?path_to_im pdf resources) names images
let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
let rec extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
let resources =
match Pdf.lookup_direct pdf "/Resources" form with
Some (Pdf.Dictionary d) -> Pdf.Dictionary d
@ -95,9 +96,9 @@ let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
images
in
extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images
extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images
let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem =
let extract_images ?(raw=false) ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf range stem =
if dedup || dedup_per_page then written := [];
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let pages =
@ -119,8 +120,8 @@ let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf rang
if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
iter (extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
pages
(indx pages)

View File

@ -1,8 +1,8 @@
(** Images *)
(** Extract images. *)
val extract_images : string ->
string ->
val extract_images : ?raw:bool -> ?path_to_p2p:string ->
?path_to_im:string ->
Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
(** Report image resolutions. *)

View File

@ -3,6 +3,7 @@
%Document -info-json, -page-info-json, -page-labels-json, -list-fonts-json
%Document subformat information
%Document -list-images[-json], -list-image-uses[-json], -image-resolution-json
%Document -raw for -extract-images
\documentclass{book}
% Edit here to produce cpdfmanual.pdf, cpdflibmanual.pdf, pycpdfmanual.pdf,
% dotnetcpdflibmanual.pdf, jcpdflibmanual.pdf jscpdflibmanual.pdf etc.