cpdf-source/cpdfimage.ml

309 lines
13 KiB
OCaml
Raw Normal View History

2021-12-21 15:57:42 +01:00
open Pdfutil
open Pdfio
(* Extract Images. *)
let pnm_to_channel_24 channel w h s =
let white () = output_char channel ' '
and newline () = output_char channel '\n'
2022-04-09 14:13:12 +02:00
and output_string = Stdlib.output_string channel in
2021-12-21 15:57:42 +01:00
output_string "P6";
white ();
output_string (string_of_int w);
white ();
output_string (string_of_int h);
white ();
output_string "255";
newline ();
let pos = ref 0 in
for y = 1 to h do
for x = 1 to w * 3 do
output_byte channel (bget s !pos);
incr pos
done
done
2023-03-22 20:29:12 +01:00
(* FIXME do this all at once *)
2021-12-21 15:57:42 +01:00
let write_stream name stream =
let fh = open_out_bin name in
for x = 0 to bytes_size stream - 1 do
output_byte fh (bget stream x)
done;
close_out fh
2023-03-22 20:29:12 +01:00
(* Detect images we can write directly as PNGs, to avoid going through pnm2png or imagemagick.
This is when BPC = 8, colourspace = DeviceRGB or CalRGB, compression is /FlateDecode. *)
let write_image_png pdf resources name dict =
(*Printf.printf "%s\n" (Pdfwrite.string_of_pdf (Pdf.direct pdf dict));*)
match
Pdfimage.colspace pdf dict resources,
Pdfimage.bpc pdf dict,
Pdf.lookup_direct pdf "/Filter" dict
with
| (Pdfspace.DeviceRGB | Pdfspace.CalRGB _),
Some (Pdf.Integer 8),
Some (Pdf.Name "/FlateDecode" | Pdf.Array [Pdf.Name "/FlateDecode"]) ->
(*Printf.printf "Direct to png...\n";*)
Pdf.getstream (Pdf.direct pdf dict);
let ch = open_out_bin (name ^ ".png") in
let o = Pdfio.output_of_channel ch in
let width = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer x) -> x | _ -> raise Exit in
let height = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer x) -> x | _ -> raise Exit in
let idat = match Pdf.direct pdf dict with Pdf.Stream {contents = (_, Got bytes)} -> bytes | _ -> raise Exit in
(*Printf.printf "all ok...\n";*)
Cpdfpng.write_png {width; height; idat} o;
close_out ch;
true
| _ -> false
2021-12-21 15:57:42 +01:00
let write_image path_to_p2p path_to_im pdf resources name image =
2023-03-22 20:29:12 +01:00
let main () =
begin match Pdfimage.get_image_24bpp pdf resources image with
| Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
| Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
| Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream
| Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
let pnm = name ^ ".pnm" in
let png = name ^ ".png" in
let fh = open_out_bin pnm in
pnm_to_channel_24 fh w h stream;
close_out fh;
begin match path_to_p2p with
| "" ->
begin match path_to_im with
"" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!"
| _ ->
begin match
Sys.command (Filename.quote_command path_to_im [pnm; png])
with
0 -> Sys.remove pnm
| _ ->
Printf.eprintf "Call to imagemagick failed: did you specify -p2p or -im correctly?\n%!";
Sys.remove pnm
end
end
2021-12-21 15:57:42 +01:00
| _ ->
begin match
2023-03-22 20:29:12 +01:00
Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
2021-12-21 15:57:42 +01:00
with
2023-03-22 20:29:12 +01:00
| 0 -> Sys.remove pnm
| _ ->
Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!";
Sys.remove pnm
2021-12-21 15:57:42 +01:00
end
end
2023-03-22 20:29:12 +01:00
| _ ->
Printf.eprintf "Unsupported image type when extracting image %s %!" name
end
in
match write_image_png pdf resources name image with
| true -> ()
| exception x -> Printf.printf "Failed to write PNG directly (%s)\n" (Printexc.to_string x); main ()
| _ -> main ()
2021-12-21 15:57:42 +01:00
let written = ref []
let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images =
let names = map
(fun _ ->
Cpdfbookmarks.name_of_spec
encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
(let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
in
iter2 (write_image path_to_p2p path_to_im pdf resources) names images
let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
let resources =
match Pdf.lookup_direct pdf "/Resources" form with
Some (Pdf.Dictionary d) -> Pdf.Dictionary d
| _ -> Pdf.Dictionary []
in
let images =
let xobjects =
match Pdf.lookup_direct pdf "/XObject" resources with
| Some (Pdf.Dictionary elts) -> map snd elts
| _ -> []
in
(* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
images
in
extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images
let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem =
if dedup || dedup_per_page then written := [];
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let pages =
option_map
(function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
(combine (indx pdf_pages) pdf_pages)
in
let serial = ref 0 in
iter2
(fun page pnum ->
if dedup_per_page then written := [];
let xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) -> map snd elts
| _ -> []
in
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
pages
(indx pages)
2021-12-21 16:25:59 +01:00
(* Image resolution *)
type xobj =
| Image of int * int (* width, height *)
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
let image_results = ref []
let add_image_result i =
image_results := i::!image_results
(* Given a page and a list of (pagenum, name, thing) *)
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
try
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
and transform = ref [ref Pdftransform.i_matrix] in
iter
(function
| Pdfops.Op_cm matrix ->
begin match !transform with
| [] -> raise (Failure "no transform")
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
end
| Pdfops.Op_Do xobject ->
let trans (x, y) =
match !transform with
| [] -> raise (Failure "no transform")
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
in
let o = trans (0., 0.)
and x = trans (1., 0.)
and y = trans (0., 1.)
in
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
let rec lookup_image k = function
| [] -> assert false
| (_, a, _) as h::_ when a = k -> h
| _::t -> lookup_image k t
in
begin match lookup_image xobject images with
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
let content =
(* Add in matrix etc. *)
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
let ops =
Pdfops.Op_cm total_matrix::
Pdfops.parse_operators pdf resources [content]
in
Pdfops.stream_of_ops ops
in
let page =
{Pdfpage.content = [content];
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
Pdfpage.resources = resources;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let newpdf = Pdfpage.change_pages false pdf [page] in
image_resolution newpdf [pagenum] dpi
| (pagenum, name, Image (w, h)) ->
2023-04-11 14:50:17 +02:00
let lx = Pdfunits.points (distance_between o x) Pdfunits.Inch in
let ly = Pdfunits.points (distance_between o y) Pdfunits.Inch in
2021-12-21 16:25:59 +01:00
let wdpi = float w /. lx
and hdpi = float h /. ly in
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
(*i else
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
end
| Pdfops.Op_q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| h::t ->
let h' = ref Pdftransform.i_matrix in
h' := !h;
transform := h'::h::t
end
| Pdfops.Op_Q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| _ -> transform := tl !transform
end
| _ -> ())
pageops
with
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
and image_resolution pdf range dpi =
let images = ref [] in
Cpdfpage.iter_pages
(fun pagenum page ->
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjects) ->
iter
(function (name, xobject) ->
match Pdf.lookup_direct pdf "/Subtype" xobject with
| Some (Pdf.Name "/Image") ->
let width =
match Pdf.lookup_direct pdf "/Width" xobject with
2022-07-14 15:06:25 +02:00
| Some x -> Pdf.getnum pdf x
2021-12-21 16:25:59 +01:00
| None -> 1.
and height =
match Pdf.lookup_direct pdf "/Height" xobject with
2022-07-14 15:06:25 +02:00
| Some x -> Pdf.getnum pdf x
2021-12-21 16:25:59 +01:00
| None -> 1.
in
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
| Some (Pdf.Name "/Form") ->
let resources =
match Pdf.lookup_direct pdf "/Resources" xobject with
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
| Some r -> r
and contents =
xobject
and matrix =
match Pdf.lookup_direct pdf "/Matrix" xobject with
| Some (Pdf.Array [a; b; c; d; e; f]) ->
2022-07-14 15:06:25 +02:00
{Pdftransform.a = Pdf.getnum pdf a; Pdftransform.b = Pdf.getnum pdf b; Pdftransform.c = Pdf.getnum pdf c;
Pdftransform.d = Pdf.getnum pdf d; Pdftransform.e = Pdf.getnum pdf e; Pdftransform.f = Pdf.getnum pdf f}
2021-12-21 16:25:59 +01:00
| _ -> Pdftransform.i_matrix
in
images := (pagenum, name, Form (matrix, contents, resources))::!images
| _ -> ()
)
xobjects
| _ -> ())
pdf
range;
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
let pagesplits =
map
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
and pages =
Pdfpage.pages_of_pagetree pdf
in
iter
(function (pagenum, images) ->
let page = select pagenum pages in
image_resolution_page pdf page pagenum dpi images)
pagesplits
let image_resolution pdf range dpi =
image_results := [];
image_resolution pdf range dpi;
rev !image_results