2021-12-21 15:57:42 +01:00
|
|
|
open Pdfutil
|
|
|
|
open Pdfio
|
2023-12-18 16:13:40 +01:00
|
|
|
open Cpdferror
|
2021-12-21 15:57:42 +01:00
|
|
|
|
2023-12-28 17:18:25 +01:00
|
|
|
let debug_image_processing = ref false
|
|
|
|
|
2024-02-20 20:41:49 +01:00
|
|
|
let complain_jbig2enc path =
|
|
|
|
if path = "" then error "Specify jbig2enc location with -jbig2enc"
|
|
|
|
|
|
|
|
let complain_convert path =
|
2024-02-22 16:56:35 +01:00
|
|
|
if path = "" then error "Specify magick location with -im"
|
2024-02-20 20:41:49 +01:00
|
|
|
|
2023-12-31 12:13:58 +01:00
|
|
|
let remove x =
|
2024-02-20 20:41:49 +01:00
|
|
|
try (*Printf.printf "%s\n" x;*) Sys.remove x with _ -> ()
|
2023-12-31 12:13:58 +01:00
|
|
|
|
2023-12-18 23:39:33 +01:00
|
|
|
let pnm_white ch = output_char ch ' '
|
|
|
|
let pnm_newline ch = output_char ch '\n'
|
|
|
|
let pnm_output_string = Stdlib.output_string
|
|
|
|
|
|
|
|
let pnm_header ch w h =
|
|
|
|
pnm_white ch;
|
|
|
|
pnm_output_string ch (string_of_int w);
|
|
|
|
pnm_white ch;
|
|
|
|
pnm_output_string ch (string_of_int h);
|
|
|
|
pnm_white ch
|
|
|
|
|
|
|
|
let pnm_to_channel_24 ch w h s =
|
|
|
|
pnm_output_string ch "P6";
|
|
|
|
pnm_header ch w h;
|
|
|
|
pnm_output_string ch "255";
|
|
|
|
pnm_newline ch;
|
2023-12-29 18:46:54 +01:00
|
|
|
bytes_to_output_channel ch s
|
2023-12-18 23:39:33 +01:00
|
|
|
|
|
|
|
let pnm_to_channel_8 ch w h s =
|
|
|
|
pnm_output_string ch "P5";
|
|
|
|
pnm_header ch w h;
|
2023-12-19 19:11:47 +01:00
|
|
|
pnm_output_string ch "255";
|
2023-12-18 23:39:33 +01:00
|
|
|
pnm_newline ch;
|
2023-12-29 18:46:54 +01:00
|
|
|
bytes_to_output_channel ch s
|
2021-12-21 15:57:42 +01:00
|
|
|
|
2023-12-23 15:07:17 +01:00
|
|
|
let pnm_to_channel_1_inverted ch w h s =
|
2023-12-22 22:21:23 +01:00
|
|
|
pnm_output_string ch "P4";
|
|
|
|
pnm_header ch w h;
|
|
|
|
pnm_newline ch;
|
2023-12-23 15:07:17 +01:00
|
|
|
let inverted = Pdfio.copybytes s in
|
|
|
|
Pdfio.bytes_selfmap lnot inverted;
|
2023-12-29 18:46:54 +01:00
|
|
|
bytes_to_output_channel ch inverted
|
2023-12-22 22:21:23 +01:00
|
|
|
|
2023-12-19 20:12:56 +01:00
|
|
|
let cmyk_to_channel_32 ch w h s =
|
2023-12-29 18:46:54 +01:00
|
|
|
let inverted = Pdfio.copybytes s in
|
|
|
|
Pdfio.bytes_selfmap (fun x -> 255 - x) inverted;
|
|
|
|
bytes_to_output_channel ch inverted
|
2023-12-19 20:12:56 +01:00
|
|
|
|
2023-12-04 12:19:17 +01:00
|
|
|
let jbig2_serial = ref 0
|
|
|
|
|
|
|
|
let jbig2_globals = null_hash ()
|
|
|
|
|
2023-12-04 15:00:45 +01:00
|
|
|
let write_stream name stream =
|
|
|
|
let fh = open_out_bin name in
|
|
|
|
Pdfio.bytes_to_output_channel fh stream;
|
|
|
|
close_out fh
|
|
|
|
|
2023-11-10 14:46:52 +01:00
|
|
|
let write_image ~raw ?path_to_p2p ?path_to_im pdf resources name image =
|
2023-06-14 19:38:26 +02:00
|
|
|
match Pdfimage.get_image_24bpp pdf resources image with
|
|
|
|
| Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
|
|
|
|
| Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
|
2023-12-04 12:19:17 +01:00
|
|
|
| Pdfimage.JBIG2 (stream, _, global) ->
|
|
|
|
begin match global with
|
|
|
|
| None ->
|
2023-12-23 18:18:13 +01:00
|
|
|
(*Printf.printf "JBIG2: No global, writing plain\n";*)
|
2023-12-04 12:19:17 +01:00
|
|
|
write_stream (name ^ ".jbig2") stream
|
|
|
|
| Some g ->
|
2023-12-23 18:18:13 +01:00
|
|
|
(*Printf.printf "JBIG2: there is a global\n";*)
|
2023-12-04 12:19:17 +01:00
|
|
|
let go () =
|
|
|
|
let serial, _ = Hashtbl.find jbig2_globals g in
|
|
|
|
write_stream (name ^ ".jbig2__" ^ string_of_int serial) stream
|
|
|
|
in
|
|
|
|
try go () with Not_found ->
|
|
|
|
jbig2_serial += 1;
|
|
|
|
let globaldata =
|
|
|
|
let obj = Pdf.lookup_obj pdf g in
|
|
|
|
Pdfcodec.decode_pdfstream_until_unknown pdf obj;
|
|
|
|
match obj with | Pdf.Stream {contents = (_, Got b)} -> Some b | _ -> None
|
|
|
|
in
|
|
|
|
match globaldata with
|
|
|
|
| Some d ->
|
|
|
|
Hashtbl.add jbig2_globals g (!jbig2_serial, d);
|
|
|
|
let filename = Filename.concat (Filename.dirname name) (string_of_int !jbig2_serial ^ ".jbig2global") in
|
|
|
|
write_stream filename d;
|
|
|
|
go ()
|
|
|
|
| None ->
|
|
|
|
Pdfe.log "Could not extract JBIG2Globals. Skipping this image."
|
|
|
|
end
|
2023-06-14 19:38:26 +02:00
|
|
|
| Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
|
|
|
|
let pnm = name ^ ".pnm" in
|
|
|
|
let png = name ^ ".png" in
|
|
|
|
let fh = open_out_bin pnm in
|
|
|
|
pnm_to_channel_24 fh w h stream;
|
|
|
|
close_out fh;
|
|
|
|
begin match path_to_p2p with
|
2023-11-10 14:46:52 +01:00
|
|
|
| None ->
|
2023-06-14 19:38:26 +02:00
|
|
|
begin match path_to_im with
|
2023-11-10 14:46:52 +01:00
|
|
|
None ->
|
|
|
|
if not raw then Pdfe.log "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n"
|
|
|
|
| Some path_to_im ->
|
2021-12-21 15:57:42 +01:00
|
|
|
begin match
|
2023-06-14 19:38:26 +02:00
|
|
|
Sys.command (Filename.quote_command path_to_im [pnm; png])
|
2021-12-21 15:57:42 +01:00
|
|
|
with
|
2023-12-31 12:13:58 +01:00
|
|
|
0 -> remove pnm
|
2023-06-14 19:38:26 +02:00
|
|
|
| _ ->
|
|
|
|
Pdfe.log "Call to imagemagick failed: did you specify -p2p or -im correctly?\n";
|
2023-12-31 12:13:58 +01:00
|
|
|
remove pnm
|
2021-12-21 15:57:42 +01:00
|
|
|
end
|
|
|
|
end
|
2023-11-10 14:46:52 +01:00
|
|
|
| Some path_to_p2p ->
|
2023-06-14 19:38:26 +02:00
|
|
|
begin match
|
|
|
|
Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
|
|
|
|
with
|
2023-12-31 12:13:58 +01:00
|
|
|
| 0 -> remove pnm
|
2023-06-14 19:38:26 +02:00
|
|
|
| _ ->
|
|
|
|
Pdfe.log "Call to pnmtopng failed: did you specify -p2p correctly?\n";
|
2023-12-31 12:13:58 +01:00
|
|
|
remove pnm
|
2023-06-14 19:38:26 +02:00
|
|
|
end
|
|
|
|
end
|
|
|
|
| _ ->
|
|
|
|
Pdfe.log (Printf.sprintf "Unsupported image type when extracting image %s " name)
|
2021-12-21 15:57:42 +01:00
|
|
|
|
|
|
|
let written = ref []
|
|
|
|
|
2023-11-10 14:46:52 +01:00
|
|
|
let extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images =
|
2021-12-21 15:57:42 +01:00
|
|
|
let names = map
|
|
|
|
(fun _ ->
|
|
|
|
Cpdfbookmarks.name_of_spec
|
|
|
|
encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
|
|
|
|
(let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
|
|
|
|
in
|
2023-11-10 14:46:52 +01:00
|
|
|
iter2 (write_image ~raw ?path_to_p2p ?path_to_im pdf resources) names images
|
2021-12-21 15:57:42 +01:00
|
|
|
|
2024-11-21 19:47:45 +01:00
|
|
|
let rec extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
|
2021-12-21 15:57:42 +01:00
|
|
|
let resources =
|
|
|
|
match Pdf.lookup_direct pdf "/Resources" form with
|
|
|
|
Some (Pdf.Dictionary d) -> Pdf.Dictionary d
|
|
|
|
| _ -> Pdf.Dictionary []
|
|
|
|
in
|
2024-11-21 19:47:45 +01:00
|
|
|
let images, forms =
|
2021-12-21 15:57:42 +01:00
|
|
|
let xobjects =
|
|
|
|
match Pdf.lookup_direct pdf "/XObject" resources with
|
|
|
|
| Some (Pdf.Dictionary elts) -> map snd elts
|
|
|
|
| _ -> []
|
|
|
|
in
|
|
|
|
(* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
|
2024-11-21 19:47:45 +01:00
|
|
|
let images, forms = List.partition (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
|
2021-12-21 15:57:42 +01:00
|
|
|
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
|
|
|
|
if dedup || dedup_per_page then
|
|
|
|
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
|
2024-11-21 19:47:45 +01:00
|
|
|
images, forms
|
2021-12-21 15:57:42 +01:00
|
|
|
in
|
2024-11-21 19:47:45 +01:00
|
|
|
iter (extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms;
|
2023-11-10 14:46:52 +01:00
|
|
|
extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf resources stem pnum images
|
2021-12-21 15:57:42 +01:00
|
|
|
|
2023-11-10 14:46:52 +01:00
|
|
|
let extract_images ?(raw=false) ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf range stem =
|
2023-12-04 12:19:17 +01:00
|
|
|
Hashtbl.clear jbig2_globals;
|
|
|
|
jbig2_serial := 0;
|
2021-12-21 15:57:42 +01:00
|
|
|
if dedup || dedup_per_page then written := [];
|
|
|
|
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
|
|
|
|
let pages =
|
|
|
|
option_map
|
|
|
|
(function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
|
|
|
|
(combine (indx pdf_pages) pdf_pages)
|
|
|
|
in
|
|
|
|
let serial = ref 0 in
|
|
|
|
iter2
|
|
|
|
(fun page pnum ->
|
|
|
|
if dedup_per_page then written := [];
|
|
|
|
let xobjects =
|
|
|
|
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
|
|
|
| Some (Pdf.Dictionary elts) -> map snd elts
|
|
|
|
| _ -> []
|
|
|
|
in
|
|
|
|
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
|
|
|
|
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
|
|
|
|
if dedup || dedup_per_page then
|
|
|
|
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
|
|
|
|
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
|
2023-11-10 14:46:52 +01:00
|
|
|
extract_images_inner ~raw ?path_to_p2p ?path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
|
|
|
|
iter (extract_images_form_xobject ~raw ?path_to_p2p ?path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
|
2021-12-21 15:57:42 +01:00
|
|
|
pages
|
|
|
|
(indx pages)
|
2021-12-21 16:25:59 +01:00
|
|
|
|
|
|
|
(* Image resolution *)
|
|
|
|
type xobj =
|
|
|
|
| Image of int * int (* width, height *)
|
|
|
|
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
|
|
|
|
|
|
|
|
let image_results = ref []
|
|
|
|
|
2024-10-10 17:38:34 +02:00
|
|
|
let rec image_resolution_page pdf page pagenum images =
|
2021-12-21 16:25:59 +01:00
|
|
|
try
|
|
|
|
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
|
|
|
|
and transform = ref [ref Pdftransform.i_matrix] in
|
|
|
|
iter
|
|
|
|
(function
|
|
|
|
| Pdfops.Op_cm matrix ->
|
|
|
|
begin match !transform with
|
|
|
|
| [] -> raise (Failure "no transform")
|
|
|
|
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
|
|
|
|
end
|
|
|
|
| Pdfops.Op_Do xobject ->
|
|
|
|
let trans (x, y) =
|
|
|
|
match !transform with
|
|
|
|
| [] -> raise (Failure "no transform")
|
|
|
|
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
|
|
|
|
in
|
|
|
|
let o = trans (0., 0.)
|
|
|
|
and x = trans (1., 0.)
|
|
|
|
and y = trans (0., 1.)
|
|
|
|
in
|
|
|
|
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
|
|
|
|
let rec lookup_image k = function
|
|
|
|
| [] -> assert false
|
2024-01-04 18:40:15 +01:00
|
|
|
| (_, a, _, _) as h::_ when a = k -> h
|
2021-12-21 16:25:59 +01:00
|
|
|
| _::t -> lookup_image k t
|
|
|
|
in
|
|
|
|
begin match lookup_image xobject images with
|
2024-01-04 18:40:15 +01:00
|
|
|
| (pagenum, name, Form (xobj_matrix, content, resources), objnum) ->
|
2021-12-21 16:25:59 +01:00
|
|
|
let content =
|
|
|
|
(* Add in matrix etc. *)
|
|
|
|
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
|
|
|
|
let ops =
|
|
|
|
Pdfops.Op_cm total_matrix::
|
|
|
|
Pdfops.parse_operators pdf resources [content]
|
|
|
|
in
|
|
|
|
Pdfops.stream_of_ops ops
|
|
|
|
in
|
|
|
|
let page =
|
|
|
|
{Pdfpage.content = [content];
|
|
|
|
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
|
|
|
|
Pdfpage.resources = resources;
|
|
|
|
Pdfpage.rotate = Pdfpage.Rotate0;
|
|
|
|
Pdfpage.rest = Pdf.Dictionary []}
|
|
|
|
in
|
|
|
|
let newpdf = Pdfpage.change_pages false pdf [page] in
|
2024-11-22 17:30:22 +01:00
|
|
|
image_resolution newpdf [1] pagenum
|
2024-01-04 18:40:15 +01:00
|
|
|
| (pagenum, name, Image (w, h), objnum) ->
|
2024-01-10 14:10:39 +01:00
|
|
|
let lx = Pdfunits.inches (distance_between o x) Pdfunits.PdfPoint in
|
|
|
|
let ly = Pdfunits.inches (distance_between o y) Pdfunits.PdfPoint in
|
2021-12-21 16:25:59 +01:00
|
|
|
let wdpi = float w /. lx
|
|
|
|
and hdpi = float h /. ly in
|
2024-11-22 16:57:28 +01:00
|
|
|
image_results := (pagenum, xobject, w, h, wdpi, hdpi, objnum)::!image_results;
|
|
|
|
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi;*)
|
2021-12-21 16:25:59 +01:00
|
|
|
end
|
|
|
|
| Pdfops.Op_q ->
|
|
|
|
begin match !transform with
|
|
|
|
| [] -> raise (Failure "Unbalanced q/Q ops")
|
|
|
|
| h::t ->
|
|
|
|
let h' = ref Pdftransform.i_matrix in
|
|
|
|
h' := !h;
|
|
|
|
transform := h'::h::t
|
|
|
|
end
|
|
|
|
| Pdfops.Op_Q ->
|
|
|
|
begin match !transform with
|
|
|
|
| [] -> raise (Failure "Unbalanced q/Q ops")
|
|
|
|
| _ -> transform := tl !transform
|
|
|
|
end
|
|
|
|
| _ -> ())
|
|
|
|
pageops
|
|
|
|
with
|
|
|
|
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
|
|
|
|
|
2024-11-22 17:30:22 +01:00
|
|
|
and image_resolution pdf range real_pagenum =
|
2021-12-21 16:25:59 +01:00
|
|
|
let images = ref [] in
|
|
|
|
Cpdfpage.iter_pages
|
|
|
|
(fun pagenum page ->
|
2024-11-22 17:30:22 +01:00
|
|
|
let pagenum = if real_pagenum > 0 then real_pagenum else pagenum in
|
2021-12-21 16:25:59 +01:00
|
|
|
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
|
|
|
|
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
|
|
|
| Some (Pdf.Dictionary xobjects) ->
|
|
|
|
iter
|
|
|
|
(function (name, xobject) ->
|
2024-01-04 18:40:15 +01:00
|
|
|
let objnum = match xobject with Pdf.Indirect i -> i | _ -> 0 in
|
2021-12-21 16:25:59 +01:00
|
|
|
match Pdf.lookup_direct pdf "/Subtype" xobject with
|
|
|
|
| Some (Pdf.Name "/Image") ->
|
|
|
|
let width =
|
|
|
|
match Pdf.lookup_direct pdf "/Width" xobject with
|
2022-07-14 15:06:25 +02:00
|
|
|
| Some x -> Pdf.getnum pdf x
|
2021-12-21 16:25:59 +01:00
|
|
|
| None -> 1.
|
|
|
|
and height =
|
|
|
|
match Pdf.lookup_direct pdf "/Height" xobject with
|
2022-07-14 15:06:25 +02:00
|
|
|
| Some x -> Pdf.getnum pdf x
|
2021-12-21 16:25:59 +01:00
|
|
|
| None -> 1.
|
|
|
|
in
|
2024-01-04 18:40:15 +01:00
|
|
|
images := (pagenum, name, Image (int_of_float width, int_of_float height), objnum)::!images
|
2021-12-21 16:25:59 +01:00
|
|
|
| Some (Pdf.Name "/Form") ->
|
|
|
|
let resources =
|
|
|
|
match Pdf.lookup_direct pdf "/Resources" xobject with
|
|
|
|
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
|
|
|
|
| Some r -> r
|
|
|
|
and contents =
|
|
|
|
xobject
|
|
|
|
and matrix =
|
|
|
|
match Pdf.lookup_direct pdf "/Matrix" xobject with
|
|
|
|
| Some (Pdf.Array [a; b; c; d; e; f]) ->
|
2022-07-14 15:06:25 +02:00
|
|
|
{Pdftransform.a = Pdf.getnum pdf a; Pdftransform.b = Pdf.getnum pdf b; Pdftransform.c = Pdf.getnum pdf c;
|
|
|
|
Pdftransform.d = Pdf.getnum pdf d; Pdftransform.e = Pdf.getnum pdf e; Pdftransform.f = Pdf.getnum pdf f}
|
2021-12-21 16:25:59 +01:00
|
|
|
| _ -> Pdftransform.i_matrix
|
|
|
|
in
|
2024-01-04 18:40:15 +01:00
|
|
|
images := (pagenum, name, Form (matrix, contents, resources), objnum)::!images
|
2021-12-21 16:25:59 +01:00
|
|
|
| _ -> ()
|
|
|
|
)
|
|
|
|
xobjects
|
|
|
|
| _ -> ())
|
|
|
|
pdf
|
2024-11-22 17:30:22 +01:00
|
|
|
(if real_pagenum = 0 then range else [1]);
|
2021-12-21 16:25:59 +01:00
|
|
|
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
|
|
|
|
let pagesplits =
|
|
|
|
map
|
2024-01-04 18:40:15 +01:00
|
|
|
(function (a, _, _, _)::_ as ls -> (a, ls) | _ -> assert false)
|
|
|
|
(collate (fun (a, _, _, _) (b, _, _, _) -> compare a b) (rev !images))
|
2021-12-21 16:25:59 +01:00
|
|
|
and pages =
|
|
|
|
Pdfpage.pages_of_pagetree pdf
|
|
|
|
in
|
|
|
|
iter
|
|
|
|
(function (pagenum, images) ->
|
2024-11-22 17:30:22 +01:00
|
|
|
let pagenum = if real_pagenum > 0 then 1 else pagenum in
|
2021-12-21 16:25:59 +01:00
|
|
|
let page = select pagenum pages in
|
2024-10-10 17:38:34 +02:00
|
|
|
image_resolution_page pdf page pagenum images)
|
2021-12-21 16:25:59 +01:00
|
|
|
pagesplits
|
|
|
|
|
2024-10-10 17:38:34 +02:00
|
|
|
let is_below_dpi dpi (_, _, _, _, wdpi, hdpi, _) =
|
|
|
|
wdpi < dpi || hdpi < dpi
|
|
|
|
|
2021-12-21 16:25:59 +01:00
|
|
|
let image_resolution pdf range dpi =
|
|
|
|
image_results := [];
|
2024-11-22 17:30:22 +01:00
|
|
|
image_resolution pdf range 0;
|
|
|
|
sort compare (rev (keep (is_below_dpi dpi) !image_results))
|
2021-12-21 16:25:59 +01:00
|
|
|
|
2024-02-07 15:42:38 +01:00
|
|
|
let image_resolution_json pdf range dpi =
|
|
|
|
let images = image_resolution pdf range dpi in
|
|
|
|
Pdfio.bytes_of_string
|
|
|
|
(Cpdfyojson.Safe.pretty_to_string
|
|
|
|
(`List (map (fun (pagenum, xobject, w, h, wdpi, hdpi, objnum) ->
|
|
|
|
`Assoc [("Object", `Int objnum); ("Page", `Int pagenum); ("XObject", `String xobject);
|
|
|
|
("W", `Int w); ("H", `Int h); ("Xdpi", `Float wdpi); ("Ydpi", `Float hdpi)]) images)))
|
|
|
|
|
2023-11-14 17:45:49 +01:00
|
|
|
(* All the images in file referenced at least once from the given range of pages. *)
|
2023-11-14 17:19:47 +01:00
|
|
|
let images pdf range =
|
|
|
|
let images = null_hash () in
|
2023-11-14 18:47:44 +01:00
|
|
|
let formnums = null_hash () in
|
|
|
|
let rec process_xobject resources pagenum page (name, xobject) =
|
|
|
|
match Pdf.lookup_direct pdf "/Subtype" xobject with
|
|
|
|
| Some (Pdf.Name "/Image") ->
|
|
|
|
begin match xobject with
|
|
|
|
| Pdf.Indirect i ->
|
|
|
|
begin match Hashtbl.find images i with
|
2023-12-28 16:48:30 +01:00
|
|
|
| (pagenums, n, w, h, s, bpc, cs, f) ->
|
|
|
|
Hashtbl.replace images i (pagenum::pagenums, n, w, h, s, bpc, cs, f)
|
2023-11-14 18:47:44 +01:00
|
|
|
| exception Not_found ->
|
|
|
|
let width =
|
|
|
|
match Pdf.lookup_direct pdf "/Width" xobject with
|
|
|
|
| Some x -> Pdf.getnum pdf x
|
|
|
|
| None -> 1.
|
|
|
|
and height =
|
|
|
|
match Pdf.lookup_direct pdf "/Height" xobject with
|
|
|
|
| Some x -> Pdf.getnum pdf x
|
|
|
|
| None -> 1.
|
2023-12-28 16:48:30 +01:00
|
|
|
and size =
|
|
|
|
match Pdf.lookup_direct pdf "/Length" xobject with
|
|
|
|
| Some (Pdf.Integer x) -> x
|
|
|
|
| _ -> 0
|
|
|
|
and bpc =
|
|
|
|
match Pdf.lookup_direct pdf "/BitsPerComponent" xobject with
|
2024-03-22 16:33:08 +01:00
|
|
|
| Some (Pdf.Integer x) -> Some x
|
|
|
|
| _ -> None
|
2023-11-14 18:47:44 +01:00
|
|
|
and colourspace =
|
|
|
|
match Pdf.lookup_direct pdf "/ColorSpace" xobject with
|
|
|
|
| Some x -> Some (Pdfspace.string_of_colourspace (Pdfspace.read_colourspace pdf resources x))
|
|
|
|
| None -> None
|
2023-12-28 16:48:30 +01:00
|
|
|
and filter =
|
|
|
|
match Pdf.lookup_direct pdf "/Filter" xobject with
|
|
|
|
| Some (Pdf.Array [x]) | Some x -> Some (Pdfwrite.string_of_pdf x)
|
|
|
|
| None -> None
|
2023-11-14 18:47:44 +01:00
|
|
|
in
|
2023-12-28 16:48:30 +01:00
|
|
|
Hashtbl.replace images i ([pagenum], name, int_of_float width, int_of_float height, size, bpc, colourspace, filter)
|
2023-11-14 18:47:44 +01:00
|
|
|
end
|
|
|
|
| _ -> ()
|
|
|
|
end
|
|
|
|
| Some (Pdf.Name "/Form") ->
|
|
|
|
begin match xobject with
|
|
|
|
| Pdf.Indirect i ->
|
|
|
|
begin match Hashtbl.find formnums i with
|
|
|
|
| () -> ()
|
|
|
|
| exception Not_found ->
|
|
|
|
Hashtbl.add formnums i ();
|
|
|
|
begin match Pdf.lookup_direct pdf "/Resources" xobject with
|
|
|
|
| Some r ->
|
|
|
|
begin match Pdf.lookup_direct pdf "/XObject" r with
|
|
|
|
| Some (Pdf.Dictionary xobjects) -> iter (process_xobject r pagenum page) xobjects
|
|
|
|
| _ -> ()
|
|
|
|
end
|
|
|
|
| None -> ()
|
|
|
|
end
|
|
|
|
end
|
|
|
|
| _ -> ()
|
|
|
|
end
|
|
|
|
| _ -> ()
|
|
|
|
in
|
|
|
|
Cpdfpage.iter_pages
|
|
|
|
(fun pagenum page ->
|
|
|
|
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
|
|
|
|
| Some (Pdf.Dictionary xobjects) ->
|
|
|
|
iter (process_xobject page.Pdfpage.resources pagenum page) xobjects
|
|
|
|
| _ -> ())
|
|
|
|
pdf
|
|
|
|
range;
|
|
|
|
let images = list_of_hashtbl images in
|
2023-12-28 16:48:30 +01:00
|
|
|
let images = map (fun (i, (pnums, n, w, h, s, bpc, c, filter)) -> (i, (setify (sort compare pnums), n, w, h, s, bpc, c, filter))) images in
|
|
|
|
let images = sort (fun (_, (pnums, _, _, _, _, _, _, _)) (_, (pnums', _, _, _, _, _, _, _)) -> compare (hd pnums) (hd pnums')) images in
|
2023-11-14 18:47:44 +01:00
|
|
|
`List
|
|
|
|
(map
|
2023-12-28 16:48:30 +01:00
|
|
|
(fun (i, (pnums, n, w, h, size, bpc, cs, filter)) ->
|
2023-11-14 18:47:44 +01:00
|
|
|
`Assoc [("Object", `Int i);
|
|
|
|
("Pages", `List (map (fun x -> `Int x) pnums));
|
|
|
|
("Name", `String n);
|
|
|
|
("Width", `Int w);
|
|
|
|
("Height", `Int h);
|
2023-12-28 16:48:30 +01:00
|
|
|
("Bytes", `Int size);
|
2024-03-22 16:33:08 +01:00
|
|
|
("BitsPerComponent", match bpc with None -> `Null | Some bpc -> `Int bpc);
|
2023-12-28 16:48:30 +01:00
|
|
|
("Colourspace", match cs with None -> `Null | Some s -> `String s);
|
|
|
|
("Filter", match filter with None -> `Null | Some s -> `String s)])
|
2023-11-14 18:47:44 +01:00
|
|
|
images)
|
2023-11-13 18:55:59 +01:00
|
|
|
|
2023-08-14 17:15:11 +02:00
|
|
|
let obj_of_jpeg_data data =
|
|
|
|
let w, h = Cpdfjpeg.jpeg_dimensions data in
|
|
|
|
let d =
|
|
|
|
["/Length", Pdf.Integer (Pdfio.bytes_size data);
|
|
|
|
"/Filter", Pdf.Name "/DCTDecode";
|
|
|
|
"/BitsPerComponent", Pdf.Integer 8;
|
|
|
|
"/ColorSpace", Pdf.Name "/DeviceRGB";
|
|
|
|
"/Subtype", Pdf.Name "/Image";
|
|
|
|
"/Width", Pdf.Integer w;
|
|
|
|
"/Height", Pdf.Integer h]
|
|
|
|
in
|
2023-12-04 17:32:12 +01:00
|
|
|
Pdf.Stream {contents = (Pdf.Dictionary d, Pdf.Got data)}, []
|
2023-08-14 17:15:11 +02:00
|
|
|
|
|
|
|
let obj_of_png_data data =
|
|
|
|
let png = Cpdfpng.read_png (Pdfio.input_of_bytes data) in
|
2024-01-01 12:46:32 +01:00
|
|
|
let d =
|
|
|
|
["/Length", Pdf.Integer (Pdfio.bytes_size png.idat);
|
|
|
|
"/Filter", Pdf.Name "/FlateDecode";
|
|
|
|
"/Subtype", Pdf.Name "/Image";
|
|
|
|
"/BitsPerComponent", Pdf.Integer png.bitdepth;
|
|
|
|
"/ColorSpace", Pdf.Name (match png.colortype with 0 -> "/DeviceGray" | 2 -> "/DeviceRGB" | _ -> error "obj_of_png_data 1");
|
|
|
|
"/DecodeParms", Pdf.Dictionary
|
|
|
|
["/BitsPerComponent", Pdf.Integer png.bitdepth;
|
|
|
|
"/Colors", Pdf.Integer (match png.colortype with 0 -> 1 | 2 -> 3 | _ -> error "obj_of_png_data 2");
|
|
|
|
"/Columns", Pdf.Integer png.width;
|
|
|
|
"/Predictor", Pdf.Integer 15];
|
|
|
|
"/Width", Pdf.Integer png.width;
|
|
|
|
"/Height", Pdf.Integer png.height]
|
|
|
|
in
|
|
|
|
Pdf.Stream {contents = (Pdf.Dictionary d, Pdf.Got png.idat)}, []
|
2023-12-04 14:39:56 +01:00
|
|
|
|
2024-03-22 14:57:04 +01:00
|
|
|
let obj_of_jpeg2000_data data =
|
|
|
|
let w, h = Cpdfjpeg2000.jpeg2000_dimensions data in
|
|
|
|
let d =
|
|
|
|
["/Length", Pdf.Integer (Pdfio.bytes_size data);
|
|
|
|
"/Filter", Pdf.Name "/JPXDecode";
|
|
|
|
"/Subtype", Pdf.Name "/Image";
|
|
|
|
"/Width", Pdf.Integer w;
|
|
|
|
"/Height", Pdf.Integer h]
|
|
|
|
in
|
|
|
|
Pdf.Stream {contents = (Pdf.Dictionary d, Pdf.Got data)}, []
|
|
|
|
|
2023-12-04 15:00:45 +01:00
|
|
|
let jbig2_dimensions data =
|
|
|
|
(bget data 11 * 256 * 256 * 256 + bget data 12 * 256 * 256 + bget data 13 * 256 + bget data 14,
|
|
|
|
bget data 15 * 256 * 256 * 256 + bget data 16 * 256 * 256 + bget data 17 * 256 + bget data 18)
|
|
|
|
|
2023-12-04 17:58:13 +01:00
|
|
|
let obj_of_jbig2_data ?global data =
|
|
|
|
let d, extra =
|
|
|
|
let decodeparms, extra =
|
|
|
|
match global with
|
|
|
|
| Some data ->
|
|
|
|
[("/DecodeParms", Pdf.Dictionary [("/JBIG2Globals", Pdf.Indirect 10000)])],
|
2023-12-04 18:15:15 +01:00
|
|
|
[(10000, Pdf.Stream {contents = (Pdf.Dictionary [("/Length", Pdf.Integer (bytes_size data))], Pdf.Got data)})]
|
2023-12-04 17:58:13 +01:00
|
|
|
| None ->
|
|
|
|
[], []
|
|
|
|
in
|
2023-12-04 15:00:45 +01:00
|
|
|
let w, h = jbig2_dimensions data in
|
2023-12-04 14:39:56 +01:00
|
|
|
[("/Length", Pdf.Integer (Pdfio.bytes_size data));
|
|
|
|
("/Filter", Pdf.Name "/JBIG2Decode");
|
|
|
|
("/Subtype", Pdf.Name "/Image");
|
|
|
|
("/BitsPerComponent", Pdf.Integer 1);
|
|
|
|
("/ColorSpace", Pdf.Name "/DeviceGray");
|
|
|
|
("/Width", Pdf.Integer w);
|
|
|
|
("/Height", Pdf.Integer h)]
|
2023-12-04 17:58:13 +01:00
|
|
|
@ decodeparms, extra
|
2023-12-04 14:39:56 +01:00
|
|
|
in
|
2023-12-04 17:58:13 +01:00
|
|
|
Pdf.Stream {contents = (Pdf.Dictionary d, Pdf.Got data)}, extra
|
2023-08-14 17:15:11 +02:00
|
|
|
|
2024-09-30 19:26:39 +02:00
|
|
|
let image_of_input ?subformat ?title ~process_struct_tree fobj i =
|
2024-09-30 19:24:10 +02:00
|
|
|
let pdf, title =
|
2024-09-30 16:13:56 +02:00
|
|
|
match subformat with
|
2024-09-30 19:24:10 +02:00
|
|
|
| None -> Pdf.empty (), begin match title with Some x -> x | None -> "" end
|
2024-09-30 16:13:56 +02:00
|
|
|
| Some Cpdfua.PDFUA1 ->
|
|
|
|
begin match title with
|
|
|
|
| None -> error "no -title given"
|
2024-09-30 19:24:10 +02:00
|
|
|
| Some title -> Cpdfua.create_pdfua1 title Pdfpaper.a4 1, title
|
2024-09-30 16:13:56 +02:00
|
|
|
end
|
|
|
|
| Some Cpdfua.PDFUA2 ->
|
|
|
|
begin match title with
|
|
|
|
| None -> error "no -title given"
|
2024-09-30 19:24:10 +02:00
|
|
|
| Some title -> Cpdfua.create_pdfua2 title Pdfpaper.a4 1, title
|
2024-09-30 16:13:56 +02:00
|
|
|
end
|
|
|
|
in
|
2023-08-14 17:15:11 +02:00
|
|
|
let data = Pdfio.bytes_of_input i 0 i.Pdfio.in_channel_length in
|
2023-12-04 18:15:15 +01:00
|
|
|
let obj, extras = fobj () data in
|
2023-12-04 17:58:13 +01:00
|
|
|
iter (Pdf.addobj_given_num pdf) extras;
|
2023-08-14 17:15:11 +02:00
|
|
|
let w = match Pdf.lookup_direct pdf "/Width" obj with Some x -> Pdf.getnum pdf x | _ -> assert false in
|
|
|
|
let h = match Pdf.lookup_direct pdf "/Height" obj with Some x -> Pdf.getnum pdf x | _ -> assert false in
|
2024-10-01 16:33:53 +02:00
|
|
|
let structinfo =
|
|
|
|
match process_struct_tree, subformat with
|
|
|
|
| _, (Some Cpdfua.PDFUA1 | Some Cpdfua.PDFUA2) | true, _ -> true
|
|
|
|
| _ -> false
|
|
|
|
in
|
2024-10-03 15:47:57 +02:00
|
|
|
if subformat = Some Cpdfua.PDFUA2 then
|
2024-10-01 17:44:44 +02:00
|
|
|
begin
|
|
|
|
let str = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let figure = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let parent_tree = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let namespace = Pdf.addobj pdf (Pdf.Dictionary [("/NS", Pdf.String "http://iso.org/pdf2/ssn")]) in
|
|
|
|
let document = Pdf.addobj pdf Pdf.Null in
|
|
|
|
Pdf.addobj_given_num pdf (document, Pdf.Dictionary [("/K", Pdf.Array [Pdf.Indirect figure]); ("/P", Pdf.Indirect str); ("/S", Pdf.Name "/Document"); ("/NS", Pdf.Indirect namespace)]);
|
|
|
|
Pdf.addobj_given_num pdf (parent_tree, Pdf.Dictionary [("/Nums", Pdf.Array [Pdf.Integer 1; Pdf.Array [Pdf.Indirect figure]])]);
|
|
|
|
Pdf.addobj_given_num pdf (figure, Pdf.Dictionary [("/K", Pdf.Array [Pdf.Integer 0]); ("/P", Pdf.Indirect document); ("/S", Pdf.Name "/Figure"); ("/Alt", Pdf.String title)]);
|
|
|
|
Pdf.addobj_given_num pdf (str, Pdf.Dictionary [("/Namespaces", Pdf.Array [Pdf.Indirect namespace]); ("/Type", Pdf.Name "/StructTreeRoot");
|
|
|
|
("/K", Pdf.Array [Pdf.Indirect document]); ("/ParentTree", Pdf.Indirect parent_tree)]);
|
2024-10-23 14:44:31 +02:00
|
|
|
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"] (Pdf.Indirect str)
|
2024-10-01 17:44:44 +02:00
|
|
|
end
|
2024-10-03 15:47:57 +02:00
|
|
|
else if process_struct_tree || subformat = Some Cpdfua.PDFUA1 then
|
2024-10-01 16:33:53 +02:00
|
|
|
begin
|
|
|
|
let str = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let figure = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let parent_tree = Pdf.addobj pdf Pdf.Null in
|
|
|
|
Pdf.addobj_given_num pdf (parent_tree, Pdf.Dictionary [("/Nums", Pdf.Array [Pdf.Integer 1; Pdf.Array [Pdf.Indirect figure]])]);
|
|
|
|
Pdf.addobj_given_num pdf (figure, Pdf.Dictionary [("/K", Pdf.Array [Pdf.Integer 0]); ("/P", Pdf.Indirect str); ("/S", Pdf.Name "/Figure"); ("/Alt", Pdf.String title)]);
|
|
|
|
Pdf.addobj_given_num pdf (str, Pdf.Dictionary [("/Type", Pdf.Name "/StructTreeRoot"); ("/K", Pdf.Array [Pdf.Indirect figure]); ("/ParentTree", Pdf.Indirect parent_tree)]);
|
2024-10-23 14:44:31 +02:00
|
|
|
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"] (Pdf.Indirect str)
|
2024-10-01 16:33:53 +02:00
|
|
|
end;
|
|
|
|
let ops =
|
|
|
|
(if structinfo then [Pdfops.Op_BDC ("/Figure", Pdf.Dictionary [("/MCID", Pdf.Integer 0)])] else [])
|
|
|
|
@ [Pdfops.Op_cm (Pdftransform.matrix_of_transform [Pdftransform.Translate (0., 0.);
|
|
|
|
Pdftransform.Scale ((0., 0.), w, h)]);
|
|
|
|
Pdfops.Op_Do "/I0"]
|
|
|
|
@ (if structinfo then [Pdfops.Op_EMC] else [])
|
|
|
|
in
|
2023-08-14 17:15:11 +02:00
|
|
|
let page =
|
2024-10-01 16:33:53 +02:00
|
|
|
{Pdfpage.content = [Pdfops.stream_of_ops ops];
|
2023-08-14 17:15:11 +02:00
|
|
|
Pdfpage.mediabox = Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real w; Pdf.Real h];
|
2024-10-01 16:33:53 +02:00
|
|
|
Pdfpage.resources = Pdf.Dictionary ["/XObject", Pdf.Dictionary ["/I0", Pdf.Indirect (Pdf.addobj pdf obj)]];
|
2023-08-14 17:15:11 +02:00
|
|
|
Pdfpage.rotate = Pdfpage.Rotate0;
|
2024-10-01 16:33:53 +02:00
|
|
|
Pdfpage.rest = if structinfo then Pdf.Dictionary [("/StructParents", Pdf.Integer 1)] else Pdf.Dictionary []}
|
2023-08-14 17:15:11 +02:00
|
|
|
in
|
|
|
|
let pdf, pageroot = Pdfpage.add_pagetree [page] pdf in
|
|
|
|
Pdfpage.add_root pageroot [] pdf
|
2023-12-06 13:20:27 +01:00
|
|
|
|
2024-11-06 17:18:57 +01:00
|
|
|
let jpeg_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~jpeg_to_jpeg_scale ~interpolate ~q ~path_to_convert s dict reference =
|
2024-02-20 20:41:49 +01:00
|
|
|
if q < 0. || q > 100. then error "Out of range quality";
|
|
|
|
complain_convert path_to_convert;
|
2023-12-24 14:54:21 +01:00
|
|
|
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
|
|
|
|
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
|
2023-12-29 22:49:56 +01:00
|
|
|
if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else
|
2023-12-20 13:11:55 +01:00
|
|
|
Pdf.getstream s;
|
2023-12-28 12:32:43 +01:00
|
|
|
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
|
2023-12-29 22:49:56 +01:00
|
|
|
if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else
|
2024-02-21 19:41:26 +01:00
|
|
|
let out = Filename.temp_file "cpdf" "convertin.jpg" in
|
|
|
|
let out2 = Filename.temp_file "cpdf" "convertout.jpg" in
|
2023-12-20 13:11:55 +01:00
|
|
|
let fh = open_out_bin out in
|
2023-12-28 12:32:43 +01:00
|
|
|
begin match s with Pdf.Stream {contents = _, Pdf.Got d} -> Pdfio.bytes_to_output_channel fh d | _ -> () end;
|
2023-12-20 13:11:55 +01:00
|
|
|
close_out fh;
|
|
|
|
let retcode =
|
2024-11-06 16:15:32 +01:00
|
|
|
let scaling =
|
|
|
|
if jpeg_to_jpeg_scale <> 100. then
|
|
|
|
[(if interpolate then "-sample" else "-resize"); string_of_float jpeg_to_jpeg_scale ^ "%"]
|
|
|
|
else
|
|
|
|
[]
|
|
|
|
in
|
2023-12-20 13:11:55 +01:00
|
|
|
let command =
|
2024-11-06 16:15:32 +01:00
|
|
|
Filename.quote_command path_to_convert ([out] @ scaling @ ["-quality"; string_of_float q ^ "%"; out2])
|
2023-12-20 13:11:55 +01:00
|
|
|
in
|
2024-11-06 15:54:02 +01:00
|
|
|
Printf.printf "%S\n" command; Sys.command command
|
2023-12-20 13:11:55 +01:00
|
|
|
in
|
|
|
|
if retcode = 0 then
|
|
|
|
begin
|
2024-02-20 20:41:49 +01:00
|
|
|
try
|
|
|
|
let result = open_in_bin out2 in
|
|
|
|
let newsize = in_channel_length result in
|
|
|
|
let perc_ok = float newsize /. float size < percentage_threshold /. 100. in
|
|
|
|
if newsize < size && perc_ok then
|
|
|
|
begin
|
2024-11-06 15:54:02 +01:00
|
|
|
let data = Pdfio.bytes_of_input_channel result in
|
|
|
|
let w, h = Cpdfjpeg.jpeg_dimensions data in
|
2024-02-20 20:41:49 +01:00
|
|
|
if !debug_image_processing then Printf.printf "JPEG to JPEG %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.));
|
2024-11-06 15:54:02 +01:00
|
|
|
reference :=
|
|
|
|
Pdf.add_dict_entry (Pdf.add_dict_entry (Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize)) "/Width" (Pdf.Integer w)) "/Height" (Pdf.Integer h),
|
2024-11-06 16:15:32 +01:00
|
|
|
Pdf.Got data
|
2024-02-20 20:41:49 +01:00
|
|
|
end
|
|
|
|
else
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "no size reduction\n%!"
|
|
|
|
end;
|
|
|
|
close_in result
|
|
|
|
with _ ->
|
|
|
|
remove out;
|
|
|
|
remove out2
|
2023-12-31 12:13:58 +01:00
|
|
|
end
|
|
|
|
else
|
2024-01-03 18:43:51 +01:00
|
|
|
begin Printf.printf "external process failed\n%!" end;
|
2023-12-31 12:13:58 +01:00
|
|
|
remove out;
|
|
|
|
remove out2
|
2023-12-20 13:11:55 +01:00
|
|
|
|
|
|
|
let suitable_num pdf dict =
|
|
|
|
match Pdf.lookup_direct pdf "/ColorSpace" dict with
|
2024-02-19 18:56:35 +01:00
|
|
|
| Some (Pdf.Name ("/DeviceRGB" | "/CalRGB")) -> 3
|
|
|
|
| Some (Pdf.Name ("/DeviceGray" | "/CalGray")) -> 1
|
2023-12-20 13:11:55 +01:00
|
|
|
| Some (Pdf.Name "/DeviceCMYK") -> 4
|
2024-02-19 18:56:35 +01:00
|
|
|
| Some (Pdf.Array [Pdf.Name "/Lab"; _]) -> 3
|
2023-12-20 13:11:55 +01:00
|
|
|
| Some (Pdf.Array [Pdf.Name "/ICCBased"; stream]) ->
|
|
|
|
begin match Pdf.lookup_direct pdf "/N" stream with
|
|
|
|
| Some (Pdf.Integer 3) -> 3
|
|
|
|
| Some (Pdf.Integer 1) -> 1
|
|
|
|
| Some (Pdf.Integer 4) -> 4
|
|
|
|
| _ -> 0
|
|
|
|
end
|
2024-01-02 18:54:16 +01:00
|
|
|
| Some (Pdf.Array (Pdf.Name ("/Separation")::_)) -> ~-1
|
|
|
|
| Some (Pdf.Array (Pdf.Name ("/Indexed")::_)) -> ~-2
|
2023-12-20 13:11:55 +01:00
|
|
|
| _ -> 0
|
|
|
|
|
2023-12-29 18:22:02 +01:00
|
|
|
let lossless_out pdf ~pixel_threshold ~length_threshold extension s dict reference =
|
2024-01-03 18:43:51 +01:00
|
|
|
let old = !reference in
|
|
|
|
let restore () = reference := old in
|
2023-12-20 13:11:55 +01:00
|
|
|
let bpc = Pdf.lookup_direct pdf "/BitsPerComponent" dict in
|
|
|
|
let components = suitable_num pdf dict in
|
|
|
|
match components, bpc with
|
2024-01-02 18:54:16 +01:00
|
|
|
| (1 | 3 | 4 | -1 | -2), Some (Pdf.Integer 8) ->
|
2023-12-24 14:54:21 +01:00
|
|
|
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
|
|
|
|
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
|
2023-12-29 22:49:56 +01:00
|
|
|
if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!"; None) else
|
2023-12-20 13:11:55 +01:00
|
|
|
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
|
2023-12-29 22:49:56 +01:00
|
|
|
if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!"; None) else
|
2023-12-29 18:22:02 +01:00
|
|
|
begin
|
|
|
|
Pdfcodec.decode_pdfstream_until_unknown pdf s;
|
2024-01-03 18:43:51 +01:00
|
|
|
match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> restore (); None | None ->
|
2024-02-21 19:41:26 +01:00
|
|
|
let out = Filename.temp_file "cpdf" ("convertin" ^ (if suitable_num pdf dict < 4 then ".pnm" else ".cmyk")) in
|
|
|
|
let out2 = Filename.temp_file "cpdf" ("convertout" ^ extension) in
|
2023-12-20 13:11:55 +01:00
|
|
|
let fh = open_out_bin out in
|
|
|
|
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
|
|
|
|
(if components = 3 then pnm_to_channel_24 else
|
|
|
|
if components = 4 then cmyk_to_channel_32 else pnm_to_channel_8) fh w h data;
|
|
|
|
close_out fh;
|
2023-12-29 18:22:02 +01:00
|
|
|
Some (out, out2, size, components, w, h)
|
2023-12-20 13:11:55 +01:00
|
|
|
end
|
|
|
|
| colspace, bpc ->
|
2024-02-20 20:41:49 +01:00
|
|
|
(*let colspace = Pdf.lookup_direct pdf "/ColorSpace" dict in
|
2023-12-20 13:11:55 +01:00
|
|
|
let colspace, bpc, filter =
|
|
|
|
(match colspace with None -> "none" | Some x -> Pdfwrite.string_of_pdf x),
|
|
|
|
(match bpc with None -> "none" | Some x -> Pdfwrite.string_of_pdf x),
|
|
|
|
(match Pdf.lookup_direct pdf "/Filter" dict with None -> "none" | Some x -> Pdfwrite.string_of_pdf x)
|
|
|
|
in
|
2023-12-22 17:12:19 +01:00
|
|
|
print_string (Pdfwrite.string_of_pdf dict);
|
2024-02-20 20:41:49 +01:00
|
|
|
print_string (Printf.sprintf "%s (%s) [%s]\n" colspace bpc filter);*)
|
2023-12-31 12:13:58 +01:00
|
|
|
if !debug_image_processing then Printf.printf "colourspace not suitable\n%!";
|
2024-01-03 18:43:51 +01:00
|
|
|
restore ();
|
2023-12-29 18:22:02 +01:00
|
|
|
None (* an image we cannot or do not handle *)
|
|
|
|
|
|
|
|
let lossless_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~qlossless ~path_to_convert s dict reference =
|
2024-02-20 20:41:49 +01:00
|
|
|
complain_convert path_to_convert;
|
2024-02-21 18:03:22 +01:00
|
|
|
match lossless_out pdf ~pixel_threshold ~length_threshold ".jpg" s dict reference with
|
|
|
|
| None -> ()
|
|
|
|
| Some (_, _, _, -2, _, _) ->
|
|
|
|
if !debug_image_processing then Printf.printf "skipping indexed colorspace\n%!"
|
|
|
|
| Some (out, out2, size, components, w, h) ->
|
2023-12-29 18:22:02 +01:00
|
|
|
let retcode =
|
|
|
|
let command =
|
|
|
|
(Filename.quote_command path_to_convert
|
|
|
|
((if components = 4 then ["-depth"; "8"; "-size"; string_of_int w ^ "x" ^ string_of_int h] else []) @
|
2024-02-01 17:38:07 +01:00
|
|
|
[out; "-quality"; string_of_float qlossless ^ "%"] @
|
2023-12-29 18:22:02 +01:00
|
|
|
(if components = 1 then ["-colorspace"; "Gray"] else if components = 4 then ["-colorspace"; "CMYK"] else []) @
|
|
|
|
[out2]))
|
|
|
|
in
|
|
|
|
(*Printf.printf "%S\n" command;*) Sys.command command
|
|
|
|
in
|
|
|
|
if retcode = 0 then
|
|
|
|
begin
|
2024-02-20 20:41:49 +01:00
|
|
|
try
|
|
|
|
let result = open_in_bin out2 in
|
|
|
|
let newsize = in_channel_length result in
|
|
|
|
let perc_ok = float newsize /. float size < percentage_threshold /. 100. in
|
|
|
|
if newsize < size && perc_ok then
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "lossless to JPEG %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.));
|
|
|
|
reference :=
|
|
|
|
(Pdf.add_dict_entry
|
|
|
|
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
|
|
|
|
"/Filter"
|
|
|
|
(Pdf.Name "/DCTDecode")),
|
|
|
|
Pdf.Got (Pdfio.bytes_of_input_channel result)
|
|
|
|
end
|
|
|
|
else
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "no size reduction\n%!"
|
|
|
|
end;
|
|
|
|
close_in result
|
|
|
|
with
|
|
|
|
_ ->
|
|
|
|
remove out;
|
|
|
|
remove out2
|
2023-12-29 18:22:02 +01:00
|
|
|
end;
|
2023-12-31 12:13:58 +01:00
|
|
|
remove out;
|
|
|
|
remove out2
|
2023-12-29 18:22:02 +01:00
|
|
|
|
2024-02-20 15:59:14 +01:00
|
|
|
let test_components pdf dict =
|
|
|
|
match suitable_num pdf dict with -1 | -2 -> 1 | x -> x
|
|
|
|
|
|
|
|
let test_bpc pdf dict =
|
|
|
|
match Pdf.lookup_direct pdf "/BitsPerComponent" dict with
|
|
|
|
| Some (Pdf.Integer i) -> i
|
|
|
|
| _ -> 0
|
|
|
|
|
2024-02-01 14:29:20 +01:00
|
|
|
let lossless_resample pdf ~pixel_threshold ~length_threshold ~factor ~interpolate ~path_to_convert s dict reference =
|
2024-02-20 20:41:49 +01:00
|
|
|
complain_convert path_to_convert;
|
2024-02-20 15:59:14 +01:00
|
|
|
let in_components = test_components pdf dict in
|
|
|
|
let in_bpc = test_bpc pdf dict in
|
2024-02-20 20:41:49 +01:00
|
|
|
(*Printf.printf "***lossless_resample IN dictionary: %S\n" (Pdfwrite.string_of_pdf dict); *)
|
2024-02-20 17:02:56 +01:00
|
|
|
(*Printf.printf "\n***IN components = %i, bpc = %i\n" in_components in_bpc;*)
|
2024-01-02 15:58:35 +01:00
|
|
|
match lossless_out pdf ~pixel_threshold ~length_threshold ".png" s dict reference with
|
|
|
|
| None -> ()
|
|
|
|
| Some (_, _, _, 4, _, _) -> Printf.printf "lossless resampling for CMYK not supported yet\n%!"
|
|
|
|
| Some (out, out2, size, components, w, h) ->
|
|
|
|
let retcode =
|
|
|
|
let command =
|
|
|
|
Filename.quote_command path_to_convert
|
2024-02-22 16:56:35 +01:00
|
|
|
([out] @ (if components = 4 then ["-depth"; "8"; "-size"; string_of_int w ^ "x" ^ string_of_int h] else []) @
|
2024-01-02 15:58:35 +01:00
|
|
|
(if components = 1 then ["-define"; "png:color-type=0"; "-colorspace"; "Gray"] else if components = 3 then ["-define"; "-png:color-type=2"; "-colorspace"; "RGB"] else if components = 4 then ["-colorspace"; "CMYK"] else []) @
|
2024-02-22 16:56:35 +01:00
|
|
|
[if interpolate && components > -2 then "-resize" else "-sample"; string_of_float factor ^ "%"; out2])
|
2024-01-02 15:15:06 +01:00
|
|
|
in
|
2024-01-02 18:54:16 +01:00
|
|
|
(*Printf.printf "%S\n" command;*)
|
2024-01-02 15:58:35 +01:00
|
|
|
Sys.command command
|
2023-12-29 18:22:02 +01:00
|
|
|
in
|
2024-01-02 15:58:35 +01:00
|
|
|
try
|
2023-12-29 18:22:02 +01:00
|
|
|
if retcode = 0 then
|
|
|
|
begin
|
2024-01-02 15:58:35 +01:00
|
|
|
let result = open_in_bin out2 in
|
2023-12-29 18:22:02 +01:00
|
|
|
let newsize = in_channel_length result in
|
|
|
|
if newsize < size then
|
|
|
|
begin
|
|
|
|
reference :=
|
2024-01-01 20:09:40 +01:00
|
|
|
(match fst (obj_of_png_data (Pdfio.bytes_of_input_channel result)) with
|
2024-02-20 15:59:14 +01:00
|
|
|
| Pdf.Stream {contents = Pdf.Dictionary d, data} as s ->
|
|
|
|
let out_components = test_components pdf s in
|
|
|
|
let out_bpc = test_bpc pdf s in
|
2024-02-20 17:02:56 +01:00
|
|
|
(*Printf.printf "***OUT components = %i, bpc = %i\n" out_components out_bpc;*)
|
|
|
|
let rgb_to_grey_special =
|
|
|
|
let was_rgb =
|
|
|
|
match Pdf.lookup_direct pdf "/ColorSpace" dict with
|
|
|
|
| Some (Pdf.Name ("/DeviceRGB" | "/CalRGB")) -> true
|
|
|
|
| _ -> false
|
|
|
|
in
|
|
|
|
in_bpc = out_bpc && in_components = 3 && out_components = 1 && was_rgb
|
|
|
|
in
|
|
|
|
(*Printf.printf "***rgb_to_grey_special = %b\n" rgb_to_grey_special;*)
|
|
|
|
if (out_components <> in_components || in_bpc <> out_bpc) && not rgb_to_grey_special then
|
2024-02-20 15:59:14 +01:00
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "wrong bpc / components returned. Skipping.\n%!";
|
|
|
|
!reference
|
|
|
|
end
|
|
|
|
else
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "lossless resample %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.));
|
2024-02-20 17:02:56 +01:00
|
|
|
let d' = fold_right (fun (k, v) d -> if k <> "/ColorSpace" || rgb_to_grey_special then add k v d else d) d (match dict with Pdf.Dictionary x -> x | _ -> []) in
|
2024-02-20 15:59:14 +01:00
|
|
|
(*Printf.printf "***lossless_resample OUT dictionary: %S\n" (Pdfwrite.string_of_pdf (Pdf.Dictionary d')); *)
|
|
|
|
(Pdf.Dictionary d', data)
|
|
|
|
end
|
2024-01-01 20:09:40 +01:00
|
|
|
| _ -> assert false)
|
2023-12-29 22:49:56 +01:00
|
|
|
end
|
|
|
|
else
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "no size reduction\n%!"
|
2023-12-29 18:22:02 +01:00
|
|
|
end;
|
|
|
|
close_in result
|
2024-02-01 20:31:39 +01:00
|
|
|
end;
|
|
|
|
remove out;
|
|
|
|
remove out2
|
2024-02-20 20:41:49 +01:00
|
|
|
with _ ->
|
|
|
|
remove out;
|
|
|
|
remove out2
|
2023-12-20 13:11:55 +01:00
|
|
|
|
2024-02-01 16:41:27 +01:00
|
|
|
let lossless_resample_target_dpi objnum pdf ~pixel_threshold ~length_threshold ~factor ~target_dpi_info ~interpolate ~path_to_convert s dict reference =
|
2024-11-06 21:03:43 +01:00
|
|
|
try
|
|
|
|
let real_factor = factor /. Hashtbl.find target_dpi_info objnum *. 100. in
|
|
|
|
if real_factor < 100. then
|
|
|
|
lossless_resample pdf ~pixel_threshold ~length_threshold ~factor:real_factor ~interpolate ~path_to_convert s dict reference
|
|
|
|
else
|
|
|
|
if !debug_image_processing then Printf.printf "failed to meet dpi target\n%!"
|
|
|
|
with
|
2024-11-06 21:13:57 +01:00
|
|
|
Not_found -> Printf.eprintf "Warning: orphaned image, skipping\n" (* Could not find DPI data - an orphan image. *)
|
2024-02-01 15:29:41 +01:00
|
|
|
|
2024-11-06 17:18:57 +01:00
|
|
|
let jpeg_to_jpeg_wrapper objnum pdf ~target_dpi_info ~pixel_threshold ~length_threshold ~percentage_threshold ~jpeg_to_jpeg_scale ~jpeg_to_jpeg_dpi ~interpolate ~q ~path_to_convert s dict reference =
|
|
|
|
if jpeg_to_jpeg_dpi = 0. then
|
|
|
|
jpeg_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~jpeg_to_jpeg_scale ~interpolate ~q ~path_to_convert s dict reference
|
|
|
|
else
|
2024-11-06 21:03:43 +01:00
|
|
|
try
|
|
|
|
let factor = jpeg_to_jpeg_dpi in
|
|
|
|
let real_factor = factor /. Hashtbl.find target_dpi_info objnum *. 100. in
|
|
|
|
if real_factor < 100. then
|
|
|
|
jpeg_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~jpeg_to_jpeg_scale:real_factor ~interpolate ~q ~path_to_convert s dict reference
|
|
|
|
else
|
|
|
|
if !debug_image_processing then Printf.printf "failed to meet dpi target\n%!"
|
|
|
|
with
|
2024-11-06 21:13:57 +01:00
|
|
|
Not_found -> Printf.eprintf "Warning: orphaned image, skipping\n" (* Could not find DPI data - an orphan image. *)
|
2024-11-06 17:18:57 +01:00
|
|
|
|
2023-12-28 12:32:43 +01:00
|
|
|
let recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_jbig2enc pdf s dict reference =
|
2024-02-20 19:55:20 +01:00
|
|
|
complain_jbig2enc path_to_jbig2enc;
|
2024-01-03 18:43:51 +01:00
|
|
|
let old = !reference in
|
|
|
|
let restore () = reference := old in
|
2023-12-24 14:54:21 +01:00
|
|
|
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
|
|
|
|
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
|
2023-12-29 22:49:56 +01:00
|
|
|
if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else (* (but also, jbig2enc fails on tiny images) *)
|
2023-12-22 22:21:23 +01:00
|
|
|
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
|
2023-12-31 12:13:58 +01:00
|
|
|
if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else
|
|
|
|
begin
|
|
|
|
Pdfcodec.decode_pdfstream_until_unknown pdf s;
|
|
|
|
match Pdf.lookup_direct pdf "/Filter" (fst !reference) with
|
2024-01-03 18:43:51 +01:00
|
|
|
| Some x ->
|
|
|
|
if !debug_image_processing then Printf.printf "could not decode - skipping %s length %i\n%!" (Pdfwrite.string_of_pdf x) size;
|
|
|
|
restore ()
|
2023-12-31 12:13:58 +01:00
|
|
|
| None ->
|
2024-02-21 19:41:26 +01:00
|
|
|
let out = Filename.temp_file "cpdf" "convertin.pnm" in
|
|
|
|
let out2 = Filename.temp_file "cpdf" "convertout.jbig2" in
|
2023-12-31 12:13:58 +01:00
|
|
|
let fh = open_out_bin out in
|
|
|
|
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
|
|
|
|
pnm_to_channel_1_inverted fh w h data;
|
|
|
|
close_out fh;
|
|
|
|
let retcode =
|
2024-01-12 16:00:28 +01:00
|
|
|
let command = Filename.quote_command ~stdout:out2 path_to_jbig2enc ["-d"; "-p"; out] in
|
2023-12-31 12:13:58 +01:00
|
|
|
(*Printf.printf "%S\n" command;*) Sys.command command
|
|
|
|
in
|
2024-02-20 19:55:20 +01:00
|
|
|
if retcode <> 0 then
|
|
|
|
restore ()
|
|
|
|
else
|
2023-12-31 12:13:58 +01:00
|
|
|
begin
|
|
|
|
let result = open_in_bin out2 in
|
|
|
|
let newsize = in_channel_length result in
|
|
|
|
if newsize < size then
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "1bpp to JBIG2 %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.));
|
|
|
|
reference :=
|
|
|
|
(Pdf.remove_dict_entry
|
|
|
|
(Pdf.add_dict_entry
|
|
|
|
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
|
|
|
|
"/Filter"
|
|
|
|
(Pdf.Name "/JBIG2Decode")) "/DecodeParms"),
|
|
|
|
Pdf.Got (Pdfio.bytes_of_input_channel result)
|
|
|
|
end
|
|
|
|
else
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "no size reduction\n%!"
|
|
|
|
end;
|
|
|
|
close_in result
|
|
|
|
end;
|
|
|
|
remove out;
|
|
|
|
remove out2
|
|
|
|
end
|
2023-12-22 17:45:53 +01:00
|
|
|
|
2024-01-12 13:45:35 +01:00
|
|
|
(* Recompress 1bpp images (except existing JBIG2 compressed ones) to lossy jbig2 *)
|
2024-01-12 16:00:28 +01:00
|
|
|
let preprocess_jbig2_lossy ~path_to_jbig2enc ~jbig2_lossy_threshold ~length_threshold ~pixel_threshold ~dpi_threshold inrange highdpi pdf =
|
2024-02-20 19:55:20 +01:00
|
|
|
complain_jbig2enc path_to_jbig2enc;
|
2024-01-11 17:48:34 +01:00
|
|
|
let objnum_name_pairs = ref [] in
|
|
|
|
let process_obj objnum s =
|
2024-01-12 13:45:35 +01:00
|
|
|
match s with
|
|
|
|
| Pdf.Stream ({contents = dict, _} as reference) ->
|
|
|
|
let old = !reference in
|
|
|
|
let restore () = reference := old in
|
2024-02-01 17:38:07 +01:00
|
|
|
if Hashtbl.mem inrange objnum && (dpi_threshold = 0. || Hashtbl.mem highdpi objnum) then begin match
|
2024-01-12 13:45:35 +01:00
|
|
|
Pdf.lookup_direct pdf "/Subtype" dict,
|
|
|
|
Pdf.lookup_direct pdf "/BitsPerComponent" dict,
|
|
|
|
Pdf.lookup_direct pdf "/ImageMask" dict
|
|
|
|
with
|
|
|
|
| Some (Pdf.Name "/Image"), Some (Pdf.Integer 1), _
|
|
|
|
| Some (Pdf.Name "/Image"), _, Some (Pdf.Boolean true) ->
|
|
|
|
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
|
|
|
|
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
|
|
|
|
if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else (* (but also, jbig2enc fails on tiny images) *)
|
|
|
|
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
|
|
|
|
if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else
|
|
|
|
begin
|
|
|
|
Pdfcodec.decode_pdfstream_until_unknown pdf s;
|
|
|
|
match Pdf.lookup_direct pdf "/Filter" (fst !reference) with
|
|
|
|
| Some x ->
|
|
|
|
if !debug_image_processing then Printf.printf "could not decode - skipping %s length %i\n%!" (Pdfwrite.string_of_pdf x) size;
|
|
|
|
restore ()
|
|
|
|
| None ->
|
2024-02-21 19:41:26 +01:00
|
|
|
let out = Filename.temp_file "cpdf" "convertin.pnm" in
|
2024-01-12 13:45:35 +01:00
|
|
|
let fh = open_out_bin out in
|
|
|
|
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
|
|
|
|
pnm_to_channel_1_inverted fh w h data;
|
|
|
|
close_out fh;
|
2024-02-20 19:55:20 +01:00
|
|
|
if !debug_image_processing then Printf.printf "JBIG2Lossy: obj %i is suitable\n%!" objnum;
|
2024-01-12 13:45:35 +01:00
|
|
|
objnum_name_pairs := (objnum, out)::!objnum_name_pairs
|
|
|
|
end
|
|
|
|
| _ -> () (* not a 1bpp image *)
|
|
|
|
end
|
|
|
|
| _ -> () (* not a stream *)
|
2024-01-11 17:48:34 +01:00
|
|
|
in
|
2024-01-11 19:24:42 +01:00
|
|
|
Pdf.objiter process_obj pdf;
|
|
|
|
if length !objnum_name_pairs > 10000 then Pdfe.log "Too many jbig2 streams" else
|
2024-01-11 21:18:27 +01:00
|
|
|
if length !objnum_name_pairs = 0 then () else
|
2024-01-11 19:24:42 +01:00
|
|
|
let jbig2out = Filename.temp_file "cpdf" "jbig2" in
|
|
|
|
let retcode =
|
2024-01-12 14:06:29 +01:00
|
|
|
let command =
|
|
|
|
Filename.quote_command
|
|
|
|
path_to_jbig2enc
|
|
|
|
?stderr:(if !debug_image_processing then None else Some Filename.null)
|
2024-01-12 16:00:28 +01:00
|
|
|
(["-p"; "-s"; "-d"; "-t"; string_of_float jbig2_lossy_threshold; "-b"; jbig2out] @ map snd !objnum_name_pairs)
|
2024-01-12 14:06:29 +01:00
|
|
|
in
|
2024-01-11 19:24:42 +01:00
|
|
|
(*Printf.printf "%S\n" command;*) Sys.command command
|
|
|
|
in
|
2024-01-12 15:16:11 +01:00
|
|
|
iter remove (map snd !objnum_name_pairs);
|
2024-01-11 19:24:42 +01:00
|
|
|
if retcode = 0 then
|
|
|
|
begin
|
|
|
|
let globals = bytes_of_string (contents_of_file (jbig2out ^ ".sym")) in
|
|
|
|
let globalobj =
|
|
|
|
Pdf.addobj pdf (Pdf.Stream {contents = Pdf.Dictionary [("/Length", Pdf.Integer (bytes_size globals))], Pdf.Got globals})
|
|
|
|
in
|
2024-01-11 19:57:11 +01:00
|
|
|
iter2
|
|
|
|
(fun (objnum, _) i ->
|
|
|
|
let data = bytes_of_string (contents_of_file (jbig2out ^ Printf.sprintf ".%04i" i)) in
|
|
|
|
let basic_obj =
|
|
|
|
Pdf.Stream
|
|
|
|
{contents =
|
|
|
|
Pdf.Dictionary [("/Length", Pdf.Integer (bytes_size data));
|
|
|
|
("/Filter", Pdf.Name "/JBIG2Decode");
|
2024-01-11 20:36:54 +01:00
|
|
|
("/DecodeParms", Pdf.Dictionary [("/JBIG2Globals", Pdf.Indirect globalobj)])],
|
2024-01-11 19:57:11 +01:00
|
|
|
Pdf.Got data}
|
|
|
|
in
|
|
|
|
let dict = match Pdf.lookup_obj pdf objnum with Pdf.Stream {contents = d, _} -> d | _ -> Pdf.Dictionary [] in
|
|
|
|
Pdf.addobj_given_num pdf
|
|
|
|
(objnum,
|
|
|
|
(match basic_obj with
|
|
|
|
| Pdf.Stream {contents = Pdf.Dictionary d, data} ->
|
|
|
|
let d' = fold_right (fun (k, v) d -> add k v d) d (match dict with Pdf.Dictionary x -> x | _ -> []) in
|
|
|
|
Pdf.Stream {contents = Pdf.Dictionary d', data}
|
|
|
|
| _ -> assert false)))
|
|
|
|
!objnum_name_pairs
|
|
|
|
(indx0 !objnum_name_pairs)
|
2024-01-11 19:24:42 +01:00
|
|
|
end
|
2024-01-12 15:16:11 +01:00
|
|
|
else
|
|
|
|
begin
|
|
|
|
Pdfe.log "Call to jbig2enc failed"
|
|
|
|
end;
|
|
|
|
iter (fun i -> remove (jbig2out ^ Printf.sprintf ".%04i" i)) (indx0 !objnum_name_pairs);
|
|
|
|
remove (jbig2out ^ ".sym")
|
2024-01-10 18:09:49 +01:00
|
|
|
|
2023-12-27 20:53:02 +01:00
|
|
|
let process
|
2024-02-01 17:38:07 +01:00
|
|
|
~q ~qlossless ~onebppmethod ~jbig2_lossy_threshold ~length_threshold ~percentage_threshold ~pixel_threshold ~dpi_threshold
|
2024-11-05 15:02:57 +01:00
|
|
|
~factor ~interpolate ~jpeg_to_jpeg_scale ~jpeg_to_jpeg_dpi ~path_to_jbig2enc ~path_to_convert range pdf
|
2023-12-27 20:53:02 +01:00
|
|
|
=
|
2024-01-04 12:33:17 +01:00
|
|
|
let inrange =
|
|
|
|
match images pdf range with
|
|
|
|
| `List l -> hashset_of_list (map (function `Assoc (("Object", `Int i)::_) -> i | _ -> assert false) l)
|
|
|
|
| _ -> assert false
|
|
|
|
in
|
2024-02-01 16:22:19 +01:00
|
|
|
let highdpi, target_dpi_info =
|
|
|
|
let objnums, dpi =
|
2024-11-06 21:03:43 +01:00
|
|
|
if dpi_threshold = 0. && factor > 0. && jpeg_to_jpeg_dpi = 0. then ([], []) else
|
2024-01-04 20:31:21 +01:00
|
|
|
let results = image_resolution pdf range max_float in
|
2024-01-10 14:55:40 +01:00
|
|
|
(*iter (fun (_, _, _, _, wdpi, hdpi, objnum) -> Printf.printf "From image_resolution %f %f %i\n" wdpi hdpi objnum) results;*)
|
2024-01-04 20:31:21 +01:00
|
|
|
let cmp (_, _, _, _, _, _, a) (_, _, _, _, _, _, b) = compare a b in
|
|
|
|
let sets = collate cmp (sort cmp results) in
|
|
|
|
let heads = map hd (map (sort (fun (_, _, _, _, a, b, _) (_, _, _, _, c, d, _) -> compare (fmin a b) (fmin c d))) sets) in
|
2024-01-10 14:55:40 +01:00
|
|
|
(*iter (fun (_, _, _, _, wdpi, hdpi, objnum) -> Printf.printf "Lowest resolution exemplar %f %f %i\n" wdpi hdpi objnum) heads;*)
|
2024-02-01 17:38:07 +01:00
|
|
|
let needed = keep (fun (_, _, _, _, wdpi, hdpi, objnum) -> fmin wdpi hdpi > dpi_threshold) heads in
|
2024-01-10 14:55:40 +01:00
|
|
|
(*iter (fun (_, _, _, _, wdpi, hdpi, objnum) -> Printf.printf "keep %f %f %i\n" wdpi hdpi objnum) needed;*)
|
2024-02-01 16:22:19 +01:00
|
|
|
map (fun (_, _, _, _, _, _, objnum) -> objnum) needed,
|
|
|
|
map (fun (_, _, _, _, wdpi, hdpi, objnum) -> (objnum, fmin wdpi hdpi)) heads
|
2024-02-01 20:31:39 +01:00
|
|
|
(*iter (fun (x, d) -> Printf.printf "obj %i at %f dpi\n" x d) r; r*)
|
2024-01-04 20:31:21 +01:00
|
|
|
in
|
2024-02-01 16:41:27 +01:00
|
|
|
hashset_of_list objnums, hashtable_of_dictionary dpi
|
2024-01-04 20:31:21 +01:00
|
|
|
in
|
2024-02-01 17:38:07 +01:00
|
|
|
begin match onebppmethod with "JBIG2Lossy" -> preprocess_jbig2_lossy ~path_to_jbig2enc ~jbig2_lossy_threshold ~dpi_threshold ~length_threshold ~pixel_threshold inrange highdpi pdf | _ -> () end;
|
2023-12-29 21:09:50 +01:00
|
|
|
let nobjects = Pdf.objcard pdf in
|
|
|
|
let ndone = ref 0 in
|
2023-12-28 17:18:25 +01:00
|
|
|
let process_obj objnum s =
|
2023-12-17 13:58:32 +01:00
|
|
|
match s with
|
|
|
|
| Pdf.Stream ({contents = dict, _} as reference) ->
|
2023-12-29 21:09:50 +01:00
|
|
|
ndone += 1;
|
2024-02-01 17:38:07 +01:00
|
|
|
if Hashtbl.mem inrange objnum && (dpi_threshold = 0. || Hashtbl.mem highdpi objnum) then begin match
|
2023-12-22 17:12:19 +01:00
|
|
|
Pdf.lookup_direct pdf "/Subtype" dict,
|
|
|
|
Pdf.lookup_direct pdf "/Filter" dict,
|
|
|
|
Pdf.lookup_direct pdf "/BitsPerComponent" dict,
|
|
|
|
Pdf.lookup_direct pdf "/ImageMask" dict
|
|
|
|
with
|
|
|
|
| Some (Pdf.Name "/Image"), Some (Pdf.Name "/DCTDecode" | Pdf.Array [Pdf.Name "/DCTDecode"]), _, _ ->
|
2024-11-06 16:15:32 +01:00
|
|
|
if q < 100. || jpeg_to_jpeg_scale <> 100. || jpeg_to_jpeg_dpi <> 0. then
|
2024-02-01 17:38:07 +01:00
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "(%i/%i) Object %i (JPEG)... %!" !ndone nobjects objnum;
|
2024-11-06 17:18:57 +01:00
|
|
|
jpeg_to_jpeg_wrapper objnum pdf ~target_dpi_info ~pixel_threshold ~length_threshold ~percentage_threshold ~jpeg_to_jpeg_scale ~jpeg_to_jpeg_dpi ~interpolate ~q ~path_to_convert s dict reference
|
2024-02-01 17:38:07 +01:00
|
|
|
end
|
2023-12-22 17:12:19 +01:00
|
|
|
| Some (Pdf.Name "/Image"), _, Some (Pdf.Integer 1), _
|
|
|
|
| Some (Pdf.Name "/Image"), _, _, Some (Pdf.Boolean true) ->
|
2023-12-28 17:18:25 +01:00
|
|
|
begin match onebppmethod with
|
2024-02-01 17:38:07 +01:00
|
|
|
| "JBIG2" ->
|
2023-12-29 22:49:56 +01:00
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "(%i/%i) object %i (1bpp)... %!" !ndone nobjects objnum;
|
|
|
|
recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_jbig2enc pdf s dict reference
|
|
|
|
end
|
2023-12-28 17:18:25 +01:00
|
|
|
| _ -> ()
|
|
|
|
end
|
2023-12-22 17:12:19 +01:00
|
|
|
| Some (Pdf.Name "/Image"), _, _, _ ->
|
2024-02-01 17:38:07 +01:00
|
|
|
if qlossless < 101. then
|
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "(%i/%i) object %i (lossless)... %!" !ndone nobjects objnum;
|
|
|
|
lossless_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~qlossless ~path_to_convert s dict reference
|
|
|
|
end
|
|
|
|
else
|
|
|
|
begin
|
|
|
|
if factor < 101. then
|
2023-12-29 22:49:56 +01:00
|
|
|
begin
|
|
|
|
if !debug_image_processing then Printf.printf "(%i/%i) object %i (lossless)... %!" !ndone nobjects objnum;
|
2024-02-01 17:38:07 +01:00
|
|
|
if factor < 0. then
|
|
|
|
lossless_resample_target_dpi objnum pdf ~pixel_threshold ~length_threshold ~factor:~-.factor ~target_dpi_info ~interpolate ~path_to_convert s dict reference
|
|
|
|
else
|
|
|
|
lossless_resample pdf ~pixel_threshold ~length_threshold ~factor ~interpolate ~path_to_convert s dict reference
|
2023-12-29 22:49:56 +01:00
|
|
|
end
|
2024-02-01 17:38:07 +01:00
|
|
|
end
|
2023-12-17 13:58:32 +01:00
|
|
|
| _ -> () (* not an image *)
|
|
|
|
end
|
2023-12-29 21:09:50 +01:00
|
|
|
| _ -> ndone += 1 (* not a stream *)
|
2023-12-07 15:54:47 +01:00
|
|
|
in
|
|
|
|
Pdf.objiter process_obj pdf
|