From 6f5c20e4090a0a4a4b0c504b3efa0758cf6fb10f Mon Sep 17 00:00:00 2001 From: John Whitington Date: Fri, 12 Jan 2024 12:45:35 +0000 Subject: [PATCH] Begin cleaning up jbig2globals work --- cpdfimage.ml | 80 +++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/cpdfimage.ml b/cpdfimage.ml index 82013f2..6c4b2a0 100644 --- a/cpdfimage.ml +++ b/cpdfimage.ml @@ -710,69 +710,59 @@ let recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_j remove out2 end -(* Recompress 1bpp images, compressed any way (none, flate, ccitt, jbig2 lossless) to lossy jbig2 *) -(* For now, just example 6406.pdf, which is CCITT, so simple decompression will do. For lossless -to lossy JBIG2, we will need to write out and convert to PNM with jbig2dec *) -(* FIXME: Need interface for jbig2 lossy parameters *) +(* Recompress 1bpp images (except existing JBIG2 compressed ones) to lossy jbig2 *) let preprocess_jbig2_lossy ~path_to_jbig2enc ~length_threshold ~pixel_threshold ~dpi_threshold inrange highdpi pdf = let objnum_name_pairs = ref [] in let process_obj objnum s = - (* Write out each stream as a *.pnm, if we choose to process it, restoring if not. *) - match s with - | Pdf.Stream ({contents = dict, _} as reference) -> - let old = !reference in - let restore () = reference := old in - if Hashtbl.mem inrange objnum && (dpi_threshold = 0 || Hashtbl.mem highdpi objnum) then begin match - Pdf.lookup_direct pdf "/Subtype" dict, - Pdf.lookup_direct pdf "/BitsPerComponent" dict, - Pdf.lookup_direct pdf "/ImageMask" dict - with - | Some (Pdf.Name "/Image"), Some (Pdf.Integer 1), _ - | Some (Pdf.Name "/Image"), _, Some (Pdf.Boolean true) -> - let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in - let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in - if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else (* (but also, jbig2enc fails on tiny images) *) - let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in - if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else - begin - Pdfcodec.decode_pdfstream_until_unknown pdf s; - match Pdf.lookup_direct pdf "/Filter" (fst !reference) with - | Some x -> - if !debug_image_processing then Printf.printf "could not decode - skipping %s length %i\n%!" (Pdfwrite.string_of_pdf x) size; - restore () - | None -> - let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in - let fh = open_out_bin out in - let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in - pnm_to_channel_1_inverted fh w h data; - close_out fh; - Printf.printf "obj %i = %s\n%!" objnum out; - objnum_name_pairs := (objnum, out)::!objnum_name_pairs - end - | _ -> () (* not a 1bpp image *) - end - | _ -> () (* not a stream *) + match s with + | Pdf.Stream ({contents = dict, _} as reference) -> + let old = !reference in + let restore () = reference := old in + if Hashtbl.mem inrange objnum && (dpi_threshold = 0 || Hashtbl.mem highdpi objnum) then begin match + Pdf.lookup_direct pdf "/Subtype" dict, + Pdf.lookup_direct pdf "/BitsPerComponent" dict, + Pdf.lookup_direct pdf "/ImageMask" dict + with + | Some (Pdf.Name "/Image"), Some (Pdf.Integer 1), _ + | Some (Pdf.Name "/Image"), _, Some (Pdf.Boolean true) -> + let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in + let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in + if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else (* (but also, jbig2enc fails on tiny images) *) + let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in + if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else + begin + Pdfcodec.decode_pdfstream_until_unknown pdf s; + match Pdf.lookup_direct pdf "/Filter" (fst !reference) with + | Some x -> + if !debug_image_processing then Printf.printf "could not decode - skipping %s length %i\n%!" (Pdfwrite.string_of_pdf x) size; + restore () + | None -> + let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in + let fh = open_out_bin out in + let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in + pnm_to_channel_1_inverted fh w h data; + close_out fh; + (*Printf.printf "obj %i = %s\n%!" objnum out;*) + objnum_name_pairs := (objnum, out)::!objnum_name_pairs + end + | _ -> () (* not a 1bpp image *) + end + | _ -> () (* not a stream *) in Pdf.objiter process_obj pdf; if length !objnum_name_pairs > 10000 then Pdfe.log "Too many jbig2 streams" else if length !objnum_name_pairs = 0 then () else - (* Call jbig2 to generate one *.jbig2 for each, and a *.jbig2globals *) let jbig2out = Filename.temp_file "cpdf" "jbig2" in - Printf.printf "jbig2out: %s" jbig2out; - (* FIXME: redirect stdout or sterr from jbig2 to remove summary report. *) let retcode = let command = Filename.quote_command path_to_jbig2enc (["-p"; "-s"; "-b"; jbig2out] @ map snd !objnum_name_pairs) in (*Printf.printf "%S\n" command;*) Sys.command command in if retcode = 0 then begin - (* Build the JBIG2Globals stream for the file *) let globals = bytes_of_string (contents_of_file (jbig2out ^ ".sym")) in let globalobj = Pdf.addobj pdf (Pdf.Stream {contents = Pdf.Dictionary [("/Length", Pdf.Integer (bytes_size globals))], Pdf.Got globals}) in - (* For each file, read in the new JBIG2 data, and build each new image stream to replace the old one, using the same - overwriting technique as elsewhere. *) iter2 (fun (objnum, _) i -> let data = bytes_of_string (contents_of_file (jbig2out ^ Printf.sprintf ".%04i" i)) in