Begin cleaning up jbig2globals work

This commit is contained in:
John Whitington 2024-01-12 12:45:35 +00:00
parent 68a4e31af2
commit 6f5c20e409
1 changed files with 35 additions and 45 deletions

View File

@ -710,69 +710,59 @@ let recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_j
remove out2 remove out2
end end
(* Recompress 1bpp images, compressed any way (none, flate, ccitt, jbig2 lossless) to lossy jbig2 *) (* Recompress 1bpp images (except existing JBIG2 compressed ones) to lossy jbig2 *)
(* For now, just example 6406.pdf, which is CCITT, so simple decompression will do. For lossless
to lossy JBIG2, we will need to write out and convert to PNM with jbig2dec *)
(* FIXME: Need interface for jbig2 lossy parameters *)
let preprocess_jbig2_lossy ~path_to_jbig2enc ~length_threshold ~pixel_threshold ~dpi_threshold inrange highdpi pdf = let preprocess_jbig2_lossy ~path_to_jbig2enc ~length_threshold ~pixel_threshold ~dpi_threshold inrange highdpi pdf =
let objnum_name_pairs = ref [] in let objnum_name_pairs = ref [] in
let process_obj objnum s = let process_obj objnum s =
(* Write out each stream as a *.pnm, if we choose to process it, restoring if not. *) match s with
match s with | Pdf.Stream ({contents = dict, _} as reference) ->
| Pdf.Stream ({contents = dict, _} as reference) -> let old = !reference in
let old = !reference in let restore () = reference := old in
let restore () = reference := old in if Hashtbl.mem inrange objnum && (dpi_threshold = 0 || Hashtbl.mem highdpi objnum) then begin match
if Hashtbl.mem inrange objnum && (dpi_threshold = 0 || Hashtbl.mem highdpi objnum) then begin match Pdf.lookup_direct pdf "/Subtype" dict,
Pdf.lookup_direct pdf "/Subtype" dict, Pdf.lookup_direct pdf "/BitsPerComponent" dict,
Pdf.lookup_direct pdf "/BitsPerComponent" dict, Pdf.lookup_direct pdf "/ImageMask" dict
Pdf.lookup_direct pdf "/ImageMask" dict with
with | Some (Pdf.Name "/Image"), Some (Pdf.Integer 1), _
| Some (Pdf.Name "/Image"), Some (Pdf.Integer 1), _ | Some (Pdf.Name "/Image"), _, Some (Pdf.Boolean true) ->
| Some (Pdf.Name "/Image"), _, Some (Pdf.Boolean true) -> let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else (* (but also, jbig2enc fails on tiny images) *)
if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else (* (but also, jbig2enc fails on tiny images) *) let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else
if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else begin
begin Pdfcodec.decode_pdfstream_until_unknown pdf s;
Pdfcodec.decode_pdfstream_until_unknown pdf s; match Pdf.lookup_direct pdf "/Filter" (fst !reference) with
match Pdf.lookup_direct pdf "/Filter" (fst !reference) with | Some x ->
| Some x -> if !debug_image_processing then Printf.printf "could not decode - skipping %s length %i\n%!" (Pdfwrite.string_of_pdf x) size;
if !debug_image_processing then Printf.printf "could not decode - skipping %s length %i\n%!" (Pdfwrite.string_of_pdf x) size; restore ()
restore () | None ->
| None -> let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in
let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in let fh = open_out_bin out in
let fh = open_out_bin out in let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in pnm_to_channel_1_inverted fh w h data;
pnm_to_channel_1_inverted fh w h data; close_out fh;
close_out fh; (*Printf.printf "obj %i = %s\n%!" objnum out;*)
Printf.printf "obj %i = %s\n%!" objnum out; objnum_name_pairs := (objnum, out)::!objnum_name_pairs
objnum_name_pairs := (objnum, out)::!objnum_name_pairs end
end | _ -> () (* not a 1bpp image *)
| _ -> () (* not a 1bpp image *) end
end | _ -> () (* not a stream *)
| _ -> () (* not a stream *)
in in
Pdf.objiter process_obj pdf; Pdf.objiter process_obj pdf;
if length !objnum_name_pairs > 10000 then Pdfe.log "Too many jbig2 streams" else if length !objnum_name_pairs > 10000 then Pdfe.log "Too many jbig2 streams" else
if length !objnum_name_pairs = 0 then () else if length !objnum_name_pairs = 0 then () else
(* Call jbig2 to generate one *.jbig2 for each, and a *.jbig2globals *)
let jbig2out = Filename.temp_file "cpdf" "jbig2" in let jbig2out = Filename.temp_file "cpdf" "jbig2" in
Printf.printf "jbig2out: %s" jbig2out;
(* FIXME: redirect stdout or sterr from jbig2 to remove summary report. *)
let retcode = let retcode =
let command = Filename.quote_command path_to_jbig2enc (["-p"; "-s"; "-b"; jbig2out] @ map snd !objnum_name_pairs) in let command = Filename.quote_command path_to_jbig2enc (["-p"; "-s"; "-b"; jbig2out] @ map snd !objnum_name_pairs) in
(*Printf.printf "%S\n" command;*) Sys.command command (*Printf.printf "%S\n" command;*) Sys.command command
in in
if retcode = 0 then if retcode = 0 then
begin begin
(* Build the JBIG2Globals stream for the file *)
let globals = bytes_of_string (contents_of_file (jbig2out ^ ".sym")) in let globals = bytes_of_string (contents_of_file (jbig2out ^ ".sym")) in
let globalobj = let globalobj =
Pdf.addobj pdf (Pdf.Stream {contents = Pdf.Dictionary [("/Length", Pdf.Integer (bytes_size globals))], Pdf.Got globals}) Pdf.addobj pdf (Pdf.Stream {contents = Pdf.Dictionary [("/Length", Pdf.Integer (bytes_size globals))], Pdf.Got globals})
in in
(* For each file, read in the new JBIG2 data, and build each new image stream to replace the old one, using the same
overwriting technique as elsewhere. *)
iter2 iter2
(fun (objnum, _) i -> (fun (objnum, _) i ->
let data = bytes_of_string (contents_of_file (jbig2out ^ Printf.sprintf ".%04i" i)) in let data = bytes_of_string (contents_of_file (jbig2out ^ Printf.sprintf ".%04i" i)) in