towards better -process-images-info

This commit is contained in:
John Whitington 2023-12-29 21:49:56 +00:00
parent c953d0f2a9
commit 70e9092e01
1 changed files with 35 additions and 15 deletions

View File

@ -494,10 +494,10 @@ let image_of_input fobj i =
let jpeg_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~q ~path_to_convert s dict reference = let jpeg_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~q ~path_to_convert s dict reference =
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
if w * h < pixel_threshold then () else if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else
Pdf.getstream s; Pdf.getstream s;
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
if size < length_threshold then () else if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!") else
let out = Filename.temp_file "cpdf" "convertin" ^ ".jpg" in let out = Filename.temp_file "cpdf" "convertin" ^ ".jpg" in
let out2 = Filename.temp_file "cpdf" "convertout" ^ ".jpg" in let out2 = Filename.temp_file "cpdf" "convertout" ^ ".jpg" in
let fh = open_out_bin out in let fh = open_out_bin out in
@ -542,9 +542,9 @@ let lossless_out pdf ~pixel_threshold ~length_threshold extension s dict referen
| (1 | 3 | 4 | -1), Some (Pdf.Integer 8) -> | (1 | 3 | 4 | -1), Some (Pdf.Integer 8) ->
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
if w * h < pixel_threshold then None else if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!"; None) else
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
if size < length_threshold then None else if size < length_threshold then (if !debug_image_processing then Printf.printf "length threshold not met\n%!"; None) else
begin begin
Pdfcodec.decode_pdfstream_until_unknown pdf s; Pdfcodec.decode_pdfstream_until_unknown pdf s;
match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> None | None -> match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> None | None ->
@ -586,13 +586,17 @@ let lossless_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshol
let newsize = in_channel_length result in let newsize = in_channel_length result in
if newsize < size then if newsize < size then
begin begin
if !debug_image_processing then Printf.printf "Lossless to JPEG %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.)); if !debug_image_processing then Printf.printf "lossless to JPEG %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.));
reference := reference :=
(Pdf.add_dict_entry (Pdf.add_dict_entry
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize)) (Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
"/Filter" "/Filter"
(Pdf.Name "/DCTDecode")), (Pdf.Name "/DCTDecode")),
Pdf.Got (Pdfio.bytes_of_input_channel result) Pdf.Got (Pdfio.bytes_of_input_channel result)
end
else
begin
if !debug_image_processing then Printf.printf "no size reduction\n%!"
end; end;
close_in result close_in result
end; end;
@ -617,13 +621,17 @@ let lossless_resample pdf ~pixel_threshold ~length_threshold ~percentage_thresho
let newsize = in_channel_length result in let newsize = in_channel_length result in
if newsize < size then if newsize < size then
begin begin
if !debug_image_processing then Printf.printf "Lossless to JPEG %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.)); if !debug_image_processing then Printf.printf "lossless resample %i -> %i (%i%%)\n%!" size newsize (int_of_float (float newsize /. float size *. 100.));
reference := reference :=
(Pdf.add_dict_entry (Pdf.add_dict_entry
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize)) (Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
"/Filter" "/Filter"
(Pdf.Name "/DCTDecode")), (Pdf.Name "/DCTDecode")),
Pdf.Got (Pdfio.bytes_of_input_channel result) Pdf.Got (Pdfio.bytes_of_input_channel result)
end
else
begin
if !debug_image_processing then Printf.printf "no size reduction\n%!"
end; end;
close_in result close_in result
end; end;
@ -633,11 +641,11 @@ let lossless_resample pdf ~pixel_threshold ~length_threshold ~percentage_thresho
let recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_jbig2enc pdf s dict reference = let recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_jbig2enc pdf s dict reference =
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
if w * h < pixel_threshold then () else (* pixel_threshold (but also, jbig2enc fails on tiny images) *) if w * h < pixel_threshold then (if !debug_image_processing then Printf.printf "pixel threshold not met\n%!") else (* (but also, jbig2enc fails on tiny images) *)
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
if size < length_threshold then () else if size < length_threshold then if !debug_image_processing then Printf.printf "length threshold not met\n%!" else
Pdfcodec.decode_pdfstream_until_unknown pdf s; Pdfcodec.decode_pdfstream_until_unknown pdf s;
match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None -> match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> if !debug_image_processing then Printf.printf "could not decode - skipping\n%!" | None ->
let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in
let out2 = Filename.temp_file "cpdf" "convertout" ^ ".jbig2" in let out2 = Filename.temp_file "cpdf" "convertout" ^ ".jbig2" in
let fh = open_out_bin out in let fh = open_out_bin out in
@ -662,6 +670,10 @@ let recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_j
"/Filter" "/Filter"
(Pdf.Name "/JBIG2Decode")) "/DecodeParms"), (Pdf.Name "/JBIG2Decode")) "/DecodeParms"),
Pdf.Got (Pdfio.bytes_of_input_channel result) Pdf.Got (Pdfio.bytes_of_input_channel result)
end
else
begin
if !debug_image_processing then Printf.printf "no size reduction\n%!"
end; end;
close_in result close_in result
end; end;
@ -688,25 +700,33 @@ let process
Pdf.lookup_direct pdf "/ImageMask" dict Pdf.lookup_direct pdf "/ImageMask" dict
with with
| Some (Pdf.Name "/Image"), Some (Pdf.Name "/DCTDecode" | Pdf.Array [Pdf.Name "/DCTDecode"]), _, _ -> | Some (Pdf.Name "/Image"), Some (Pdf.Name "/DCTDecode" | Pdf.Array [Pdf.Name "/DCTDecode"]), _, _ ->
if !debug_image_processing then Printf.printf "(%i/%i) Object %i (JPEG)...\n%!" !ndone nobjects objnum;
begin match q with begin match q with
| Some q -> | Some q ->
if q < 100 then jpeg_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~q ~path_to_convert s dict reference if q < 100 then
begin
if !debug_image_processing then Printf.printf "(%i/%i) Object %i (JPEG)... %!" !ndone nobjects objnum;
jpeg_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~q ~path_to_convert s dict reference
end
| None -> () | None -> ()
end end
| Some (Pdf.Name "/Image"), _, Some (Pdf.Integer 1), _ | Some (Pdf.Name "/Image"), _, Some (Pdf.Integer 1), _
| Some (Pdf.Name "/Image"), _, _, Some (Pdf.Boolean true) -> | Some (Pdf.Name "/Image"), _, _, Some (Pdf.Boolean true) ->
if !debug_image_processing then Printf.printf "(%i/%i) object %i (1bpp)...\n%!" !ndone nobjects objnum;
begin match onebppmethod with begin match onebppmethod with
| Some "JBIG2" -> | Some "JBIG2" ->
begin
if !debug_image_processing then Printf.printf "(%i/%i) object %i (1bpp)... %!" !ndone nobjects objnum;
recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_jbig2enc pdf s dict reference recompress_1bpp_jbig2_lossless ~pixel_threshold ~length_threshold ~path_to_jbig2enc pdf s dict reference
end
| _ -> () | _ -> ()
end end
| Some (Pdf.Name "/Image"), _, _, _ -> | Some (Pdf.Name "/Image"), _, _, _ ->
if !debug_image_processing then Printf.printf "(%i/%i) object %i (lossless)...\n%!" !ndone nobjects objnum;
begin match qlossless with begin match qlossless with
| Some qlossless -> | Some qlossless ->
if qlossless < 101 then lossless_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~qlossless ~path_to_convert s dict reference if qlossless < 101 then
begin
if !debug_image_processing then Printf.printf "(%i/%i) object %i (lossless)... %!" !ndone nobjects objnum;
lossless_to_jpeg pdf ~pixel_threshold ~length_threshold ~percentage_threshold ~qlossless ~path_to_convert s dict reference
end
| None -> () | None -> ()
end end
| _ -> () (* not an image *) | _ -> () (* not an image *)