First go at /DeviceGray reprocessing

This commit is contained in:
John Whitington 2023-12-18 22:39:33 +00:00
parent d96b6aabbb
commit 78dbe17b67
2 changed files with 60 additions and 35 deletions

View File

@ -13,6 +13,7 @@ o Extract font files from a document
o List images on a page with -list-images[-json] o List images on a page with -list-images[-json]
o Chop pages up into sections with -chop o Chop pages up into sections with -chop
o Build PDF files from JBIG2 streams, including globals o Build PDF files from JBIG2 streams, including globals
o Reprocess images within PDFs to further compress them
Extended features: Extended features:

View File

@ -2,26 +2,42 @@ open Pdfutil
open Pdfio open Pdfio
open Cpdferror open Cpdferror
(* Extract Images. *) let pnm_white ch = output_char ch ' '
let pnm_to_channel_24 channel w h s = let pnm_newline ch = output_char ch '\n'
let white () = output_char channel ' ' let pnm_output_string = Stdlib.output_string
and newline () = output_char channel '\n'
and output_string = Stdlib.output_string channel in let pnm_header ch w h =
output_string "P6"; pnm_white ch;
white (); pnm_output_string ch (string_of_int w);
output_string (string_of_int w); pnm_white ch;
white (); pnm_output_string ch (string_of_int h);
output_string (string_of_int h); pnm_white ch
white ();
output_string "255"; let pnm_to_channel_24 ch w h s =
newline (); pnm_output_string ch "P6";
let pos = ref 0 in pnm_header ch w h;
for y = 1 to h do pnm_output_string ch "255";
for x = 1 to w * 3 do pnm_newline ch;
output_byte channel (bget s !pos); let pos = ref 0 in
incr pos for y = 1 to h do
done for x = 1 to w * 3 do
output_byte ch (bget s !pos);
incr pos
done done
done
let pnm_to_channel_8 ch w h s =
pnm_output_string ch "P5";
pnm_header ch w h;
pnm_output_string ch "15";
pnm_newline ch;
let pos = ref 0 in
for y = 1 to h do
for x = 1 to w do
output_byte ch (bget s !pos);
incr pos
done
done
let jbig2_serial = ref 0 let jbig2_serial = ref 0
@ -466,6 +482,7 @@ let image_of_input fobj i =
(* For each image xobject, process it through convert to reduce size. *) (* For each image xobject, process it through convert to reduce size. *)
(* FIXME What about predictors? Audit to see if files get smaller. *) (* FIXME What about predictors? Audit to see if files get smaller. *)
(* FIXME if lossy only 5% smaller, ignore? Set this parameter... *) (* FIXME if lossy only 5% smaller, ignore? Set this parameter... *)
(* FIXME error handling for Sys.remove, others *)
let process pdf ~q ~qlossless ~path_to_convert = let process pdf ~q ~qlossless ~path_to_convert =
let process_obj _ s = let process_obj _ s =
match s with match s with
@ -501,18 +518,20 @@ let process pdf ~q ~qlossless ~path_to_convert =
| Some (Pdf.Name "/Image"), _ -> | Some (Pdf.Name "/Image"), _ ->
(* 0. Test if this is one we can do - for now just Colourspace=RGB, BPC=8 *) (* 0. Test if this is one we can do - for now just Colourspace=RGB, BPC=8 *)
let bpc = Pdf.lookup_direct pdf "/BitsPerComponent" dict in let bpc = Pdf.lookup_direct pdf "/BitsPerComponent" dict in
let is_rgb = let suitable_num =
match Pdf.lookup_direct pdf "/ColorSpace" dict with match Pdf.lookup_direct pdf "/ColorSpace" dict with
| Some (Pdf.Name "/DeviceRGB") -> true | Some (Pdf.Name "/DeviceRGB") -> 3
| Some (Pdf.Name "/DeviceGray") -> 1
| Some (Pdf.Array [Pdf.Name "/ICCBased"; stream]) -> | Some (Pdf.Array [Pdf.Name "/ICCBased"; stream]) ->
begin match Pdf.lookup_direct pdf "/N" stream with begin match Pdf.lookup_direct pdf "/N" stream with
| Some (Pdf.Integer 3) -> true | Some (Pdf.Integer 3) -> 3
| _ -> false | Some (Pdf.Integer 1) -> 1
| _ -> 0
end end
| _ -> false | _ -> 0
in in
begin match is_rgb, bpc with begin match suitable_num, bpc with
| true, Some (Pdf.Integer 8) -> | (1 | 3), Some (Pdf.Integer 8) ->
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
Pdfcodec.decode_pdfstream_until_unknown pdf s; Pdfcodec.decode_pdfstream_until_unknown pdf s;
begin match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None -> begin match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None ->
@ -522,12 +541,14 @@ let process pdf ~q ~qlossless ~path_to_convert =
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
pnm_to_channel_24 fh w h data; (if suitable_num = 3 then pnm_to_channel_24 else pnm_to_channel_8) fh w h data;
close_out fh; close_out fh;
let retcode = let retcode =
let command = let command =
(Filename.quote_command path_to_convert (Filename.quote_command path_to_convert
[out; "-quality"; string_of_int qlossless ^ "%"; out2]) ([out; "-quality"; string_of_int qlossless ^ "%"] @
(if suitable_num = 1 then ["-colorspace"; "Gray"] else []) @
[out2]))
in in
(*Printf.printf "%S\n" command;*) (*Printf.printf "%S\n" command;*)
Sys.command command Sys.command command
@ -537,13 +558,16 @@ let process pdf ~q ~qlossless ~path_to_convert =
let result = open_in_bin out2 in let result = open_in_bin out2 in
let newsize = in_channel_length result in let newsize = in_channel_length result in
if newsize < size then if newsize < size then
Printf.printf "Lossless to JPEG %i -> %i\n" size newsize; begin
reference := Printf.printf "Lossless to JPEG %i -> %i (components %i) \n" size newsize suitable_num;
(Pdf.add_dict_entry reference :=
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize)) (Pdf.add_dict_entry
"/Filter" (Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
(Pdf.Name "/DCTDecode")), "/Filter"
Pdf.Got (Pdfio.bytes_of_input_channel result) (Pdf.Name "/DCTDecode")),
Pdf.Got (Pdfio.bytes_of_input_channel result)
end;
close_in result
end; end;
Sys.remove out; Sys.remove out;
Sys.remove out2 Sys.remove out2