From 78dbe17b6759f47134d5f6b322c197f108eaa0df Mon Sep 17 00:00:00 2001 From: John Whitington Date: Mon, 18 Dec 2023 22:39:33 +0000 Subject: [PATCH] First go at /DeviceGray reprocessing --- Changes | 1 + cpdfimage.ml | 94 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 60 insertions(+), 35 deletions(-) diff --git a/Changes b/Changes index dea0e12..a816759 100644 --- a/Changes +++ b/Changes @@ -13,6 +13,7 @@ o Extract font files from a document o List images on a page with -list-images[-json] o Chop pages up into sections with -chop o Build PDF files from JBIG2 streams, including globals +o Reprocess images within PDFs to further compress them Extended features: diff --git a/cpdfimage.ml b/cpdfimage.ml index cd12ed3..5021a37 100644 --- a/cpdfimage.ml +++ b/cpdfimage.ml @@ -2,26 +2,42 @@ open Pdfutil open Pdfio open Cpdferror -(* Extract Images. *) -let pnm_to_channel_24 channel w h s = - let white () = output_char channel ' ' - and newline () = output_char channel '\n' - and output_string = Stdlib.output_string channel in - output_string "P6"; - white (); - output_string (string_of_int w); - white (); - output_string (string_of_int h); - white (); - output_string "255"; - newline (); - let pos = ref 0 in - for y = 1 to h do - for x = 1 to w * 3 do - output_byte channel (bget s !pos); - incr pos - done +let pnm_white ch = output_char ch ' ' +let pnm_newline ch = output_char ch '\n' +let pnm_output_string = Stdlib.output_string + +let pnm_header ch w h = + pnm_white ch; + pnm_output_string ch (string_of_int w); + pnm_white ch; + pnm_output_string ch (string_of_int h); + pnm_white ch + +let pnm_to_channel_24 ch w h s = + pnm_output_string ch "P6"; + pnm_header ch w h; + pnm_output_string ch "255"; + pnm_newline ch; + let pos = ref 0 in + for y = 1 to h do + for x = 1 to w * 3 do + output_byte ch (bget s !pos); + incr pos done + done + +let pnm_to_channel_8 ch w h s = + pnm_output_string ch "P5"; + pnm_header ch w h; + pnm_output_string ch "15"; + pnm_newline ch; + let pos = ref 0 in + for y = 1 to h do + for x = 1 to w do + output_byte ch (bget s !pos); + incr pos + done + done let jbig2_serial = ref 0 @@ -466,6 +482,7 @@ let image_of_input fobj i = (* For each image xobject, process it through convert to reduce size. *) (* FIXME What about predictors? Audit to see if files get smaller. *) (* FIXME if lossy only 5% smaller, ignore? Set this parameter... *) +(* FIXME error handling for Sys.remove, others *) let process pdf ~q ~qlossless ~path_to_convert = let process_obj _ s = match s with @@ -501,18 +518,20 @@ let process pdf ~q ~qlossless ~path_to_convert = | Some (Pdf.Name "/Image"), _ -> (* 0. Test if this is one we can do - for now just Colourspace=RGB, BPC=8 *) let bpc = Pdf.lookup_direct pdf "/BitsPerComponent" dict in - let is_rgb = + let suitable_num = match Pdf.lookup_direct pdf "/ColorSpace" dict with - | Some (Pdf.Name "/DeviceRGB") -> true + | Some (Pdf.Name "/DeviceRGB") -> 3 + | Some (Pdf.Name "/DeviceGray") -> 1 | Some (Pdf.Array [Pdf.Name "/ICCBased"; stream]) -> begin match Pdf.lookup_direct pdf "/N" stream with - | Some (Pdf.Integer 3) -> true - | _ -> false + | Some (Pdf.Integer 3) -> 3 + | Some (Pdf.Integer 1) -> 1 + | _ -> 0 end - | _ -> false + | _ -> 0 in - begin match is_rgb, bpc with - | true, Some (Pdf.Integer 8) -> + begin match suitable_num, bpc with + | (1 | 3), Some (Pdf.Integer 8) -> let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in Pdfcodec.decode_pdfstream_until_unknown pdf s; begin match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None -> @@ -522,12 +541,14 @@ let process pdf ~q ~qlossless ~path_to_convert = let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in - pnm_to_channel_24 fh w h data; + (if suitable_num = 3 then pnm_to_channel_24 else pnm_to_channel_8) fh w h data; close_out fh; let retcode = let command = (Filename.quote_command path_to_convert - [out; "-quality"; string_of_int qlossless ^ "%"; out2]) + ([out; "-quality"; string_of_int qlossless ^ "%"] @ + (if suitable_num = 1 then ["-colorspace"; "Gray"] else []) @ + [out2])) in (*Printf.printf "%S\n" command;*) Sys.command command @@ -537,13 +558,16 @@ let process pdf ~q ~qlossless ~path_to_convert = let result = open_in_bin out2 in let newsize = in_channel_length result in if newsize < size then - Printf.printf "Lossless to JPEG %i -> %i\n" size newsize; - reference := - (Pdf.add_dict_entry - (Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize)) - "/Filter" - (Pdf.Name "/DCTDecode")), - Pdf.Got (Pdfio.bytes_of_input_channel result) + begin + Printf.printf "Lossless to JPEG %i -> %i (components %i) \n" size newsize suitable_num; + reference := + (Pdf.add_dict_entry + (Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize)) + "/Filter" + (Pdf.Name "/DCTDecode")), + Pdf.Got (Pdfio.bytes_of_input_channel result) + end; + close_in result end; Sys.remove out; Sys.remove out2