From c7dc1cd4247f7f8e6ac30c7f3e4f65afca0474a6 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Thu, 7 Dec 2023 14:54:47 +0000 Subject: [PATCH] First successful round-trip of JPEGs --- cpdfcommand.ml | 2 +- cpdfimage.ml | 43 ++++++++++++++++++++++++++++++++++++++++++- cpdfimage.mli | 2 +- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 46e14bc..ce7efc3 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -4447,7 +4447,7 @@ let go () = write_pdf false (Cpdfchop.chop ~x ~y ~columns:args.impose_columns ~btt:args.impose_btt ~rtl:args.impose_rtl pdf range) | Some ProcessImages -> let pdf = get_single_pdf args.op false in - Cpdfimage.process pdf; + Cpdfimage.process pdf ~q:args.jpegquality ~qlossless:args.jpegqualitylossless ~path_to_convert:args.path_to_convert; write_pdf false pdf (* Advise the user if a combination of command line flags makes little sense, diff --git a/cpdfimage.ml b/cpdfimage.ml index f6f464a..7f83782 100644 --- a/cpdfimage.ml +++ b/cpdfimage.ml @@ -457,4 +457,45 @@ let image_of_input fobj i = let pdf, pageroot = Pdfpage.add_pagetree [page] pdf in Pdfpage.add_root pageroot [] pdf -let process pdf = () +(* For each image xobject, process it through convert to reduce JPEG quality if we can. *) +let process pdf ~q ~qlossless ~path_to_convert = + let process_obj _ s = + match s with + | Pdf.Stream ({contents = dict, _} as reference) -> + begin match Pdf.lookup_direct pdf "/Subtype" dict, Pdf.lookup_direct pdf "/Filter" dict with + | Some (Pdf.Name "/Image"), Some (Pdf.Name "/DCTDecode" | Pdf.Array [Pdf.Name "/DCTDecode"]) -> + Pdf.getstream s; + let out = Filename.temp_file "cpdf" "convertin" ^ ".jpg" in + let out2 = Filename.temp_file "cpdf" "convertout" ^ ".jpg" in + let fh = open_out_bin out in + (* 1. Write data to temp file *) + let size = + begin match s with Pdf.Stream {contents = _, Pdf.Got d} -> Pdfio.bytes_to_output_channel fh d; bytes_size d | _ -> 0 end + in + close_out fh; + (* 2. Process with convert. *) + let retcode = + let command = + (Filename.quote_command path_to_convert + [out; "-quality"; string_of_int q ^ "%"; out2]) + in + Printf.printf "%S\n" command; + Sys.command command + in + if retcode = 0 then + begin + (* 3. If return code 0, and output file produced, read it in *) + let result = open_in_bin out2 in + let newsize = in_channel_length result in + if newsize < size then + (* 4. Replace the data, but only if it was smaller *) + reference := Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize), Pdf.Got (Pdfio.bytes_of_input_channel result) + end; + (* 5. Clean up *) + Sys.remove out; + Sys.remove out2 + | _ -> () + end + | _ -> () + in + Pdf.objiter process_obj pdf diff --git a/cpdfimage.mli b/cpdfimage.mli index b5f79c9..3c0eba4 100644 --- a/cpdfimage.mli +++ b/cpdfimage.mli @@ -11,7 +11,7 @@ val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * (** List images in JSON format *) val images : Pdf.t -> int list -> Cpdfyojson.Safe.t -val process : Pdf.t -> unit +val process : Pdf.t -> q:int -> qlossless:int -> path_to_convert:string -> unit (**/**) val image_of_input : (unit -> Pdfio.bytes -> Pdf.pdfobject * (int * Pdf.pdfobject) list) -> Pdfio.input -> Pdf.t