Detection for ICCBased colour spaces

This commit is contained in:
John Whitington 2023-12-18 20:50:52 +00:00
parent 16278b03f6
commit d96b6aabbb
1 changed files with 21 additions and 18 deletions

View File

@ -500,13 +500,22 @@ let process pdf ~q ~qlossless ~path_to_convert =
Sys.remove out2 Sys.remove out2
| Some (Pdf.Name "/Image"), _ -> | Some (Pdf.Name "/Image"), _ ->
(* 0. Test if this is one we can do - for now just Colourspace=RGB, BPC=8 *) (* 0. Test if this is one we can do - for now just Colourspace=RGB, BPC=8 *)
begin match Pdf.lookup_direct pdf "/ColorSpace" dict, Pdf.lookup_direct pdf "/BitsPerComponent" dict with let bpc = Pdf.lookup_direct pdf "/BitsPerComponent" dict in
| Some (Pdf.Name "/DeviceRGB"), Some (Pdf.Integer 8) -> let is_rgb =
match Pdf.lookup_direct pdf "/ColorSpace" dict with
| Some (Pdf.Name "/DeviceRGB") -> true
| Some (Pdf.Array [Pdf.Name "/ICCBased"; stream]) ->
begin match Pdf.lookup_direct pdf "/N" stream with
| Some (Pdf.Integer 3) -> true
| _ -> false
end
| _ -> false
in
begin match is_rgb, bpc with
| true, Some (Pdf.Integer 8) ->
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
(* 1. Decompress it - check we succeeded, bail if not *)
Pdfcodec.decode_pdfstream_until_unknown pdf s; Pdfcodec.decode_pdfstream_until_unknown pdf s;
begin match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None -> begin match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None ->
(* 1. Output to pnm *)
let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in let out = Filename.temp_file "cpdf" "convertin" ^ ".pnm" in
let out2 = Filename.temp_file "cpdf" "convertout" ^ ".jpg" in let out2 = Filename.temp_file "cpdf" "convertout" ^ ".jpg" in
let fh = open_out_bin out in let fh = open_out_bin out in
@ -515,7 +524,6 @@ let process pdf ~q ~qlossless ~path_to_convert =
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
pnm_to_channel_24 fh w h data; pnm_to_channel_24 fh w h data;
close_out fh; close_out fh;
(* 2. Convert to JPEG with convert *)
let retcode = let retcode =
let command = let command =
(Filename.quote_command path_to_convert (Filename.quote_command path_to_convert
@ -524,35 +532,30 @@ let process pdf ~q ~qlossless ~path_to_convert =
(*Printf.printf "%S\n" command;*) (*Printf.printf "%S\n" command;*)
Sys.command command Sys.command command
in in
(* 3. Check smaller, Read file, and build new dictionary - removing ColorSpace, BitsPerComponent replacing Filter *)
if retcode = 0 then if retcode = 0 then
begin begin
let result = open_in_bin out2 in let result = open_in_bin out2 in
let newsize = in_channel_length result in let newsize = in_channel_length result in
Printf.printf "Lossless to JPEG %i -> %i\n" size newsize;
if newsize < size then if newsize < size then
Printf.printf "Lossless to JPEG %i -> %i\n" size newsize;
reference := reference :=
(Pdf.remove_dict_entry
(Pdf.remove_dict_entry
(Pdf.add_dict_entry (Pdf.add_dict_entry
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize)) (Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
"/Filter" "/Filter"
(Pdf.Name "/DCTDecode")) (Pdf.Name "/DCTDecode")),
"/ColorSpace")
"/BitsPerComponent"),
Pdf.Got (Pdfio.bytes_of_input_channel result) Pdf.Got (Pdfio.bytes_of_input_channel result)
end; end;
(* 4. Clean up. *)
Sys.remove out; Sys.remove out;
Sys.remove out2 Sys.remove out2
end end
| colspace, bpc -> | colspace, bpc ->
(*let colspace, bpc, filter = let colspace = Pdf.lookup_direct pdf "/ColorSpace" dict in
let colspace, bpc, filter =
(match colspace with None -> "none" | Some x -> Pdfwrite.string_of_pdf x), (match colspace with None -> "none" | Some x -> Pdfwrite.string_of_pdf x),
(match bpc with None -> "none" | Some x -> Pdfwrite.string_of_pdf x), (match bpc with None -> "none" | Some x -> Pdfwrite.string_of_pdf x),
(match Pdf.lookup_direct pdf "/Filter" dict with None -> "none" | Some x -> Pdfwrite.string_of_pdf x) (match Pdf.lookup_direct pdf "/Filter" dict with None -> "none" | Some x -> Pdfwrite.string_of_pdf x)
in in
print_string (Printf.sprintf "%s (%s) [%s]\n" colspace bpc filter);*) print_string (Printf.sprintf "%s (%s) [%s]\n" colspace bpc filter);
() (* an image we cannot or do not handle *) () (* an image we cannot or do not handle *)
end end
| _ -> () (* not an image *) | _ -> () (* not an image *)