Detecting 1bpp images for processing

This commit is contained in:
John Whitington 2023-12-22 16:12:19 +00:00
parent d4b571d089
commit 2955a30c80
2 changed files with 27 additions and 10 deletions

View File

@ -483,7 +483,6 @@ let image_of_input fobj i =
Pdfpage.add_root pageroot [] pdf Pdfpage.add_root pageroot [] pdf
(* NOTE: ./cpdf -convert convert -recrypt -process-images -lossless-to-jpeg 65 ~/repos/pdfs/PDFTests/main128fail.pdf -o out.pdf *) (* NOTE: ./cpdf -convert convert -recrypt -process-images -lossless-to-jpeg 65 ~/repos/pdfs/PDFTests/main128fail.pdf -o out.pdf *)
(* FIXME Make sure this process is ok for masks too - do we get them, is it allowed etc. *)
(* FIXME Only do if quality < 100 *) (* FIXME Only do if quality < 100 *)
(* FIXME Error when path_to_convert not defined *) (* FIXME Error when path_to_convert not defined *)
(* FIXME Need the "is it smaller" check from Pdfcodec.encode here too? *) (* FIXME Need the "is it smaller" check from Pdfcodec.encode here too? *)
@ -581,24 +580,42 @@ let lossless_to_jpeg pdf ~qlossless ~path_to_convert s dict reference =
Sys.remove out2 Sys.remove out2
end end
| colspace, bpc -> | colspace, bpc ->
(*let colspace = Pdf.lookup_direct pdf "/ColorSpace" dict in let colspace = Pdf.lookup_direct pdf "/ColorSpace" dict in
let colspace, bpc, filter = let colspace, bpc, filter =
(match colspace with None -> "none" | Some x -> Pdfwrite.string_of_pdf x), (match colspace with None -> "none" | Some x -> Pdfwrite.string_of_pdf x),
(match bpc with None -> "none" | Some x -> Pdfwrite.string_of_pdf x), (match bpc with None -> "none" | Some x -> Pdfwrite.string_of_pdf x),
(match Pdf.lookup_direct pdf "/Filter" dict with None -> "none" | Some x -> Pdfwrite.string_of_pdf x) (match Pdf.lookup_direct pdf "/Filter" dict with None -> "none" | Some x -> Pdfwrite.string_of_pdf x)
in in
print_string (Printf.sprintf "%s (%s) [%s]\n" colspace bpc filter);*) print_string (Pdfwrite.string_of_pdf dict);
print_string (Printf.sprintf "%s (%s) [%s]\n" colspace bpc filter);
() (* an image we cannot or do not handle *) () (* an image we cannot or do not handle *)
let process pdf ~q ~qlossless ~path_to_convert = (* JPEG to JPEG: RGB and CMYK JPEGS *)
(* Lossless to JPEG: 8bpp Grey, 8bpp RGB, 8bpp CMYK including separation add ICCBased colourspaces *)
(* 1 bit: anything to CCITT; anything to JBIG2 lossless (no globals yet) *)
let process ?q ?qlossless ?jbig2 pdf ~path_to_convert =
let process_obj _ s = let process_obj _ s =
match s with match s with
| Pdf.Stream ({contents = dict, _} as reference) -> | Pdf.Stream ({contents = dict, _} as reference) ->
begin match Pdf.lookup_direct pdf "/Subtype" dict, Pdf.lookup_direct pdf "/Filter" dict with begin match
| Some (Pdf.Name "/Image"), Some (Pdf.Name "/DCTDecode" | Pdf.Array [Pdf.Name "/DCTDecode"]) -> Pdf.lookup_direct pdf "/Subtype" dict,
jpeg_to_jpeg pdf ~q ~path_to_convert s dict reference Pdf.lookup_direct pdf "/Filter" dict,
| Some (Pdf.Name "/Image"), _ -> Pdf.lookup_direct pdf "/BitsPerComponent" dict,
lossless_to_jpeg pdf ~qlossless ~path_to_convert s dict reference Pdf.lookup_direct pdf "/ImageMask" dict
with
| Some (Pdf.Name "/Image"), Some (Pdf.Name "/DCTDecode" | Pdf.Array [Pdf.Name "/DCTDecode"]), _, _ ->
begin match q with
| Some q -> jpeg_to_jpeg pdf ~q ~path_to_convert s dict reference
| None -> ()
end
| Some (Pdf.Name "/Image"), _, Some (Pdf.Integer 1), _
| Some (Pdf.Name "/Image"), _, _, Some (Pdf.Boolean true) ->
Printf.printf "1bpp\n"
| Some (Pdf.Name "/Image"), _, _, _ ->
begin match qlossless with
| Some qlossless -> lossless_to_jpeg pdf ~qlossless ~path_to_convert s dict reference
| None -> ()
end
| _ -> () (* not an image *) | _ -> () (* not an image *)
end end
| _ -> () (* not a stream *) | _ -> () (* not a stream *)

View File

@ -11,7 +11,7 @@ val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int *
(** List images in JSON format *) (** List images in JSON format *)
val images : Pdf.t -> int list -> Cpdfyojson.Safe.t val images : Pdf.t -> int list -> Cpdfyojson.Safe.t
val process : Pdf.t -> q:int -> qlossless:int -> path_to_convert:string -> unit val process : ?q:int -> ?qlossless:int -> ?jbig2:int -> Pdf.t -> path_to_convert:string -> unit
(**/**) (**/**)
val image_of_input : (unit -> Pdfio.bytes -> Pdf.pdfobject * (int * Pdf.pdfobject) list) -> Pdfio.input -> Pdf.t val image_of_input : (unit -> Pdfio.bytes -> Pdf.pdfobject * (int * Pdf.pdfobject) list) -> Pdfio.input -> Pdf.t