First go at /DeviceGray reprocessing
This commit is contained in:
parent
d96b6aabbb
commit
78dbe17b67
1
Changes
1
Changes
|
@ -13,6 +13,7 @@ o Extract font files from a document
|
||||||
o List images on a page with -list-images[-json]
|
o List images on a page with -list-images[-json]
|
||||||
o Chop pages up into sections with -chop
|
o Chop pages up into sections with -chop
|
||||||
o Build PDF files from JBIG2 streams, including globals
|
o Build PDF files from JBIG2 streams, including globals
|
||||||
|
o Reprocess images within PDFs to further compress them
|
||||||
|
|
||||||
Extended features:
|
Extended features:
|
||||||
|
|
||||||
|
|
94
cpdfimage.ml
94
cpdfimage.ml
|
@ -2,26 +2,42 @@ open Pdfutil
|
||||||
open Pdfio
|
open Pdfio
|
||||||
open Cpdferror
|
open Cpdferror
|
||||||
|
|
||||||
(* Extract Images. *)
|
let pnm_white ch = output_char ch ' '
|
||||||
let pnm_to_channel_24 channel w h s =
|
let pnm_newline ch = output_char ch '\n'
|
||||||
let white () = output_char channel ' '
|
let pnm_output_string = Stdlib.output_string
|
||||||
and newline () = output_char channel '\n'
|
|
||||||
and output_string = Stdlib.output_string channel in
|
let pnm_header ch w h =
|
||||||
output_string "P6";
|
pnm_white ch;
|
||||||
white ();
|
pnm_output_string ch (string_of_int w);
|
||||||
output_string (string_of_int w);
|
pnm_white ch;
|
||||||
white ();
|
pnm_output_string ch (string_of_int h);
|
||||||
output_string (string_of_int h);
|
pnm_white ch
|
||||||
white ();
|
|
||||||
output_string "255";
|
let pnm_to_channel_24 ch w h s =
|
||||||
newline ();
|
pnm_output_string ch "P6";
|
||||||
let pos = ref 0 in
|
pnm_header ch w h;
|
||||||
for y = 1 to h do
|
pnm_output_string ch "255";
|
||||||
for x = 1 to w * 3 do
|
pnm_newline ch;
|
||||||
output_byte channel (bget s !pos);
|
let pos = ref 0 in
|
||||||
incr pos
|
for y = 1 to h do
|
||||||
done
|
for x = 1 to w * 3 do
|
||||||
|
output_byte ch (bget s !pos);
|
||||||
|
incr pos
|
||||||
done
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
let pnm_to_channel_8 ch w h s =
|
||||||
|
pnm_output_string ch "P5";
|
||||||
|
pnm_header ch w h;
|
||||||
|
pnm_output_string ch "15";
|
||||||
|
pnm_newline ch;
|
||||||
|
let pos = ref 0 in
|
||||||
|
for y = 1 to h do
|
||||||
|
for x = 1 to w do
|
||||||
|
output_byte ch (bget s !pos);
|
||||||
|
incr pos
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
let jbig2_serial = ref 0
|
let jbig2_serial = ref 0
|
||||||
|
|
||||||
|
@ -466,6 +482,7 @@ let image_of_input fobj i =
|
||||||
(* For each image xobject, process it through convert to reduce size. *)
|
(* For each image xobject, process it through convert to reduce size. *)
|
||||||
(* FIXME What about predictors? Audit to see if files get smaller. *)
|
(* FIXME What about predictors? Audit to see if files get smaller. *)
|
||||||
(* FIXME if lossy only 5% smaller, ignore? Set this parameter... *)
|
(* FIXME if lossy only 5% smaller, ignore? Set this parameter... *)
|
||||||
|
(* FIXME error handling for Sys.remove, others *)
|
||||||
let process pdf ~q ~qlossless ~path_to_convert =
|
let process pdf ~q ~qlossless ~path_to_convert =
|
||||||
let process_obj _ s =
|
let process_obj _ s =
|
||||||
match s with
|
match s with
|
||||||
|
@ -501,18 +518,20 @@ let process pdf ~q ~qlossless ~path_to_convert =
|
||||||
| Some (Pdf.Name "/Image"), _ ->
|
| Some (Pdf.Name "/Image"), _ ->
|
||||||
(* 0. Test if this is one we can do - for now just Colourspace=RGB, BPC=8 *)
|
(* 0. Test if this is one we can do - for now just Colourspace=RGB, BPC=8 *)
|
||||||
let bpc = Pdf.lookup_direct pdf "/BitsPerComponent" dict in
|
let bpc = Pdf.lookup_direct pdf "/BitsPerComponent" dict in
|
||||||
let is_rgb =
|
let suitable_num =
|
||||||
match Pdf.lookup_direct pdf "/ColorSpace" dict with
|
match Pdf.lookup_direct pdf "/ColorSpace" dict with
|
||||||
| Some (Pdf.Name "/DeviceRGB") -> true
|
| Some (Pdf.Name "/DeviceRGB") -> 3
|
||||||
|
| Some (Pdf.Name "/DeviceGray") -> 1
|
||||||
| Some (Pdf.Array [Pdf.Name "/ICCBased"; stream]) ->
|
| Some (Pdf.Array [Pdf.Name "/ICCBased"; stream]) ->
|
||||||
begin match Pdf.lookup_direct pdf "/N" stream with
|
begin match Pdf.lookup_direct pdf "/N" stream with
|
||||||
| Some (Pdf.Integer 3) -> true
|
| Some (Pdf.Integer 3) -> 3
|
||||||
| _ -> false
|
| Some (Pdf.Integer 1) -> 1
|
||||||
|
| _ -> 0
|
||||||
end
|
end
|
||||||
| _ -> false
|
| _ -> 0
|
||||||
in
|
in
|
||||||
begin match is_rgb, bpc with
|
begin match suitable_num, bpc with
|
||||||
| true, Some (Pdf.Integer 8) ->
|
| (1 | 3), Some (Pdf.Integer 8) ->
|
||||||
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
|
let size = match Pdf.lookup_direct pdf "/Length" dict with Some (Pdf.Integer i) -> i | _ -> 0 in
|
||||||
Pdfcodec.decode_pdfstream_until_unknown pdf s;
|
Pdfcodec.decode_pdfstream_until_unknown pdf s;
|
||||||
begin match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None ->
|
begin match Pdf.lookup_direct pdf "/Filter" (fst !reference) with Some _ -> () | None ->
|
||||||
|
@ -522,12 +541,14 @@ let process pdf ~q ~qlossless ~path_to_convert =
|
||||||
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
|
let w = match Pdf.lookup_direct pdf "/Width" dict with Some (Pdf.Integer i) -> i | _ -> error "bad width" in
|
||||||
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
|
let h = match Pdf.lookup_direct pdf "/Height" dict with Some (Pdf.Integer i) -> i | _ -> error "bad height" in
|
||||||
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
|
let data = match s with Pdf.Stream {contents = _, Pdf.Got d} -> d | _ -> assert false in
|
||||||
pnm_to_channel_24 fh w h data;
|
(if suitable_num = 3 then pnm_to_channel_24 else pnm_to_channel_8) fh w h data;
|
||||||
close_out fh;
|
close_out fh;
|
||||||
let retcode =
|
let retcode =
|
||||||
let command =
|
let command =
|
||||||
(Filename.quote_command path_to_convert
|
(Filename.quote_command path_to_convert
|
||||||
[out; "-quality"; string_of_int qlossless ^ "%"; out2])
|
([out; "-quality"; string_of_int qlossless ^ "%"] @
|
||||||
|
(if suitable_num = 1 then ["-colorspace"; "Gray"] else []) @
|
||||||
|
[out2]))
|
||||||
in
|
in
|
||||||
(*Printf.printf "%S\n" command;*)
|
(*Printf.printf "%S\n" command;*)
|
||||||
Sys.command command
|
Sys.command command
|
||||||
|
@ -537,13 +558,16 @@ let process pdf ~q ~qlossless ~path_to_convert =
|
||||||
let result = open_in_bin out2 in
|
let result = open_in_bin out2 in
|
||||||
let newsize = in_channel_length result in
|
let newsize = in_channel_length result in
|
||||||
if newsize < size then
|
if newsize < size then
|
||||||
Printf.printf "Lossless to JPEG %i -> %i\n" size newsize;
|
begin
|
||||||
reference :=
|
Printf.printf "Lossless to JPEG %i -> %i (components %i) \n" size newsize suitable_num;
|
||||||
(Pdf.add_dict_entry
|
reference :=
|
||||||
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
|
(Pdf.add_dict_entry
|
||||||
"/Filter"
|
(Pdf.add_dict_entry dict "/Length" (Pdf.Integer newsize))
|
||||||
(Pdf.Name "/DCTDecode")),
|
"/Filter"
|
||||||
Pdf.Got (Pdfio.bytes_of_input_channel result)
|
(Pdf.Name "/DCTDecode")),
|
||||||
|
Pdf.Got (Pdfio.bytes_of_input_channel result)
|
||||||
|
end;
|
||||||
|
close_in result
|
||||||
end;
|
end;
|
||||||
Sys.remove out;
|
Sys.remove out;
|
||||||
Sys.remove out2
|
Sys.remove out2
|
||||||
|
|
Loading…
Reference in New Issue