more
This commit is contained in:
parent
1048bdf1df
commit
8726d3dee0
2
Makefile
2
Makefile
|
@ -2,7 +2,7 @@
|
||||||
MODS = cpdfyojson cpdfxmlm \
|
MODS = cpdfyojson cpdfxmlm \
|
||||||
cpdfunicodedata cpdferror cpdfjson cpdfstrftime cpdfcoord cpdfattach \
|
cpdfunicodedata cpdferror cpdfjson cpdfstrftime cpdfcoord cpdfattach \
|
||||||
cpdfpagespec cpdfposition cpdf cpdfpresent cpdffont cpdftype \
|
cpdfpagespec cpdfposition cpdf cpdfpresent cpdffont cpdftype \
|
||||||
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfcommand
|
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfcommand
|
||||||
|
|
||||||
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
|
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
|
||||||
|
|
||||||
|
|
244
cpdf.ml
244
cpdf.ml
|
@ -49,250 +49,6 @@ let xmp_template =
|
||||||
|
|
||||||
<?xpacket end='r'?>|}
|
<?xpacket end='r'?>|}
|
||||||
|
|
||||||
(* For debugging *)
|
|
||||||
let report_pdf_size pdf =
|
|
||||||
Pdf.remove_unreferenced pdf;
|
|
||||||
Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false
|
|
||||||
~compress_objstm:false false None false pdf "temp.pdf";
|
|
||||||
let fh = open_in_bin "temp.pdf" in
|
|
||||||
Printf.printf "Size %i bytes\n" (in_channel_length fh);
|
|
||||||
flush stdout;
|
|
||||||
close_in fh
|
|
||||||
|
|
||||||
|
|
||||||
(* Recompress anything which isn't compressed, unless it's metadata. *)
|
|
||||||
let recompress_stream pdf = function
|
|
||||||
(* If there is no compression, compress with /FlateDecode *)
|
|
||||||
| Pdf.Stream {contents = (dict, _)} as stream ->
|
|
||||||
begin match
|
|
||||||
Pdf.lookup_direct pdf "/Filter" dict,
|
|
||||||
Pdf.lookup_direct pdf "/Type" dict
|
|
||||||
with
|
|
||||||
| _, Some (Pdf.Name "/Metadata") -> ()
|
|
||||||
| (None | Some (Pdf.Array [])), _ ->
|
|
||||||
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
|
|
||||||
| _ -> ()
|
|
||||||
end
|
|
||||||
| _ -> assert false
|
|
||||||
|
|
||||||
let recompress_pdf pdf =
|
|
||||||
if not (Pdfcrypt.is_encrypted pdf) then
|
|
||||||
Pdf.iter_stream (recompress_stream pdf) pdf;
|
|
||||||
pdf
|
|
||||||
|
|
||||||
let decompress_pdf pdf =
|
|
||||||
if not (Pdfcrypt.is_encrypted pdf) then
|
|
||||||
(Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
|
|
||||||
pdf
|
|
||||||
|
|
||||||
|
|
||||||
(* Equality on PDF objects *)
|
|
||||||
let pdfobjeq pdf x y =
|
|
||||||
let x = Pdf.lookup_obj pdf x
|
|
||||||
and y = Pdf.lookup_obj pdf y in
|
|
||||||
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
|
|
||||||
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
|
|
||||||
compare x y
|
|
||||||
|
|
||||||
let really_squeeze pdf =
|
|
||||||
let objs = ref [] in
|
|
||||||
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
|
|
||||||
let toprocess =
|
|
||||||
keep
|
|
||||||
(fun x -> length x > 1)
|
|
||||||
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
|
|
||||||
in
|
|
||||||
(* Remove any pools of objects which are page objects, since Adobe Reader
|
|
||||||
* gets confused when there are duplicate page objects. *)
|
|
||||||
let toprocess =
|
|
||||||
option_map
|
|
||||||
(function
|
|
||||||
[] -> assert false
|
|
||||||
| h::_ as l ->
|
|
||||||
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
|
|
||||||
Some (Pdf.Name "/Page") -> None
|
|
||||||
| _ -> Some l)
|
|
||||||
toprocess
|
|
||||||
in
|
|
||||||
let pdfr = ref pdf in
|
|
||||||
let changetable = Hashtbl.create 100 in
|
|
||||||
iter
|
|
||||||
(function [] -> assert false | h::t ->
|
|
||||||
iter (fun e -> Hashtbl.add changetable e h) t)
|
|
||||||
toprocess;
|
|
||||||
(* For a unknown reason, the output file is much smaller if
|
|
||||||
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
|
|
||||||
an old, well-understood function in use for years -- what is
|
|
||||||
going on? Furthermore, if we run it 3 times, it gets bigger again! *)
|
|
||||||
pdfr := Pdf.renumber changetable !pdfr;
|
|
||||||
pdfr := Pdf.renumber changetable !pdfr;
|
|
||||||
Pdf.remove_unreferenced !pdfr;
|
|
||||||
pdf.Pdf.root <- !pdfr.Pdf.root;
|
|
||||||
pdf.Pdf.objects <- !pdfr.Pdf.objects;
|
|
||||||
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
|
|
||||||
|
|
||||||
(* Squeeze the form xobject at objnum.
|
|
||||||
|
|
||||||
FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
|
|
||||||
the page tree!) are also needed - we must merge them with the ones from the
|
|
||||||
xobject itself. However, it it safe for now -- in the unlikely event that the
|
|
||||||
resources actually need to be available, the parse will fail, the squeeze of
|
|
||||||
this object will fail, and we bail out. *)
|
|
||||||
let xobjects_done = ref []
|
|
||||||
|
|
||||||
let squeeze_form_xobject pdf objnum =
|
|
||||||
if mem objnum !xobjects_done then () else
|
|
||||||
xobjects_done := objnum :: !xobjects_done;
|
|
||||||
let obj = Pdf.lookup_obj pdf objnum in
|
|
||||||
match Pdf.lookup_direct pdf "/Subtype" obj with
|
|
||||||
Some (Pdf.Name "/Form") ->
|
|
||||||
let resources =
|
|
||||||
match Pdf.lookup_direct pdf "/Resources" obj with
|
|
||||||
Some d -> d
|
|
||||||
| None -> Pdf.Dictionary []
|
|
||||||
in
|
|
||||||
begin match
|
|
||||||
Pdfops.stream_of_ops
|
|
||||||
(Pdfops.parse_operators pdf resources [Pdf.Indirect objnum])
|
|
||||||
with
|
|
||||||
Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
|
||||||
(* Put replacement data in original stream, and overwrite /Length *)
|
|
||||||
begin match obj with
|
|
||||||
Pdf.Stream ({contents = (d, _)} as str) ->
|
|
||||||
str :=
|
|
||||||
(Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)),
|
|
||||||
Pdf.Got data)
|
|
||||||
| _ -> failwith "squeeze_form_xobject"
|
|
||||||
end
|
|
||||||
| _ -> failwith "squeeze_form_xobject"
|
|
||||||
end
|
|
||||||
| _ -> ()
|
|
||||||
|
|
||||||
(* For a list of indirects representing content streams, make sure that none of
|
|
||||||
them are duplicated in the PDF. This indicates sharing, which parsing and
|
|
||||||
rewriting the streams might destroy, thus making the file bigger. FIXME: The
|
|
||||||
correct thing to do is to preserve the multiple content streams. *)
|
|
||||||
let no_duplicates content_stream_numbers stream_numbers =
|
|
||||||
not
|
|
||||||
(mem false
|
|
||||||
(map
|
|
||||||
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
|
|
||||||
stream_numbers))
|
|
||||||
|
|
||||||
(* Give a list of content stream numbers, given a page reference number *)
|
|
||||||
let content_streams_of_page pdf refnum =
|
|
||||||
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
|
|
||||||
Pdf.Dictionary dict ->
|
|
||||||
begin match lookup "/Contents" dict with
|
|
||||||
Some (Pdf.Indirect i) -> [i]
|
|
||||||
| Some (Pdf.Array x) ->
|
|
||||||
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
|
|
||||||
| _ -> []
|
|
||||||
end
|
|
||||||
| _ -> []
|
|
||||||
|
|
||||||
(* For each object in the PDF marked with /Type /Page, for each /Contents
|
|
||||||
indirect reference or array of such, decode and recode that content stream. *)
|
|
||||||
let squeeze_all_content_streams pdf =
|
|
||||||
let page_reference_numbers = Pdf.page_reference_numbers pdf in
|
|
||||||
let all_content_streams_in_doc =
|
|
||||||
flatten (map (content_streams_of_page pdf) page_reference_numbers)
|
|
||||||
in
|
|
||||||
xobjects_done := [];
|
|
||||||
Pdf.objiter
|
|
||||||
(fun objnum _ ->
|
|
||||||
match Pdf.lookup_obj pdf objnum with
|
|
||||||
Pdf.Dictionary dict as d
|
|
||||||
when
|
|
||||||
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
|
|
||||||
->
|
|
||||||
let resources =
|
|
||||||
match Pdf.lookup_direct pdf "/Resources" d with
|
|
||||||
Some d -> d
|
|
||||||
| None -> Pdf.Dictionary []
|
|
||||||
in
|
|
||||||
begin try
|
|
||||||
let content_streams =
|
|
||||||
match lookup "/Contents" dict with
|
|
||||||
Some (Pdf.Indirect i) ->
|
|
||||||
begin match Pdf.direct pdf (Pdf.Indirect i) with
|
|
||||||
Pdf.Array x -> x
|
|
||||||
| _ -> [Pdf.Indirect i]
|
|
||||||
end
|
|
||||||
| Some (Pdf.Array x) -> x
|
|
||||||
| _ -> raise Not_found
|
|
||||||
in
|
|
||||||
if
|
|
||||||
no_duplicates
|
|
||||||
all_content_streams_in_doc
|
|
||||||
(map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
|
|
||||||
then
|
|
||||||
let newstream =
|
|
||||||
Pdfops.stream_of_ops
|
|
||||||
(Pdfops.parse_operators pdf resources content_streams)
|
|
||||||
in
|
|
||||||
let newdict =
|
|
||||||
Pdf.add_dict_entry
|
|
||||||
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
|
|
||||||
in
|
|
||||||
Pdf.addobj_given_num pdf (objnum, newdict);
|
|
||||||
(* Now process all xobjects related to this page *)
|
|
||||||
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
|
||||||
Some (Pdf.Dictionary xobjs) ->
|
|
||||||
iter
|
|
||||||
(function
|
|
||||||
(_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
|
|
||||||
| _ -> failwith "squeeze_xobject")
|
|
||||||
xobjs
|
|
||||||
| _ -> ()
|
|
||||||
end
|
|
||||||
with
|
|
||||||
(* No /Contents, which is ok. Or a parsing failure due to
|
|
||||||
uninherited resources. FIXME: Add support for inherited
|
|
||||||
resources. *)
|
|
||||||
Not_found -> ()
|
|
||||||
end
|
|
||||||
| _ -> ())
|
|
||||||
pdf
|
|
||||||
|
|
||||||
(* We run squeeze enough times for the number of objects to not change *)
|
|
||||||
let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf =
|
|
||||||
let log x =
|
|
||||||
match logto with
|
|
||||||
None -> print_string x; flush stdout
|
|
||||||
| Some "nolog" -> ()
|
|
||||||
| Some s ->
|
|
||||||
let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in
|
|
||||||
seek_out fh (out_channel_length fh);
|
|
||||||
output_string fh x;
|
|
||||||
close_out fh
|
|
||||||
in
|
|
||||||
try
|
|
||||||
let n = ref (Pdf.objcard pdf) in
|
|
||||||
log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf));
|
|
||||||
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
|
|
||||||
n := Pdf.objcard pdf;
|
|
||||||
log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf));
|
|
||||||
done;
|
|
||||||
if pagedata then
|
|
||||||
begin
|
|
||||||
log (Printf.sprintf "Squeezing page data and xobjects\n");
|
|
||||||
squeeze_all_content_streams pdf;
|
|
||||||
end;
|
|
||||||
if recompress then
|
|
||||||
begin
|
|
||||||
log (Printf.sprintf "Recompressing document\n");
|
|
||||||
Pdfcodec.flate_level := 9;
|
|
||||||
ignore (recompress_pdf pdf)
|
|
||||||
end
|
|
||||||
with
|
|
||||||
e ->
|
|
||||||
raise
|
|
||||||
(Pdf.PDFError
|
|
||||||
(Printf.sprintf
|
|
||||||
"Squeeze failed. No output written.\n Proximate error was:\n %s"
|
|
||||||
(Printexc.to_string e)))
|
|
||||||
|
|
||||||
type encoding =
|
type encoding =
|
||||||
| Raw
|
| Raw
|
||||||
| UTF8
|
| UTF8
|
||||||
|
|
10
cpdf.mli
10
cpdf.mli
|
@ -36,14 +36,6 @@ val iter_pages : (int -> Pdfpage.t -> unit) -> Pdf.t -> int list -> unit
|
||||||
(** Same as [process_pages] but return the list of outputs of the map function. *)
|
(** Same as [process_pages] but return the list of outputs of the map function. *)
|
||||||
val map_pages : (int -> Pdfpage.t -> 'a) -> Pdf.t -> int list -> 'a list
|
val map_pages : (int -> Pdfpage.t -> 'a) -> Pdf.t -> int list -> 'a list
|
||||||
|
|
||||||
|
|
||||||
(** Compresses all streams in the PDF document which are uncompressed, using
|
|
||||||
/FlateDecode, leaving out metadata. If the PDF is encrypted, does nothing. *)
|
|
||||||
val recompress_pdf : Pdf.t -> Pdf.t
|
|
||||||
|
|
||||||
(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *)
|
|
||||||
val decompress_pdf : Pdf.t -> Pdf.t
|
|
||||||
|
|
||||||
val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t
|
val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t
|
||||||
|
|
||||||
(** {2 Metadata and settings} *)
|
(** {2 Metadata and settings} *)
|
||||||
|
@ -344,8 +336,6 @@ val blackfills : color -> int list -> Pdf.t -> Pdf.t
|
||||||
(** Remove images from a PDF, optionally adding crossed boxes. *)
|
(** Remove images from a PDF, optionally adding crossed boxes. *)
|
||||||
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
|
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
|
||||||
|
|
||||||
(** Squeeze a PDF *)
|
|
||||||
val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit
|
|
||||||
|
|
||||||
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
||||||
|
|
||||||
|
|
|
@ -2834,8 +2834,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
|
||||||
None ->
|
None ->
|
||||||
if not is_decompress then
|
if not is_decompress then
|
||||||
begin
|
begin
|
||||||
ignore (Cpdf.recompress_pdf pdf);
|
ignore (Cpdfsqueeze.recompress_pdf pdf);
|
||||||
if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||||
end;
|
end;
|
||||||
Pdf.remove_unreferenced pdf;
|
Pdf.remove_unreferenced pdf;
|
||||||
really_write_pdf ~is_decompress mk_id pdf outname
|
really_write_pdf ~is_decompress mk_id pdf outname
|
||||||
|
@ -2849,8 +2849,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
|
||||||
None ->
|
None ->
|
||||||
if not is_decompress then
|
if not is_decompress then
|
||||||
begin
|
begin
|
||||||
ignore (Cpdf.recompress_pdf pdf);
|
ignore (Cpdfsqueeze.recompress_pdf pdf);
|
||||||
if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||||
Pdf.remove_unreferenced pdf
|
Pdf.remove_unreferenced pdf
|
||||||
end;
|
end;
|
||||||
really_write_pdf ~encryption ~is_decompress mk_id pdf temp;
|
really_write_pdf ~encryption ~is_decompress mk_id pdf temp;
|
||||||
|
@ -2889,7 +2889,7 @@ let fast_write_split_pdfs
|
||||||
(stem original_filename) startpage endpage
|
(stem original_filename) startpage endpage
|
||||||
in
|
in
|
||||||
Pdf.remove_unreferenced pdf;
|
Pdf.remove_unreferenced pdf;
|
||||||
if sq then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
if sq then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||||
really_write_pdf ~encryption:enc (not (enc = None)) pdf name)
|
really_write_pdf ~encryption:enc (not (enc = None)) pdf name)
|
||||||
(indx pagenums)
|
(indx pagenums)
|
||||||
pagenums
|
pagenums
|
||||||
|
@ -3261,7 +3261,7 @@ let go () =
|
||||||
let pdf = get_single_pdf (Some Compress) false in
|
let pdf = get_single_pdf (Some Compress) false in
|
||||||
if args.remove_duplicate_streams then
|
if args.remove_duplicate_streams then
|
||||||
Pdfmerge.remove_duplicate_fonts pdf;
|
Pdfmerge.remove_duplicate_fonts pdf;
|
||||||
write_pdf false (Cpdf.recompress_pdf pdf)
|
write_pdf false (Cpdfsqueeze.recompress_pdf pdf)
|
||||||
| Some RemoveCrop ->
|
| Some RemoveCrop ->
|
||||||
begin match args.inputs, args.out with
|
begin match args.inputs, args.out with
|
||||||
| (_, pagespec, _, _, _, _)::_, _ ->
|
| (_, pagespec, _, _, _, _)::_, _ ->
|
||||||
|
@ -3736,7 +3736,7 @@ let go () =
|
||||||
(Cpdf.combine_pages args.fast (get_single_pdf args.op false) (pdfread_pdf_of_file None None over) false false true)
|
(Cpdf.combine_pages args.fast (get_single_pdf args.op false) (pdfread_pdf_of_file None None over) false false true)
|
||||||
| Some Encrypt ->
|
| Some Encrypt ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let pdf = Cpdf.recompress_pdf pdf
|
let pdf = Cpdfsqueeze.recompress_pdf pdf
|
||||||
and encryption =
|
and encryption =
|
||||||
{Pdfwrite.encryption_method =
|
{Pdfwrite.encryption_method =
|
||||||
(match args.crypt_method with
|
(match args.crypt_method with
|
||||||
|
|
|
@ -0,0 +1,246 @@
|
||||||
|
open Pdfutil
|
||||||
|
open Pdfio
|
||||||
|
|
||||||
|
(* For debugging *)
|
||||||
|
let report_pdf_size pdf =
|
||||||
|
Pdf.remove_unreferenced pdf;
|
||||||
|
Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false
|
||||||
|
~compress_objstm:false false None false pdf "temp.pdf";
|
||||||
|
let fh = open_in_bin "temp.pdf" in
|
||||||
|
Printf.printf "Size %i bytes\n" (in_channel_length fh);
|
||||||
|
flush stdout;
|
||||||
|
close_in fh
|
||||||
|
|
||||||
|
(* Recompress anything which isn't compressed, unless it's metadata. *)
|
||||||
|
let recompress_stream pdf = function
|
||||||
|
(* If there is no compression, compress with /FlateDecode *)
|
||||||
|
| Pdf.Stream {contents = (dict, _)} as stream ->
|
||||||
|
begin match
|
||||||
|
Pdf.lookup_direct pdf "/Filter" dict,
|
||||||
|
Pdf.lookup_direct pdf "/Type" dict
|
||||||
|
with
|
||||||
|
| _, Some (Pdf.Name "/Metadata") -> ()
|
||||||
|
| (None | Some (Pdf.Array [])), _ ->
|
||||||
|
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
|
||||||
|
| _ -> ()
|
||||||
|
end
|
||||||
|
| _ -> assert false
|
||||||
|
|
||||||
|
let recompress_pdf pdf =
|
||||||
|
if not (Pdfcrypt.is_encrypted pdf) then
|
||||||
|
Pdf.iter_stream (recompress_stream pdf) pdf;
|
||||||
|
pdf
|
||||||
|
|
||||||
|
let decompress_pdf pdf =
|
||||||
|
if not (Pdfcrypt.is_encrypted pdf) then
|
||||||
|
(Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
|
||||||
|
pdf
|
||||||
|
|
||||||
|
|
||||||
|
(* Equality on PDF objects *)
|
||||||
|
let pdfobjeq pdf x y =
|
||||||
|
let x = Pdf.lookup_obj pdf x
|
||||||
|
and y = Pdf.lookup_obj pdf y in
|
||||||
|
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
|
||||||
|
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
|
||||||
|
compare x y
|
||||||
|
|
||||||
|
let really_squeeze pdf =
|
||||||
|
let objs = ref [] in
|
||||||
|
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
|
||||||
|
let toprocess =
|
||||||
|
keep
|
||||||
|
(fun x -> length x > 1)
|
||||||
|
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
|
||||||
|
in
|
||||||
|
(* Remove any pools of objects which are page objects, since Adobe Reader
|
||||||
|
* gets confused when there are duplicate page objects. *)
|
||||||
|
let toprocess =
|
||||||
|
option_map
|
||||||
|
(function
|
||||||
|
[] -> assert false
|
||||||
|
| h::_ as l ->
|
||||||
|
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
|
||||||
|
Some (Pdf.Name "/Page") -> None
|
||||||
|
| _ -> Some l)
|
||||||
|
toprocess
|
||||||
|
in
|
||||||
|
let pdfr = ref pdf in
|
||||||
|
let changetable = Hashtbl.create 100 in
|
||||||
|
iter
|
||||||
|
(function [] -> assert false | h::t ->
|
||||||
|
iter (fun e -> Hashtbl.add changetable e h) t)
|
||||||
|
toprocess;
|
||||||
|
(* For a unknown reason, the output file is much smaller if
|
||||||
|
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
|
||||||
|
an old, well-understood function in use for years -- what is
|
||||||
|
going on? Furthermore, if we run it 3 times, it gets bigger again! *)
|
||||||
|
pdfr := Pdf.renumber changetable !pdfr;
|
||||||
|
pdfr := Pdf.renumber changetable !pdfr;
|
||||||
|
Pdf.remove_unreferenced !pdfr;
|
||||||
|
pdf.Pdf.root <- !pdfr.Pdf.root;
|
||||||
|
pdf.Pdf.objects <- !pdfr.Pdf.objects;
|
||||||
|
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
|
||||||
|
|
||||||
|
(* Squeeze the form xobject at objnum.
|
||||||
|
|
||||||
|
FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
|
||||||
|
the page tree!) are also needed - we must merge them with the ones from the
|
||||||
|
xobject itself. However, it it safe for now -- in the unlikely event that the
|
||||||
|
resources actually need to be available, the parse will fail, the squeeze of
|
||||||
|
this object will fail, and we bail out. *)
|
||||||
|
let xobjects_done = ref []
|
||||||
|
|
||||||
|
let squeeze_form_xobject pdf objnum =
|
||||||
|
if mem objnum !xobjects_done then () else
|
||||||
|
xobjects_done := objnum :: !xobjects_done;
|
||||||
|
let obj = Pdf.lookup_obj pdf objnum in
|
||||||
|
match Pdf.lookup_direct pdf "/Subtype" obj with
|
||||||
|
Some (Pdf.Name "/Form") ->
|
||||||
|
let resources =
|
||||||
|
match Pdf.lookup_direct pdf "/Resources" obj with
|
||||||
|
Some d -> d
|
||||||
|
| None -> Pdf.Dictionary []
|
||||||
|
in
|
||||||
|
begin match
|
||||||
|
Pdfops.stream_of_ops
|
||||||
|
(Pdfops.parse_operators pdf resources [Pdf.Indirect objnum])
|
||||||
|
with
|
||||||
|
Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
||||||
|
(* Put replacement data in original stream, and overwrite /Length *)
|
||||||
|
begin match obj with
|
||||||
|
Pdf.Stream ({contents = (d, _)} as str) ->
|
||||||
|
str :=
|
||||||
|
(Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)),
|
||||||
|
Pdf.Got data)
|
||||||
|
| _ -> failwith "squeeze_form_xobject"
|
||||||
|
end
|
||||||
|
| _ -> failwith "squeeze_form_xobject"
|
||||||
|
end
|
||||||
|
| _ -> ()
|
||||||
|
|
||||||
|
(* For a list of indirects representing content streams, make sure that none of
|
||||||
|
them are duplicated in the PDF. This indicates sharing, which parsing and
|
||||||
|
rewriting the streams might destroy, thus making the file bigger. FIXME: The
|
||||||
|
correct thing to do is to preserve the multiple content streams. *)
|
||||||
|
let no_duplicates content_stream_numbers stream_numbers =
|
||||||
|
not
|
||||||
|
(mem false
|
||||||
|
(map
|
||||||
|
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
|
||||||
|
stream_numbers))
|
||||||
|
|
||||||
|
(* Give a list of content stream numbers, given a page reference number *)
|
||||||
|
let content_streams_of_page pdf refnum =
|
||||||
|
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
|
||||||
|
Pdf.Dictionary dict ->
|
||||||
|
begin match lookup "/Contents" dict with
|
||||||
|
Some (Pdf.Indirect i) -> [i]
|
||||||
|
| Some (Pdf.Array x) ->
|
||||||
|
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
|
||||||
|
| _ -> []
|
||||||
|
end
|
||||||
|
| _ -> []
|
||||||
|
|
||||||
|
(* For each object in the PDF marked with /Type /Page, for each /Contents
|
||||||
|
indirect reference or array of such, decode and recode that content stream. *)
|
||||||
|
let squeeze_all_content_streams pdf =
|
||||||
|
let page_reference_numbers = Pdf.page_reference_numbers pdf in
|
||||||
|
let all_content_streams_in_doc =
|
||||||
|
flatten (map (content_streams_of_page pdf) page_reference_numbers)
|
||||||
|
in
|
||||||
|
xobjects_done := [];
|
||||||
|
Pdf.objiter
|
||||||
|
(fun objnum _ ->
|
||||||
|
match Pdf.lookup_obj pdf objnum with
|
||||||
|
Pdf.Dictionary dict as d
|
||||||
|
when
|
||||||
|
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
|
||||||
|
->
|
||||||
|
let resources =
|
||||||
|
match Pdf.lookup_direct pdf "/Resources" d with
|
||||||
|
Some d -> d
|
||||||
|
| None -> Pdf.Dictionary []
|
||||||
|
in
|
||||||
|
begin try
|
||||||
|
let content_streams =
|
||||||
|
match lookup "/Contents" dict with
|
||||||
|
Some (Pdf.Indirect i) ->
|
||||||
|
begin match Pdf.direct pdf (Pdf.Indirect i) with
|
||||||
|
Pdf.Array x -> x
|
||||||
|
| _ -> [Pdf.Indirect i]
|
||||||
|
end
|
||||||
|
| Some (Pdf.Array x) -> x
|
||||||
|
| _ -> raise Not_found
|
||||||
|
in
|
||||||
|
if
|
||||||
|
no_duplicates
|
||||||
|
all_content_streams_in_doc
|
||||||
|
(map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
|
||||||
|
then
|
||||||
|
let newstream =
|
||||||
|
Pdfops.stream_of_ops
|
||||||
|
(Pdfops.parse_operators pdf resources content_streams)
|
||||||
|
in
|
||||||
|
let newdict =
|
||||||
|
Pdf.add_dict_entry
|
||||||
|
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
|
||||||
|
in
|
||||||
|
Pdf.addobj_given_num pdf (objnum, newdict);
|
||||||
|
(* Now process all xobjects related to this page *)
|
||||||
|
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
||||||
|
Some (Pdf.Dictionary xobjs) ->
|
||||||
|
iter
|
||||||
|
(function
|
||||||
|
(_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
|
||||||
|
| _ -> failwith "squeeze_xobject")
|
||||||
|
xobjs
|
||||||
|
| _ -> ()
|
||||||
|
end
|
||||||
|
with
|
||||||
|
(* No /Contents, which is ok. Or a parsing failure due to
|
||||||
|
uninherited resources. FIXME: Add support for inherited
|
||||||
|
resources. *)
|
||||||
|
Not_found -> ()
|
||||||
|
end
|
||||||
|
| _ -> ())
|
||||||
|
pdf
|
||||||
|
|
||||||
|
(* We run squeeze enough times for the number of objects to not change *)
|
||||||
|
let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf =
|
||||||
|
let log x =
|
||||||
|
match logto with
|
||||||
|
None -> print_string x; flush stdout
|
||||||
|
| Some "nolog" -> ()
|
||||||
|
| Some s ->
|
||||||
|
let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in
|
||||||
|
seek_out fh (out_channel_length fh);
|
||||||
|
output_string fh x;
|
||||||
|
close_out fh
|
||||||
|
in
|
||||||
|
try
|
||||||
|
let n = ref (Pdf.objcard pdf) in
|
||||||
|
log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf));
|
||||||
|
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
|
||||||
|
n := Pdf.objcard pdf;
|
||||||
|
log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf));
|
||||||
|
done;
|
||||||
|
if pagedata then
|
||||||
|
begin
|
||||||
|
log (Printf.sprintf "Squeezing page data and xobjects\n");
|
||||||
|
squeeze_all_content_streams pdf;
|
||||||
|
end;
|
||||||
|
if recompress then
|
||||||
|
begin
|
||||||
|
log (Printf.sprintf "Recompressing document\n");
|
||||||
|
Pdfcodec.flate_level := 9;
|
||||||
|
ignore (recompress_pdf pdf)
|
||||||
|
end
|
||||||
|
with
|
||||||
|
e ->
|
||||||
|
raise
|
||||||
|
(Pdf.PDFError
|
||||||
|
(Printf.sprintf
|
||||||
|
"Squeeze failed. No output written.\n Proximate error was:\n %s"
|
||||||
|
(Printexc.to_string e)))
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
(** Compresses all streams in the PDF document which are uncompressed, using
|
||||||
|
/FlateDecode, leaving out metadata. If the PDF is encrypted, does nothing. *)
|
||||||
|
val recompress_pdf : Pdf.t -> Pdf.t
|
||||||
|
|
||||||
|
(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *)
|
||||||
|
val decompress_pdf : Pdf.t -> Pdf.t
|
||||||
|
|
||||||
|
(** Squeeze a PDF *)
|
||||||
|
val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit
|
Loading…
Reference in New Issue