more
This commit is contained in:
parent
1048bdf1df
commit
8726d3dee0
2
Makefile
2
Makefile
|
@ -2,7 +2,7 @@
|
|||
MODS = cpdfyojson cpdfxmlm \
|
||||
cpdfunicodedata cpdferror cpdfjson cpdfstrftime cpdfcoord cpdfattach \
|
||||
cpdfpagespec cpdfposition cpdf cpdfpresent cpdffont cpdftype \
|
||||
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfcommand
|
||||
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfcommand
|
||||
|
||||
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
|
||||
|
||||
|
|
244
cpdf.ml
244
cpdf.ml
|
@ -49,250 +49,6 @@ let xmp_template =
|
|||
|
||||
<?xpacket end='r'?>|}
|
||||
|
||||
(* For debugging *)
|
||||
let report_pdf_size pdf =
|
||||
Pdf.remove_unreferenced pdf;
|
||||
Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false
|
||||
~compress_objstm:false false None false pdf "temp.pdf";
|
||||
let fh = open_in_bin "temp.pdf" in
|
||||
Printf.printf "Size %i bytes\n" (in_channel_length fh);
|
||||
flush stdout;
|
||||
close_in fh
|
||||
|
||||
|
||||
(* Recompress anything which isn't compressed, unless it's metadata. *)
|
||||
let recompress_stream pdf = function
|
||||
(* If there is no compression, compress with /FlateDecode *)
|
||||
| Pdf.Stream {contents = (dict, _)} as stream ->
|
||||
begin match
|
||||
Pdf.lookup_direct pdf "/Filter" dict,
|
||||
Pdf.lookup_direct pdf "/Type" dict
|
||||
with
|
||||
| _, Some (Pdf.Name "/Metadata") -> ()
|
||||
| (None | Some (Pdf.Array [])), _ ->
|
||||
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
|
||||
| _ -> ()
|
||||
end
|
||||
| _ -> assert false
|
||||
|
||||
let recompress_pdf pdf =
|
||||
if not (Pdfcrypt.is_encrypted pdf) then
|
||||
Pdf.iter_stream (recompress_stream pdf) pdf;
|
||||
pdf
|
||||
|
||||
let decompress_pdf pdf =
|
||||
if not (Pdfcrypt.is_encrypted pdf) then
|
||||
(Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
|
||||
pdf
|
||||
|
||||
|
||||
(* Equality on PDF objects *)
|
||||
let pdfobjeq pdf x y =
|
||||
let x = Pdf.lookup_obj pdf x
|
||||
and y = Pdf.lookup_obj pdf y in
|
||||
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
|
||||
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
|
||||
compare x y
|
||||
|
||||
let really_squeeze pdf =
|
||||
let objs = ref [] in
|
||||
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
|
||||
let toprocess =
|
||||
keep
|
||||
(fun x -> length x > 1)
|
||||
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
|
||||
in
|
||||
(* Remove any pools of objects which are page objects, since Adobe Reader
|
||||
* gets confused when there are duplicate page objects. *)
|
||||
let toprocess =
|
||||
option_map
|
||||
(function
|
||||
[] -> assert false
|
||||
| h::_ as l ->
|
||||
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
|
||||
Some (Pdf.Name "/Page") -> None
|
||||
| _ -> Some l)
|
||||
toprocess
|
||||
in
|
||||
let pdfr = ref pdf in
|
||||
let changetable = Hashtbl.create 100 in
|
||||
iter
|
||||
(function [] -> assert false | h::t ->
|
||||
iter (fun e -> Hashtbl.add changetable e h) t)
|
||||
toprocess;
|
||||
(* For a unknown reason, the output file is much smaller if
|
||||
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
|
||||
an old, well-understood function in use for years -- what is
|
||||
going on? Furthermore, if we run it 3 times, it gets bigger again! *)
|
||||
pdfr := Pdf.renumber changetable !pdfr;
|
||||
pdfr := Pdf.renumber changetable !pdfr;
|
||||
Pdf.remove_unreferenced !pdfr;
|
||||
pdf.Pdf.root <- !pdfr.Pdf.root;
|
||||
pdf.Pdf.objects <- !pdfr.Pdf.objects;
|
||||
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
|
||||
|
||||
(* Squeeze the form xobject at objnum.
|
||||
|
||||
FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
|
||||
the page tree!) are also needed - we must merge them with the ones from the
|
||||
xobject itself. However, it it safe for now -- in the unlikely event that the
|
||||
resources actually need to be available, the parse will fail, the squeeze of
|
||||
this object will fail, and we bail out. *)
|
||||
let xobjects_done = ref []
|
||||
|
||||
let squeeze_form_xobject pdf objnum =
|
||||
if mem objnum !xobjects_done then () else
|
||||
xobjects_done := objnum :: !xobjects_done;
|
||||
let obj = Pdf.lookup_obj pdf objnum in
|
||||
match Pdf.lookup_direct pdf "/Subtype" obj with
|
||||
Some (Pdf.Name "/Form") ->
|
||||
let resources =
|
||||
match Pdf.lookup_direct pdf "/Resources" obj with
|
||||
Some d -> d
|
||||
| None -> Pdf.Dictionary []
|
||||
in
|
||||
begin match
|
||||
Pdfops.stream_of_ops
|
||||
(Pdfops.parse_operators pdf resources [Pdf.Indirect objnum])
|
||||
with
|
||||
Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
||||
(* Put replacement data in original stream, and overwrite /Length *)
|
||||
begin match obj with
|
||||
Pdf.Stream ({contents = (d, _)} as str) ->
|
||||
str :=
|
||||
(Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)),
|
||||
Pdf.Got data)
|
||||
| _ -> failwith "squeeze_form_xobject"
|
||||
end
|
||||
| _ -> failwith "squeeze_form_xobject"
|
||||
end
|
||||
| _ -> ()
|
||||
|
||||
(* For a list of indirects representing content streams, make sure that none of
|
||||
them are duplicated in the PDF. This indicates sharing, which parsing and
|
||||
rewriting the streams might destroy, thus making the file bigger. FIXME: The
|
||||
correct thing to do is to preserve the multiple content streams. *)
|
||||
let no_duplicates content_stream_numbers stream_numbers =
|
||||
not
|
||||
(mem false
|
||||
(map
|
||||
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
|
||||
stream_numbers))
|
||||
|
||||
(* Give a list of content stream numbers, given a page reference number *)
|
||||
let content_streams_of_page pdf refnum =
|
||||
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
|
||||
Pdf.Dictionary dict ->
|
||||
begin match lookup "/Contents" dict with
|
||||
Some (Pdf.Indirect i) -> [i]
|
||||
| Some (Pdf.Array x) ->
|
||||
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
|
||||
| _ -> []
|
||||
end
|
||||
| _ -> []
|
||||
|
||||
(* For each object in the PDF marked with /Type /Page, for each /Contents
|
||||
indirect reference or array of such, decode and recode that content stream. *)
|
||||
let squeeze_all_content_streams pdf =
|
||||
let page_reference_numbers = Pdf.page_reference_numbers pdf in
|
||||
let all_content_streams_in_doc =
|
||||
flatten (map (content_streams_of_page pdf) page_reference_numbers)
|
||||
in
|
||||
xobjects_done := [];
|
||||
Pdf.objiter
|
||||
(fun objnum _ ->
|
||||
match Pdf.lookup_obj pdf objnum with
|
||||
Pdf.Dictionary dict as d
|
||||
when
|
||||
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
|
||||
->
|
||||
let resources =
|
||||
match Pdf.lookup_direct pdf "/Resources" d with
|
||||
Some d -> d
|
||||
| None -> Pdf.Dictionary []
|
||||
in
|
||||
begin try
|
||||
let content_streams =
|
||||
match lookup "/Contents" dict with
|
||||
Some (Pdf.Indirect i) ->
|
||||
begin match Pdf.direct pdf (Pdf.Indirect i) with
|
||||
Pdf.Array x -> x
|
||||
| _ -> [Pdf.Indirect i]
|
||||
end
|
||||
| Some (Pdf.Array x) -> x
|
||||
| _ -> raise Not_found
|
||||
in
|
||||
if
|
||||
no_duplicates
|
||||
all_content_streams_in_doc
|
||||
(map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
|
||||
then
|
||||
let newstream =
|
||||
Pdfops.stream_of_ops
|
||||
(Pdfops.parse_operators pdf resources content_streams)
|
||||
in
|
||||
let newdict =
|
||||
Pdf.add_dict_entry
|
||||
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
|
||||
in
|
||||
Pdf.addobj_given_num pdf (objnum, newdict);
|
||||
(* Now process all xobjects related to this page *)
|
||||
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
||||
Some (Pdf.Dictionary xobjs) ->
|
||||
iter
|
||||
(function
|
||||
(_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
|
||||
| _ -> failwith "squeeze_xobject")
|
||||
xobjs
|
||||
| _ -> ()
|
||||
end
|
||||
with
|
||||
(* No /Contents, which is ok. Or a parsing failure due to
|
||||
uninherited resources. FIXME: Add support for inherited
|
||||
resources. *)
|
||||
Not_found -> ()
|
||||
end
|
||||
| _ -> ())
|
||||
pdf
|
||||
|
||||
(* We run squeeze enough times for the number of objects to not change *)
|
||||
let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf =
|
||||
let log x =
|
||||
match logto with
|
||||
None -> print_string x; flush stdout
|
||||
| Some "nolog" -> ()
|
||||
| Some s ->
|
||||
let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in
|
||||
seek_out fh (out_channel_length fh);
|
||||
output_string fh x;
|
||||
close_out fh
|
||||
in
|
||||
try
|
||||
let n = ref (Pdf.objcard pdf) in
|
||||
log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf));
|
||||
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
|
||||
n := Pdf.objcard pdf;
|
||||
log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf));
|
||||
done;
|
||||
if pagedata then
|
||||
begin
|
||||
log (Printf.sprintf "Squeezing page data and xobjects\n");
|
||||
squeeze_all_content_streams pdf;
|
||||
end;
|
||||
if recompress then
|
||||
begin
|
||||
log (Printf.sprintf "Recompressing document\n");
|
||||
Pdfcodec.flate_level := 9;
|
||||
ignore (recompress_pdf pdf)
|
||||
end
|
||||
with
|
||||
e ->
|
||||
raise
|
||||
(Pdf.PDFError
|
||||
(Printf.sprintf
|
||||
"Squeeze failed. No output written.\n Proximate error was:\n %s"
|
||||
(Printexc.to_string e)))
|
||||
|
||||
type encoding =
|
||||
| Raw
|
||||
| UTF8
|
||||
|
|
10
cpdf.mli
10
cpdf.mli
|
@ -36,14 +36,6 @@ val iter_pages : (int -> Pdfpage.t -> unit) -> Pdf.t -> int list -> unit
|
|||
(** Same as [process_pages] but return the list of outputs of the map function. *)
|
||||
val map_pages : (int -> Pdfpage.t -> 'a) -> Pdf.t -> int list -> 'a list
|
||||
|
||||
|
||||
(** Compresses all streams in the PDF document which are uncompressed, using
|
||||
/FlateDecode, leaving out metadata. If the PDF is encrypted, does nothing. *)
|
||||
val recompress_pdf : Pdf.t -> Pdf.t
|
||||
|
||||
(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *)
|
||||
val decompress_pdf : Pdf.t -> Pdf.t
|
||||
|
||||
val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t
|
||||
|
||||
(** {2 Metadata and settings} *)
|
||||
|
@ -344,8 +336,6 @@ val blackfills : color -> int list -> Pdf.t -> Pdf.t
|
|||
(** Remove images from a PDF, optionally adding crossed boxes. *)
|
||||
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
|
||||
|
||||
(** Squeeze a PDF *)
|
||||
val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit
|
||||
|
||||
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
||||
|
||||
|
|
|
@ -2834,8 +2834,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
|
|||
None ->
|
||||
if not is_decompress then
|
||||
begin
|
||||
ignore (Cpdf.recompress_pdf pdf);
|
||||
if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||
ignore (Cpdfsqueeze.recompress_pdf pdf);
|
||||
if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||
end;
|
||||
Pdf.remove_unreferenced pdf;
|
||||
really_write_pdf ~is_decompress mk_id pdf outname
|
||||
|
@ -2849,8 +2849,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
|
|||
None ->
|
||||
if not is_decompress then
|
||||
begin
|
||||
ignore (Cpdf.recompress_pdf pdf);
|
||||
if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||
ignore (Cpdfsqueeze.recompress_pdf pdf);
|
||||
if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||
Pdf.remove_unreferenced pdf
|
||||
end;
|
||||
really_write_pdf ~encryption ~is_decompress mk_id pdf temp;
|
||||
|
@ -2889,7 +2889,7 @@ let fast_write_split_pdfs
|
|||
(stem original_filename) startpage endpage
|
||||
in
|
||||
Pdf.remove_unreferenced pdf;
|
||||
if sq then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||
if sq then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
|
||||
really_write_pdf ~encryption:enc (not (enc = None)) pdf name)
|
||||
(indx pagenums)
|
||||
pagenums
|
||||
|
@ -3261,7 +3261,7 @@ let go () =
|
|||
let pdf = get_single_pdf (Some Compress) false in
|
||||
if args.remove_duplicate_streams then
|
||||
Pdfmerge.remove_duplicate_fonts pdf;
|
||||
write_pdf false (Cpdf.recompress_pdf pdf)
|
||||
write_pdf false (Cpdfsqueeze.recompress_pdf pdf)
|
||||
| Some RemoveCrop ->
|
||||
begin match args.inputs, args.out with
|
||||
| (_, pagespec, _, _, _, _)::_, _ ->
|
||||
|
@ -3736,7 +3736,7 @@ let go () =
|
|||
(Cpdf.combine_pages args.fast (get_single_pdf args.op false) (pdfread_pdf_of_file None None over) false false true)
|
||||
| Some Encrypt ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
let pdf = Cpdf.recompress_pdf pdf
|
||||
let pdf = Cpdfsqueeze.recompress_pdf pdf
|
||||
and encryption =
|
||||
{Pdfwrite.encryption_method =
|
||||
(match args.crypt_method with
|
||||
|
|
|
@ -0,0 +1,246 @@
|
|||
open Pdfutil
|
||||
open Pdfio
|
||||
|
||||
(* For debugging *)
|
||||
let report_pdf_size pdf =
|
||||
Pdf.remove_unreferenced pdf;
|
||||
Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false
|
||||
~compress_objstm:false false None false pdf "temp.pdf";
|
||||
let fh = open_in_bin "temp.pdf" in
|
||||
Printf.printf "Size %i bytes\n" (in_channel_length fh);
|
||||
flush stdout;
|
||||
close_in fh
|
||||
|
||||
(* Recompress anything which isn't compressed, unless it's metadata. *)
|
||||
let recompress_stream pdf = function
|
||||
(* If there is no compression, compress with /FlateDecode *)
|
||||
| Pdf.Stream {contents = (dict, _)} as stream ->
|
||||
begin match
|
||||
Pdf.lookup_direct pdf "/Filter" dict,
|
||||
Pdf.lookup_direct pdf "/Type" dict
|
||||
with
|
||||
| _, Some (Pdf.Name "/Metadata") -> ()
|
||||
| (None | Some (Pdf.Array [])), _ ->
|
||||
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
|
||||
| _ -> ()
|
||||
end
|
||||
| _ -> assert false
|
||||
|
||||
let recompress_pdf pdf =
|
||||
if not (Pdfcrypt.is_encrypted pdf) then
|
||||
Pdf.iter_stream (recompress_stream pdf) pdf;
|
||||
pdf
|
||||
|
||||
let decompress_pdf pdf =
|
||||
if not (Pdfcrypt.is_encrypted pdf) then
|
||||
(Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
|
||||
pdf
|
||||
|
||||
|
||||
(* Equality on PDF objects *)
|
||||
let pdfobjeq pdf x y =
|
||||
let x = Pdf.lookup_obj pdf x
|
||||
and y = Pdf.lookup_obj pdf y in
|
||||
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
|
||||
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
|
||||
compare x y
|
||||
|
||||
let really_squeeze pdf =
|
||||
let objs = ref [] in
|
||||
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
|
||||
let toprocess =
|
||||
keep
|
||||
(fun x -> length x > 1)
|
||||
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
|
||||
in
|
||||
(* Remove any pools of objects which are page objects, since Adobe Reader
|
||||
* gets confused when there are duplicate page objects. *)
|
||||
let toprocess =
|
||||
option_map
|
||||
(function
|
||||
[] -> assert false
|
||||
| h::_ as l ->
|
||||
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
|
||||
Some (Pdf.Name "/Page") -> None
|
||||
| _ -> Some l)
|
||||
toprocess
|
||||
in
|
||||
let pdfr = ref pdf in
|
||||
let changetable = Hashtbl.create 100 in
|
||||
iter
|
||||
(function [] -> assert false | h::t ->
|
||||
iter (fun e -> Hashtbl.add changetable e h) t)
|
||||
toprocess;
|
||||
(* For a unknown reason, the output file is much smaller if
|
||||
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
|
||||
an old, well-understood function in use for years -- what is
|
||||
going on? Furthermore, if we run it 3 times, it gets bigger again! *)
|
||||
pdfr := Pdf.renumber changetable !pdfr;
|
||||
pdfr := Pdf.renumber changetable !pdfr;
|
||||
Pdf.remove_unreferenced !pdfr;
|
||||
pdf.Pdf.root <- !pdfr.Pdf.root;
|
||||
pdf.Pdf.objects <- !pdfr.Pdf.objects;
|
||||
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
|
||||
|
||||
(* Squeeze the form xobject at objnum.
|
||||
|
||||
FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
|
||||
the page tree!) are also needed - we must merge them with the ones from the
|
||||
xobject itself. However, it it safe for now -- in the unlikely event that the
|
||||
resources actually need to be available, the parse will fail, the squeeze of
|
||||
this object will fail, and we bail out. *)
|
||||
let xobjects_done = ref []
|
||||
|
||||
let squeeze_form_xobject pdf objnum =
|
||||
if mem objnum !xobjects_done then () else
|
||||
xobjects_done := objnum :: !xobjects_done;
|
||||
let obj = Pdf.lookup_obj pdf objnum in
|
||||
match Pdf.lookup_direct pdf "/Subtype" obj with
|
||||
Some (Pdf.Name "/Form") ->
|
||||
let resources =
|
||||
match Pdf.lookup_direct pdf "/Resources" obj with
|
||||
Some d -> d
|
||||
| None -> Pdf.Dictionary []
|
||||
in
|
||||
begin match
|
||||
Pdfops.stream_of_ops
|
||||
(Pdfops.parse_operators pdf resources [Pdf.Indirect objnum])
|
||||
with
|
||||
Pdf.Stream {contents = (_, Pdf.Got data)} ->
|
||||
(* Put replacement data in original stream, and overwrite /Length *)
|
||||
begin match obj with
|
||||
Pdf.Stream ({contents = (d, _)} as str) ->
|
||||
str :=
|
||||
(Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)),
|
||||
Pdf.Got data)
|
||||
| _ -> failwith "squeeze_form_xobject"
|
||||
end
|
||||
| _ -> failwith "squeeze_form_xobject"
|
||||
end
|
||||
| _ -> ()
|
||||
|
||||
(* For a list of indirects representing content streams, make sure that none of
|
||||
them are duplicated in the PDF. This indicates sharing, which parsing and
|
||||
rewriting the streams might destroy, thus making the file bigger. FIXME: The
|
||||
correct thing to do is to preserve the multiple content streams. *)
|
||||
let no_duplicates content_stream_numbers stream_numbers =
|
||||
not
|
||||
(mem false
|
||||
(map
|
||||
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
|
||||
stream_numbers))
|
||||
|
||||
(* Give a list of content stream numbers, given a page reference number *)
|
||||
let content_streams_of_page pdf refnum =
|
||||
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
|
||||
Pdf.Dictionary dict ->
|
||||
begin match lookup "/Contents" dict with
|
||||
Some (Pdf.Indirect i) -> [i]
|
||||
| Some (Pdf.Array x) ->
|
||||
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
|
||||
| _ -> []
|
||||
end
|
||||
| _ -> []
|
||||
|
||||
(* For each object in the PDF marked with /Type /Page, for each /Contents
|
||||
indirect reference or array of such, decode and recode that content stream. *)
|
||||
let squeeze_all_content_streams pdf =
|
||||
let page_reference_numbers = Pdf.page_reference_numbers pdf in
|
||||
let all_content_streams_in_doc =
|
||||
flatten (map (content_streams_of_page pdf) page_reference_numbers)
|
||||
in
|
||||
xobjects_done := [];
|
||||
Pdf.objiter
|
||||
(fun objnum _ ->
|
||||
match Pdf.lookup_obj pdf objnum with
|
||||
Pdf.Dictionary dict as d
|
||||
when
|
||||
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
|
||||
->
|
||||
let resources =
|
||||
match Pdf.lookup_direct pdf "/Resources" d with
|
||||
Some d -> d
|
||||
| None -> Pdf.Dictionary []
|
||||
in
|
||||
begin try
|
||||
let content_streams =
|
||||
match lookup "/Contents" dict with
|
||||
Some (Pdf.Indirect i) ->
|
||||
begin match Pdf.direct pdf (Pdf.Indirect i) with
|
||||
Pdf.Array x -> x
|
||||
| _ -> [Pdf.Indirect i]
|
||||
end
|
||||
| Some (Pdf.Array x) -> x
|
||||
| _ -> raise Not_found
|
||||
in
|
||||
if
|
||||
no_duplicates
|
||||
all_content_streams_in_doc
|
||||
(map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
|
||||
then
|
||||
let newstream =
|
||||
Pdfops.stream_of_ops
|
||||
(Pdfops.parse_operators pdf resources content_streams)
|
||||
in
|
||||
let newdict =
|
||||
Pdf.add_dict_entry
|
||||
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
|
||||
in
|
||||
Pdf.addobj_given_num pdf (objnum, newdict);
|
||||
(* Now process all xobjects related to this page *)
|
||||
begin match Pdf.lookup_direct pdf "/XObject" resources with
|
||||
Some (Pdf.Dictionary xobjs) ->
|
||||
iter
|
||||
(function
|
||||
(_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
|
||||
| _ -> failwith "squeeze_xobject")
|
||||
xobjs
|
||||
| _ -> ()
|
||||
end
|
||||
with
|
||||
(* No /Contents, which is ok. Or a parsing failure due to
|
||||
uninherited resources. FIXME: Add support for inherited
|
||||
resources. *)
|
||||
Not_found -> ()
|
||||
end
|
||||
| _ -> ())
|
||||
pdf
|
||||
|
||||
(* We run squeeze enough times for the number of objects to not change *)
|
||||
let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf =
|
||||
let log x =
|
||||
match logto with
|
||||
None -> print_string x; flush stdout
|
||||
| Some "nolog" -> ()
|
||||
| Some s ->
|
||||
let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in
|
||||
seek_out fh (out_channel_length fh);
|
||||
output_string fh x;
|
||||
close_out fh
|
||||
in
|
||||
try
|
||||
let n = ref (Pdf.objcard pdf) in
|
||||
log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf));
|
||||
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
|
||||
n := Pdf.objcard pdf;
|
||||
log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf));
|
||||
done;
|
||||
if pagedata then
|
||||
begin
|
||||
log (Printf.sprintf "Squeezing page data and xobjects\n");
|
||||
squeeze_all_content_streams pdf;
|
||||
end;
|
||||
if recompress then
|
||||
begin
|
||||
log (Printf.sprintf "Recompressing document\n");
|
||||
Pdfcodec.flate_level := 9;
|
||||
ignore (recompress_pdf pdf)
|
||||
end
|
||||
with
|
||||
e ->
|
||||
raise
|
||||
(Pdf.PDFError
|
||||
(Printf.sprintf
|
||||
"Squeeze failed. No output written.\n Proximate error was:\n %s"
|
||||
(Printexc.to_string e)))
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
(** Compresses all streams in the PDF document which are uncompressed, using
|
||||
/FlateDecode, leaving out metadata. If the PDF is encrypted, does nothing. *)
|
||||
val recompress_pdf : Pdf.t -> Pdf.t
|
||||
|
||||
(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *)
|
||||
val decompress_pdf : Pdf.t -> Pdf.t
|
||||
|
||||
(** Squeeze a PDF *)
|
||||
val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit
|
Loading…
Reference in New Issue