Squeeze basics done. Encryption, xobjects to to.

This commit is contained in:
John Whitington 2014-09-24 12:07:14 +01:00
parent 274d580f98
commit 27a40e84b4
2 changed files with 44 additions and 37 deletions

View File

@ -1767,19 +1767,13 @@ let rec writing_ok outname =
else
outname
(* Equality on PDF streams *)
(* Equality on PDF objects *)
let pdfobjeq pdf x y =
let x = Pdf.lookup_obj pdf x
and y = Pdf.lookup_obj pdf y in
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
match x with
(*Pdf.Dictionary _
when
Pdf.lookup_direct pdf "/Type" x = Some (Pdf.Name "/Page")
->
(-1)*) (* FIXME *)
| _ -> compare x y
compare x y
(* FIXME: We need to be able to do squeeze on encrypted files, which at the
* moment thinks it has a permissions problem. *)
@ -1791,25 +1785,34 @@ let really_squeeze pdf =
(fun x -> length x > 1)
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
in
(*Printf.printf "Found %i pools of duplicate objects to coalesce\n" (length toprocess);*)
(*List.iter
(fun x -> Printf.printf "\n\nPool: "; List.iter (Printf.printf "%i ") x)
toprocess;*)
let pdfr = ref pdf in
let changetable = Hashtbl.create 100 in
iter
(function [] -> assert false | h::t ->
iter (fun e -> Hashtbl.add changetable e h) t)
toprocess;
(* For a unknown reason, the output file is much smaller if
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
an old, well-understood function in use for years -- what is
going on? *)
pdfr := Pdf.renumber changetable !pdfr;
pdfr := Pdf.renumber changetable !pdfr;
pdf.Pdf.root <- !pdfr.Pdf.root;
pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
(* Remove any pools of objects which are page objects, since Adobe Reader
* gets confused when there are duplicate page objects. *)
let toprocess =
option_map
(function
[] -> assert false
| h::_ as l ->
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
Some (Pdf.Name "/Page") -> None
| _ -> Some l)
toprocess
in
let pdfr = ref pdf in
let changetable = Hashtbl.create 100 in
iter
(function [] -> assert false | h::t ->
iter (fun e -> Hashtbl.add changetable e h) t)
toprocess;
(* For a unknown reason, the output file is much smaller if
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
an old, well-understood function in use for years -- what is
going on? *)
pdfr := Pdf.renumber changetable !pdfr;
pdfr := Pdf.renumber changetable !pdfr;
Pdf.remove_unreferenced !pdfr;
pdf.Pdf.root <- !pdfr.Pdf.root;
pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
(* For each object in the PDF marked with /Type /Page, for each /Contents
indirect reference or array of such, decode and recode that content stream. *)
@ -1856,16 +1859,19 @@ let squeeze_all_content_streams pdf =
(* We run squeeze enough times to reach a fixed point in the cardinality of the
* object map *)
let squeeze pdf =
let n = ref (Pdf.objcard pdf) in
Printf.printf "Beginning squeeze: %i objects\n%!" (Pdf.objcard pdf);
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
n := Pdf.objcard pdf;
Printf.printf "Squeezing... Down to %i objects\n%!" (Pdf.objcard pdf);
done;
Printf.printf "Squeezing page data\n%!";
squeeze_all_content_streams pdf;
Printf.printf "Recompressing document\n%!";
ignore (Cpdf.recompress_pdf pdf)
try
let n = ref (Pdf.objcard pdf) in
Printf.printf "Beginning squeeze: %i objects\n%!" (Pdf.objcard pdf);
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
n := Pdf.objcard pdf;
Printf.printf "Squeezing... Down to %i objects\n%!" (Pdf.objcard pdf);
done;
Printf.printf "Squeezing page data\n%!";
squeeze_all_content_streams pdf;
Printf.printf "Recompressing document\n%!";
ignore (Cpdf.recompress_pdf pdf)
with
e -> raise (Pdf.PDFError "Squeeze failed. No output written")
let write_pdf mk_id pdf =
if args.create_objstm && not args.keepversion

View File

@ -9,3 +9,4 @@ val go_withargv : string array -> unit
(**/**)
val demo : bool
val pdfobjeq : Pdf.t -> int -> int -> int