Avoid squeezing shared content streams

This commit is contained in:
John Whitington 2014-10-11 14:17:24 +01:00
parent ef5f6b13e4
commit 2f8beb0e0d
1 changed files with 89 additions and 53 deletions

142
cpdf.ml
View File

@ -109,9 +109,13 @@ let really_squeeze pdf =
pdf.Pdf.objects <- !pdfr.Pdf.objects; pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
(* Squeeze the form xobject at objnum. FIXME: For old PDFs (< v1.2) any (* Squeeze the form xobject at objnum.
resources from the page (or its ancestors in the page tree!) are also needed -
we must merge them with the ones from the xobject itself. *) FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
the page tree!) are also needed - we must merge them with the ones from the
xobject itself. However, it it safe for now -- in the unlikely event that the
resources actually need to be available, the parse will fail, the squeeze of
this object will fail, and we bail out. *)
let xobjects_done = ref [] let xobjects_done = ref []
let squeeze_form_xobject pdf objnum = let squeeze_form_xobject pdf objnum =
@ -142,60 +146,92 @@ let squeeze_form_xobject pdf objnum =
end end
| _ -> () | _ -> ()
(* For a list of indirects representing content streams, make sure that none of
them are duplicated in the PDF. This indicates sharing, which parsing and
rewriting the streams might destroy, thus making the file bigger. FIXME: The
correct thing to do is to preserve the multiple content streams. *)
let no_duplicates content_stream_numbers stream_numbers =
not
(mem false
(map
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
stream_numbers))
(* Give a list of content stream numbers, given a page reference number *)
let content_streams_of_page pdf refnum =
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
Pdf.Dictionary dict ->
begin match lookup "/Contents" dict with
Some (Pdf.Indirect i) -> [i]
| Some (Pdf.Array x) ->
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
| _ -> []
end
| _ -> []
(* For each object in the PDF marked with /Type /Page, for each /Contents (* For each object in the PDF marked with /Type /Page, for each /Contents
indirect reference or array of such, decode and recode that content stream. *) indirect reference or array of such, decode and recode that content stream. *)
let squeeze_all_content_streams pdf = let squeeze_all_content_streams pdf =
xobjects_done := []; let page_reference_numbers = Pdf.page_reference_numbers pdf in
Pdf.objiter let all_content_streams_in_doc =
(fun objnum _ -> flatten (map (content_streams_of_page pdf) page_reference_numbers)
match Pdf.lookup_obj pdf objnum with in
Pdf.Dictionary dict as d xobjects_done := [];
when Pdf.objiter
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page") (fun objnum _ ->
-> match Pdf.lookup_obj pdf objnum with
let resources = Pdf.Dictionary dict as d
match Pdf.lookup_direct pdf "/Resources" d with when
Some d -> d Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
| None -> Pdf.Dictionary [] ->
in let resources =
begin try match Pdf.lookup_direct pdf "/Resources" d with
let newstream = Some d -> d
let content_streams = | None -> Pdf.Dictionary []
match lookup "/Contents" dict with
Some (Pdf.Indirect i) ->
begin match Pdf.direct pdf (Pdf.Indirect i) with
Pdf.Array x -> x
| _ -> [Pdf.Indirect i]
end
| Some (Pdf.Array x) -> x
| _ -> raise Not_found
in
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf resources content_streams)
in in
let newdict = begin try
Pdf.add_dict_entry let content_streams =
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream)) match lookup "/Contents" dict with
in Some (Pdf.Indirect i) ->
Pdf.addobj_given_num pdf (objnum, newdict); begin match Pdf.direct pdf (Pdf.Indirect i) with
(* Now process all xobjects related to this page *) Pdf.Array x -> x
begin match Pdf.lookup_direct pdf "/XObject" resources with | _ -> [Pdf.Indirect i]
Some (Pdf.Dictionary xobjs) -> end
iter | Some (Pdf.Array x) -> x
(function | _ -> raise Not_found
(_, Pdf.Indirect i) -> squeeze_form_xobject pdf i in
| _ -> failwith "squeeze_xobject") if
xobjs no_duplicates
| _ -> () all_content_streams_in_doc
end (map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
with then
(* No /Contents, which is ok. Or a parsing failure due to let newstream =
uninherited resources. FIXME: Add support for inherited Pdfops.stream_of_ops
resources. *) (Pdfops.parse_operators pdf resources content_streams)
Not_found -> () in
end let newdict =
| _ -> ()) Pdf.add_dict_entry
pdf d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
in
Pdf.addobj_given_num pdf (objnum, newdict);
(* Now process all xobjects related to this page *)
begin match Pdf.lookup_direct pdf "/XObject" resources with
Some (Pdf.Dictionary xobjs) ->
iter
(function
(_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
| _ -> failwith "squeeze_xobject")
xobjs
| _ -> ()
end
with
(* No /Contents, which is ok. Or a parsing failure due to
uninherited resources. FIXME: Add support for inherited
resources. *)
Not_found -> ()
end
| _ -> ())
pdf
(* We run squeeze enough times to reach a fixed point in the cardinality of the (* We run squeeze enough times to reach a fixed point in the cardinality of the
* object map *) * object map *)