Avoid squeezing shared content streams

This commit is contained in:
John Whitington 2014-10-11 14:17:24 +01:00
parent ef5f6b13e4
commit 2f8beb0e0d
1 changed files with 89 additions and 53 deletions

44
cpdf.ml
View File

@ -109,9 +109,13 @@ let really_squeeze pdf =
pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
(* Squeeze the form xobject at objnum. FIXME: For old PDFs (< v1.2) any
resources from the page (or its ancestors in the page tree!) are also needed -
we must merge them with the ones from the xobject itself. *)
(* Squeeze the form xobject at objnum.
FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
the page tree!) are also needed - we must merge them with the ones from the
xobject itself. However, it it safe for now -- in the unlikely event that the
resources actually need to be available, the parse will fail, the squeeze of
this object will fail, and we bail out. *)
let xobjects_done = ref []
let squeeze_form_xobject pdf objnum =
@ -142,9 +146,36 @@ let squeeze_form_xobject pdf objnum =
end
| _ -> ()
(* For a list of indirects representing content streams, make sure that none of
them are duplicated in the PDF. This indicates sharing, which parsing and
rewriting the streams might destroy, thus making the file bigger. FIXME: The
correct thing to do is to preserve the multiple content streams. *)
let no_duplicates content_stream_numbers stream_numbers =
not
(mem false
(map
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
stream_numbers))
(* Give a list of content stream numbers, given a page reference number *)
let content_streams_of_page pdf refnum =
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
Pdf.Dictionary dict ->
begin match lookup "/Contents" dict with
Some (Pdf.Indirect i) -> [i]
| Some (Pdf.Array x) ->
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
| _ -> []
end
| _ -> []
(* For each object in the PDF marked with /Type /Page, for each /Contents
indirect reference or array of such, decode and recode that content stream. *)
let squeeze_all_content_streams pdf =
let page_reference_numbers = Pdf.page_reference_numbers pdf in
let all_content_streams_in_doc =
flatten (map (content_streams_of_page pdf) page_reference_numbers)
in
xobjects_done := [];
Pdf.objiter
(fun objnum _ ->
@ -159,7 +190,6 @@ let squeeze_all_content_streams pdf =
| None -> Pdf.Dictionary []
in
begin try
let newstream =
let content_streams =
match lookup "/Contents" dict with
Some (Pdf.Indirect i) ->
@ -170,6 +200,12 @@ let squeeze_all_content_streams pdf =
| Some (Pdf.Array x) -> x
| _ -> raise Not_found
in
if
no_duplicates
all_content_streams_in_doc
(map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
then
let newstream =
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf resources content_streams)
in