Avoid squeezing shared content streams

This commit is contained in:
John Whitington 2014-10-11 14:17:24 +01:00
parent ef5f6b13e4
commit 2f8beb0e0d
1 changed files with 89 additions and 53 deletions

44
cpdf.ml
View File

@ -109,9 +109,13 @@ let really_squeeze pdf =
pdf.Pdf.objects <- !pdfr.Pdf.objects; pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
(* Squeeze the form xobject at objnum. FIXME: For old PDFs (< v1.2) any (* Squeeze the form xobject at objnum.
resources from the page (or its ancestors in the page tree!) are also needed -
we must merge them with the ones from the xobject itself. *) FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
the page tree!) are also needed - we must merge them with the ones from the
xobject itself. However, it it safe for now -- in the unlikely event that the
resources actually need to be available, the parse will fail, the squeeze of
this object will fail, and we bail out. *)
let xobjects_done = ref [] let xobjects_done = ref []
let squeeze_form_xobject pdf objnum = let squeeze_form_xobject pdf objnum =
@ -142,9 +146,36 @@ let squeeze_form_xobject pdf objnum =
end end
| _ -> () | _ -> ()
(* For a list of indirects representing content streams, make sure that none of
them are duplicated in the PDF. This indicates sharing, which parsing and
rewriting the streams might destroy, thus making the file bigger. FIXME: The
correct thing to do is to preserve the multiple content streams. *)
let no_duplicates content_stream_numbers stream_numbers =
not
(mem false
(map
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
stream_numbers))
(* Give a list of content stream numbers, given a page reference number *)
let content_streams_of_page pdf refnum =
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
Pdf.Dictionary dict ->
begin match lookup "/Contents" dict with
Some (Pdf.Indirect i) -> [i]
| Some (Pdf.Array x) ->
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
| _ -> []
end
| _ -> []
(* For each object in the PDF marked with /Type /Page, for each /Contents (* For each object in the PDF marked with /Type /Page, for each /Contents
indirect reference or array of such, decode and recode that content stream. *) indirect reference or array of such, decode and recode that content stream. *)
let squeeze_all_content_streams pdf = let squeeze_all_content_streams pdf =
let page_reference_numbers = Pdf.page_reference_numbers pdf in
let all_content_streams_in_doc =
flatten (map (content_streams_of_page pdf) page_reference_numbers)
in
xobjects_done := []; xobjects_done := [];
Pdf.objiter Pdf.objiter
(fun objnum _ -> (fun objnum _ ->
@ -159,7 +190,6 @@ let squeeze_all_content_streams pdf =
| None -> Pdf.Dictionary [] | None -> Pdf.Dictionary []
in in
begin try begin try
let newstream =
let content_streams = let content_streams =
match lookup "/Contents" dict with match lookup "/Contents" dict with
Some (Pdf.Indirect i) -> Some (Pdf.Indirect i) ->
@ -170,6 +200,12 @@ let squeeze_all_content_streams pdf =
| Some (Pdf.Array x) -> x | Some (Pdf.Array x) -> x
| _ -> raise Not_found | _ -> raise Not_found
in in
if
no_duplicates
all_content_streams_in_doc
(map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
then
let newstream =
Pdfops.stream_of_ops Pdfops.stream_of_ops
(Pdfops.parse_operators pdf resources content_streams) (Pdfops.parse_operators pdf resources content_streams)
in in