Squeeze now works on fast_write_split_pdfs i.e for -split and

-split-bookmarks
This commit is contained in:
John Whitington 2014-09-30 17:43:04 +01:00
parent 5a2081d077
commit e94b01605b
4 changed files with 148 additions and 141 deletions

166
cpdf.ml
View File

@ -2,6 +2,137 @@
open Pdfutil open Pdfutil
open Pdfio open Pdfio
(* Recompress anything which isn't compressed, unless it's metadata. *)
let recompress_stream pdf = function
(* If there is no compression, compress with /FlateDecode *)
| Pdf.Stream {contents = (dict, _)} as stream ->
begin match
Pdf.lookup_direct pdf "/Filter" dict,
Pdf.lookup_direct pdf "/Type" dict
with
| _, Some (Pdf.Name "/Metadata") -> ()
| (None | Some (Pdf.Array [])), _ ->
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
| _ -> ()
end
| _ -> assert false
let recompress_pdf pdf =
if not (Pdfcrypt.is_encrypted pdf) then
Pdf.iter_stream (recompress_stream pdf) pdf;
pdf
let decompress_pdf pdf =
if not (Pdfcrypt.is_encrypted pdf) then
(Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
pdf
(* Equality on PDF objects *)
let pdfobjeq pdf x y =
let x = Pdf.lookup_obj pdf x
and y = Pdf.lookup_obj pdf y in
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
compare x y
(* FIXME: We need to be able to do squeeze on encrypted files, which at the
* moment thinks it has a permissions problem. *)
let really_squeeze pdf =
let objs = ref [] in
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
let toprocess =
keep
(fun x -> length x > 1)
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
in
(* Remove any pools of objects which are page objects, since Adobe Reader
* gets confused when there are duplicate page objects. *)
let toprocess =
option_map
(function
[] -> assert false
| h::_ as l ->
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
Some (Pdf.Name "/Page") -> None
| _ -> Some l)
toprocess
in
let pdfr = ref pdf in
let changetable = Hashtbl.create 100 in
iter
(function [] -> assert false | h::t ->
iter (fun e -> Hashtbl.add changetable e h) t)
toprocess;
(* For a unknown reason, the output file is much smaller if
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
an old, well-understood function in use for years -- what is
going on? *)
pdfr := Pdf.renumber changetable !pdfr;
pdfr := Pdf.renumber changetable !pdfr;
Pdf.remove_unreferenced !pdfr;
pdf.Pdf.root <- !pdfr.Pdf.root;
pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
(* For each object in the PDF marked with /Type /Page, for each /Contents
indirect reference or array of such, decode and recode that content stream. *)
let squeeze_all_content_streams pdf =
Pdf.objiter
(fun objnum _ ->
match Pdf.lookup_obj pdf objnum with
Pdf.Dictionary dict as d
when
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
->
let resources =
match Pdf.lookup_direct pdf "/Resources" d with
Some d -> d
| None -> Pdf.Dictionary []
in
begin try
let newstream =
let content_streams =
match lookup "/Contents" dict with
Some (Pdf.Indirect i) ->
begin match Pdf.direct pdf (Pdf.Indirect i) with
Pdf.Array x -> x
| _ -> [Pdf.Indirect i]
end
| Some (Pdf.Array x) -> x
| _ -> raise Not_found
in
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf resources content_streams)
in
let newdict =
Pdf.add_dict_entry
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
in
Pdf.addobj_given_num pdf (objnum, newdict)
with
(* No /Contents, which is ok. *)
Not_found -> ()
end
| _ -> ())
pdf
(* We run squeeze enough times to reach a fixed point in the cardinality of the
* object map *)
let squeeze pdf =
try
let n = ref (Pdf.objcard pdf) in
Printf.printf "Beginning squeeze: %i objects\n%!" (Pdf.objcard pdf);
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
n := Pdf.objcard pdf;
Printf.printf "Squeezing... Down to %i objects\n%!" (Pdf.objcard pdf);
done;
Printf.printf "Squeezing page data\n%!";
squeeze_all_content_streams pdf;
Printf.printf "Recompressing document\n%!";
ignore (recompress_pdf pdf)
with
e -> raise (Pdf.PDFError "Squeeze failed. No output written")
(* Printf implementation *) (* Printf implementation *)
exception PrintfFailure of string exception PrintfFailure of string
@ -429,30 +560,7 @@ let print_pdf_objs pdf =
Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj)) Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj))
pdf pdf
(* Recompress anything which isn't compressed, unless it's metadata. *)
let recompress_stream pdf = function
(* If there is no compression, compress with /FlateDecode *)
| Pdf.Stream {contents = (dict, _)} as stream ->
begin match
Pdf.lookup_direct pdf "/Filter" dict,
Pdf.lookup_direct pdf "/Type" dict
with
| _, Some (Pdf.Name "/Metadata") -> ()
| (None | Some (Pdf.Array [])), _ ->
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
| _ -> ()
end
| _ -> assert false
let recompress_pdf pdf =
if not (Pdfcrypt.is_encrypted pdf) then
Pdf.iter_stream (recompress_stream pdf) pdf;
pdf
let decompress_pdf pdf =
if not (Pdfcrypt.is_encrypted pdf) then
(Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
pdf
(* Return page label at pdf page num, or page number in arabic if no label *) (* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num = let pagelabel pdf num =
@ -1274,7 +1382,7 @@ let name_of_spec printf marks (pdf : Pdf.t) splitlevel spec n filename startpage
let stem s = let stem s =
implode (rev (tail_no_fail (dropwhile (neq '.') (rev (explode (Filename.basename s)))))) implode (rev (tail_no_fail (dropwhile (neq '.') (rev (explode (Filename.basename s))))))
let fast_write_split_pdfs enc printf splitlevel original_filename linearize preserve_objstm create_objstm nobble spec main_pdf pagenums pdf_pages = let fast_write_split_pdfs enc printf splitlevel original_filename linearize preserve_objstm create_objstm sq nobble spec main_pdf pagenums pdf_pages =
let marks = Pdfmarks.read_bookmarks main_pdf in let marks = Pdfmarks.read_bookmarks main_pdf in
iter2 iter2
(fun number pagenums -> (fun number pagenums ->
@ -1282,14 +1390,15 @@ let fast_write_split_pdfs enc printf splitlevel original_filename linearize pres
let startpage, endpage = extremes pagenums in let startpage, endpage = extremes pagenums in
let name = name_of_spec printf marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in let name = name_of_spec printf marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
if sq then squeeze pdf;
Pdfwrite.pdf_to_file_options ~preserve_objstm ~generate_objstm:create_objstm linearize enc (not (enc = None)) pdf name) Pdfwrite.pdf_to_file_options ~preserve_objstm ~generate_objstm:create_objstm linearize enc (not (enc = None)) pdf name)
(indx pagenums) (indx pagenums)
pagenums pagenums
let split_pdf enc printf original_filename chunksize linearize ~preserve_objstm ~create_objstm nobble spec pdf = let split_pdf enc printf original_filename chunksize linearize ~preserve_objstm ~create_objstm ~squeeze nobble spec pdf =
let pdf_pages = Pdfpage.pages_of_pagetree pdf in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
fast_write_split_pdfs enc printf 0 original_filename linearize preserve_objstm fast_write_split_pdfs enc printf 0 original_filename linearize preserve_objstm
create_objstm nobble spec pdf (splitinto chunksize (indx pdf_pages)) pdf_pages create_objstm squeeze nobble spec pdf (splitinto chunksize (indx pdf_pages)) pdf_pages
(* Return list, in order, a *set* of page numbers of bookmarks at a given level *) (* Return list, in order, a *set* of page numbers of bookmarks at a given level *)
let bookmark_pages level pdf = let bookmark_pages level pdf =
@ -1298,7 +1407,7 @@ let bookmark_pages level pdf =
(function l when l.Pdfmarks.level = level -> Some (Pdfpage.pagenumber_of_target pdf l.Pdfmarks.target) | _ -> None) (function l when l.Pdfmarks.level = level -> Some (Pdfpage.pagenumber_of_target pdf l.Pdfmarks.target) | _ -> None)
(Pdfmarks.read_bookmarks pdf)) (Pdfmarks.read_bookmarks pdf))
let split_at_bookmarks original_filename linearize ~preserve_objstm ~create_objstm nobble level spec pdf = let split_at_bookmarks original_filename linearize ~preserve_objstm ~create_objstm ~squeeze nobble level spec pdf =
let pdf_pages = Pdfpage.pages_of_pagetree pdf in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let points = bookmark_pages level pdf in let points = bookmark_pages level pdf in
let points = let points =
@ -1306,7 +1415,7 @@ let split_at_bookmarks original_filename linearize ~preserve_objstm ~create_objs
in in
let pts = splitat points (indx pdf_pages) in let pts = splitat points (indx pdf_pages) in
fast_write_split_pdfs None false level fast_write_split_pdfs None false level
original_filename linearize preserve_objstm create_objstm nobble spec pdf pts pdf_pages original_filename linearize preserve_objstm create_objstm squeeze nobble spec pdf pts pdf_pages
(* Called from cpdflib.ml - different from above *) (* Called from cpdflib.ml - different from above *)
let split_on_bookmarks pdf level = let split_on_bookmarks pdf level =
@ -3290,3 +3399,4 @@ let add_page_labels pdf style prefix startval range =
iter (fun x -> flprint (Pdfpagelabels.string_of_pagelabel x)) !labels;*) iter (fun x -> flprint (Pdfpagelabels.string_of_pagelabel x)) !labels;*)
Pdfpagelabels.write pdf !labels Pdfpagelabels.write pdf !labels

View File

@ -189,7 +189,7 @@ val stamp : bool -> bool -> bool -> int list -> Pdf.t -> Pdf.t -> Pdf.t
the fiven level, writing to files with names given by [spec] (see the fiven level, writing to files with names given by [spec] (see
cpdfmanual.pdf). [nobble] is undocumented and should be false. If [linearize] cpdfmanual.pdf). [nobble] is undocumented and should be false. If [linearize]
is true, the files will be linearized. *) is true, the files will be linearized. *)
val split_at_bookmarks : string -> bool -> preserve_objstm:bool -> create_objstm:bool -> (Pdf.t -> Pdf.t) -> int -> string -> Pdf.t -> unit val split_at_bookmarks : string -> bool -> preserve_objstm:bool -> create_objstm:bool -> squeeze:bool -> (Pdf.t -> Pdf.t) -> int -> string -> Pdf.t -> unit
(** Split a PDF on bookmarks of a given level or below. Level 0 is top level. *) (** Split a PDF on bookmarks of a given level or below. Level 0 is top level. *)
val split_on_bookmarks : Pdf.t -> int -> Pdf.t list val split_on_bookmarks : Pdf.t -> int -> Pdf.t list
@ -202,7 +202,7 @@ be used if the input file had them. If [create_objstm] is true, object
streams will be created in any event. [printf] and [nobble] are streams will be created in any event. [printf] and [nobble] are
undocumented and should be false. *) undocumented and should be false. *)
val split_pdf : Pdfwrite.encryption option -> bool -> string -> int -> bool -> val split_pdf : Pdfwrite.encryption option -> bool -> string -> int -> bool ->
preserve_objstm:bool -> create_objstm:bool -> (Pdf.t -> Pdf.t) -> string -> Pdf.t -> unit preserve_objstm:bool -> create_objstm:bool -> squeeze:bool -> (Pdf.t -> Pdf.t) -> string -> Pdf.t -> unit
(** {2 Listing fonts} *) (** {2 Listing fonts} *)
@ -426,6 +426,9 @@ val blackfills : int list -> Pdf.t -> Pdf.t
(** Remove images from a PDF, optionally adding crossed boxes. *) (** Remove images from a PDF, optionally adding crossed boxes. *)
val draft : bool -> int list -> Pdf.t -> Pdf.t val draft : bool -> int list -> Pdf.t -> Pdf.t
(** Squeeze a PDF *)
val squeeze : Pdf.t -> unit
(**/**) (**/**)
(** Custom CSP1 *) (** Custom CSP1 *)

View File

@ -1772,112 +1772,6 @@ let rec writing_ok outname =
else else
outname outname
(* Equality on PDF objects *)
let pdfobjeq pdf x y =
let x = Pdf.lookup_obj pdf x
and y = Pdf.lookup_obj pdf y in
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
compare x y
(* FIXME: We need to be able to do squeeze on encrypted files, which at the
* moment thinks it has a permissions problem. *)
let really_squeeze pdf =
let objs = ref [] in
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
let toprocess =
keep
(fun x -> length x > 1)
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
in
(* Remove any pools of objects which are page objects, since Adobe Reader
* gets confused when there are duplicate page objects. *)
let toprocess =
option_map
(function
[] -> assert false
| h::_ as l ->
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
Some (Pdf.Name "/Page") -> None
| _ -> Some l)
toprocess
in
let pdfr = ref pdf in
let changetable = Hashtbl.create 100 in
iter
(function [] -> assert false | h::t ->
iter (fun e -> Hashtbl.add changetable e h) t)
toprocess;
(* For a unknown reason, the output file is much smaller if
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
an old, well-understood function in use for years -- what is
going on? *)
pdfr := Pdf.renumber changetable !pdfr;
pdfr := Pdf.renumber changetable !pdfr;
Pdf.remove_unreferenced !pdfr;
pdf.Pdf.root <- !pdfr.Pdf.root;
pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
(* For each object in the PDF marked with /Type /Page, for each /Contents
indirect reference or array of such, decode and recode that content stream. *)
let squeeze_all_content_streams pdf =
Pdf.objiter
(fun objnum _ ->
match Pdf.lookup_obj pdf objnum with
Pdf.Dictionary dict as d
when
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
->
let resources =
match Pdf.lookup_direct pdf "/Resources" d with
Some d -> d
| None -> Pdf.Dictionary []
in
begin try
let newstream =
let content_streams =
match lookup "/Contents" dict with
Some (Pdf.Indirect i) ->
begin match Pdf.direct pdf (Pdf.Indirect i) with
Pdf.Array x -> x
| _ -> [Pdf.Indirect i]
end
| Some (Pdf.Array x) -> x
| _ -> raise Not_found
in
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf resources content_streams)
in
let newdict =
Pdf.add_dict_entry
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
in
Pdf.addobj_given_num pdf (objnum, newdict)
with
(* No /Contents, which is ok. *)
Not_found -> ()
end
| _ -> ())
pdf
(* We run squeeze enough times to reach a fixed point in the cardinality of the
* object map *)
let squeeze pdf =
try
let n = ref (Pdf.objcard pdf) in
Printf.printf "Beginning squeeze: %i objects\n%!" (Pdf.objcard pdf);
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
n := Pdf.objcard pdf;
Printf.printf "Squeezing... Down to %i objects\n%!" (Pdf.objcard pdf);
done;
Printf.printf "Squeezing page data\n%!";
squeeze_all_content_streams pdf;
Printf.printf "Recompressing document\n%!";
ignore (Cpdf.recompress_pdf pdf)
with
e -> raise (Pdf.PDFError "Squeeze failed. No output written")
let write_pdf mk_id pdf = let write_pdf mk_id pdf =
if args.create_objstm && not args.keepversion if args.create_objstm && not args.keepversion
then pdf.Pdf.minor <- max pdf.Pdf.minor 5; then pdf.Pdf.minor <- max pdf.Pdf.minor 5;
@ -1888,7 +1782,7 @@ let write_pdf mk_id pdf =
| File outname -> | File outname ->
let outname = writing_ok outname in let outname = writing_ok outname in
let pdf = Cpdf.recompress_pdf <| nobble pdf in let pdf = Cpdf.recompress_pdf <| nobble pdf in
if args.squeeze then squeeze pdf; if args.squeeze then Cpdf.squeeze pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
Pdfwrite.pdf_to_file_options Pdfwrite.pdf_to_file_options
~preserve_objstm:args.preserve_objstm ~preserve_objstm:args.preserve_objstm
@ -1896,7 +1790,7 @@ let write_pdf mk_id pdf =
args.linearize None mk_id pdf outname args.linearize None mk_id pdf outname
| Stdout -> | Stdout ->
let pdf = Cpdf.recompress_pdf <| nobble pdf in let pdf = Cpdf.recompress_pdf <| nobble pdf in
if args.squeeze then squeeze pdf; if args.squeeze then Cpdf.squeeze pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
Pdfwrite.pdf_to_channel Pdfwrite.pdf_to_channel
~preserve_objstm:args.preserve_objstm ~preserve_objstm:args.preserve_objstm
@ -3180,7 +3074,8 @@ let go () =
Printf.printf "original filename: %s\n" args.original_filename; Printf.printf "original filename: %s\n" args.original_filename;
Cpdf.split_pdf Cpdf.split_pdf
enc args.printf_format args.original_filename args.chunksize args.linearize enc args.printf_format args.original_filename args.chunksize args.linearize
args.preserve_objstm args.preserve_objstm (*yes--always create if preserving *) nobble output_spec pdf args.preserve_objstm args.preserve_objstm (*yes--always create if preserving *)
args.squeeze nobble output_spec pdf
| _, Stdout -> error "Can't split to standard output" | _, Stdout -> error "Can't split to standard output"
| _, NoOutputSpecified -> error "Split: No output format specified" | _, NoOutputSpecified -> error "Split: No output format specified"
| _ -> error "Split: bad parameters" | _ -> error "Split: bad parameters"
@ -3331,7 +3226,7 @@ let go () =
| _ -> "" | _ -> ""
in in
Cpdf.split_at_bookmarks filename args.linearize args.preserve_objstm Cpdf.split_at_bookmarks filename args.linearize args.preserve_objstm
(* Yes *)args.preserve_objstm nobble level output_spec pdf (* Yes *)args.preserve_objstm args.squeeze nobble level output_spec pdf
| Stdout -> error "Can't split to standard output" | Stdout -> error "Can't split to standard output"
| NoOutputSpecified -> error "Split: No output format specified" | NoOutputSpecified -> error "Split: No output format specified"
end end

View File

@ -9,4 +9,3 @@ val go_withargv : string array -> unit
(**/**) (**/**)
val demo : bool val demo : bool
val pdfobjeq : Pdf.t -> int -> int -> int