From 8726d3dee017969860c4ac0c33db5103ddc7a62d Mon Sep 17 00:00:00 2001 From: John Whitington Date: Sat, 18 Dec 2021 16:26:33 +0000 Subject: [PATCH] more --- Makefile | 2 +- cpdf.ml | 244 ----------------------------------------------- cpdf.mli | 10 -- cpdfcommand.ml | 14 +-- cpdfsqueeze.ml | 246 ++++++++++++++++++++++++++++++++++++++++++++++++ cpdfsqueeze.mli | 9 ++ 6 files changed, 263 insertions(+), 262 deletions(-) create mode 100644 cpdfsqueeze.ml create mode 100644 cpdfsqueeze.mli diff --git a/Makefile b/Makefile index 2c60f7b..20e857f 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ MODS = cpdfyojson cpdfxmlm \ cpdfunicodedata cpdferror cpdfjson cpdfstrftime cpdfcoord cpdfattach \ cpdfpagespec cpdfposition cpdf cpdfpresent cpdffont cpdftype \ - cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfcommand + cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfcommand SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml diff --git a/cpdf.ml b/cpdf.ml index 8da0b53..b01d95f 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -49,250 +49,6 @@ let xmp_template = |} -(* For debugging *) -let report_pdf_size pdf = - Pdf.remove_unreferenced pdf; - Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false - ~compress_objstm:false false None false pdf "temp.pdf"; - let fh = open_in_bin "temp.pdf" in - Printf.printf "Size %i bytes\n" (in_channel_length fh); - flush stdout; - close_in fh - - -(* Recompress anything which isn't compressed, unless it's metadata. *) -let recompress_stream pdf = function - (* If there is no compression, compress with /FlateDecode *) - | Pdf.Stream {contents = (dict, _)} as stream -> - begin match - Pdf.lookup_direct pdf "/Filter" dict, - Pdf.lookup_direct pdf "/Type" dict - with - | _, Some (Pdf.Name "/Metadata") -> () - | (None | Some (Pdf.Array [])), _ -> - Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream - | _ -> () - end - | _ -> assert false - -let recompress_pdf pdf = - if not (Pdfcrypt.is_encrypted pdf) then - Pdf.iter_stream (recompress_stream pdf) pdf; - pdf - -let decompress_pdf pdf = - if not (Pdfcrypt.is_encrypted pdf) then - (Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf); - pdf - - -(* Equality on PDF objects *) -let pdfobjeq pdf x y = - let x = Pdf.lookup_obj pdf x - and y = Pdf.lookup_obj pdf y in - begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end; - begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end; - compare x y - -let really_squeeze pdf = - let objs = ref [] in - Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf; - let toprocess = - keep - (fun x -> length x > 1) - (collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs)) - in - (* Remove any pools of objects which are page objects, since Adobe Reader - * gets confused when there are duplicate page objects. *) - let toprocess = - option_map - (function - [] -> assert false - | h::_ as l -> - match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with - Some (Pdf.Name "/Page") -> None - | _ -> Some l) - toprocess - in - let pdfr = ref pdf in - let changetable = Hashtbl.create 100 in - iter - (function [] -> assert false | h::t -> - iter (fun e -> Hashtbl.add changetable e h) t) - toprocess; - (* For a unknown reason, the output file is much smaller if - Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is - an old, well-understood function in use for years -- what is - going on? Furthermore, if we run it 3 times, it gets bigger again! *) - pdfr := Pdf.renumber changetable !pdfr; - pdfr := Pdf.renumber changetable !pdfr; - Pdf.remove_unreferenced !pdfr; - pdf.Pdf.root <- !pdfr.Pdf.root; - pdf.Pdf.objects <- !pdfr.Pdf.objects; - pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict - -(* Squeeze the form xobject at objnum. - -FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in -the page tree!) are also needed - we must merge them with the ones from the -xobject itself. However, it it safe for now -- in the unlikely event that the -resources actually need to be available, the parse will fail, the squeeze of -this object will fail, and we bail out. *) -let xobjects_done = ref [] - -let squeeze_form_xobject pdf objnum = - if mem objnum !xobjects_done then () else - xobjects_done := objnum :: !xobjects_done; - let obj = Pdf.lookup_obj pdf objnum in - match Pdf.lookup_direct pdf "/Subtype" obj with - Some (Pdf.Name "/Form") -> - let resources = - match Pdf.lookup_direct pdf "/Resources" obj with - Some d -> d - | None -> Pdf.Dictionary [] - in - begin match - Pdfops.stream_of_ops - (Pdfops.parse_operators pdf resources [Pdf.Indirect objnum]) - with - Pdf.Stream {contents = (_, Pdf.Got data)} -> - (* Put replacement data in original stream, and overwrite /Length *) - begin match obj with - Pdf.Stream ({contents = (d, _)} as str) -> - str := - (Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)), - Pdf.Got data) - | _ -> failwith "squeeze_form_xobject" - end - | _ -> failwith "squeeze_form_xobject" - end - | _ -> () - -(* For a list of indirects representing content streams, make sure that none of -them are duplicated in the PDF. This indicates sharing, which parsing and -rewriting the streams might destroy, thus making the file bigger. FIXME: The -correct thing to do is to preserve the multiple content streams. *) -let no_duplicates content_stream_numbers stream_numbers = - not - (mem false - (map - (fun n -> length (keep (eq n) content_stream_numbers) < 2) - stream_numbers)) - -(* Give a list of content stream numbers, given a page reference number *) -let content_streams_of_page pdf refnum = - match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with - Pdf.Dictionary dict -> - begin match lookup "/Contents" dict with - Some (Pdf.Indirect i) -> [i] - | Some (Pdf.Array x) -> - option_map (function Pdf.Indirect i -> Some i | _ -> None) x - | _ -> [] - end - | _ -> [] - -(* For each object in the PDF marked with /Type /Page, for each /Contents -indirect reference or array of such, decode and recode that content stream. *) -let squeeze_all_content_streams pdf = - let page_reference_numbers = Pdf.page_reference_numbers pdf in - let all_content_streams_in_doc = - flatten (map (content_streams_of_page pdf) page_reference_numbers) - in - xobjects_done := []; - Pdf.objiter - (fun objnum _ -> - match Pdf.lookup_obj pdf objnum with - Pdf.Dictionary dict as d - when - Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page") - -> - let resources = - match Pdf.lookup_direct pdf "/Resources" d with - Some d -> d - | None -> Pdf.Dictionary [] - in - begin try - let content_streams = - match lookup "/Contents" dict with - Some (Pdf.Indirect i) -> - begin match Pdf.direct pdf (Pdf.Indirect i) with - Pdf.Array x -> x - | _ -> [Pdf.Indirect i] - end - | Some (Pdf.Array x) -> x - | _ -> raise Not_found - in - if - no_duplicates - all_content_streams_in_doc - (map (function Pdf.Indirect i -> i | _ -> assert false) content_streams) - then - let newstream = - Pdfops.stream_of_ops - (Pdfops.parse_operators pdf resources content_streams) - in - let newdict = - Pdf.add_dict_entry - d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream)) - in - Pdf.addobj_given_num pdf (objnum, newdict); - (* Now process all xobjects related to this page *) - begin match Pdf.lookup_direct pdf "/XObject" resources with - Some (Pdf.Dictionary xobjs) -> - iter - (function - (_, Pdf.Indirect i) -> squeeze_form_xobject pdf i - | _ -> failwith "squeeze_xobject") - xobjs - | _ -> () - end - with - (* No /Contents, which is ok. Or a parsing failure due to - uninherited resources. FIXME: Add support for inherited - resources. *) - Not_found -> () - end - | _ -> ()) - pdf - -(* We run squeeze enough times for the number of objects to not change *) -let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf = - let log x = - match logto with - None -> print_string x; flush stdout - | Some "nolog" -> () - | Some s -> - let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in - seek_out fh (out_channel_length fh); - output_string fh x; - close_out fh - in - try - let n = ref (Pdf.objcard pdf) in - log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf)); - while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do - n := Pdf.objcard pdf; - log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf)); - done; - if pagedata then - begin - log (Printf.sprintf "Squeezing page data and xobjects\n"); - squeeze_all_content_streams pdf; - end; - if recompress then - begin - log (Printf.sprintf "Recompressing document\n"); - Pdfcodec.flate_level := 9; - ignore (recompress_pdf pdf) - end - with - e -> - raise - (Pdf.PDFError - (Printf.sprintf - "Squeeze failed. No output written.\n Proximate error was:\n %s" - (Printexc.to_string e))) - type encoding = | Raw | UTF8 diff --git a/cpdf.mli b/cpdf.mli index 3f4efc4..c02a444 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -36,14 +36,6 @@ val iter_pages : (int -> Pdfpage.t -> unit) -> Pdf.t -> int list -> unit (** Same as [process_pages] but return the list of outputs of the map function. *) val map_pages : (int -> Pdfpage.t -> 'a) -> Pdf.t -> int list -> 'a list - -(** Compresses all streams in the PDF document which are uncompressed, using -/FlateDecode, leaving out metadata. If the PDF is encrypted, does nothing. *) -val recompress_pdf : Pdf.t -> Pdf.t - -(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *) -val decompress_pdf : Pdf.t -> Pdf.t - val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t (** {2 Metadata and settings} *) @@ -344,8 +336,6 @@ val blackfills : color -> int list -> Pdf.t -> Pdf.t (** Remove images from a PDF, optionally adding crossed boxes. *) val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t -(** Squeeze a PDF *) -val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit val remove_all_text : int list -> Pdf.t -> Pdf.t diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 07f0bf6..3a3fce4 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -2834,8 +2834,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf = None -> if not is_decompress then begin - ignore (Cpdf.recompress_pdf pdf); - if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf; + ignore (Cpdfsqueeze.recompress_pdf pdf); + if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf; end; Pdf.remove_unreferenced pdf; really_write_pdf ~is_decompress mk_id pdf outname @@ -2849,8 +2849,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf = None -> if not is_decompress then begin - ignore (Cpdf.recompress_pdf pdf); - if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf; + ignore (Cpdfsqueeze.recompress_pdf pdf); + if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf; Pdf.remove_unreferenced pdf end; really_write_pdf ~encryption ~is_decompress mk_id pdf temp; @@ -2889,7 +2889,7 @@ let fast_write_split_pdfs (stem original_filename) startpage endpage in Pdf.remove_unreferenced pdf; - if sq then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf; + if sq then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf; really_write_pdf ~encryption:enc (not (enc = None)) pdf name) (indx pagenums) pagenums @@ -3261,7 +3261,7 @@ let go () = let pdf = get_single_pdf (Some Compress) false in if args.remove_duplicate_streams then Pdfmerge.remove_duplicate_fonts pdf; - write_pdf false (Cpdf.recompress_pdf pdf) + write_pdf false (Cpdfsqueeze.recompress_pdf pdf) | Some RemoveCrop -> begin match args.inputs, args.out with | (_, pagespec, _, _, _, _)::_, _ -> @@ -3736,7 +3736,7 @@ let go () = (Cpdf.combine_pages args.fast (get_single_pdf args.op false) (pdfread_pdf_of_file None None over) false false true) | Some Encrypt -> let pdf = get_single_pdf args.op false in - let pdf = Cpdf.recompress_pdf pdf + let pdf = Cpdfsqueeze.recompress_pdf pdf and encryption = {Pdfwrite.encryption_method = (match args.crypt_method with diff --git a/cpdfsqueeze.ml b/cpdfsqueeze.ml new file mode 100644 index 0000000..8d1e4d6 --- /dev/null +++ b/cpdfsqueeze.ml @@ -0,0 +1,246 @@ +open Pdfutil +open Pdfio + +(* For debugging *) +let report_pdf_size pdf = + Pdf.remove_unreferenced pdf; + Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false + ~compress_objstm:false false None false pdf "temp.pdf"; + let fh = open_in_bin "temp.pdf" in + Printf.printf "Size %i bytes\n" (in_channel_length fh); + flush stdout; + close_in fh + +(* Recompress anything which isn't compressed, unless it's metadata. *) +let recompress_stream pdf = function + (* If there is no compression, compress with /FlateDecode *) + | Pdf.Stream {contents = (dict, _)} as stream -> + begin match + Pdf.lookup_direct pdf "/Filter" dict, + Pdf.lookup_direct pdf "/Type" dict + with + | _, Some (Pdf.Name "/Metadata") -> () + | (None | Some (Pdf.Array [])), _ -> + Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream + | _ -> () + end + | _ -> assert false + +let recompress_pdf pdf = + if not (Pdfcrypt.is_encrypted pdf) then + Pdf.iter_stream (recompress_stream pdf) pdf; + pdf + +let decompress_pdf pdf = + if not (Pdfcrypt.is_encrypted pdf) then + (Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf); + pdf + + +(* Equality on PDF objects *) +let pdfobjeq pdf x y = + let x = Pdf.lookup_obj pdf x + and y = Pdf.lookup_obj pdf y in + begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end; + begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end; + compare x y + +let really_squeeze pdf = + let objs = ref [] in + Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf; + let toprocess = + keep + (fun x -> length x > 1) + (collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs)) + in + (* Remove any pools of objects which are page objects, since Adobe Reader + * gets confused when there are duplicate page objects. *) + let toprocess = + option_map + (function + [] -> assert false + | h::_ as l -> + match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with + Some (Pdf.Name "/Page") -> None + | _ -> Some l) + toprocess + in + let pdfr = ref pdf in + let changetable = Hashtbl.create 100 in + iter + (function [] -> assert false | h::t -> + iter (fun e -> Hashtbl.add changetable e h) t) + toprocess; + (* For a unknown reason, the output file is much smaller if + Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is + an old, well-understood function in use for years -- what is + going on? Furthermore, if we run it 3 times, it gets bigger again! *) + pdfr := Pdf.renumber changetable !pdfr; + pdfr := Pdf.renumber changetable !pdfr; + Pdf.remove_unreferenced !pdfr; + pdf.Pdf.root <- !pdfr.Pdf.root; + pdf.Pdf.objects <- !pdfr.Pdf.objects; + pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict + +(* Squeeze the form xobject at objnum. + +FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in +the page tree!) are also needed - we must merge them with the ones from the +xobject itself. However, it it safe for now -- in the unlikely event that the +resources actually need to be available, the parse will fail, the squeeze of +this object will fail, and we bail out. *) +let xobjects_done = ref [] + +let squeeze_form_xobject pdf objnum = + if mem objnum !xobjects_done then () else + xobjects_done := objnum :: !xobjects_done; + let obj = Pdf.lookup_obj pdf objnum in + match Pdf.lookup_direct pdf "/Subtype" obj with + Some (Pdf.Name "/Form") -> + let resources = + match Pdf.lookup_direct pdf "/Resources" obj with + Some d -> d + | None -> Pdf.Dictionary [] + in + begin match + Pdfops.stream_of_ops + (Pdfops.parse_operators pdf resources [Pdf.Indirect objnum]) + with + Pdf.Stream {contents = (_, Pdf.Got data)} -> + (* Put replacement data in original stream, and overwrite /Length *) + begin match obj with + Pdf.Stream ({contents = (d, _)} as str) -> + str := + (Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)), + Pdf.Got data) + | _ -> failwith "squeeze_form_xobject" + end + | _ -> failwith "squeeze_form_xobject" + end + | _ -> () + +(* For a list of indirects representing content streams, make sure that none of +them are duplicated in the PDF. This indicates sharing, which parsing and +rewriting the streams might destroy, thus making the file bigger. FIXME: The +correct thing to do is to preserve the multiple content streams. *) +let no_duplicates content_stream_numbers stream_numbers = + not + (mem false + (map + (fun n -> length (keep (eq n) content_stream_numbers) < 2) + stream_numbers)) + +(* Give a list of content stream numbers, given a page reference number *) +let content_streams_of_page pdf refnum = + match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with + Pdf.Dictionary dict -> + begin match lookup "/Contents" dict with + Some (Pdf.Indirect i) -> [i] + | Some (Pdf.Array x) -> + option_map (function Pdf.Indirect i -> Some i | _ -> None) x + | _ -> [] + end + | _ -> [] + +(* For each object in the PDF marked with /Type /Page, for each /Contents +indirect reference or array of such, decode and recode that content stream. *) +let squeeze_all_content_streams pdf = + let page_reference_numbers = Pdf.page_reference_numbers pdf in + let all_content_streams_in_doc = + flatten (map (content_streams_of_page pdf) page_reference_numbers) + in + xobjects_done := []; + Pdf.objiter + (fun objnum _ -> + match Pdf.lookup_obj pdf objnum with + Pdf.Dictionary dict as d + when + Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page") + -> + let resources = + match Pdf.lookup_direct pdf "/Resources" d with + Some d -> d + | None -> Pdf.Dictionary [] + in + begin try + let content_streams = + match lookup "/Contents" dict with + Some (Pdf.Indirect i) -> + begin match Pdf.direct pdf (Pdf.Indirect i) with + Pdf.Array x -> x + | _ -> [Pdf.Indirect i] + end + | Some (Pdf.Array x) -> x + | _ -> raise Not_found + in + if + no_duplicates + all_content_streams_in_doc + (map (function Pdf.Indirect i -> i | _ -> assert false) content_streams) + then + let newstream = + Pdfops.stream_of_ops + (Pdfops.parse_operators pdf resources content_streams) + in + let newdict = + Pdf.add_dict_entry + d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream)) + in + Pdf.addobj_given_num pdf (objnum, newdict); + (* Now process all xobjects related to this page *) + begin match Pdf.lookup_direct pdf "/XObject" resources with + Some (Pdf.Dictionary xobjs) -> + iter + (function + (_, Pdf.Indirect i) -> squeeze_form_xobject pdf i + | _ -> failwith "squeeze_xobject") + xobjs + | _ -> () + end + with + (* No /Contents, which is ok. Or a parsing failure due to + uninherited resources. FIXME: Add support for inherited + resources. *) + Not_found -> () + end + | _ -> ()) + pdf + +(* We run squeeze enough times for the number of objects to not change *) +let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf = + let log x = + match logto with + None -> print_string x; flush stdout + | Some "nolog" -> () + | Some s -> + let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in + seek_out fh (out_channel_length fh); + output_string fh x; + close_out fh + in + try + let n = ref (Pdf.objcard pdf) in + log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf)); + while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do + n := Pdf.objcard pdf; + log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf)); + done; + if pagedata then + begin + log (Printf.sprintf "Squeezing page data and xobjects\n"); + squeeze_all_content_streams pdf; + end; + if recompress then + begin + log (Printf.sprintf "Recompressing document\n"); + Pdfcodec.flate_level := 9; + ignore (recompress_pdf pdf) + end + with + e -> + raise + (Pdf.PDFError + (Printf.sprintf + "Squeeze failed. No output written.\n Proximate error was:\n %s" + (Printexc.to_string e))) + diff --git a/cpdfsqueeze.mli b/cpdfsqueeze.mli new file mode 100644 index 0000000..b2d42d4 --- /dev/null +++ b/cpdfsqueeze.mli @@ -0,0 +1,9 @@ +(** Compresses all streams in the PDF document which are uncompressed, using +/FlateDecode, leaving out metadata. If the PDF is encrypted, does nothing. *) +val recompress_pdf : Pdf.t -> Pdf.t + +(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *) +val decompress_pdf : Pdf.t -> Pdf.t + +(** Squeeze a PDF *) +val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit