From 8726d3dee017969860c4ac0c33db5103ddc7a62d Mon Sep 17 00:00:00 2001
From: John Whitington <john@coherentgraphics.co.uk>
Date: Sat, 18 Dec 2021 16:26:33 +0000
Subject: [PATCH] more

---
 Makefile        |   2 +-
 cpdf.ml         | 244 -----------------------------------------------
 cpdf.mli        |  10 --
 cpdfcommand.ml  |  14 +--
 cpdfsqueeze.ml  | 246 ++++++++++++++++++++++++++++++++++++++++++++++++
 cpdfsqueeze.mli |   9 ++
 6 files changed, 263 insertions(+), 262 deletions(-)
 create mode 100644 cpdfsqueeze.ml
 create mode 100644 cpdfsqueeze.mli

diff --git a/Makefile b/Makefile
index 2c60f7b..20e857f 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 MODS = cpdfyojson cpdfxmlm \
        cpdfunicodedata cpdferror cpdfjson cpdfstrftime cpdfcoord cpdfattach \
        cpdfpagespec cpdfposition cpdf cpdfpresent cpdffont cpdftype \
-       cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfcommand
+       cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfcommand
 
 SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
 
diff --git a/cpdf.ml b/cpdf.ml
index 8da0b53..b01d95f 100644
--- a/cpdf.ml
+++ b/cpdf.ml
@@ -49,250 +49,6 @@ let xmp_template =
 
 <?xpacket end='r'?>|}
 
-(* For debugging *)
-let report_pdf_size pdf =
-  Pdf.remove_unreferenced pdf;
-  Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false
-  ~compress_objstm:false false None false pdf "temp.pdf";
-  let fh = open_in_bin "temp.pdf" in
-    Printf.printf "Size %i bytes\n" (in_channel_length fh);
-    flush stdout;
-    close_in fh
-
-
-(* Recompress anything which isn't compressed, unless it's metadata. *)
-let recompress_stream pdf = function
-  (* If there is no compression, compress with /FlateDecode *)
-  | Pdf.Stream {contents = (dict, _)} as stream ->
-      begin match
-        Pdf.lookup_direct pdf "/Filter" dict, 
-        Pdf.lookup_direct pdf "/Type" dict
-      with
-      | _, Some (Pdf.Name "/Metadata") -> ()
-      | (None | Some (Pdf.Array [])), _ ->
-          Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
-      | _ -> ()
-      end
-  | _ -> assert false
-
-let recompress_pdf pdf =
-  if not (Pdfcrypt.is_encrypted pdf) then
-    Pdf.iter_stream (recompress_stream pdf) pdf;
-    pdf
-
-let decompress_pdf pdf =
-  if not (Pdfcrypt.is_encrypted pdf) then
-    (Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
-    pdf
-
-
-(* Equality on PDF objects *)
-let pdfobjeq pdf x y =
-  let x = Pdf.lookup_obj pdf x 
-  and y = Pdf.lookup_obj pdf y in
-    begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
-    begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
-    compare x y
-
-let really_squeeze pdf =
-  let objs = ref [] in
-    Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
-    let toprocess =
-      keep
-        (fun x -> length x > 1)
-        (collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
-    in
-      (* Remove any pools of objects which are page objects, since Adobe Reader
-       * gets confused when there are duplicate page objects. *)
-      let toprocess =
-        option_map
-          (function
-             [] -> assert false
-           | h::_ as l ->
-               match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
-                 Some (Pdf.Name "/Page") -> None
-               | _ -> Some l)
-          toprocess
-      in
-        let pdfr = ref pdf in
-        let changetable = Hashtbl.create 100 in
-          iter
-            (function [] -> assert false | h::t ->
-               iter (fun e -> Hashtbl.add changetable e h) t)
-            toprocess;
-          (* For a unknown reason, the output file is much smaller if
-             Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
-             an old, well-understood function in use for years -- what is
-             going on? Furthermore, if we run it 3 times, it gets bigger again! *)
-          pdfr := Pdf.renumber changetable !pdfr;
-          pdfr := Pdf.renumber changetable !pdfr;
-          Pdf.remove_unreferenced !pdfr;
-          pdf.Pdf.root <- !pdfr.Pdf.root;
-          pdf.Pdf.objects <- !pdfr.Pdf.objects;
-          pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
-
-(* Squeeze the form xobject at objnum.
-
-FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
-the page tree!) are also needed - we must merge them with the ones from the
-xobject itself. However, it it safe for now -- in the unlikely event that the
-resources actually need to be available, the parse will fail, the squeeze of
-this object will fail, and we bail out. *)
-let xobjects_done = ref []
-
-let squeeze_form_xobject pdf objnum =
-  if mem objnum !xobjects_done then () else
-    xobjects_done := objnum :: !xobjects_done;
-    let obj = Pdf.lookup_obj pdf objnum in
-      match Pdf.lookup_direct pdf "/Subtype" obj with
-        Some (Pdf.Name "/Form") ->
-          let resources =
-            match Pdf.lookup_direct pdf "/Resources" obj with
-              Some d -> d
-            | None -> Pdf.Dictionary []
-          in
-            begin match
-              Pdfops.stream_of_ops
-                (Pdfops.parse_operators pdf resources [Pdf.Indirect objnum])
-            with
-              Pdf.Stream {contents = (_, Pdf.Got data)} ->
-                (* Put replacement data in original stream, and overwrite /Length *)
-                begin match obj with
-                  Pdf.Stream ({contents = (d, _)} as str) ->
-                    str :=
-                      (Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)),
-                       Pdf.Got data)
-                | _ -> failwith "squeeze_form_xobject"
-                end
-            | _ -> failwith "squeeze_form_xobject"
-            end
-      | _ -> ()
-
-(* For a list of indirects representing content streams, make sure that none of
-them are duplicated in the PDF. This indicates sharing, which parsing and
-rewriting the streams might destroy, thus making the file bigger. FIXME: The
-correct thing to do is to preserve the multiple content streams. *)
-let no_duplicates content_stream_numbers stream_numbers =
-  not
-    (mem false
-       (map
-         (fun n -> length (keep (eq n) content_stream_numbers) < 2)
-         stream_numbers))
-
-(* Give a list of content stream numbers, given a page reference number *)
-let content_streams_of_page pdf refnum =
-  match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
-    Pdf.Dictionary dict ->
-      begin match lookup "/Contents" dict with
-        Some (Pdf.Indirect i) -> [i]
-      | Some (Pdf.Array x) ->
-          option_map (function Pdf.Indirect i -> Some i | _ -> None) x
-      | _ -> []
-      end
-  | _ -> []
-
-(* For each object in the PDF marked with /Type /Page, for each /Contents
-indirect reference or array of such, decode and recode that content stream. *)
-let squeeze_all_content_streams pdf =
-  let page_reference_numbers = Pdf.page_reference_numbers pdf in
-    let all_content_streams_in_doc =
-      flatten (map (content_streams_of_page pdf) page_reference_numbers)
-    in
-      xobjects_done := [];
-      Pdf.objiter
-        (fun objnum _ ->
-          match Pdf.lookup_obj pdf objnum with
-            Pdf.Dictionary dict as d
-              when
-                Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
-              ->
-                let resources =
-                  match Pdf.lookup_direct pdf "/Resources" d with
-                    Some d -> d
-                  | None -> Pdf.Dictionary []
-                in
-                  begin try
-                    let content_streams =
-                      match lookup "/Contents" dict with
-                        Some (Pdf.Indirect i) ->
-                          begin match Pdf.direct pdf (Pdf.Indirect i) with
-                            Pdf.Array x -> x
-                          | _ -> [Pdf.Indirect i]
-                          end
-                      | Some (Pdf.Array x) -> x
-                      | _ -> raise Not_found
-                    in
-                      if
-                        no_duplicates
-                          all_content_streams_in_doc
-                          (map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
-                      then
-                        let newstream =
-                          Pdfops.stream_of_ops
-                            (Pdfops.parse_operators pdf resources content_streams)
-                        in
-                          let newdict =
-                            Pdf.add_dict_entry
-                              d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
-                          in
-                            Pdf.addobj_given_num pdf (objnum, newdict);
-                            (* Now process all xobjects related to this page *)
-                            begin match Pdf.lookup_direct pdf "/XObject" resources with
-                              Some (Pdf.Dictionary xobjs) ->
-                                iter
-                                  (function
-                                     (_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
-                                    | _ -> failwith "squeeze_xobject")
-                                  xobjs
-                            | _ -> ()
-                            end
-                  with
-                    (* No /Contents, which is ok. Or a parsing failure due to
-                     uninherited resources. FIXME: Add support for inherited
-                     resources. *)
-                    Not_found -> ()
-                  end
-            | _ -> ())
-        pdf
-
-(* We run squeeze enough times for the number of objects to not change *)
-let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf =
-  let log x =
-    match logto with
-      None -> print_string x; flush stdout
-    | Some "nolog" -> ()
-    | Some s ->
-        let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in
-          seek_out fh (out_channel_length fh);
-          output_string fh x;
-          close_out fh
-  in
-    try
-      let n = ref (Pdf.objcard pdf) in
-      log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf));
-      while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
-        n := Pdf.objcard pdf;
-        log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf));
-      done;
-      if pagedata then
-        begin
-          log (Printf.sprintf "Squeezing page data and xobjects\n");
-          squeeze_all_content_streams pdf;
-        end;
-      if recompress then
-        begin
-          log (Printf.sprintf "Recompressing document\n");
-          Pdfcodec.flate_level := 9;
-          ignore (recompress_pdf pdf)
-        end
-    with
-      e ->
-        raise
-          (Pdf.PDFError
-             (Printf.sprintf
-                "Squeeze failed. No output written.\n Proximate error was:\n %s"
-                (Printexc.to_string e)))
-
 type encoding =
   | Raw
   | UTF8
diff --git a/cpdf.mli b/cpdf.mli
index 3f4efc4..c02a444 100644
--- a/cpdf.mli
+++ b/cpdf.mli
@@ -36,14 +36,6 @@ val iter_pages : (int -> Pdfpage.t -> unit) -> Pdf.t -> int list -> unit
 (** Same as [process_pages] but return the list of outputs of the map function. *)
 val map_pages : (int -> Pdfpage.t -> 'a) -> Pdf.t -> int list -> 'a list
 
-
-(** Compresses all streams in the PDF document which are uncompressed, using
-/FlateDecode, leaving out metadata.  If the PDF is encrypted, does nothing. *)
-val recompress_pdf : Pdf.t -> Pdf.t
-
-(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *)
-val decompress_pdf : Pdf.t -> Pdf.t
-
 val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t
 
 (** {2 Metadata and settings} *)
@@ -344,8 +336,6 @@ val blackfills : color -> int list -> Pdf.t -> Pdf.t
 (** Remove images from a PDF, optionally adding crossed boxes. *)
 val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
 
-(** Squeeze a PDF *)
-val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit
 
 val remove_all_text : int list -> Pdf.t -> Pdf.t 
 
diff --git a/cpdfcommand.ml b/cpdfcommand.ml
index 07f0bf6..3a3fce4 100644
--- a/cpdfcommand.ml
+++ b/cpdfcommand.ml
@@ -2834,8 +2834,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
           None ->
             if not is_decompress then
               begin
-                ignore (Cpdf.recompress_pdf pdf);
-                if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
+                ignore (Cpdfsqueeze.recompress_pdf pdf);
+                if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
               end;
             Pdf.remove_unreferenced pdf;
             really_write_pdf ~is_decompress mk_id pdf outname
@@ -2849,8 +2849,8 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
             None -> 
               if not is_decompress then
                 begin
-                  ignore (Cpdf.recompress_pdf pdf);
-                  if args.squeeze then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
+                  ignore (Cpdfsqueeze.recompress_pdf pdf);
+                  if args.squeeze then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
                   Pdf.remove_unreferenced pdf
                 end;
                 really_write_pdf ~encryption ~is_decompress mk_id pdf temp;
@@ -2889,7 +2889,7 @@ let fast_write_split_pdfs
                  (stem original_filename) startpage endpage
              in
                Pdf.remove_unreferenced pdf;
-               if sq then Cpdf.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
+               if sq then Cpdfsqueeze.squeeze ~pagedata:args.squeeze_pagedata ~recompress:args.squeeze_recompress ?logto:!logto pdf;
                really_write_pdf ~encryption:enc (not (enc = None)) pdf name)
       (indx pagenums)
       pagenums
@@ -3261,7 +3261,7 @@ let go () =
       let pdf = get_single_pdf (Some Compress) false in
         if args.remove_duplicate_streams then
           Pdfmerge.remove_duplicate_fonts pdf;
-        write_pdf false (Cpdf.recompress_pdf pdf)
+        write_pdf false (Cpdfsqueeze.recompress_pdf pdf)
   | Some RemoveCrop ->
       begin match args.inputs, args.out with
       | (_, pagespec, _, _, _, _)::_, _ ->
@@ -3736,7 +3736,7 @@ let go () =
         (Cpdf.combine_pages args.fast (get_single_pdf args.op false) (pdfread_pdf_of_file None None over) false false true)
   | Some Encrypt ->
       let pdf = get_single_pdf args.op false in
-        let pdf = Cpdf.recompress_pdf pdf
+        let pdf = Cpdfsqueeze.recompress_pdf pdf
         and encryption =
           {Pdfwrite.encryption_method =
              (match args.crypt_method with
diff --git a/cpdfsqueeze.ml b/cpdfsqueeze.ml
new file mode 100644
index 0000000..8d1e4d6
--- /dev/null
+++ b/cpdfsqueeze.ml
@@ -0,0 +1,246 @@
+open Pdfutil
+open Pdfio
+
+(* For debugging *)
+let report_pdf_size pdf =
+  Pdf.remove_unreferenced pdf;
+  Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false
+  ~compress_objstm:false false None false pdf "temp.pdf";
+  let fh = open_in_bin "temp.pdf" in
+    Printf.printf "Size %i bytes\n" (in_channel_length fh);
+    flush stdout;
+    close_in fh
+
+(* Recompress anything which isn't compressed, unless it's metadata. *)
+let recompress_stream pdf = function
+  (* If there is no compression, compress with /FlateDecode *)
+  | Pdf.Stream {contents = (dict, _)} as stream ->
+      begin match
+        Pdf.lookup_direct pdf "/Filter" dict, 
+        Pdf.lookup_direct pdf "/Type" dict
+      with
+      | _, Some (Pdf.Name "/Metadata") -> ()
+      | (None | Some (Pdf.Array [])), _ ->
+          Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
+      | _ -> ()
+      end
+  | _ -> assert false
+
+let recompress_pdf pdf =
+  if not (Pdfcrypt.is_encrypted pdf) then
+    Pdf.iter_stream (recompress_stream pdf) pdf;
+    pdf
+
+let decompress_pdf pdf =
+  if not (Pdfcrypt.is_encrypted pdf) then
+    (Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
+    pdf
+
+
+(* Equality on PDF objects *)
+let pdfobjeq pdf x y =
+  let x = Pdf.lookup_obj pdf x 
+  and y = Pdf.lookup_obj pdf y in
+    begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
+    begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
+    compare x y
+
+let really_squeeze pdf =
+  let objs = ref [] in
+    Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
+    let toprocess =
+      keep
+        (fun x -> length x > 1)
+        (collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
+    in
+      (* Remove any pools of objects which are page objects, since Adobe Reader
+       * gets confused when there are duplicate page objects. *)
+      let toprocess =
+        option_map
+          (function
+             [] -> assert false
+           | h::_ as l ->
+               match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
+                 Some (Pdf.Name "/Page") -> None
+               | _ -> Some l)
+          toprocess
+      in
+        let pdfr = ref pdf in
+        let changetable = Hashtbl.create 100 in
+          iter
+            (function [] -> assert false | h::t ->
+               iter (fun e -> Hashtbl.add changetable e h) t)
+            toprocess;
+          (* For a unknown reason, the output file is much smaller if
+             Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
+             an old, well-understood function in use for years -- what is
+             going on? Furthermore, if we run it 3 times, it gets bigger again! *)
+          pdfr := Pdf.renumber changetable !pdfr;
+          pdfr := Pdf.renumber changetable !pdfr;
+          Pdf.remove_unreferenced !pdfr;
+          pdf.Pdf.root <- !pdfr.Pdf.root;
+          pdf.Pdf.objects <- !pdfr.Pdf.objects;
+          pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
+
+(* Squeeze the form xobject at objnum.
+
+FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
+the page tree!) are also needed - we must merge them with the ones from the
+xobject itself. However, it it safe for now -- in the unlikely event that the
+resources actually need to be available, the parse will fail, the squeeze of
+this object will fail, and we bail out. *)
+let xobjects_done = ref []
+
+let squeeze_form_xobject pdf objnum =
+  if mem objnum !xobjects_done then () else
+    xobjects_done := objnum :: !xobjects_done;
+    let obj = Pdf.lookup_obj pdf objnum in
+      match Pdf.lookup_direct pdf "/Subtype" obj with
+        Some (Pdf.Name "/Form") ->
+          let resources =
+            match Pdf.lookup_direct pdf "/Resources" obj with
+              Some d -> d
+            | None -> Pdf.Dictionary []
+          in
+            begin match
+              Pdfops.stream_of_ops
+                (Pdfops.parse_operators pdf resources [Pdf.Indirect objnum])
+            with
+              Pdf.Stream {contents = (_, Pdf.Got data)} ->
+                (* Put replacement data in original stream, and overwrite /Length *)
+                begin match obj with
+                  Pdf.Stream ({contents = (d, _)} as str) ->
+                    str :=
+                      (Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)),
+                       Pdf.Got data)
+                | _ -> failwith "squeeze_form_xobject"
+                end
+            | _ -> failwith "squeeze_form_xobject"
+            end
+      | _ -> ()
+
+(* For a list of indirects representing content streams, make sure that none of
+them are duplicated in the PDF. This indicates sharing, which parsing and
+rewriting the streams might destroy, thus making the file bigger. FIXME: The
+correct thing to do is to preserve the multiple content streams. *)
+let no_duplicates content_stream_numbers stream_numbers =
+  not
+    (mem false
+       (map
+         (fun n -> length (keep (eq n) content_stream_numbers) < 2)
+         stream_numbers))
+
+(* Give a list of content stream numbers, given a page reference number *)
+let content_streams_of_page pdf refnum =
+  match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
+    Pdf.Dictionary dict ->
+      begin match lookup "/Contents" dict with
+        Some (Pdf.Indirect i) -> [i]
+      | Some (Pdf.Array x) ->
+          option_map (function Pdf.Indirect i -> Some i | _ -> None) x
+      | _ -> []
+      end
+  | _ -> []
+
+(* For each object in the PDF marked with /Type /Page, for each /Contents
+indirect reference or array of such, decode and recode that content stream. *)
+let squeeze_all_content_streams pdf =
+  let page_reference_numbers = Pdf.page_reference_numbers pdf in
+    let all_content_streams_in_doc =
+      flatten (map (content_streams_of_page pdf) page_reference_numbers)
+    in
+      xobjects_done := [];
+      Pdf.objiter
+        (fun objnum _ ->
+          match Pdf.lookup_obj pdf objnum with
+            Pdf.Dictionary dict as d
+              when
+                Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
+              ->
+                let resources =
+                  match Pdf.lookup_direct pdf "/Resources" d with
+                    Some d -> d
+                  | None -> Pdf.Dictionary []
+                in
+                  begin try
+                    let content_streams =
+                      match lookup "/Contents" dict with
+                        Some (Pdf.Indirect i) ->
+                          begin match Pdf.direct pdf (Pdf.Indirect i) with
+                            Pdf.Array x -> x
+                          | _ -> [Pdf.Indirect i]
+                          end
+                      | Some (Pdf.Array x) -> x
+                      | _ -> raise Not_found
+                    in
+                      if
+                        no_duplicates
+                          all_content_streams_in_doc
+                          (map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
+                      then
+                        let newstream =
+                          Pdfops.stream_of_ops
+                            (Pdfops.parse_operators pdf resources content_streams)
+                        in
+                          let newdict =
+                            Pdf.add_dict_entry
+                              d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
+                          in
+                            Pdf.addobj_given_num pdf (objnum, newdict);
+                            (* Now process all xobjects related to this page *)
+                            begin match Pdf.lookup_direct pdf "/XObject" resources with
+                              Some (Pdf.Dictionary xobjs) ->
+                                iter
+                                  (function
+                                     (_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
+                                    | _ -> failwith "squeeze_xobject")
+                                  xobjs
+                            | _ -> ()
+                            end
+                  with
+                    (* No /Contents, which is ok. Or a parsing failure due to
+                     uninherited resources. FIXME: Add support for inherited
+                     resources. *)
+                    Not_found -> ()
+                  end
+            | _ -> ())
+        pdf
+
+(* We run squeeze enough times for the number of objects to not change *)
+let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf =
+  let log x =
+    match logto with
+      None -> print_string x; flush stdout
+    | Some "nolog" -> ()
+    | Some s ->
+        let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in
+          seek_out fh (out_channel_length fh);
+          output_string fh x;
+          close_out fh
+  in
+    try
+      let n = ref (Pdf.objcard pdf) in
+      log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf));
+      while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
+        n := Pdf.objcard pdf;
+        log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf));
+      done;
+      if pagedata then
+        begin
+          log (Printf.sprintf "Squeezing page data and xobjects\n");
+          squeeze_all_content_streams pdf;
+        end;
+      if recompress then
+        begin
+          log (Printf.sprintf "Recompressing document\n");
+          Pdfcodec.flate_level := 9;
+          ignore (recompress_pdf pdf)
+        end
+    with
+      e ->
+        raise
+          (Pdf.PDFError
+             (Printf.sprintf
+                "Squeeze failed. No output written.\n Proximate error was:\n %s"
+                (Printexc.to_string e)))
+
diff --git a/cpdfsqueeze.mli b/cpdfsqueeze.mli
new file mode 100644
index 0000000..b2d42d4
--- /dev/null
+++ b/cpdfsqueeze.mli
@@ -0,0 +1,9 @@
+(** Compresses all streams in the PDF document which are uncompressed, using
+/FlateDecode, leaving out metadata.  If the PDF is encrypted, does nothing. *)
+val recompress_pdf : Pdf.t -> Pdf.t
+
+(** Decompresses all streams in a PDF document, assuming it isn't encrypted. *)
+val decompress_pdf : Pdf.t -> Pdf.t
+
+(** Squeeze a PDF *)
+val squeeze : ?logto:string -> ?pagedata:bool -> ?recompress:bool -> Pdf.t -> unit