diff --git a/Makefile b/Makefile index 14df99e..ac1c11d 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml RESULT = cpdf ANNOTATE = true -PACKS = camlpdf xml-light +PACKS = camlpdf OCAMLNCFLAGS = -g OCAMLBCFLAGS = -g diff --git a/cpdf.ml b/cpdf.ml index 55b936a..f699c79 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -2688,9 +2688,9 @@ let output_xml_info pdf = match get_metadata pdf with None -> () | Some metadata -> - print_string (string_of_bytes metadata); - let parsed = Xml.parse_string (string_of_bytes metadata) in - print_string (Xml.to_string parsed) + print_string (string_of_bytes metadata) + (*let parsed = Xml.parse_string (string_of_bytes metadata) in + print_string (Xml.to_string parsed)*) let output_info encoding pdf = let getstring = diff --git a/cpdfcommand.ml b/cpdfcommand.ml index c1c5fb6..fc116d9 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -257,7 +257,8 @@ type args = mutable uprightstamp : bool; mutable labelstyle : Pdfpagelabels.labelstyle option; mutable labelprefix : string option; - mutable labeloffset : int option} + mutable labeloffset : int option; + mutable squeeze : bool} (* List of all filenames in any AND stage - this is used to check that we don't overwrite any input file when -dont-overwrite-existing-files is used. *) @@ -339,7 +340,8 @@ let args = uprightstamp = false; labelstyle = None; labelprefix = None; - labeloffset = None} + labeloffset = None; + squeeze = false} let reset_arguments () = args.op <- None; @@ -412,7 +414,8 @@ let reset_arguments () = args.uprightstamp <- false; args.labelstyle <- None; args.labelprefix <- None; - args.labeloffset <- None + args.labeloffset <- None; + args.squeeze <- false (* We don't reset args.do_ask and args.verbose, because they operate on all parts of the AND-ed command line sent from cpdftk. *) @@ -1010,6 +1013,9 @@ let setscalecontents f = args.op <- Some (ScaleContents f); args.position <- Cpdf.Diagonal (* Will be center *) +let setsqueeze () = + args.squeeze <- true + (* Parsing the control file *) let rec getuntilendquote prev = function | [] -> implode (rev prev), [] @@ -1374,6 +1380,9 @@ and specs = ("-remove-duplicate-streams", Arg.Unit setremoveduplicatestreams, ""); + ("-squeeze", + Arg.Unit setsqueeze, + " Slow, lossless compression of a PDF file"); ("-list-bookmarks", Arg.Unit (setop ListBookmarks), " List Bookmarks"); @@ -1756,6 +1765,42 @@ let rec writing_ok outname = else outname +(* Equality on PDF streams *) +let pdfobjeq pdf x y = + let x = Pdf.lookup_obj pdf x + and y = Pdf.lookup_obj pdf y in + begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end; + begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end; + compare x y + +let squeeze pdf = + let objs = ref [] in + Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf; + let toprocess = + keep + (fun x -> length x > 1) + (collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs)) + in + Printf.printf "Found %i pools of duplicate objects to coalesce\n" (length toprocess); + (*List.iter + (fun x -> Printf.printf "\n\nPool: "; List.iter (Printf.printf "%i ") x) + toprocess;*) + let pdfr = ref pdf in + let changetable = Hashtbl.create 100 in + iter + (function [] -> assert false | h::t -> + iter (fun e -> Hashtbl.add changetable e h) t) + toprocess; + (* For a unknown reason, the output file is much smaller if + Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is + an old, well-understood function in use for years -- what is + going on? *) + pdfr := Pdf.renumber changetable !pdfr; + pdfr := Pdf.renumber changetable !pdfr; + pdf.Pdf.root <- !pdfr.Pdf.root; + pdf.Pdf.objects <- !pdfr.Pdf.objects; + pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict + let write_pdf mk_id pdf = if args.create_objstm && not args.keepversion then pdf.Pdf.minor <- max pdf.Pdf.minor 5; @@ -1766,6 +1811,7 @@ let write_pdf mk_id pdf = | File outname -> let outname = writing_ok outname in let pdf = Cpdf.recompress_pdf <| nobble pdf in + if args.squeeze then squeeze pdf; Pdf.remove_unreferenced pdf; Pdfwrite.pdf_to_file_options ~preserve_objstm:args.preserve_objstm @@ -1773,6 +1819,7 @@ let write_pdf mk_id pdf = args.linearize None mk_id pdf outname | Stdout -> let pdf = Cpdf.recompress_pdf <| nobble pdf in + if args.squeeze then squeeze pdf; Pdf.remove_unreferenced pdf; Pdfwrite.pdf_to_channel ~preserve_objstm:args.preserve_objstm diff --git a/cpdfmanual.pdf b/cpdfmanual.pdf index 92b0dbc..cabc903 100644 Binary files a/cpdfmanual.pdf and b/cpdfmanual.pdf differ diff --git a/cpdfmanual.tex b/cpdfmanual.tex index d376e37..c628657 100644 --- a/cpdfmanual.tex +++ b/cpdfmanual.tex @@ -30,7 +30,7 @@ Command Line Toolkit} \vspace{12mm} {\Huge User Manual}\\ -Version 1.7 (August 2013) +Version 1.8 (October 2014) \vspace{25mm} @@ -911,8 +911,11 @@ metadata. Add \texttt{-no-encrypt-metadata} to the command line. \vspace{1.5mm} \noindent\verb!cpdf -compress in.pdf -o out.pdf! - \end{framed} - \cpdf\ provides basic facilities for decompressing and compressing PDF streams. + + \vspace{1.5mm} + \noindent\verb!cpdf -squeeze in.pdf -o out.pdf! + \end{framed} + \cpdf\ provides basic facilities for decompressing and compressing PDF streams, and a more advanced PDF \textit{squeezer}. \section{Decompressing a Document} \index{decompressing} To decompress the streams in a PDF file, for instance to manually inspect the @@ -930,7 +933,15 @@ PDF, use: \noindent\cpdf\ compresses any streams which have no compression using the \textbf{Flate\-Decode} method, with the exception of Metadata streams, which are left uncompressed. - + + \section{Squeezing a Document} +\index{squeeze} + To \textit{squeeze} a PDF file, use: + \begin{framed} + \small\verb!cpdf -squeeze in.pdf -o out.pdf! + \end{framed} + \noindent Adding \texttt{-squeeze} to the command line will \textit{squeeze} the file upon output. The squeezing process involves several processes which losslessly attempt to reduce the file size. The squeezing process is slow. + \chapter{Bookmarks} \begin{framed} \small\noindent\verb!cpdf -list-bookmarks [-utf8 | -raw] in.pdf! diff --git a/empty.pdf b/empty.pdf deleted file mode 100644 index e497ec6..0000000 Binary files a/empty.pdf and /dev/null differ