Added squeezing functionality

This commit is contained in:
John Whitington 2014-09-11 14:05:13 +01:00
parent 0876413f32
commit 42028ff80a
6 changed files with 69 additions and 11 deletions

View File

@ -5,7 +5,7 @@ SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
RESULT = cpdf RESULT = cpdf
ANNOTATE = true ANNOTATE = true
PACKS = camlpdf xml-light PACKS = camlpdf
OCAMLNCFLAGS = -g OCAMLNCFLAGS = -g
OCAMLBCFLAGS = -g OCAMLBCFLAGS = -g

View File

@ -2688,9 +2688,9 @@ let output_xml_info pdf =
match get_metadata pdf with match get_metadata pdf with
None -> () None -> ()
| Some metadata -> | Some metadata ->
print_string (string_of_bytes metadata); print_string (string_of_bytes metadata)
let parsed = Xml.parse_string (string_of_bytes metadata) in (*let parsed = Xml.parse_string (string_of_bytes metadata) in
print_string (Xml.to_string parsed) print_string (Xml.to_string parsed)*)
let output_info encoding pdf = let output_info encoding pdf =
let getstring = let getstring =

View File

@ -257,7 +257,8 @@ type args =
mutable uprightstamp : bool; mutable uprightstamp : bool;
mutable labelstyle : Pdfpagelabels.labelstyle option; mutable labelstyle : Pdfpagelabels.labelstyle option;
mutable labelprefix : string option; mutable labelprefix : string option;
mutable labeloffset : int option} mutable labeloffset : int option;
mutable squeeze : bool}
(* List of all filenames in any AND stage - this is used to check that we don't (* List of all filenames in any AND stage - this is used to check that we don't
overwrite any input file when -dont-overwrite-existing-files is used. *) overwrite any input file when -dont-overwrite-existing-files is used. *)
@ -339,7 +340,8 @@ let args =
uprightstamp = false; uprightstamp = false;
labelstyle = None; labelstyle = None;
labelprefix = None; labelprefix = None;
labeloffset = None} labeloffset = None;
squeeze = false}
let reset_arguments () = let reset_arguments () =
args.op <- None; args.op <- None;
@ -412,7 +414,8 @@ let reset_arguments () =
args.uprightstamp <- false; args.uprightstamp <- false;
args.labelstyle <- None; args.labelstyle <- None;
args.labelprefix <- None; args.labelprefix <- None;
args.labeloffset <- None args.labeloffset <- None;
args.squeeze <- false
(* We don't reset args.do_ask and args.verbose, because they operate on all (* We don't reset args.do_ask and args.verbose, because they operate on all
parts of the AND-ed command line sent from cpdftk. *) parts of the AND-ed command line sent from cpdftk. *)
@ -1010,6 +1013,9 @@ let setscalecontents f =
args.op <- Some (ScaleContents f); args.op <- Some (ScaleContents f);
args.position <- Cpdf.Diagonal (* Will be center *) args.position <- Cpdf.Diagonal (* Will be center *)
let setsqueeze () =
args.squeeze <- true
(* Parsing the control file *) (* Parsing the control file *)
let rec getuntilendquote prev = function let rec getuntilendquote prev = function
| [] -> implode (rev prev), [] | [] -> implode (rev prev), []
@ -1374,6 +1380,9 @@ and specs =
("-remove-duplicate-streams", ("-remove-duplicate-streams",
Arg.Unit setremoveduplicatestreams, Arg.Unit setremoveduplicatestreams,
""); "");
("-squeeze",
Arg.Unit setsqueeze,
" Slow, lossless compression of a PDF file");
("-list-bookmarks", ("-list-bookmarks",
Arg.Unit (setop ListBookmarks), Arg.Unit (setop ListBookmarks),
" List Bookmarks"); " List Bookmarks");
@ -1756,6 +1765,42 @@ let rec writing_ok outname =
else else
outname outname
(* Equality on PDF streams *)
let pdfobjeq pdf x y =
let x = Pdf.lookup_obj pdf x
and y = Pdf.lookup_obj pdf y in
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
compare x y
let squeeze pdf =
let objs = ref [] in
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
let toprocess =
keep
(fun x -> length x > 1)
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
in
Printf.printf "Found %i pools of duplicate objects to coalesce\n" (length toprocess);
(*List.iter
(fun x -> Printf.printf "\n\nPool: "; List.iter (Printf.printf "%i ") x)
toprocess;*)
let pdfr = ref pdf in
let changetable = Hashtbl.create 100 in
iter
(function [] -> assert false | h::t ->
iter (fun e -> Hashtbl.add changetable e h) t)
toprocess;
(* For a unknown reason, the output file is much smaller if
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
an old, well-understood function in use for years -- what is
going on? *)
pdfr := Pdf.renumber changetable !pdfr;
pdfr := Pdf.renumber changetable !pdfr;
pdf.Pdf.root <- !pdfr.Pdf.root;
pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
let write_pdf mk_id pdf = let write_pdf mk_id pdf =
if args.create_objstm && not args.keepversion if args.create_objstm && not args.keepversion
then pdf.Pdf.minor <- max pdf.Pdf.minor 5; then pdf.Pdf.minor <- max pdf.Pdf.minor 5;
@ -1766,6 +1811,7 @@ let write_pdf mk_id pdf =
| File outname -> | File outname ->
let outname = writing_ok outname in let outname = writing_ok outname in
let pdf = Cpdf.recompress_pdf <| nobble pdf in let pdf = Cpdf.recompress_pdf <| nobble pdf in
if args.squeeze then squeeze pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
Pdfwrite.pdf_to_file_options Pdfwrite.pdf_to_file_options
~preserve_objstm:args.preserve_objstm ~preserve_objstm:args.preserve_objstm
@ -1773,6 +1819,7 @@ let write_pdf mk_id pdf =
args.linearize None mk_id pdf outname args.linearize None mk_id pdf outname
| Stdout -> | Stdout ->
let pdf = Cpdf.recompress_pdf <| nobble pdf in let pdf = Cpdf.recompress_pdf <| nobble pdf in
if args.squeeze then squeeze pdf;
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
Pdfwrite.pdf_to_channel Pdfwrite.pdf_to_channel
~preserve_objstm:args.preserve_objstm ~preserve_objstm:args.preserve_objstm

Binary file not shown.

View File

@ -30,7 +30,7 @@ Command Line Toolkit}
\vspace{12mm} \vspace{12mm}
{\Huge User Manual}\\ {\Huge User Manual}\\
Version 1.7 (August 2013) Version 1.8 (October 2014)
\vspace{25mm} \vspace{25mm}
@ -911,8 +911,11 @@ metadata. Add \texttt{-no-encrypt-metadata} to the command line.
\vspace{1.5mm} \vspace{1.5mm}
\noindent\verb!cpdf -compress in.pdf -o out.pdf! \noindent\verb!cpdf -compress in.pdf -o out.pdf!
\end{framed}
\cpdf\ provides basic facilities for decompressing and compressing PDF streams. \vspace{1.5mm}
\noindent\verb!cpdf -squeeze in.pdf -o out.pdf!
\end{framed}
\cpdf\ provides basic facilities for decompressing and compressing PDF streams, and a more advanced PDF \textit{squeezer}.
\section{Decompressing a Document} \section{Decompressing a Document}
\index{decompressing} \index{decompressing}
To decompress the streams in a PDF file, for instance to manually inspect the To decompress the streams in a PDF file, for instance to manually inspect the
@ -931,6 +934,14 @@ PDF, use:
\textbf{Flate\-Decode} method, with the exception of Metadata streams, which \textbf{Flate\-Decode} method, with the exception of Metadata streams, which
are left uncompressed. are left uncompressed.
\section{Squeezing a Document}
\index{squeeze}
To \textit{squeeze} a PDF file, use:
\begin{framed}
\small\verb!cpdf -squeeze in.pdf -o out.pdf!
\end{framed}
\noindent Adding \texttt{-squeeze} to the command line will \textit{squeeze} the file upon output. The squeezing process involves several processes which losslessly attempt to reduce the file size. The squeezing process is slow.
\chapter{Bookmarks} \chapter{Bookmarks}
\begin{framed} \begin{framed}
\small\noindent\verb!cpdf -list-bookmarks [-utf8 | -raw] in.pdf! \small\noindent\verb!cpdf -list-bookmarks [-utf8 | -raw] in.pdf!

BIN
empty.pdf

Binary file not shown.