Added squeezing functionality
This commit is contained in:
parent
0876413f32
commit
42028ff80a
2
Makefile
2
Makefile
|
@ -5,7 +5,7 @@ SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
|
||||||
|
|
||||||
RESULT = cpdf
|
RESULT = cpdf
|
||||||
ANNOTATE = true
|
ANNOTATE = true
|
||||||
PACKS = camlpdf xml-light
|
PACKS = camlpdf
|
||||||
|
|
||||||
OCAMLNCFLAGS = -g
|
OCAMLNCFLAGS = -g
|
||||||
OCAMLBCFLAGS = -g
|
OCAMLBCFLAGS = -g
|
||||||
|
|
6
cpdf.ml
6
cpdf.ml
|
@ -2688,9 +2688,9 @@ let output_xml_info pdf =
|
||||||
match get_metadata pdf with
|
match get_metadata pdf with
|
||||||
None -> ()
|
None -> ()
|
||||||
| Some metadata ->
|
| Some metadata ->
|
||||||
print_string (string_of_bytes metadata);
|
print_string (string_of_bytes metadata)
|
||||||
let parsed = Xml.parse_string (string_of_bytes metadata) in
|
(*let parsed = Xml.parse_string (string_of_bytes metadata) in
|
||||||
print_string (Xml.to_string parsed)
|
print_string (Xml.to_string parsed)*)
|
||||||
|
|
||||||
let output_info encoding pdf =
|
let output_info encoding pdf =
|
||||||
let getstring =
|
let getstring =
|
||||||
|
|
|
@ -257,7 +257,8 @@ type args =
|
||||||
mutable uprightstamp : bool;
|
mutable uprightstamp : bool;
|
||||||
mutable labelstyle : Pdfpagelabels.labelstyle option;
|
mutable labelstyle : Pdfpagelabels.labelstyle option;
|
||||||
mutable labelprefix : string option;
|
mutable labelprefix : string option;
|
||||||
mutable labeloffset : int option}
|
mutable labeloffset : int option;
|
||||||
|
mutable squeeze : bool}
|
||||||
|
|
||||||
(* List of all filenames in any AND stage - this is used to check that we don't
|
(* List of all filenames in any AND stage - this is used to check that we don't
|
||||||
overwrite any input file when -dont-overwrite-existing-files is used. *)
|
overwrite any input file when -dont-overwrite-existing-files is used. *)
|
||||||
|
@ -339,7 +340,8 @@ let args =
|
||||||
uprightstamp = false;
|
uprightstamp = false;
|
||||||
labelstyle = None;
|
labelstyle = None;
|
||||||
labelprefix = None;
|
labelprefix = None;
|
||||||
labeloffset = None}
|
labeloffset = None;
|
||||||
|
squeeze = false}
|
||||||
|
|
||||||
let reset_arguments () =
|
let reset_arguments () =
|
||||||
args.op <- None;
|
args.op <- None;
|
||||||
|
@ -412,7 +414,8 @@ let reset_arguments () =
|
||||||
args.uprightstamp <- false;
|
args.uprightstamp <- false;
|
||||||
args.labelstyle <- None;
|
args.labelstyle <- None;
|
||||||
args.labelprefix <- None;
|
args.labelprefix <- None;
|
||||||
args.labeloffset <- None
|
args.labeloffset <- None;
|
||||||
|
args.squeeze <- false
|
||||||
(* We don't reset args.do_ask and args.verbose, because they operate on all
|
(* We don't reset args.do_ask and args.verbose, because they operate on all
|
||||||
parts of the AND-ed command line sent from cpdftk. *)
|
parts of the AND-ed command line sent from cpdftk. *)
|
||||||
|
|
||||||
|
@ -1010,6 +1013,9 @@ let setscalecontents f =
|
||||||
args.op <- Some (ScaleContents f);
|
args.op <- Some (ScaleContents f);
|
||||||
args.position <- Cpdf.Diagonal (* Will be center *)
|
args.position <- Cpdf.Diagonal (* Will be center *)
|
||||||
|
|
||||||
|
let setsqueeze () =
|
||||||
|
args.squeeze <- true
|
||||||
|
|
||||||
(* Parsing the control file *)
|
(* Parsing the control file *)
|
||||||
let rec getuntilendquote prev = function
|
let rec getuntilendquote prev = function
|
||||||
| [] -> implode (rev prev), []
|
| [] -> implode (rev prev), []
|
||||||
|
@ -1374,6 +1380,9 @@ and specs =
|
||||||
("-remove-duplicate-streams",
|
("-remove-duplicate-streams",
|
||||||
Arg.Unit setremoveduplicatestreams,
|
Arg.Unit setremoveduplicatestreams,
|
||||||
"");
|
"");
|
||||||
|
("-squeeze",
|
||||||
|
Arg.Unit setsqueeze,
|
||||||
|
" Slow, lossless compression of a PDF file");
|
||||||
("-list-bookmarks",
|
("-list-bookmarks",
|
||||||
Arg.Unit (setop ListBookmarks),
|
Arg.Unit (setop ListBookmarks),
|
||||||
" List Bookmarks");
|
" List Bookmarks");
|
||||||
|
@ -1756,6 +1765,42 @@ let rec writing_ok outname =
|
||||||
else
|
else
|
||||||
outname
|
outname
|
||||||
|
|
||||||
|
(* Equality on PDF streams *)
|
||||||
|
let pdfobjeq pdf x y =
|
||||||
|
let x = Pdf.lookup_obj pdf x
|
||||||
|
and y = Pdf.lookup_obj pdf y in
|
||||||
|
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
|
||||||
|
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
|
||||||
|
compare x y
|
||||||
|
|
||||||
|
let squeeze pdf =
|
||||||
|
let objs = ref [] in
|
||||||
|
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
|
||||||
|
let toprocess =
|
||||||
|
keep
|
||||||
|
(fun x -> length x > 1)
|
||||||
|
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
|
||||||
|
in
|
||||||
|
Printf.printf "Found %i pools of duplicate objects to coalesce\n" (length toprocess);
|
||||||
|
(*List.iter
|
||||||
|
(fun x -> Printf.printf "\n\nPool: "; List.iter (Printf.printf "%i ") x)
|
||||||
|
toprocess;*)
|
||||||
|
let pdfr = ref pdf in
|
||||||
|
let changetable = Hashtbl.create 100 in
|
||||||
|
iter
|
||||||
|
(function [] -> assert false | h::t ->
|
||||||
|
iter (fun e -> Hashtbl.add changetable e h) t)
|
||||||
|
toprocess;
|
||||||
|
(* For a unknown reason, the output file is much smaller if
|
||||||
|
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
|
||||||
|
an old, well-understood function in use for years -- what is
|
||||||
|
going on? *)
|
||||||
|
pdfr := Pdf.renumber changetable !pdfr;
|
||||||
|
pdfr := Pdf.renumber changetable !pdfr;
|
||||||
|
pdf.Pdf.root <- !pdfr.Pdf.root;
|
||||||
|
pdf.Pdf.objects <- !pdfr.Pdf.objects;
|
||||||
|
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
|
||||||
|
|
||||||
let write_pdf mk_id pdf =
|
let write_pdf mk_id pdf =
|
||||||
if args.create_objstm && not args.keepversion
|
if args.create_objstm && not args.keepversion
|
||||||
then pdf.Pdf.minor <- max pdf.Pdf.minor 5;
|
then pdf.Pdf.minor <- max pdf.Pdf.minor 5;
|
||||||
|
@ -1766,6 +1811,7 @@ let write_pdf mk_id pdf =
|
||||||
| File outname ->
|
| File outname ->
|
||||||
let outname = writing_ok outname in
|
let outname = writing_ok outname in
|
||||||
let pdf = Cpdf.recompress_pdf <| nobble pdf in
|
let pdf = Cpdf.recompress_pdf <| nobble pdf in
|
||||||
|
if args.squeeze then squeeze pdf;
|
||||||
Pdf.remove_unreferenced pdf;
|
Pdf.remove_unreferenced pdf;
|
||||||
Pdfwrite.pdf_to_file_options
|
Pdfwrite.pdf_to_file_options
|
||||||
~preserve_objstm:args.preserve_objstm
|
~preserve_objstm:args.preserve_objstm
|
||||||
|
@ -1773,6 +1819,7 @@ let write_pdf mk_id pdf =
|
||||||
args.linearize None mk_id pdf outname
|
args.linearize None mk_id pdf outname
|
||||||
| Stdout ->
|
| Stdout ->
|
||||||
let pdf = Cpdf.recompress_pdf <| nobble pdf in
|
let pdf = Cpdf.recompress_pdf <| nobble pdf in
|
||||||
|
if args.squeeze then squeeze pdf;
|
||||||
Pdf.remove_unreferenced pdf;
|
Pdf.remove_unreferenced pdf;
|
||||||
Pdfwrite.pdf_to_channel
|
Pdfwrite.pdf_to_channel
|
||||||
~preserve_objstm:args.preserve_objstm
|
~preserve_objstm:args.preserve_objstm
|
||||||
|
|
BIN
cpdfmanual.pdf
BIN
cpdfmanual.pdf
Binary file not shown.
|
@ -30,7 +30,7 @@ Command Line Toolkit}
|
||||||
\vspace{12mm}
|
\vspace{12mm}
|
||||||
|
|
||||||
{\Huge User Manual}\\
|
{\Huge User Manual}\\
|
||||||
Version 1.7 (August 2013)
|
Version 1.8 (October 2014)
|
||||||
|
|
||||||
\vspace{25mm}
|
\vspace{25mm}
|
||||||
|
|
||||||
|
@ -911,8 +911,11 @@ metadata. Add \texttt{-no-encrypt-metadata} to the command line.
|
||||||
|
|
||||||
\vspace{1.5mm}
|
\vspace{1.5mm}
|
||||||
\noindent\verb!cpdf -compress in.pdf -o out.pdf!
|
\noindent\verb!cpdf -compress in.pdf -o out.pdf!
|
||||||
|
|
||||||
|
\vspace{1.5mm}
|
||||||
|
\noindent\verb!cpdf -squeeze in.pdf -o out.pdf!
|
||||||
\end{framed}
|
\end{framed}
|
||||||
\cpdf\ provides basic facilities for decompressing and compressing PDF streams.
|
\cpdf\ provides basic facilities for decompressing and compressing PDF streams, and a more advanced PDF \textit{squeezer}.
|
||||||
\section{Decompressing a Document}
|
\section{Decompressing a Document}
|
||||||
\index{decompressing}
|
\index{decompressing}
|
||||||
To decompress the streams in a PDF file, for instance to manually inspect the
|
To decompress the streams in a PDF file, for instance to manually inspect the
|
||||||
|
@ -931,6 +934,14 @@ PDF, use:
|
||||||
\textbf{Flate\-Decode} method, with the exception of Metadata streams, which
|
\textbf{Flate\-Decode} method, with the exception of Metadata streams, which
|
||||||
are left uncompressed.
|
are left uncompressed.
|
||||||
|
|
||||||
|
\section{Squeezing a Document}
|
||||||
|
\index{squeeze}
|
||||||
|
To \textit{squeeze} a PDF file, use:
|
||||||
|
\begin{framed}
|
||||||
|
\small\verb!cpdf -squeeze in.pdf -o out.pdf!
|
||||||
|
\end{framed}
|
||||||
|
\noindent Adding \texttt{-squeeze} to the command line will \textit{squeeze} the file upon output. The squeezing process involves several processes which losslessly attempt to reduce the file size. The squeezing process is slow.
|
||||||
|
|
||||||
\chapter{Bookmarks}
|
\chapter{Bookmarks}
|
||||||
\begin{framed}
|
\begin{framed}
|
||||||
\small\noindent\verb!cpdf -list-bookmarks [-utf8 | -raw] in.pdf!
|
\small\noindent\verb!cpdf -list-bookmarks [-utf8 | -raw] in.pdf!
|
||||||
|
|
Loading…
Reference in New Issue