Began to excise cpdftk support

This commit is contained in:
John Whitington 2014-10-14 19:36:57 +01:00
parent f42cb56ea0
commit ded4c5f22d
3 changed files with 89 additions and 508 deletions

212
cpdf.ml
View File

@ -251,196 +251,6 @@ let squeeze pdf =
with with
e -> raise (Pdf.PDFError "Squeeze failed. No output written") e -> raise (Pdf.PDFError "Squeeze failed. No output written")
(* Printf implementation *)
exception PrintfFailure of string
type convop = Ca | CA | Cc | Cd | Ce | CE | Cf | Cg | CG | Ci | Cn | Co | Cp | Cs | Cu | Cx | CX | CPercent
type sizespec = Sll | Sl | SL | Sh | Shh | Sj | Sz | St
type pformat =
{leftjustify : bool;
padzero : bool;
signalways : bool;
space : bool;
variant : bool;
minwidth : int option;
precision : int option;
sizespec : sizespec option;
convop : convop}
let string_of_options f =
(if f.leftjustify then "-" else "") ^
(if f.padzero then "0" else "") ^
(if f.signalways then "+" else "") ^
(if f.space then " " else "") ^
(if f.variant then "#" else "")
let string_of_minwidth = function
| None -> ""
| Some x -> string_of_int x
let string_of_precision = function
| None -> ""
| Some x -> "." ^ string_of_int x
let string_of_sizespec = function
| None -> ""
| Some s->
match s with
| Sll -> "ll" | Sl -> "l" | SL -> "L" | Sh -> "h"
| Shh -> "hh" | Sj -> "j" | Sz -> "z" | St -> "t"
let string_of_convop = function
| Ca -> "a" | CA -> "A" | Cc -> "c" | Cd -> "d"
| Ce -> "e" | CE -> "E" | Cf -> "f" | Cg -> "g"
| CG -> "G" | Ci -> "i" | Cn -> "n" | Co -> "o"
| Cp -> "p" | Cs -> "s" | Cu -> "u" | Cx -> "x"
| CX -> "X" | CPercent -> "%"
let string_of_format f =
"%" ^
string_of_options f ^
string_of_minwidth f.minwidth ^
string_of_precision f.precision ^
string_of_sizespec f.sizespec ^
string_of_convop f.convop
type section =
| Format of pformat
| String of string
let sec_of_format cs =
(* 1. Read zero or more flags -, +, 0, #, <space> *)
let cs = ref cs in
let lj, pz, sa, sp, va, fin =
ref false, ref false, ref false, ref false, ref false, ref false
in
while not !fin do
match !cs with
| '-'::_ -> set lj; cs := tl !cs
| '+'::_ -> set sa; cs := tl !cs
| '0'::_ -> set pz; cs := tl !cs
| ' '::_ -> set sp; cs := tl !cs
| '#'::_ -> set va; cs := tl !cs
| _ -> set fin
done;
(* 2. Read a possible minimum field width *)
let minwidth =
let fwchars, rest = cleavewhile isdigit !cs in
cs := rest;
if fwchars = [] then None else Some (int_of_string (implode fwchars))
in
(* 3. Read an optional precision specification *)
let precision =
match !cs with
| '.'::more ->
cs := more;
let pchars, rest = cleavewhile isdigit !cs in
cs := rest;
if pchars = [] then None else Some (int_of_string (implode pchars))
| _ -> None
in
(* 4. Read an optional size specification *)
let sizespec =
match !cs with
| 'l'::'l'::r -> cs := r; Some Sll
| 'l'::r -> cs := r; Some Sl
| 'L'::r -> cs := r; Some SL
| 'h'::'h'::r -> cs := r; Some Shh
| 'h'::r -> cs := r; Some Sh
| 'j'::r -> cs := r; Some Sj
| 'z'::r -> cs := r; Some Sz
| 't'::r -> cs := r; Some St
| _ -> None
in
(* 5. Read the conversion operation *)
let convop =
match !cs with
| 'a'::r -> cs := r; Ca
| 'A'::r -> cs := r; CA
| 'c'::r -> cs := r; Cc
| 'd'::r -> cs := r; Cd
| 'e'::r -> cs := r; Ce
| 'E'::r -> cs := r; CE
| 'f'::r -> cs := r; Cf
| 'g'::r -> cs := r; Cg
| 'G'::r -> cs := r; CG
| 'i'::r -> cs := r; Ci
| 'n'::r -> cs := r; Cn
| 'o'::r -> cs := r; Co
| 'p'::r -> cs := r; Cp
| 's'::r -> cs := r; Cs
| 'u'::r -> cs := r; Cu
| 'x'::r -> cs := r; Cx
| 'X'::r -> cs := r; CX
| '%'::r -> cs := r; CPercent
| _ -> raise (Failure "sec_of_format")
in
{leftjustify = !lj;
padzero = !pz;
signalways = !sa;
space = !sp;
variant = !va;
minwidth = minwidth;
precision = precision;
sizespec = sizespec;
convop = convop},
!cs
let rec sections_of_string_inner secs currstr = function
| '%'::m ->
let sec, rest = sec_of_format m in
if currstr = []
then sections_of_string_inner (Format sec::secs) currstr rest
else sections_of_string_inner (Format sec::String (implode (rev currstr))::secs) [] rest
| x::xs ->
sections_of_string_inner secs (x::currstr) xs
| [] ->
if currstr = [] then rev secs else rev (String (implode (rev currstr))::secs)
(* Take a format string, and split it into sections *)
let sections_of_string s =
try
sections_of_string_inner [] [] (explode s)
with
_ -> raise (PrintfFailure "Couldn't parse Printf format")
(* Substitute an integer into a format, returning the empty string if the format is not suitable. *)
(* For now, just 'd', 'u', 'i' *)
let sub_int i f =
(*i Printf.printf "Substituting format |%s|\n" (string_of_format f); i*)
let str = string_of_int i
in let padding = if f.padzero then '0' else ' ' in
if f.minwidth <> None && String.length str < unopt f.minwidth then
let padding = many padding (unopt f.minwidth - String.length str) in
if f.leftjustify then str ^ implode padding else implode padding ^ str
else
str
(* Given a list of integers, substitute into integer formats *)
let rec substitute_inner donesections sections = function
| [] -> rev donesections @ sections
| i::is ->
match sections with
| [] -> rev donesections @ sections
| String s::more -> substitute_inner (String s::donesections) more (i::is)
| Format f::more -> substitute_inner (String (sub_int i f)::donesections) more is
let substitute x =
try substitute_inner [] x with
_ -> raise (PrintfFailure "Failed to substitute integer")
(* Flatten a set of sections to a string *)
let string_of_section = function
| String s -> s
| Format f -> string_of_format f
let string_of_sections sections =
try fold_left ( ^ ) "" (map string_of_section sections) with
_ -> raise (PrintfFailure "Failed to build string from Printf sections")
type encoding = type encoding =
| Raw | Raw
| UTF8 | UTF8
@ -1324,13 +1134,7 @@ let process_others marks pdf splitlevel filename sequence startpage endpage s =
in in
implode (procss [] (explode s)) implode (procss [] (explode s))
let name_of_spec printf marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage = let name_of_spec marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
if printf then
let spec =
string_of_sections (substitute (sections_of_string spec) [n])
in
process_others marks pdf splitlevel filename n startpage endpage spec
else
let fill l n = let fill l n =
let chars = explode (string_of_int n) in let chars = explode (string_of_int n) in
if length chars > l if length chars > l
@ -1383,22 +1187,22 @@ let really_write_pdf ~preserve_objstm ~create_objstm ?(encryption = None) ?(cpdf
raise (Pdf.PDFError "linearizer failed") raise (Pdf.PDFError "linearizer failed")
end end
let fast_write_split_pdfs enc printf splitlevel original_filename linearize ?(cpdflin = None) preserve_objstm create_objstm sq nobble spec main_pdf pagenums pdf_pages = let fast_write_split_pdfs enc splitlevel original_filename linearize ?(cpdflin = None) preserve_objstm create_objstm sq nobble spec main_pdf pagenums pdf_pages =
let marks = Pdfmarks.read_bookmarks main_pdf in let marks = Pdfmarks.read_bookmarks main_pdf in
iter2 iter2
(fun number pagenums -> (fun number pagenums ->
let pdf = nobble (Pdfpage.pdf_of_pages main_pdf pagenums) in let pdf = nobble (Pdfpage.pdf_of_pages main_pdf pagenums) in
let startpage, endpage = extremes pagenums in let startpage, endpage = extremes pagenums in
let name = name_of_spec printf marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in let name = name_of_spec marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in
Pdf.remove_unreferenced pdf; Pdf.remove_unreferenced pdf;
if sq then squeeze pdf; if sq then squeeze pdf;
really_write_pdf ~preserve_objstm ~create_objstm ~encryption:enc linearize (not (enc = None)) pdf name) really_write_pdf ~preserve_objstm ~create_objstm ~encryption:enc linearize (not (enc = None)) pdf name)
(indx pagenums) (indx pagenums)
pagenums pagenums
let split_pdf enc printf original_filename chunksize linearize ~cpdflin ~preserve_objstm ~create_objstm ~squeeze nobble spec pdf = let split_pdf enc original_filename chunksize linearize ~cpdflin ~preserve_objstm ~create_objstm ~squeeze nobble spec pdf =
let pdf_pages = Pdfpage.pages_of_pagetree pdf in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
fast_write_split_pdfs enc printf 0 original_filename linearize preserve_objstm fast_write_split_pdfs enc 0 original_filename linearize preserve_objstm
create_objstm squeeze nobble spec pdf (splitinto chunksize (indx pdf_pages)) pdf_pages create_objstm squeeze nobble spec pdf (splitinto chunksize (indx pdf_pages)) pdf_pages
(* Return list, in order, a *set* of page numbers of bookmarks at a given level *) (* Return list, in order, a *set* of page numbers of bookmarks at a given level *)
@ -1415,7 +1219,7 @@ let split_at_bookmarks original_filename linearize ~cpdflin ~preserve_objstm ~cr
lose (fun x -> x <= 0 || x > Pdfpage.endpage pdf) (map pred points) lose (fun x -> x <= 0 || x > Pdfpage.endpage pdf) (map pred points)
in in
let pts = splitat points (indx pdf_pages) in let pts = splitat points (indx pdf_pages) in
fast_write_split_pdfs None false level fast_write_split_pdfs None level
original_filename linearize preserve_objstm create_objstm squeeze nobble spec pdf pts pdf_pages original_filename linearize preserve_objstm create_objstm squeeze nobble spec pdf pts pdf_pages
(* Called from cpdflib.ml - different from above *) (* Called from cpdflib.ml - different from above *)
@ -3045,7 +2849,7 @@ let xmltree_of_bytes b =
and data d = D d in and data d = D d in
Xmlm.input_doc_tree ~el ~data i Xmlm.input_doc_tree ~el ~data i
let rec string_of_xmltree = function (*let rec string_of_xmltree = function
D d -> D d ->
Printf.sprintf "DATA **%s**" d Printf.sprintf "DATA **%s**" d
| E (tag, trees) -> | E (tag, trees) ->
@ -3067,7 +2871,7 @@ and string_of_attributes attrs =
and string_of_xmltrees trees = and string_of_xmltrees trees =
fold_left fold_left
(fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees) (fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees)*)
let rec get_data_for namespace name = function let rec get_data_for namespace name = function
D _ -> None D _ -> None

View File

@ -90,7 +90,7 @@ endpage] makes format substitutions in [spec] to make an output file name:
} }
[printf] is undocumented and should be set to [false]. [printf] is undocumented and should be set to [false].
*) *)
val name_of_spec : bool -> Pdfmarks.t list -> Pdf.t -> int -> string -> int -> string -> int -> int -> string val name_of_spec : Pdfmarks.t list -> Pdf.t -> int -> string -> int -> string -> int -> int -> string
(** {2 Compress and Decompress} *) (** {2 Compress and Decompress} *)
@ -220,7 +220,7 @@ each output is linearized. If [preserve_objstm] is true, object streams will
be used if the input file had them. If [create_objstm] is true, object be used if the input file had them. If [create_objstm] is true, object
streams will be created in any event. [printf] and [nobble] are streams will be created in any event. [printf] and [nobble] are
undocumented and should be false. *) undocumented and should be false. *)
val split_pdf : Pdfwrite.encryption option -> bool -> string -> int -> bool -> cpdflin:string option -> val split_pdf : Pdfwrite.encryption option -> string -> int -> bool -> cpdflin:string option ->
preserve_objstm:bool -> create_objstm:bool -> squeeze:bool -> (Pdf.t -> Pdf.t) -> string -> Pdf.t -> unit preserve_objstm:bool -> create_objstm:bool -> squeeze:bool -> (Pdf.t -> Pdf.t) -> string -> Pdf.t -> unit
(** {2 Listing fonts} *) (** {2 Listing fonts} *)

View File

@ -8,6 +8,8 @@ let version_date = "(unreleased, 16th September 2014)"
open Pdfutil open Pdfutil
open Pdfio open Pdfio
let initial_file_size = ref 0
(* Wrap up the file reading functions to exit with code 1 when an encryption (* Wrap up the file reading functions to exit with code 1 when an encryption
problem occurs. This happens when object streams are in an encrypted document problem occurs. This happens when object streams are in an encrypted document
and so it can't be read without the right password... The existing error and so it can't be read without the right password... The existing error
@ -144,8 +146,6 @@ type op =
| ExtractImages | ExtractImages
| ImageResolution of float | ImageResolution of float
| MissingFonts | MissingFonts
| DumpData
| UpdateInfo of string
| RemoveUnusedResources | RemoveUnusedResources
| ExtractFontFile | ExtractFontFile
| ExtractText | ExtractText
@ -461,25 +461,9 @@ let rec decrypt_if_necessary (a, b, c, user_pw, owner_pw) op pdf =
| Some pdf, permissions -> | Some pdf, permissions ->
if operation_allowed permissions op if operation_allowed permissions op
then pdf then pdf
else if args.do_ask
then decrypt_if_necessary_ask (a, b, c, user_pw, owner_pw) op pdf
else soft_error "User password cannot give permission for this operation" else soft_error "User password cannot give permission for this operation"
| _ -> | _ ->
if args.do_ask soft_error "Failed to decrypt file: wrong password?"
then decrypt_if_necessary_ask (a, b, c, user_pw, owner_pw) op pdf
else soft_error "Failed to decrypt file: wrong password?"
and decrypt_if_necessary_ask (a, b, c, user_pw, owner_pw) op pdf =
let name = match a with InFile x -> x | StdIn -> "Standard input" | AlreadyInMemory _ -> "PDF" in
flprint "The password supplied for input PDF:\n";
flprint (" " ^ name);
flprint "\n did not work. The PDF is encrypted, so you must supply the\n";
flprint " owner password to open it. To quit, enter a blank password\n";
flprint "Please enter the password to use on the input PDF:\n";
flprint (" " ^ name ^ ".\n");
match Pervasives.read_line () with
| "" -> soft_error "Failed to decrypt file: wrong password?"
| x -> decrypt_if_necessary (a, b, c, user_pw, x) op pdf
let nobble pdf = let nobble pdf =
if not demo then pdf else if not demo then pdf else
@ -523,19 +507,9 @@ let setmethod s =
| _ -> error "Unsupported encryption method" | _ -> error "Unsupported encryption method"
let setowner s = let setowner s =
match s with
| "PROMPT" ->
flprint "Enter owner password to use on the output PDF.\n";
args.owner <- Pervasives.read_line ()
| s ->
args.owner <- s args.owner <- s
let setuser s = let setuser s =
match s with
| "PROMPT" ->
flprint "Enter user password to use on the output PDF.\n";
args.user <- Pervasives.read_line ()
| s ->
args.user <- s args.user <- s
let anon_fun s = let anon_fun s =
@ -872,14 +846,6 @@ let setstampunder f =
setop (StampUnder f) (); setop (StampUnder f) ();
if args.position = Cpdf.TopLeft 100. then args.position <- Cpdf.BottomLeft 0. if args.position = Cpdf.TopLeft 100. then args.position <- Cpdf.BottomLeft 0.
let setstamponmulti f =
setop (StampOn f) ();
args.ismulti <- true
let setstampundermulti f =
setop (StampUnder f) ();
args.ismulti <- true
let setcombinepages f = let setcombinepages f =
setop (CombinePages f) () setop (CombinePages f) ()
@ -1150,39 +1116,12 @@ let setkeepthisid () =
| (InFile s, _, _, _, _)::_ -> args.keep_this_id <- Some s | (InFile s, _, _, _, _)::_ -> args.keep_this_id <- Some s
| _ -> () | _ -> ()
let setupdateinfo s =
args.op <- Some (UpdateInfo s)
let setdoask () = let setdoask () =
args.do_ask <- true args.do_ask <- true
let setverbose () = let setverbose () =
args.verbose <- true args.verbose <- true
let promptinputs () =
flprint "Please enter a filename for an input PDF:\n";
set_input (Pervasives.read_line ())
let promptinputpasswords () =
flprint "Please enter the open password to use on the input PDF:\n ";
match args.inputs with
| (InFile s, b, c, d, _)::more ->
flprint s;
flprint ".\n It can be empty, or have a maximum of 32 characters:\n";
let pw = Pervasives.read_line () in
args.inputs <- (InFile s, b, c, d, pw)::more
| _ -> ()
let promptoutput () =
flprint "Please enter a name for the output:\n";
args.out <- File (Pervasives.read_line ())
let setdontoverwriteexistingfiles () =
args.dont_overwrite_existing_files <- true
let setdontoverwriteinputs () =
args.dont_overwrite_inputs <- true
let setmakenewid () = let setmakenewid () =
args.makenewid <- true args.makenewid <- true
@ -1736,7 +1675,7 @@ and specs =
Arg.Int setlabelstartval, Arg.Int setlabelstartval,
" Set label start value (default 1)"); " Set label start value (default 1)");
(* These items are for cpdftk *) (* These items are for cpdftk *)
("-update-info", Arg.String setupdateinfo, ""); (*("-update-info", Arg.String setupdateinfo, "");
("-printf-format", Arg.Unit setprintfformat, ""); ("-printf-format", Arg.Unit setprintfformat, "");
("-dump-data", Arg.Unit (setop DumpData), ""); ("-dump-data", Arg.Unit (setop DumpData), "");
@ -1753,7 +1692,7 @@ and specs =
("-remove-unused-resources", Arg.Unit (setop RemoveUnusedResources), ""); ("-remove-unused-resources", Arg.Unit (setop RemoveUnusedResources), "");
("-stamp-under-multi", Arg.String setstampundermulti, ""); ("-stamp-under-multi", Arg.String setstampundermulti, "");
("-stamp-on-multi", Arg.String setstamponmulti, ""); ("-stamp-on-multi", Arg.String setstamponmulti, "");
("-list-annotations-more", Arg.Unit (setop ListAnnotationsMore), ""); ("-list-annotations-more", Arg.Unit (setop ListAnnotationsMore), "");*)
(*These items are undocumented *) (*These items are undocumented *)
("-extract-fontfile", Arg.Unit (setop ExtractFontFile), ""); ("-extract-fontfile", Arg.Unit (setop ExtractFontFile), "");
("-extract-images", Arg.Unit setextractimages, ""); ("-extract-images", Arg.Unit setextractimages, "");
@ -1780,20 +1719,14 @@ or \"1-6,9-end\" or \"even\" or \"odd\" or \"reverse\".\n\nOperations (See \
manual for full details):\n" manual for full details):\n"
(* Reading and writing *) (* Reading and writing *)
let rec writing_ok outname = let filesize name =
if args.dont_overwrite_inputs && mem outname !all_inputs then try
error ("Error: The output filename: " ^ outname ^"\n is the same as an input filename.\n"); let x = open_in_bin name in
if args.dont_overwrite_existing_files && Sys.file_exists outname then let r = in_channel_length x in
begin close_in x;
flprint ("Output file: " ^ outname ^ " already exists. Overwrite? (y/n)\n"); r
match explode (Pervasives.read_line ()) with with
| ('y' | 'Y')::_ -> outname _ -> 0
| _ ->
flprint "Enter a name for the output:\n";
writing_ok (Pervasives.read_line ())
end
else
outname
let really_write_pdf ?(encryption = None) mk_id pdf outname = let really_write_pdf ?(encryption = None) mk_id pdf outname =
let outname' = let outname' =
@ -1805,6 +1738,7 @@ let really_write_pdf ?(encryption = None) mk_id pdf outname =
~preserve_objstm:args.preserve_objstm ~preserve_objstm:args.preserve_objstm
~generate_objstm:args.create_objstm ~generate_objstm:args.create_objstm
false encryption mk_id pdf outname'; false encryption mk_id pdf outname';
begin
if args.linearize then if args.linearize then
let cpdflin = let cpdflin =
match Cpdf.find_cpdflin args.cpdflin with match Cpdf.find_cpdflin args.cpdflin with
@ -1819,6 +1753,13 @@ let really_write_pdf ?(encryption = None) mk_id pdf outname =
begin try Sys.remove outname with _ -> () end; begin try Sys.remove outname with _ -> () end;
raise (Pdf.PDFError "linearizer failed") raise (Pdf.PDFError "linearizer failed")
end end
end;
if args.squeeze then
let s = filesize outname in
Printf.printf
"Final file size is %i bytes, %.2f%% of original.\n"
s
((float s /. float !initial_file_size) *. 100.)
let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf = let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
if args.create_objstm && not args.keepversion if args.create_objstm && not args.keepversion
@ -1828,7 +1769,6 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
| NoOutputSpecified -> | NoOutputSpecified ->
output_pdfs =| pdf output_pdfs =| pdf
| File outname -> | File outname ->
let outname = writing_ok outname in
begin match encryption with begin match encryption with
None -> None ->
ignore (nobble pdf); ignore (nobble pdf);
@ -1880,6 +1820,8 @@ let pdf_of_stdin user_pw owner_pw =
let get_single_pdf op read_lazy = let get_single_pdf op read_lazy =
match args.inputs with match args.inputs with
| (InFile inname, _, _, u, o) as input::_ -> | (InFile inname, _, _, u, o) as input::_ ->
if args.squeeze then
Printf.printf "Initial file size is %i bytes\n" (filesize inname);
let pdf = let pdf =
if read_lazy then if read_lazy then
pdfread_pdf_of_channel_lazy (optstring u) (optstring o) (open_in_bin inname) pdfread_pdf_of_channel_lazy (optstring u) (optstring o) (open_in_bin inname)
@ -1905,6 +1847,12 @@ file once *)
let get_pdf_from_input_kind ((_, _, _, u, o) as input) op = function let get_pdf_from_input_kind ((_, _, _, u, o) as input) op = function
| AlreadyInMemory pdf -> pdf | AlreadyInMemory pdf -> pdf
| InFile s -> | InFile s ->
if args.squeeze then
begin
let size = filesize s in
initial_file_size := size;
Printf.printf "Initial file size is %i bytes\n" size
end;
begin try Hashtbl.find filenames s with begin try Hashtbl.find filenames s with
Not_found -> Not_found ->
let pdf = decrypt_if_necessary input op (pdfread_pdf_of_file (optstring u) (optstring o) s) in let pdf = decrypt_if_necessary input op (pdfread_pdf_of_file (optstring u) (optstring o) s) in
@ -2047,7 +1995,7 @@ let extract_images pdf range stem =
if images <> [] then if images <> [] then
(let names = (let names =
map map
(function n -> let r = Cpdf.name_of_spec false [] (*FIXME *) pdf 0 stem n "" 0 0 in (*i flprint r; flprint "\n"; i*) r) (function n -> let r = Cpdf.name_of_spec [] (*FIXME *) pdf 0 stem n "" 0 0 in (*i flprint r; flprint "\n"; i*) r)
(ilist 1 (length images)) (ilist 1 (length images))
in in
iter2 (write_image pdf page.Pdfpage.resources) names images)) iter2 (write_image pdf page.Pdfpage.resources) names images))
@ -2492,170 +2440,6 @@ let dump_attached_files pdf out =
with with
_ -> error "Couldn't dump attached files" _ -> error "Couldn't dump attached files"
(* Prerotate a pdf *)
let prerotate_pdf pdf r =
let setto angle = Cpdf.rotate_pdf angle pdf (ilist 1 (Pdfpage.endpage pdf))
and setby angle = Cpdf.rotate_pdf_by angle pdf (ilist 1 (Pdfpage.endpage pdf)) in
match r with
| Pdfmerge.DNR -> pdf
| Pdfmerge.N -> setto 0
| Pdfmerge.S -> setto 180
| Pdfmerge.E -> setto 90
| Pdfmerge.W -> setto 270
| Pdfmerge.L -> setby ~-90
| Pdfmerge.R -> setby 90
| Pdfmerge.D -> setby 180
(* Convert from unicode or PDFDocencoded to ASCII string with HTML entities in it. *)
let html_of_unicode s =
implode
(flatten
(map
(function 60 -> explode "&lt;"
| 62 -> explode "&gt;"
| 38 -> explode "&amp;"
| 34 -> explode "&quot;"
| x when x >= 0x20 && x <= 0x7e -> [char_of_int x]
| x -> ['&';'#'] @ explode (string_of_int x) @ [';'])
(Pdftext.codepoints_of_pdfdocstring s)))
(* Convert from HTML entities to a PDF string which is unicode-encoded (if there are any non-ASCII chars, or PDFDocEncoded if there aren't) . *)
let unicode_of_html s =
let rec codepoints_of_html ps = function
| '&'::'l'::'t'::';'::r -> codepoints_of_html (60::ps) r
| '&'::'g'::'t'::';'::r -> codepoints_of_html (62::ps) r
| '&'::'a'::'m'::'p'::';'::r -> codepoints_of_html (38::ps) r
| '&'::'q'::'u'::'o'::'t'::';'::r -> codepoints_of_html (34::ps) r
| '&'::'#'::r ->
begin match cleavewhile (function '0'..'9' -> true | _ -> false) r with
| [], r -> codepoints_of_html ps r
| cs, (';'::r) ->
let i = try int_of_string (implode cs) with _ -> error "bad HTML literal in update_info" in
codepoints_of_html (i::ps) r
| _ -> error "bad HTML literal in update_info 2"
end
| x::r when int_of_char x >= 0x20 && int_of_char x <= 0x7e -> codepoints_of_html (int_of_char x::ps) r
| _::r -> codepoints_of_html ps r
| [] -> rev ps
in
Pdftext.pdfdocstring_of_codepoints (codepoints_of_html [] (explode s))
let dump_data pdf out =
let channel =
match out with
| NoOutputSpecified -> stdout
| Stdout -> stdout
| File f -> open_out_bin f
in
let prs s = Pervasives.output_string channel s in
(* 1. Info keys *)
begin match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with
| Some (Pdf.Dictionary d) ->
iter
(function (name, pdfobj) ->
match pdfobj with
| Pdf.String s ->
begin match s with "" -> () | _ ->
begin match explode name with
| [] -> ()
| h::t -> prs (Printf.sprintf "InfoKey: %s\nInfoValue: %s\n" (implode t) (html_of_unicode s))
end
end
| _ -> ())
d
| _ -> flprint "Warning: no info dictionary found\n"; ()
end;
let hex s =
fold_left ( ^ ) "" (map (Printf.sprintf "%02x") (map int_of_char (explode s)))
in
(* 2. IDs *)
begin match Pdf.lookup_direct pdf "/ID" pdf.Pdf.trailerdict with
| Some (Pdf.Array [Pdf.String s; Pdf.String t]) -> prs (Printf.sprintf "PdfID0: %s\nPdfID1: %s\n" (hex s) (hex t))
| _ -> ()
end;
(* 3. No of pages *)
prs (Printf.sprintf "NumberOfPages: %i\n" (Pdfpage.endpage pdf));
(* 4. Outlines *)
iter
(function m ->
prs (Printf.sprintf "BookmarkTitle: %s\n" (html_of_unicode m.Pdfmarks.text));
prs (Printf.sprintf "BookmarkLevel: %i\n" (m.Pdfmarks.level + 1));
prs (Printf.sprintf "BookmarkPageNumber: %i\n" (Pdfpage.pagenumber_of_target pdf m.Pdfmarks.target)))
(Pdfmarks.read_bookmarks pdf);
(* 5. Close and finish *)
match out with File _ -> close_out channel | _ -> flush stdout
(* Parse and update info *)
let update_info pdf source =
let channel =
match source with
| "use-stdin" -> stdin
| x -> open_in_bin x
in
let rec read_lines prev channel =
try read_lines (input_line channel::prev) channel with End_of_file -> rev prev
in
let lines = read_lines [] channel in
let kvpairs =
map
(function l -> let k, v = cleavewhile (neq ':') (explode l) in implode k, implode (tail_no_fail (tail_no_fail v)))
lines
in
(*i iter
(function (k, v) -> Printf.printf "(%s,%s)\n" k v)
kvpairs; i*)
(* Split into 1) info keys / values 2) PdfIDs, Bookmarks *)
let infolines =
keep (function (("InfoKey" | "InfoValue"), _) -> true | _ -> false) kvpairs;
and pdfidlines =
keep (function (("PdfID0" | "PdfID1"), _) -> true | _ -> false) kvpairs
and bookmarklines =
keep (function (("BookmarkTitle" | "BookmarkLevel" | "BookmarkPageNumber"), _) -> true | _ -> false) kvpairs
in
(* 1. Add/Replace info keys *)
let kvpairs =
map
(function [(_, k); (_, v)] -> k, v | _ -> error "Mismatched info Key/Value pairs")
(splitinto 2 infolines)
in
let pdf =
{pdf with Pdf.trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Info"
(Pdf.Dictionary
(fold_left
(fun d (k, v) -> add k v d)
(match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with | Some (Pdf.Dictionary d) -> d | _ -> [])
(map (function (k, v) -> "/" ^ k, Pdf.String (unicode_of_html v)) kvpairs)))}
in
(* 2. Add/Replace PDF Id *)
let pdf =
let unhex s =
match Pdfread.lex_hexstring (Pdfio.input_of_string ("<" ^ s ^ ">")) with
| Pdfgenlex.LexString s -> s
| _ -> error "PDFId wrongly formed in update_info file"
in
match pdfidlines with
| ["PdfID0", a; "PdfID1", b] ->
{pdf with Pdf.trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/ID" (Pdf.Array [Pdf.String (unhex a); Pdf.String (unhex b)])}
| _ -> pdf
in
(* 3. Replace Bookmarks *)
let marks =
map
(function
| [("BookmarkTitle", a); ("BookmarkLevel", b); ("BookmarkPageNumber", c)] ->
{Pdfmarks.level = int_of_string b - 1;
Pdfmarks.text = unicode_of_html a;
Pdfmarks.target = Pdfpage.target_of_pagenumber pdf (int_of_string c);
Pdfmarks.isopen = false}
| _ -> error "Bookmark entries malformed in update_info file")
(splitinto 3 bookmarklines)
in
let pdf = Pdfmarks.add_bookmarks marks pdf in
begin match source with "use-stdin" -> () | _ -> close_in channel end;
pdf
(* If pages in stamp < pages in main, extend stamp by repeating its last page. If pages in stamp more, chop stamp *) (* If pages in stamp < pages in main, extend stamp by repeating its last page. If pages in stamp more, chop stamp *)
let equalize_pages_extend main stamp = let equalize_pages_extend main stamp =
let length_stamp = Pdfpage.endpage stamp let length_stamp = Pdfpage.endpage stamp
@ -2817,7 +2601,6 @@ let go () =
input file, and we're just extracting pages, might we use a input file, and we're just extracting pages, might we use a
lazy read? *) lazy read? *)
if hd ranges <> "all" || hd rotations <> Pdfmerge.DNR || !Pdfpage.flat_pagetrees then if hd ranges <> "all" || hd rotations <> Pdfmerge.DNR || !Pdfpage.flat_pagetrees then
let pdf = if hd rotations <> Pdfmerge.DNR then prerotate_pdf pdf (hd rotations) else pdf in
let range = parse_pagespec pdf (hd ranges) in let range = parse_pagespec pdf (hd ranges) in
let newpdf = Pdfpage.pdf_of_pages ~retain_numbering:args.retain_numbering pdf range in let newpdf = Pdfpage.pdf_of_pages ~retain_numbering:args.retain_numbering pdf range in
write_pdf false newpdf write_pdf false newpdf
@ -3125,7 +2908,7 @@ let go () =
Pdfwrite.permissions = banlist_of_args ()} Pdfwrite.permissions = banlist_of_args ()}
in in
Cpdf.split_pdf Cpdf.split_pdf
enc args.printf_format args.original_filename args.chunksize args.linearize args.cpdflin enc args.original_filename args.chunksize args.linearize args.cpdflin
args.preserve_objstm args.preserve_objstm (*yes--always create if preserving *) args.preserve_objstm args.preserve_objstm (*yes--always create if preserving *)
args.squeeze nobble output_spec pdf args.squeeze nobble output_spec pdf
| _, Stdout -> error "Can't split to standard output" | _, Stdout -> error "Can't split to standard output"
@ -3438,12 +3221,6 @@ let go () =
| _ -> | _ ->
Printf.eprintf "CSP3: Too many input files or input not a file" Printf.eprintf "CSP3: Too many input files or input not a file"
end end
| Some DumpData ->
let pdf = get_single_pdf args.op true in
dump_data pdf args.out
| Some (UpdateInfo source) ->
let pdf = get_single_pdf args.op false in
write_pdf false (update_info pdf source)
| Some ExtractText -> | Some ExtractText ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
let range = parse_pagespec pdf (get_pagespec ()) in let range = parse_pagespec pdf (get_pagespec ()) in