From ded4c5f22d48785c4b8d32eaaa631bc27deeb4d1 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 14 Oct 2014 19:36:57 +0100 Subject: [PATCH] Began to excise cpdftk support --- cpdf.ml | 242 ++++------------------------------ cpdf.mli | 4 +- cpdfcommand.ml | 351 +++++++++---------------------------------------- 3 files changed, 89 insertions(+), 508 deletions(-) diff --git a/cpdf.ml b/cpdf.ml index f944357..6f54dbd 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -251,196 +251,6 @@ let squeeze pdf = with e -> raise (Pdf.PDFError "Squeeze failed. No output written") -(* Printf implementation *) -exception PrintfFailure of string - -type convop = Ca | CA | Cc | Cd | Ce | CE | Cf | Cg | CG | Ci | Cn | Co | Cp | Cs | Cu | Cx | CX | CPercent - -type sizespec = Sll | Sl | SL | Sh | Shh | Sj | Sz | St - -type pformat = - {leftjustify : bool; - padzero : bool; - signalways : bool; - space : bool; - variant : bool; - minwidth : int option; - precision : int option; - sizespec : sizespec option; - convop : convop} - -let string_of_options f = - (if f.leftjustify then "-" else "") ^ - (if f.padzero then "0" else "") ^ - (if f.signalways then "+" else "") ^ - (if f.space then " " else "") ^ - (if f.variant then "#" else "") - -let string_of_minwidth = function - | None -> "" - | Some x -> string_of_int x - -let string_of_precision = function - | None -> "" - | Some x -> "." ^ string_of_int x - -let string_of_sizespec = function - | None -> "" - | Some s-> - match s with - | Sll -> "ll" | Sl -> "l" | SL -> "L" | Sh -> "h" - | Shh -> "hh" | Sj -> "j" | Sz -> "z" | St -> "t" - -let string_of_convop = function - | Ca -> "a" | CA -> "A" | Cc -> "c" | Cd -> "d" - | Ce -> "e" | CE -> "E" | Cf -> "f" | Cg -> "g" - | CG -> "G" | Ci -> "i" | Cn -> "n" | Co -> "o" - | Cp -> "p" | Cs -> "s" | Cu -> "u" | Cx -> "x" - | CX -> "X" | CPercent -> "%" - -let string_of_format f = - "%" ^ - string_of_options f ^ - string_of_minwidth f.minwidth ^ - string_of_precision f.precision ^ - string_of_sizespec f.sizespec ^ - string_of_convop f.convop - -type section = - | Format of pformat - | String of string - -let sec_of_format cs = - (* 1. Read zero or more flags -, +, 0, #, *) - let cs = ref cs in - let lj, pz, sa, sp, va, fin = - ref false, ref false, ref false, ref false, ref false, ref false - in - while not !fin do - match !cs with - | '-'::_ -> set lj; cs := tl !cs - | '+'::_ -> set sa; cs := tl !cs - | '0'::_ -> set pz; cs := tl !cs - | ' '::_ -> set sp; cs := tl !cs - | '#'::_ -> set va; cs := tl !cs - | _ -> set fin - done; - (* 2. Read a possible minimum field width *) - let minwidth = - let fwchars, rest = cleavewhile isdigit !cs in - cs := rest; - if fwchars = [] then None else Some (int_of_string (implode fwchars)) - in - (* 3. Read an optional precision specification *) - let precision = - match !cs with - | '.'::more -> - cs := more; - let pchars, rest = cleavewhile isdigit !cs in - cs := rest; - if pchars = [] then None else Some (int_of_string (implode pchars)) - | _ -> None - in - (* 4. Read an optional size specification *) - let sizespec = - match !cs with - | 'l'::'l'::r -> cs := r; Some Sll - | 'l'::r -> cs := r; Some Sl - | 'L'::r -> cs := r; Some SL - | 'h'::'h'::r -> cs := r; Some Shh - | 'h'::r -> cs := r; Some Sh - | 'j'::r -> cs := r; Some Sj - | 'z'::r -> cs := r; Some Sz - | 't'::r -> cs := r; Some St - | _ -> None - in - (* 5. Read the conversion operation *) - let convop = - match !cs with - | 'a'::r -> cs := r; Ca - | 'A'::r -> cs := r; CA - | 'c'::r -> cs := r; Cc - | 'd'::r -> cs := r; Cd - | 'e'::r -> cs := r; Ce - | 'E'::r -> cs := r; CE - | 'f'::r -> cs := r; Cf - | 'g'::r -> cs := r; Cg - | 'G'::r -> cs := r; CG - | 'i'::r -> cs := r; Ci - | 'n'::r -> cs := r; Cn - | 'o'::r -> cs := r; Co - | 'p'::r -> cs := r; Cp - | 's'::r -> cs := r; Cs - | 'u'::r -> cs := r; Cu - | 'x'::r -> cs := r; Cx - | 'X'::r -> cs := r; CX - | '%'::r -> cs := r; CPercent - | _ -> raise (Failure "sec_of_format") - in - {leftjustify = !lj; - padzero = !pz; - signalways = !sa; - space = !sp; - variant = !va; - minwidth = minwidth; - precision = precision; - sizespec = sizespec; - convop = convop}, - !cs - -let rec sections_of_string_inner secs currstr = function - | '%'::m -> - let sec, rest = sec_of_format m in - if currstr = [] - then sections_of_string_inner (Format sec::secs) currstr rest - else sections_of_string_inner (Format sec::String (implode (rev currstr))::secs) [] rest - | x::xs -> - sections_of_string_inner secs (x::currstr) xs - | [] -> - if currstr = [] then rev secs else rev (String (implode (rev currstr))::secs) - -(* Take a format string, and split it into sections *) -let sections_of_string s = - try - sections_of_string_inner [] [] (explode s) - with - _ -> raise (PrintfFailure "Couldn't parse Printf format") - -(* Substitute an integer into a format, returning the empty string if the format is not suitable. *) - -(* For now, just 'd', 'u', 'i' *) -let sub_int i f = - (*i Printf.printf "Substituting format |%s|\n" (string_of_format f); i*) - let str = string_of_int i - in let padding = if f.padzero then '0' else ' ' in - if f.minwidth <> None && String.length str < unopt f.minwidth then - let padding = many padding (unopt f.minwidth - String.length str) in - if f.leftjustify then str ^ implode padding else implode padding ^ str - else - str - -(* Given a list of integers, substitute into integer formats *) -let rec substitute_inner donesections sections = function - | [] -> rev donesections @ sections - | i::is -> - match sections with - | [] -> rev donesections @ sections - | String s::more -> substitute_inner (String s::donesections) more (i::is) - | Format f::more -> substitute_inner (String (sub_int i f)::donesections) more is - -let substitute x = - try substitute_inner [] x with - _ -> raise (PrintfFailure "Failed to substitute integer") - -(* Flatten a set of sections to a string *) -let string_of_section = function - | String s -> s - | Format f -> string_of_format f - -let string_of_sections sections = - try fold_left ( ^ ) "" (map string_of_section sections) with - _ -> raise (PrintfFailure "Failed to build string from Printf sections") - type encoding = | Raw | UTF8 @@ -1324,28 +1134,22 @@ let process_others marks pdf splitlevel filename sequence startpage endpage s = in implode (procss [] (explode s)) -let name_of_spec printf marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage = - if printf then - let spec = - string_of_sections (substitute (sections_of_string spec) [n]) - in - process_others marks pdf splitlevel filename n startpage endpage spec - else - let fill l n = - let chars = explode (string_of_int n) in - if length chars > l - then implode (drop chars (length chars - l)) - else implode ((many '0' (l - length chars)) @ chars) - in - let chars = explode spec in - let before, including = cleavewhile (neq '%') chars in - let percents, after = cleavewhile (eq '%') including in - if percents = [] - then - process_others marks pdf splitlevel filename n startpage endpage spec - else - process_others marks pdf splitlevel filename n startpage endpage - (implode before ^ fill (length percents) n ^ implode after) +let name_of_spec marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage = + let fill l n = + let chars = explode (string_of_int n) in + if length chars > l + then implode (drop chars (length chars - l)) + else implode ((many '0' (l - length chars)) @ chars) + in + let chars = explode spec in + let before, including = cleavewhile (neq '%') chars in + let percents, after = cleavewhile (eq '%') including in + if percents = [] + then + process_others marks pdf splitlevel filename n startpage endpage spec + else + process_others marks pdf splitlevel filename n startpage endpage + (implode before ^ fill (length percents) n ^ implode after) (* Find the stem of a filename *) let stem s = @@ -1383,22 +1187,22 @@ let really_write_pdf ~preserve_objstm ~create_objstm ?(encryption = None) ?(cpdf raise (Pdf.PDFError "linearizer failed") end -let fast_write_split_pdfs enc printf splitlevel original_filename linearize ?(cpdflin = None) preserve_objstm create_objstm sq nobble spec main_pdf pagenums pdf_pages = +let fast_write_split_pdfs enc splitlevel original_filename linearize ?(cpdflin = None) preserve_objstm create_objstm sq nobble spec main_pdf pagenums pdf_pages = let marks = Pdfmarks.read_bookmarks main_pdf in iter2 (fun number pagenums -> let pdf = nobble (Pdfpage.pdf_of_pages main_pdf pagenums) in let startpage, endpage = extremes pagenums in - let name = name_of_spec printf marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in + let name = name_of_spec marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in Pdf.remove_unreferenced pdf; if sq then squeeze pdf; really_write_pdf ~preserve_objstm ~create_objstm ~encryption:enc linearize (not (enc = None)) pdf name) (indx pagenums) pagenums -let split_pdf enc printf original_filename chunksize linearize ~cpdflin ~preserve_objstm ~create_objstm ~squeeze nobble spec pdf = +let split_pdf enc original_filename chunksize linearize ~cpdflin ~preserve_objstm ~create_objstm ~squeeze nobble spec pdf = let pdf_pages = Pdfpage.pages_of_pagetree pdf in - fast_write_split_pdfs enc printf 0 original_filename linearize preserve_objstm + fast_write_split_pdfs enc 0 original_filename linearize preserve_objstm create_objstm squeeze nobble spec pdf (splitinto chunksize (indx pdf_pages)) pdf_pages (* Return list, in order, a *set* of page numbers of bookmarks at a given level *) @@ -1415,7 +1219,7 @@ let split_at_bookmarks original_filename linearize ~cpdflin ~preserve_objstm ~cr lose (fun x -> x <= 0 || x > Pdfpage.endpage pdf) (map pred points) in let pts = splitat points (indx pdf_pages) in - fast_write_split_pdfs None false level + fast_write_split_pdfs None level original_filename linearize preserve_objstm create_objstm squeeze nobble spec pdf pts pdf_pages (* Called from cpdflib.ml - different from above *) @@ -3045,7 +2849,7 @@ let xmltree_of_bytes b = and data d = D d in Xmlm.input_doc_tree ~el ~data i -let rec string_of_xmltree = function +(*let rec string_of_xmltree = function D d -> Printf.sprintf "DATA **%s**" d | E (tag, trees) -> @@ -3067,7 +2871,7 @@ and string_of_attributes attrs = and string_of_xmltrees trees = fold_left - (fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees) + (fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees)*) let rec get_data_for namespace name = function D _ -> None diff --git a/cpdf.mli b/cpdf.mli index b90ad76..cf6e9dc 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -90,7 +90,7 @@ endpage] makes format substitutions in [spec] to make an output file name: } [printf] is undocumented and should be set to [false]. *) -val name_of_spec : bool -> Pdfmarks.t list -> Pdf.t -> int -> string -> int -> string -> int -> int -> string +val name_of_spec : Pdfmarks.t list -> Pdf.t -> int -> string -> int -> string -> int -> int -> string (** {2 Compress and Decompress} *) @@ -220,7 +220,7 @@ each output is linearized. If [preserve_objstm] is true, object streams will be used if the input file had them. If [create_objstm] is true, object streams will be created in any event. [printf] and [nobble] are undocumented and should be false. *) -val split_pdf : Pdfwrite.encryption option -> bool -> string -> int -> bool -> cpdflin:string option -> +val split_pdf : Pdfwrite.encryption option -> string -> int -> bool -> cpdflin:string option -> preserve_objstm:bool -> create_objstm:bool -> squeeze:bool -> (Pdf.t -> Pdf.t) -> string -> Pdf.t -> unit (** {2 Listing fonts} *) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index cadc6e1..4ab5efe 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -8,6 +8,8 @@ let version_date = "(unreleased, 16th September 2014)" open Pdfutil open Pdfio +let initial_file_size = ref 0 + (* Wrap up the file reading functions to exit with code 1 when an encryption problem occurs. This happens when object streams are in an encrypted document and so it can't be read without the right password... The existing error @@ -144,8 +146,6 @@ type op = | ExtractImages | ImageResolution of float | MissingFonts - | DumpData - | UpdateInfo of string | RemoveUnusedResources | ExtractFontFile | ExtractText @@ -461,25 +461,9 @@ let rec decrypt_if_necessary (a, b, c, user_pw, owner_pw) op pdf = | Some pdf, permissions -> if operation_allowed permissions op then pdf - else if args.do_ask - then decrypt_if_necessary_ask (a, b, c, user_pw, owner_pw) op pdf - else soft_error "User password cannot give permission for this operation" + else soft_error "User password cannot give permission for this operation" | _ -> - if args.do_ask - then decrypt_if_necessary_ask (a, b, c, user_pw, owner_pw) op pdf - else soft_error "Failed to decrypt file: wrong password?" - -and decrypt_if_necessary_ask (a, b, c, user_pw, owner_pw) op pdf = - let name = match a with InFile x -> x | StdIn -> "Standard input" | AlreadyInMemory _ -> "PDF" in - flprint "The password supplied for input PDF:\n"; - flprint (" " ^ name); - flprint "\n did not work. The PDF is encrypted, so you must supply the\n"; - flprint " owner password to open it. To quit, enter a blank password\n"; - flprint "Please enter the password to use on the input PDF:\n"; - flprint (" " ^ name ^ ".\n"); - match Pervasives.read_line () with - | "" -> soft_error "Failed to decrypt file: wrong password?" - | x -> decrypt_if_necessary (a, b, c, user_pw, x) op pdf + soft_error "Failed to decrypt file: wrong password?" let nobble pdf = if not demo then pdf else @@ -523,20 +507,10 @@ let setmethod s = | _ -> error "Unsupported encryption method" let setowner s = - match s with - | "PROMPT" -> - flprint "Enter owner password to use on the output PDF.\n"; - args.owner <- Pervasives.read_line () - | s -> - args.owner <- s + args.owner <- s let setuser s = - match s with - | "PROMPT" -> - flprint "Enter user password to use on the output PDF.\n"; - args.user <- Pervasives.read_line () - | s -> - args.user <- s + args.user <- s let anon_fun s = try @@ -872,14 +846,6 @@ let setstampunder f = setop (StampUnder f) (); if args.position = Cpdf.TopLeft 100. then args.position <- Cpdf.BottomLeft 0. -let setstamponmulti f = - setop (StampOn f) (); - args.ismulti <- true - -let setstampundermulti f = - setop (StampUnder f) (); - args.ismulti <- true - let setcombinepages f = setop (CombinePages f) () @@ -1150,39 +1116,12 @@ let setkeepthisid () = | (InFile s, _, _, _, _)::_ -> args.keep_this_id <- Some s | _ -> () -let setupdateinfo s = - args.op <- Some (UpdateInfo s) - let setdoask () = args.do_ask <- true let setverbose () = args.verbose <- true -let promptinputs () = - flprint "Please enter a filename for an input PDF:\n"; - set_input (Pervasives.read_line ()) - -let promptinputpasswords () = - flprint "Please enter the open password to use on the input PDF:\n "; - match args.inputs with - | (InFile s, b, c, d, _)::more -> - flprint s; - flprint ".\n It can be empty, or have a maximum of 32 characters:\n"; - let pw = Pervasives.read_line () in - args.inputs <- (InFile s, b, c, d, pw)::more - | _ -> () - -let promptoutput () = - flprint "Please enter a name for the output:\n"; - args.out <- File (Pervasives.read_line ()) - -let setdontoverwriteexistingfiles () = - args.dont_overwrite_existing_files <- true - -let setdontoverwriteinputs () = - args.dont_overwrite_inputs <- true - let setmakenewid () = args.makenewid <- true @@ -1736,7 +1675,7 @@ and specs = Arg.Int setlabelstartval, " Set label start value (default 1)"); (* These items are for cpdftk *) - ("-update-info", Arg.String setupdateinfo, ""); + (*("-update-info", Arg.String setupdateinfo, ""); ("-printf-format", Arg.Unit setprintfformat, ""); ("-dump-data", Arg.Unit (setop DumpData), ""); @@ -1753,7 +1692,7 @@ and specs = ("-remove-unused-resources", Arg.Unit (setop RemoveUnusedResources), ""); ("-stamp-under-multi", Arg.String setstampundermulti, ""); ("-stamp-on-multi", Arg.String setstamponmulti, ""); - ("-list-annotations-more", Arg.Unit (setop ListAnnotationsMore), ""); + ("-list-annotations-more", Arg.Unit (setop ListAnnotationsMore), "");*) (*These items are undocumented *) ("-extract-fontfile", Arg.Unit (setop ExtractFontFile), ""); ("-extract-images", Arg.Unit setextractimages, ""); @@ -1780,20 +1719,14 @@ or \"1-6,9-end\" or \"even\" or \"odd\" or \"reverse\".\n\nOperations (See \ manual for full details):\n" (* Reading and writing *) -let rec writing_ok outname = - if args.dont_overwrite_inputs && mem outname !all_inputs then - error ("Error: The output filename: " ^ outname ^"\n is the same as an input filename.\n"); - if args.dont_overwrite_existing_files && Sys.file_exists outname then - begin - flprint ("Output file: " ^ outname ^ " already exists. Overwrite? (y/n)\n"); - match explode (Pervasives.read_line ()) with - | ('y' | 'Y')::_ -> outname - | _ -> - flprint "Enter a name for the output:\n"; - writing_ok (Pervasives.read_line ()) - end - else - outname +let filesize name = + try + let x = open_in_bin name in + let r = in_channel_length x in + close_in x; + r + with + _ -> 0 let really_write_pdf ?(encryption = None) mk_id pdf outname = let outname' = @@ -1805,20 +1738,28 @@ let really_write_pdf ?(encryption = None) mk_id pdf outname = ~preserve_objstm:args.preserve_objstm ~generate_objstm:args.create_objstm false encryption mk_id pdf outname'; - if args.linearize then - let cpdflin = - match Cpdf.find_cpdflin args.cpdflin with - Some x -> x - | None -> raise (Pdf.PDFError "Could not find cpdflin") - in - let best_password = if args.owner <> "" then args.owner else args.user in - let code = Cpdf.call_cpdflin cpdflin outname' outname best_password in - begin try Sys.remove outname' with _ -> () end; - if code > 0 then - begin - begin try Sys.remove outname with _ -> () end; - raise (Pdf.PDFError "linearizer failed") - end + begin + if args.linearize then + let cpdflin = + match Cpdf.find_cpdflin args.cpdflin with + Some x -> x + | None -> raise (Pdf.PDFError "Could not find cpdflin") + in + let best_password = if args.owner <> "" then args.owner else args.user in + let code = Cpdf.call_cpdflin cpdflin outname' outname best_password in + begin try Sys.remove outname' with _ -> () end; + if code > 0 then + begin + begin try Sys.remove outname with _ -> () end; + raise (Pdf.PDFError "linearizer failed") + end + end; + if args.squeeze then + let s = filesize outname in + Printf.printf + "Final file size is %i bytes, %.2f%% of original.\n" + s + ((float s /. float !initial_file_size) *. 100.) let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf = if args.create_objstm && not args.keepversion @@ -1828,20 +1769,19 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf = | NoOutputSpecified -> output_pdfs =| pdf | File outname -> - let outname = writing_ok outname in - begin match encryption with - None -> - ignore (nobble pdf); - if not is_decompress then - begin - ignore (Cpdf.recompress_pdf pdf); - if args.squeeze then Cpdf.squeeze pdf; - Pdf.remove_unreferenced pdf - end; - really_write_pdf mk_id pdf outname - | Some _ -> - really_write_pdf ~encryption mk_id pdf outname - end + begin match encryption with + None -> + ignore (nobble pdf); + if not is_decompress then + begin + ignore (Cpdf.recompress_pdf pdf); + if args.squeeze then Cpdf.squeeze pdf; + Pdf.remove_unreferenced pdf + end; + really_write_pdf mk_id pdf outname + | Some _ -> + really_write_pdf ~encryption mk_id pdf outname + end | Stdout -> let temp = Filename.temp_file "cpdflin" ".pdf" in begin match encryption with @@ -1880,6 +1820,8 @@ let pdf_of_stdin user_pw owner_pw = let get_single_pdf op read_lazy = match args.inputs with | (InFile inname, _, _, u, o) as input::_ -> + if args.squeeze then + Printf.printf "Initial file size is %i bytes\n" (filesize inname); let pdf = if read_lazy then pdfread_pdf_of_channel_lazy (optstring u) (optstring o) (open_in_bin inname) @@ -1905,6 +1847,12 @@ file once *) let get_pdf_from_input_kind ((_, _, _, u, o) as input) op = function | AlreadyInMemory pdf -> pdf | InFile s -> + if args.squeeze then + begin + let size = filesize s in + initial_file_size := size; + Printf.printf "Initial file size is %i bytes\n" size + end; begin try Hashtbl.find filenames s with Not_found -> let pdf = decrypt_if_necessary input op (pdfread_pdf_of_file (optstring u) (optstring o) s) in @@ -2047,7 +1995,7 @@ let extract_images pdf range stem = if images <> [] then (let names = map - (function n -> let r = Cpdf.name_of_spec false [] (*FIXME *) pdf 0 stem n "" 0 0 in (*i flprint r; flprint "\n"; i*) r) + (function n -> let r = Cpdf.name_of_spec [] (*FIXME *) pdf 0 stem n "" 0 0 in (*i flprint r; flprint "\n"; i*) r) (ilist 1 (length images)) in iter2 (write_image pdf page.Pdfpage.resources) names images)) @@ -2492,170 +2440,6 @@ let dump_attached_files pdf out = with _ -> error "Couldn't dump attached files" -(* Prerotate a pdf *) -let prerotate_pdf pdf r = - let setto angle = Cpdf.rotate_pdf angle pdf (ilist 1 (Pdfpage.endpage pdf)) - and setby angle = Cpdf.rotate_pdf_by angle pdf (ilist 1 (Pdfpage.endpage pdf)) in - match r with - | Pdfmerge.DNR -> pdf - | Pdfmerge.N -> setto 0 - | Pdfmerge.S -> setto 180 - | Pdfmerge.E -> setto 90 - | Pdfmerge.W -> setto 270 - | Pdfmerge.L -> setby ~-90 - | Pdfmerge.R -> setby 90 - | Pdfmerge.D -> setby 180 - -(* Convert from unicode or PDFDocencoded to ASCII string with HTML entities in it. *) -let html_of_unicode s = - implode - (flatten - (map - (function 60 -> explode "<" - | 62 -> explode ">" - | 38 -> explode "&" - | 34 -> explode """ - | x when x >= 0x20 && x <= 0x7e -> [char_of_int x] - | x -> ['&';'#'] @ explode (string_of_int x) @ [';']) - (Pdftext.codepoints_of_pdfdocstring s))) - -(* Convert from HTML entities to a PDF string which is unicode-encoded (if there are any non-ASCII chars, or PDFDocEncoded if there aren't) . *) -let unicode_of_html s = - let rec codepoints_of_html ps = function - | '&'::'l'::'t'::';'::r -> codepoints_of_html (60::ps) r - | '&'::'g'::'t'::';'::r -> codepoints_of_html (62::ps) r - | '&'::'a'::'m'::'p'::';'::r -> codepoints_of_html (38::ps) r - | '&'::'q'::'u'::'o'::'t'::';'::r -> codepoints_of_html (34::ps) r - | '&'::'#'::r -> - begin match cleavewhile (function '0'..'9' -> true | _ -> false) r with - | [], r -> codepoints_of_html ps r - | cs, (';'::r) -> - let i = try int_of_string (implode cs) with _ -> error "bad HTML literal in update_info" in - codepoints_of_html (i::ps) r - | _ -> error "bad HTML literal in update_info 2" - end - | x::r when int_of_char x >= 0x20 && int_of_char x <= 0x7e -> codepoints_of_html (int_of_char x::ps) r - | _::r -> codepoints_of_html ps r - | [] -> rev ps - in - Pdftext.pdfdocstring_of_codepoints (codepoints_of_html [] (explode s)) - -let dump_data pdf out = - let channel = - match out with - | NoOutputSpecified -> stdout - | Stdout -> stdout - | File f -> open_out_bin f - in - let prs s = Pervasives.output_string channel s in - (* 1. Info keys *) - begin match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with - | Some (Pdf.Dictionary d) -> - iter - (function (name, pdfobj) -> - match pdfobj with - | Pdf.String s -> - begin match s with "" -> () | _ -> - begin match explode name with - | [] -> () - | h::t -> prs (Printf.sprintf "InfoKey: %s\nInfoValue: %s\n" (implode t) (html_of_unicode s)) - end - end - | _ -> ()) - d - | _ -> flprint "Warning: no info dictionary found\n"; () - end; - let hex s = - fold_left ( ^ ) "" (map (Printf.sprintf "%02x") (map int_of_char (explode s))) - in - (* 2. IDs *) - begin match Pdf.lookup_direct pdf "/ID" pdf.Pdf.trailerdict with - | Some (Pdf.Array [Pdf.String s; Pdf.String t]) -> prs (Printf.sprintf "PdfID0: %s\nPdfID1: %s\n" (hex s) (hex t)) - | _ -> () - end; - (* 3. No of pages *) - prs (Printf.sprintf "NumberOfPages: %i\n" (Pdfpage.endpage pdf)); - (* 4. Outlines *) - iter - (function m -> - prs (Printf.sprintf "BookmarkTitle: %s\n" (html_of_unicode m.Pdfmarks.text)); - prs (Printf.sprintf "BookmarkLevel: %i\n" (m.Pdfmarks.level + 1)); - prs (Printf.sprintf "BookmarkPageNumber: %i\n" (Pdfpage.pagenumber_of_target pdf m.Pdfmarks.target))) - (Pdfmarks.read_bookmarks pdf); - (* 5. Close and finish *) - match out with File _ -> close_out channel | _ -> flush stdout - -(* Parse and update info *) -let update_info pdf source = - let channel = - match source with - | "use-stdin" -> stdin - | x -> open_in_bin x - in - let rec read_lines prev channel = - try read_lines (input_line channel::prev) channel with End_of_file -> rev prev - in - let lines = read_lines [] channel in - let kvpairs = - map - (function l -> let k, v = cleavewhile (neq ':') (explode l) in implode k, implode (tail_no_fail (tail_no_fail v))) - lines - in - (*i iter - (function (k, v) -> Printf.printf "(%s,%s)\n" k v) - kvpairs; i*) - (* Split into 1) info keys / values 2) PdfIDs, Bookmarks *) - let infolines = - keep (function (("InfoKey" | "InfoValue"), _) -> true | _ -> false) kvpairs; - and pdfidlines = - keep (function (("PdfID0" | "PdfID1"), _) -> true | _ -> false) kvpairs - and bookmarklines = - keep (function (("BookmarkTitle" | "BookmarkLevel" | "BookmarkPageNumber"), _) -> true | _ -> false) kvpairs - in - (* 1. Add/Replace info keys *) - let kvpairs = - map - (function [(_, k); (_, v)] -> k, v | _ -> error "Mismatched info Key/Value pairs") - (splitinto 2 infolines) - in - let pdf = - {pdf with Pdf.trailerdict = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Info" - (Pdf.Dictionary - (fold_left - (fun d (k, v) -> add k v d) - (match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with | Some (Pdf.Dictionary d) -> d | _ -> []) - (map (function (k, v) -> "/" ^ k, Pdf.String (unicode_of_html v)) kvpairs)))} - in - (* 2. Add/Replace PDF Id *) - let pdf = - let unhex s = - match Pdfread.lex_hexstring (Pdfio.input_of_string ("<" ^ s ^ ">")) with - | Pdfgenlex.LexString s -> s - | _ -> error "PDFId wrongly formed in update_info file" - in - match pdfidlines with - | ["PdfID0", a; "PdfID1", b] -> - {pdf with Pdf.trailerdict = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/ID" (Pdf.Array [Pdf.String (unhex a); Pdf.String (unhex b)])} - | _ -> pdf - in - (* 3. Replace Bookmarks *) - let marks = - map - (function - | [("BookmarkTitle", a); ("BookmarkLevel", b); ("BookmarkPageNumber", c)] -> - {Pdfmarks.level = int_of_string b - 1; - Pdfmarks.text = unicode_of_html a; - Pdfmarks.target = Pdfpage.target_of_pagenumber pdf (int_of_string c); - Pdfmarks.isopen = false} - | _ -> error "Bookmark entries malformed in update_info file") - (splitinto 3 bookmarklines) - in - let pdf = Pdfmarks.add_bookmarks marks pdf in - begin match source with "use-stdin" -> () | _ -> close_in channel end; - pdf - (* If pages in stamp < pages in main, extend stamp by repeating its last page. If pages in stamp more, chop stamp *) let equalize_pages_extend main stamp = let length_stamp = Pdfpage.endpage stamp @@ -2817,10 +2601,9 @@ let go () = input file, and we're just extracting pages, might we use a lazy read? *) if hd ranges <> "all" || hd rotations <> Pdfmerge.DNR || !Pdfpage.flat_pagetrees then - let pdf = if hd rotations <> Pdfmerge.DNR then prerotate_pdf pdf (hd rotations) else pdf in - let range = parse_pagespec pdf (hd ranges) in - let newpdf = Pdfpage.pdf_of_pages ~retain_numbering:args.retain_numbering pdf range in - write_pdf false newpdf + let range = parse_pagespec pdf (hd ranges) in + let newpdf = Pdfpage.pdf_of_pages ~retain_numbering:args.retain_numbering pdf range in + write_pdf false newpdf else write_pdf false pdf | _ -> @@ -3125,7 +2908,7 @@ let go () = Pdfwrite.permissions = banlist_of_args ()} in Cpdf.split_pdf - enc args.printf_format args.original_filename args.chunksize args.linearize args.cpdflin + enc args.original_filename args.chunksize args.linearize args.cpdflin args.preserve_objstm args.preserve_objstm (*yes--always create if preserving *) args.squeeze nobble output_spec pdf | _, Stdout -> error "Can't split to standard output" @@ -3438,12 +3221,6 @@ let go () = | _ -> Printf.eprintf "CSP3: Too many input files or input not a file" end - | Some DumpData -> - let pdf = get_single_pdf args.op true in - dump_data pdf args.out - | Some (UpdateInfo source) -> - let pdf = get_single_pdf args.op false in - write_pdf false (update_info pdf source) | Some ExtractText -> let pdf = get_single_pdf args.op true in let range = parse_pagespec pdf (get_pagespec ()) in