From b908e5f57d038c1cdb65a8c9e53445deef8211db Mon Sep 17 00:00:00 2001 From: John Whitington Date: Mon, 15 Nov 2021 09:30:11 -0800 Subject: [PATCH] more --- cpdf.ml | 208 +++++++++++++++++++++++++++++++++++++++++++++++ cpdf.mli | 7 ++ cpdfcommand.ml | 216 +------------------------------------------------ 3 files changed, 219 insertions(+), 212 deletions(-) diff --git a/cpdf.ml b/cpdf.ml index e3931f9..3e967aa 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -4649,3 +4649,211 @@ let create_pdf pages pagesize = in let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in Pdfpage.add_root pageroot [] pdf + +(* Remove characters which might not make good filenames. *) +let remove_unsafe_characters encoding s = + if encoding = Raw then s else + let chars = + lose + (function x -> + match x with + '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true + | x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Stripped) -> true + | _ -> false) + (explode s) + in + match chars with + | '.'::more -> implode more + | chars -> implode chars + +let get_bookmark_name encoding pdf marks splitlevel n _ = + let refnums = Pdf.page_reference_numbers pdf in + let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in + match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with + | {Pdfmarks.text = title}::_ -> remove_unsafe_characters encoding title + | _ -> "" + +(* @F means filename without extension *) +(* @N means sequence number with no padding *) +(* @S means start page of this section *) +(* @E means end page of this section *) +(* @B means bookmark name at start page *) +let process_others encoding marks pdf splitlevel filename sequence startpage endpage s = + let rec find_ats p = function + '@'::r -> find_ats (p + 1) r + | r -> (p, r) + in + let string_of_int_width w i = + if w < 0 then raise (Pdf.PDFError "width of field too narrow") + else if w > 8 then raise (Pdf.PDFError "width of field too broad") else + let formats = + [|format_of_string "%i"; + format_of_string "%i"; + format_of_string "%02i"; + format_of_string "%03i"; + format_of_string "%04i"; + format_of_string "%05i"; + format_of_string "%06i"; + format_of_string "%07i"; + format_of_string "%08i"|] + in + Printf.sprintf formats.(w) i + in + let rec procss prev = function + | [] -> rev prev + | '@'::'F'::t -> procss (rev (explode filename) @ prev) t + | '@'::'N'::t -> + let width, rest = find_ats 0 t in + procss (rev (explode (string_of_int_width width sequence)) @ prev) rest + | '@'::'S'::t -> + let width, rest = find_ats 0 t in + procss (rev (explode (string_of_int_width width startpage)) @ prev) rest + | '@'::'E'::t -> + let width, rest = find_ats 0 t in + procss (rev (explode (string_of_int_width width endpage)) @ prev) rest + | '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t + | h::t -> procss (h::prev) t + in + implode (procss [] (explode s)) + +let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage = + let fill l n = + let chars = explode (string_of_int n) in + if length chars > l + then implode (drop chars (length chars - l)) + else implode ((many '0' (l - length chars)) @ chars) + in + let chars = explode spec in + let before, including = cleavewhile (neq '%') chars in + let percents, after = cleavewhile (eq '%') including in + if percents = [] + then + process_others encoding marks pdf splitlevel filename n startpage endpage spec + else + process_others encoding marks pdf splitlevel filename n startpage endpage + (implode before ^ fill (length percents) n ^ implode after) + +(* Extract Images. *) +let pnm_to_channel_24 channel w h s = + let white () = output_char channel ' ' + and newline () = output_char channel '\n' + and output_string = Pervasives.output_string channel in + output_string "P6"; + white (); + output_string (string_of_int w); + white (); + output_string (string_of_int h); + white (); + output_string "255"; + newline (); + let pos = ref 0 in + for y = 1 to h do + for x = 1 to w * 3 do + output_byte channel (bget s !pos); + incr pos + done + done + +let write_stream name stream = + let fh = open_out_bin name in + for x = 0 to bytes_size stream - 1 do + output_byte fh (bget stream x) + done; + close_out fh + +let write_image path_to_p2p path_to_im pdf resources name image = + match Pdfimage.get_image_24bpp pdf resources image with + | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream + | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream + | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream + | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) -> + let pnm = name ^ ".pnm" in + let png = name ^ ".png" in + let fh = open_out_bin pnm in + pnm_to_channel_24 fh w h stream; + close_out fh; + begin match path_to_p2p with + | "" -> + begin match path_to_im with + "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!" + | _ -> + begin match + Sys.command (Filename.quote_command path_to_im [pnm; png]) + with + 0 -> Sys.remove pnm + | _ -> + Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!"; + Sys.remove pnm + end + end + | _ -> + begin match + Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm]) + with + | 0 -> Sys.remove pnm + | _ -> + Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!"; + Sys.remove pnm + end + end + | _ -> + Printf.eprintf "Unsupported image type when extracting image %s %!" name + +let written = ref [] + +let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images = + let names = map + (fun _ -> + name_of_spec + encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum) + (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images) + in + iter2 (write_image path_to_p2p path_to_im pdf resources) names images + +let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form = + let resources = + match Pdf.lookup_direct pdf "/Resources" form with + Some (Pdf.Dictionary d) -> Pdf.Dictionary d + | _ -> Pdf.Dictionary [] + in + let images = + let xobjects = + match Pdf.lookup_direct pdf "/XObject" resources with + | Some (Pdf.Dictionary elts) -> map snd elts + | _ -> [] + in + (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *) + let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in + let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in + if dedup || dedup_per_page then + written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; + images + in + extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images + +let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem = + if dedup || dedup_per_page then written := []; + let pdf_pages = Pdfpage.pages_of_pagetree pdf in + let pages = + option_map + (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None) + (combine (indx pdf_pages) pdf_pages) + in + let serial = ref 0 in + iter2 + (fun page pnum -> + if dedup_per_page then written := []; + let xobjects = + match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with + | Some (Pdf.Dictionary elts) -> map snd elts + | _ -> [] + in + let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in + let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in + if dedup || dedup_per_page then + written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; + let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in + extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images; + iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms) + pages + (indx pages) diff --git a/cpdf.mli b/cpdf.mli index 636d555..3be20fa 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -412,3 +412,10 @@ val bookmarks_open_to_level : int -> Pdf.t -> Pdf.t val create_pdf : int -> Pdfpaper.t -> Pdf.t +val name_of_spec : encoding -> + Pdfmarks.t list -> + Pdf.t -> int -> string -> int -> string -> int -> int -> string + +val extract_images : string -> + string -> + encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 01c0982..b0d7393 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -3,7 +3,7 @@ let demo = false let noncomp = false let major_version = 2 let minor_version = 5 -let version_date = "(devel, 28th Sept 2021)" +let version_date = "(devel, 15th Nov 2021)" open Pdfutil open Pdfio @@ -2773,89 +2773,6 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf = end; flush stdout (*r For Windows *) -(* Remove characters which might not make good filenames. *) -let remove_unsafe_characters s = - if args.encoding = Cpdf.Raw then s else - let chars = - lose - (function x -> - match x with - '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true - | x when int_of_char x < 32 || (int_of_char x > 126 && args.encoding <> Cpdf.Stripped) -> true - | _ -> false) - (explode s) - in - match chars with - | '.'::more -> implode more - | chars -> implode chars - -let get_bookmark_name pdf marks splitlevel n _ = - let refnums = Pdf.page_reference_numbers pdf in - let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in - match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with - | {Pdfmarks.text = title}::_ -> remove_unsafe_characters title - | _ -> "" - -(* @F means filename without extension *) -(* @N means sequence number with no padding *) -(* @S means start page of this section *) -(* @E means end page of this section *) -(* @B means bookmark name at start page *) -let process_others marks pdf splitlevel filename sequence startpage endpage s = - let rec find_ats p = function - '@'::r -> find_ats (p + 1) r - | r -> (p, r) - in - let string_of_int_width w i = - if w < 0 then raise (Pdf.PDFError "width of field too narrow") - else if w > 8 then raise (Pdf.PDFError "width of field too broad") else - let formats = - [|format_of_string "%i"; - format_of_string "%i"; - format_of_string "%02i"; - format_of_string "%03i"; - format_of_string "%04i"; - format_of_string "%05i"; - format_of_string "%06i"; - format_of_string "%07i"; - format_of_string "%08i"|] - in - Printf.sprintf formats.(w) i - in - let rec procss prev = function - | [] -> rev prev - | '@'::'F'::t -> procss (rev (explode filename) @ prev) t - | '@'::'N'::t -> - let width, rest = find_ats 0 t in - procss (rev (explode (string_of_int_width width sequence)) @ prev) rest - | '@'::'S'::t -> - let width, rest = find_ats 0 t in - procss (rev (explode (string_of_int_width width startpage)) @ prev) rest - | '@'::'E'::t -> - let width, rest = find_ats 0 t in - procss (rev (explode (string_of_int_width width endpage)) @ prev) rest - | '@'::'B'::t -> procss (rev (explode (get_bookmark_name pdf marks splitlevel startpage pdf)) @ prev) t - | h::t -> procss (h::prev) t - in - implode (procss [] (explode s)) - -let name_of_spec marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage = - let fill l n = - let chars = explode (string_of_int n) in - if length chars > l - then implode (drop chars (length chars - l)) - else implode ((many '0' (l - length chars)) @ chars) - in - let chars = explode spec in - let before, including = cleavewhile (neq '%') chars in - let percents, after = cleavewhile (eq '%') including in - if percents = [] - then - process_others marks pdf splitlevel filename n startpage endpage spec - else - process_others marks pdf splitlevel filename n startpage endpage - (implode before ^ fill (length percents) n ^ implode after) - (* Find the stem of a filename *) let stem s = implode @@ -2872,8 +2789,8 @@ let fast_write_split_pdfs let pdf = Pdfpage.pdf_of_pages main_pdf pagenums in let startpage, endpage = extremes pagenums in let name = - name_of_spec - marks main_pdf splitlevel spec number + Cpdf.name_of_spec + args.encoding marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in Pdf.remove_unreferenced pdf; @@ -2916,131 +2833,6 @@ let split_pdf enc 0 original_filename squeeze spec pdf (splitinto chunksize (indx pdf_pages)) pdf_pages -(* Extract Images. *) -let pnm_to_channel_24 channel w h s = - let white () = output_char channel ' ' - and newline () = output_char channel '\n' - and output_string = Pervasives.output_string channel in - output_string "P6"; - white (); - output_string (string_of_int w); - white (); - output_string (string_of_int h); - white (); - output_string "255"; - newline (); - let pos = ref 0 in - for y = 1 to h do - for x = 1 to w * 3 do - output_byte channel (bget s !pos); - incr pos - done - done - -let write_stream name stream = - let fh = open_out_bin name in - for x = 0 to bytes_size stream - 1 do - output_byte fh (bget stream x) - done; - close_out fh - -let write_image pdf resources name image = - match Pdfimage.get_image_24bpp pdf resources image with - | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream - | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream - | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream - | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) -> - let pnm = name ^ ".pnm" in - let png = name ^ ".png" in - let fh = open_out_bin pnm in - pnm_to_channel_24 fh w h stream; - close_out fh; - begin match args.path_to_p2p with - | "" -> - begin match args.path_to_im with - "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!" - | _ -> - begin match - Sys.command (Filename.quote_command args.path_to_im [pnm; png]) - with - 0 -> Sys.remove pnm - | _ -> - Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!"; - Sys.remove pnm - end - end - | _ -> - begin match - Sys.command (Filename.quote_command args.path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm]) - with - | 0 -> Sys.remove pnm - | _ -> - Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!"; - Sys.remove pnm - end - end - | _ -> - Printf.eprintf "Unsupported image type when extracting image %s %!" name - -let written = ref [] - -let extract_images_inner serial pdf resources stem pnum images = - let names = map - (fun _ -> - name_of_spec - [] pdf 0 (stem ^ "-p" ^ string_of_int pnum) - (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images) - in - iter2 (write_image pdf resources) names images - -let rec extract_images_form_xobject pdf serial stem pnum form = - let resources = - match Pdf.lookup_direct pdf "/Resources" form with - Some (Pdf.Dictionary d) -> Pdf.Dictionary d - | _ -> Pdf.Dictionary [] - in - let images = - let xobjects = - match Pdf.lookup_direct pdf "/XObject" resources with - | Some (Pdf.Dictionary elts) -> map snd elts - | _ -> [] - in - (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *) - let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in - let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in - if args.dedup || args.dedup_per_page then - written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; - images - in - extract_images_inner serial pdf resources stem pnum images - -let extract_images pdf range stem = - if args.dedup || args.dedup_per_page then written := []; - let pdf_pages = Pdfpage.pages_of_pagetree pdf in - let pages = - option_map - (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None) - (combine (indx pdf_pages) pdf_pages) - in - let serial = ref 0 in - iter2 - (fun page pnum -> - if args.dedup_per_page then written := []; - let xobjects = - match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with - | Some (Pdf.Dictionary elts) -> map snd elts - | _ -> [] - in - let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in - let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in - if args.dedup || args.dedup_per_page then - written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; - let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in - extract_images_inner serial pdf page.Pdfpage.resources stem pnum images; - iter (extract_images_form_xobject pdf serial stem pnum) forms) - pages - (indx pages) - let getencryption pdf = match Pdfread.what_encryption pdf with | None | Some Pdfwrite.AlreadyEncrypted -> "Not encrypted" @@ -3888,7 +3680,7 @@ let go () = in let pdf = get_single_pdf args.op true in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - extract_images pdf range output_spec + Cpdf.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec | Some (ImageResolution f) -> let pdf = get_single_pdf args.op true in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in