From 4c886d2b13eef3bbfc94e5102459ff307f786e8b Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 21 Dec 2021 14:57:42 +0000 Subject: [PATCH] more --- Makefile | 2 +- cpdf.ml | 190 ---------------------------------------------- cpdf.mli | 8 -- cpdfbookmarks.ml | 67 ++++++++++++++++ cpdfbookmarks.mli | 4 + cpdfcommand.ml | 4 +- cpdfimage.ml | 127 +++++++++++++++++++++++++++++++ cpdfimage.mli | 3 + 8 files changed, 204 insertions(+), 201 deletions(-) create mode 100644 cpdfimage.ml create mode 100644 cpdfimage.mli diff --git a/Makefile b/Makefile index ab252fd..17394d2 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Build the cpdf command line tools and top level MODS = cpdfyojson cpdfxmlm \ cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord cpdfattach \ - cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdffont cpdftype \ + cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \ cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot cpdfpagelabels cpdfcreate cpdfannot cpdfcommand SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml diff --git a/cpdf.ml b/cpdf.ml index f87d720..516d2f9 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -2007,194 +2007,4 @@ let create_pdf pages pagesize = let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in Pdfpage.add_root pageroot [] pdf -let get_bookmark_name encoding pdf marks splitlevel n _ = - let refnums = Pdf.page_reference_numbers pdf in - let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in - match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with - | {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters encoding title - | _ -> "" -(* @F means filename without extension *) -(* @N means sequence number with no padding *) -(* @S means start page of this section *) -(* @E means end page of this section *) -(* @B means bookmark name at start page *) -let process_others encoding marks pdf splitlevel filename sequence startpage endpage s = - let rec find_ats p = function - '@'::r -> find_ats (p + 1) r - | r -> (p, r) - in - let string_of_int_width w i = - if w < 0 then raise (Pdf.PDFError "width of field too narrow") - else if w > 8 then raise (Pdf.PDFError "width of field too broad") else - let formats = - [|format_of_string "%i"; - format_of_string "%i"; - format_of_string "%02i"; - format_of_string "%03i"; - format_of_string "%04i"; - format_of_string "%05i"; - format_of_string "%06i"; - format_of_string "%07i"; - format_of_string "%08i"|] - in - Printf.sprintf formats.(w) i - in - let rec procss prev = function - | [] -> rev prev - | '@'::'F'::t -> procss (rev (explode filename) @ prev) t - | '@'::'N'::t -> - let width, rest = find_ats 0 t in - procss (rev (explode (string_of_int_width width sequence)) @ prev) rest - | '@'::'S'::t -> - let width, rest = find_ats 0 t in - procss (rev (explode (string_of_int_width width startpage)) @ prev) rest - | '@'::'E'::t -> - let width, rest = find_ats 0 t in - procss (rev (explode (string_of_int_width width endpage)) @ prev) rest - | '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t - | h::t -> procss (h::prev) t - in - implode (procss [] (explode s)) - -let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage = - let fill l n = - let chars = explode (string_of_int n) in - if length chars > l - then implode (drop chars (length chars - l)) - else implode ((many '0' (l - length chars)) @ chars) - in - let chars = explode spec in - let before, including = cleavewhile (neq '%') chars in - let percents, after = cleavewhile (eq '%') including in - if percents = [] - then - process_others encoding marks pdf splitlevel filename n startpage endpage spec - else - process_others encoding marks pdf splitlevel filename n startpage endpage - (implode before ^ fill (length percents) n ^ implode after) - -(* Extract Images. *) -let pnm_to_channel_24 channel w h s = - let white () = output_char channel ' ' - and newline () = output_char channel '\n' - and output_string = Pervasives.output_string channel in - output_string "P6"; - white (); - output_string (string_of_int w); - white (); - output_string (string_of_int h); - white (); - output_string "255"; - newline (); - let pos = ref 0 in - for y = 1 to h do - for x = 1 to w * 3 do - output_byte channel (bget s !pos); - incr pos - done - done - -let write_stream name stream = - let fh = open_out_bin name in - for x = 0 to bytes_size stream - 1 do - output_byte fh (bget stream x) - done; - close_out fh - -let write_image path_to_p2p path_to_im pdf resources name image = - match Pdfimage.get_image_24bpp pdf resources image with - | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream - | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream - | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream - | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) -> - let pnm = name ^ ".pnm" in - let png = name ^ ".png" in - let fh = open_out_bin pnm in - pnm_to_channel_24 fh w h stream; - close_out fh; - begin match path_to_p2p with - | "" -> - begin match path_to_im with - "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!" - | _ -> - begin match - Sys.command (Filename.quote_command path_to_im [pnm; png]) - with - 0 -> Sys.remove pnm - | _ -> - Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!"; - Sys.remove pnm - end - end - | _ -> - begin match - Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm]) - with - | 0 -> Sys.remove pnm - | _ -> - Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!"; - Sys.remove pnm - end - end - | _ -> - Printf.eprintf "Unsupported image type when extracting image %s %!" name - -let written = ref [] - -let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images = - let names = map - (fun _ -> - name_of_spec - encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum) - (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images) - in - iter2 (write_image path_to_p2p path_to_im pdf resources) names images - -let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form = - let resources = - match Pdf.lookup_direct pdf "/Resources" form with - Some (Pdf.Dictionary d) -> Pdf.Dictionary d - | _ -> Pdf.Dictionary [] - in - let images = - let xobjects = - match Pdf.lookup_direct pdf "/XObject" resources with - | Some (Pdf.Dictionary elts) -> map snd elts - | _ -> [] - in - (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *) - let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in - let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in - if dedup || dedup_per_page then - written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; - images - in - extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images - -let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem = - if dedup || dedup_per_page then written := []; - let pdf_pages = Pdfpage.pages_of_pagetree pdf in - let pages = - option_map - (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None) - (combine (indx pdf_pages) pdf_pages) - in - let serial = ref 0 in - iter2 - (fun page pnum -> - if dedup_per_page then written := []; - let xobjects = - match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with - | Some (Pdf.Dictionary elts) -> map snd elts - | _ -> [] - in - let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in - let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in - if dedup || dedup_per_page then - written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; - let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in - extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images; - iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms) - pages - (indx pages) diff --git a/cpdf.mli b/cpdf.mli index 9a810fb..6c0d03e 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -145,11 +145,3 @@ val remove_unused_resources : Pdf.t -> Pdf.t val bookmarks_open_to_level : int -> Pdf.t -> Pdf.t val create_pdf : int -> Pdfpaper.t -> Pdf.t - -val name_of_spec : Cpdfmetadata.encoding -> - Pdfmarks.t list -> - Pdf.t -> int -> string -> int -> string -> int -> int -> string - -val extract_images : string -> - string -> - Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit diff --git a/cpdfbookmarks.ml b/cpdfbookmarks.ml index d10f5b9..dc7fd92 100644 --- a/cpdfbookmarks.ml +++ b/cpdfbookmarks.ml @@ -292,3 +292,70 @@ let split_on_bookmarks pdf level = in let pdf_pages = Pdfpage.pages_of_pagetree pdf in let ranges = splitat points (indx pdf_pages) in map (fun rs -> Pdfpage.pdf_of_pages pdf rs) ranges + +let get_bookmark_name encoding pdf marks splitlevel n _ = + let refnums = Pdf.page_reference_numbers pdf in + let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in + match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with + | {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters encoding title + | _ -> "" + +(* @F means filename without extension *) +(* @N means sequence number with no padding *) +(* @S means start page of this section *) +(* @E means end page of this section *) +(* @B means bookmark name at start page *) +let process_others encoding marks pdf splitlevel filename sequence startpage endpage s = + let rec find_ats p = function + '@'::r -> find_ats (p + 1) r + | r -> (p, r) + in + let string_of_int_width w i = + if w < 0 then raise (Pdf.PDFError "width of field too narrow") + else if w > 8 then raise (Pdf.PDFError "width of field too broad") else + let formats = + [|format_of_string "%i"; + format_of_string "%i"; + format_of_string "%02i"; + format_of_string "%03i"; + format_of_string "%04i"; + format_of_string "%05i"; + format_of_string "%06i"; + format_of_string "%07i"; + format_of_string "%08i"|] + in + Printf.sprintf formats.(w) i + in + let rec procss prev = function + | [] -> rev prev + | '@'::'F'::t -> procss (rev (explode filename) @ prev) t + | '@'::'N'::t -> + let width, rest = find_ats 0 t in + procss (rev (explode (string_of_int_width width sequence)) @ prev) rest + | '@'::'S'::t -> + let width, rest = find_ats 0 t in + procss (rev (explode (string_of_int_width width startpage)) @ prev) rest + | '@'::'E'::t -> + let width, rest = find_ats 0 t in + procss (rev (explode (string_of_int_width width endpage)) @ prev) rest + | '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t + | h::t -> procss (h::prev) t + in + implode (procss [] (explode s)) + +let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage = + let fill l n = + let chars = explode (string_of_int n) in + if length chars > l + then implode (drop chars (length chars - l)) + else implode ((many '0' (l - length chars)) @ chars) + in + let chars = explode spec in + let before, including = cleavewhile (neq '%') chars in + let percents, after = cleavewhile (eq '%') including in + if percents = [] + then + process_others encoding marks pdf splitlevel filename n startpage endpage spec + else + process_others encoding marks pdf splitlevel filename n startpage endpage + (implode before ^ fill (length percents) n ^ implode after) diff --git a/cpdfbookmarks.mli b/cpdfbookmarks.mli index 7cd3842..f4a8a35 100644 --- a/cpdfbookmarks.mli +++ b/cpdfbookmarks.mli @@ -12,3 +12,7 @@ val add_bookmarks : json:bool -> bool -> Pdfio.input -> Pdf.t -> Pdf.t (** [list_bookmarks encoding range pdf output] lists the bookmarks to the given output in the format specified in cpdfmanual.pdf *) val list_bookmarks : json:bool -> Cpdfmetadata.encoding -> int list -> Pdf.t -> Pdfio.output -> unit + +val name_of_spec : Cpdfmetadata.encoding -> + Pdfmarks.t list -> + Pdf.t -> int -> string -> int -> string -> int -> int -> string diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 6d7b019..e7d11ab 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -2883,7 +2883,7 @@ let fast_write_split_pdfs let pdf = Pdfpage.pdf_of_pages main_pdf pagenums in let startpage, endpage = extremes pagenums in let name = - Cpdf.name_of_spec + Cpdfbookmarks.name_of_spec args.encoding marks main_pdf splitlevel spec number (stem original_filename) startpage endpage in @@ -3774,7 +3774,7 @@ let go () = in let pdf = get_single_pdf args.op true in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - Cpdf.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec + Cpdfimage.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec | Some (ImageResolution f) -> let pdf = get_single_pdf args.op true in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in diff --git a/cpdfimage.ml b/cpdfimage.ml new file mode 100644 index 0000000..7dd09bb --- /dev/null +++ b/cpdfimage.ml @@ -0,0 +1,127 @@ +open Pdfutil +open Pdfio + +(* Extract Images. *) +let pnm_to_channel_24 channel w h s = + let white () = output_char channel ' ' + and newline () = output_char channel '\n' + and output_string = Pervasives.output_string channel in + output_string "P6"; + white (); + output_string (string_of_int w); + white (); + output_string (string_of_int h); + white (); + output_string "255"; + newline (); + let pos = ref 0 in + for y = 1 to h do + for x = 1 to w * 3 do + output_byte channel (bget s !pos); + incr pos + done + done + +let write_stream name stream = + let fh = open_out_bin name in + for x = 0 to bytes_size stream - 1 do + output_byte fh (bget stream x) + done; + close_out fh + +let write_image path_to_p2p path_to_im pdf resources name image = + match Pdfimage.get_image_24bpp pdf resources image with + | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream + | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream + | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream + | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) -> + let pnm = name ^ ".pnm" in + let png = name ^ ".png" in + let fh = open_out_bin pnm in + pnm_to_channel_24 fh w h stream; + close_out fh; + begin match path_to_p2p with + | "" -> + begin match path_to_im with + "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!" + | _ -> + begin match + Sys.command (Filename.quote_command path_to_im [pnm; png]) + with + 0 -> Sys.remove pnm + | _ -> + Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!"; + Sys.remove pnm + end + end + | _ -> + begin match + Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm]) + with + | 0 -> Sys.remove pnm + | _ -> + Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!"; + Sys.remove pnm + end + end + | _ -> + Printf.eprintf "Unsupported image type when extracting image %s %!" name + +let written = ref [] + +let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images = + let names = map + (fun _ -> + Cpdfbookmarks.name_of_spec + encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum) + (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images) + in + iter2 (write_image path_to_p2p path_to_im pdf resources) names images + +let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form = + let resources = + match Pdf.lookup_direct pdf "/Resources" form with + Some (Pdf.Dictionary d) -> Pdf.Dictionary d + | _ -> Pdf.Dictionary [] + in + let images = + let xobjects = + match Pdf.lookup_direct pdf "/XObject" resources with + | Some (Pdf.Dictionary elts) -> map snd elts + | _ -> [] + in + (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *) + let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in + let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in + if dedup || dedup_per_page then + written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; + images + in + extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images + +let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem = + if dedup || dedup_per_page then written := []; + let pdf_pages = Pdfpage.pages_of_pagetree pdf in + let pages = + option_map + (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None) + (combine (indx pdf_pages) pdf_pages) + in + let serial = ref 0 in + iter2 + (fun page pnum -> + if dedup_per_page then written := []; + let xobjects = + match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with + | Some (Pdf.Dictionary elts) -> map snd elts + | _ -> [] + in + let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in + let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in + if dedup || dedup_per_page then + written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written; + let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in + extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images; + iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms) + pages + (indx pages) diff --git a/cpdfimage.mli b/cpdfimage.mli new file mode 100644 index 0000000..9e39778 --- /dev/null +++ b/cpdfimage.mli @@ -0,0 +1,3 @@ +val extract_images : string -> + string -> + Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit