From 4c886d2b13eef3bbfc94e5102459ff307f786e8b Mon Sep 17 00:00:00 2001
From: John Whitington <john@coherentgraphics.co.uk>
Date: Tue, 21 Dec 2021 14:57:42 +0000
Subject: [PATCH] more

---
 Makefile          |   2 +-
 cpdf.ml           | 190 ----------------------------------------------
 cpdf.mli          |   8 --
 cpdfbookmarks.ml  |  67 ++++++++++++++++
 cpdfbookmarks.mli |   4 +
 cpdfcommand.ml    |   4 +-
 cpdfimage.ml      | 127 +++++++++++++++++++++++++++++++
 cpdfimage.mli     |   3 +
 8 files changed, 204 insertions(+), 201 deletions(-)
 create mode 100644 cpdfimage.ml
 create mode 100644 cpdfimage.mli

diff --git a/Makefile b/Makefile
index ab252fd..17394d2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Build the cpdf command line tools and top level
 MODS = cpdfyojson cpdfxmlm \
        cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord cpdfattach \
-       cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdffont cpdftype \
+       cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \
        cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot cpdfpagelabels cpdfcreate cpdfannot cpdfcommand
 
 SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
diff --git a/cpdf.ml b/cpdf.ml
index f87d720..516d2f9 100644
--- a/cpdf.ml
+++ b/cpdf.ml
@@ -2007,194 +2007,4 @@ let create_pdf pages pagesize =
     let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
       Pdfpage.add_root pageroot [] pdf
 
-let get_bookmark_name encoding pdf marks splitlevel n _ =
-  let refnums = Pdf.page_reference_numbers pdf in
-  let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
-  match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
-  | {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters encoding title
-  | _ -> ""
 
-(* @F means filename without extension *)
-(* @N means sequence number with no padding *)
-(* @S means start page of this section *)
-(* @E means end page of this section *)
-(* @B means bookmark name at start page *)
-let process_others encoding marks pdf splitlevel filename sequence startpage endpage s =
-  let rec find_ats p = function
-    '@'::r -> find_ats (p + 1) r
-  | r -> (p, r)
-  in
-  let string_of_int_width w i =
-    if w < 0 then raise (Pdf.PDFError "width of field too narrow")
-    else if w > 8 then raise (Pdf.PDFError "width of field too broad") else
-      let formats =
-        [|format_of_string "%i";
-          format_of_string "%i";
-          format_of_string "%02i";
-          format_of_string "%03i";
-          format_of_string "%04i";
-          format_of_string "%05i";
-          format_of_string "%06i";
-          format_of_string "%07i";
-          format_of_string "%08i"|]
-      in
-        Printf.sprintf formats.(w) i
-  in
-    let rec procss prev = function
-      | [] -> rev prev
-      | '@'::'F'::t -> procss (rev (explode filename) @ prev) t
-      | '@'::'N'::t ->
-          let width, rest = find_ats 0 t in
-            procss (rev (explode (string_of_int_width width sequence)) @ prev) rest
-      | '@'::'S'::t ->
-          let width, rest = find_ats 0 t in
-            procss (rev (explode (string_of_int_width width startpage)) @ prev) rest
-      | '@'::'E'::t ->
-          let width, rest = find_ats 0 t in
-            procss (rev (explode (string_of_int_width width endpage)) @ prev) rest
-      | '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t
-      | h::t -> procss (h::prev) t
-    in
-       implode (procss [] (explode s))
-
-let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
-  let fill l n =
-    let chars = explode (string_of_int n) in
-      if length chars > l
-        then implode (drop chars (length chars - l))
-        else implode ((many '0' (l - length chars)) @ chars)
-  in
-    let chars = explode spec in
-      let before, including = cleavewhile (neq '%') chars in
-        let percents, after = cleavewhile (eq '%') including in
-          if percents = []
-            then
-              process_others encoding marks pdf splitlevel filename n startpage endpage spec
-            else
-              process_others encoding marks pdf splitlevel filename n startpage endpage
-              (implode before ^ fill (length percents) n ^ implode after)
-
-(* Extract Images. *)
-let pnm_to_channel_24 channel w h s =
-  let white () = output_char channel ' ' 
-  and newline () = output_char channel '\n'
-  and output_string = Pervasives.output_string channel in
-    output_string "P6";
-    white ();
-    output_string (string_of_int w);
-    white ();
-    output_string (string_of_int h);
-    white ();
-    output_string "255";
-    newline ();
-    let pos = ref 0 in
-      for y = 1 to h do
-        for x = 1 to w * 3 do
-          output_byte channel (bget s !pos);
-          incr pos
-        done
-      done
-
-let write_stream name stream =
-  let fh = open_out_bin name in
-    for x = 0 to bytes_size stream - 1 do
-      output_byte fh (bget stream x)
-    done;
-    close_out fh
-
-let write_image path_to_p2p path_to_im pdf resources name image =
-  match Pdfimage.get_image_24bpp pdf resources image with
-  | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
-  | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
-  | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream
-  | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
-      let pnm = name ^ ".pnm" in
-      let png = name ^ ".png" in
-      let fh = open_out_bin pnm in
-        pnm_to_channel_24 fh w h stream;
-        close_out fh;
-        begin match path_to_p2p with
-        | "" ->
-          begin match path_to_im with
-            "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!"
-          | _ ->
-            begin match
-              Sys.command (Filename.quote_command path_to_im [pnm; png])
-            with
-              0 -> Sys.remove pnm
-            | _ -> 
-              Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!";
-              Sys.remove pnm
-            end
-          end
-        | _ ->
-          begin match
-            Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
-          with
-          | 0 -> Sys.remove pnm
-          | _ ->
-              Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!";
-              Sys.remove pnm
-          end
-        end
-  | _ ->
-      Printf.eprintf "Unsupported image type when extracting image %s %!" name
-
-let written = ref []
-
-let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images =
-  let names = map
-    (fun _ ->
-       name_of_spec
-         encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
-         (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
-  in
-    iter2 (write_image path_to_p2p path_to_im pdf resources) names images
-
-let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
-  let resources =
-    match Pdf.lookup_direct pdf "/Resources" form with
-      Some (Pdf.Dictionary d) -> Pdf.Dictionary d
-    | _ -> Pdf.Dictionary []
-  in
-    let images =
-      let xobjects =
-        match Pdf.lookup_direct pdf "/XObject" resources with
-        | Some (Pdf.Dictionary elts) -> map snd elts
-        | _ -> []
-      in
-        (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
-        let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
-        let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
-          if dedup || dedup_per_page then
-            written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
-          images
-    in
-      extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images
-
-let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem =
-  if dedup || dedup_per_page then written := [];
-  let pdf_pages = Pdfpage.pages_of_pagetree pdf in
-    let pages =
-      option_map
-        (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
-        (combine (indx pdf_pages) pdf_pages)
-    in
-      let serial = ref 0 in
-        iter2
-          (fun page pnum ->
-             if dedup_per_page then written := [];
-             let xobjects =
-               match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
-               | Some (Pdf.Dictionary elts) -> map snd elts
-               | _ -> []
-             in
-               let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
-               let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
-               if dedup || dedup_per_page then
-                 written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
-               let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
-                 extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
-                 iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
-          pages
-          (indx pages)
diff --git a/cpdf.mli b/cpdf.mli
index 9a810fb..6c0d03e 100644
--- a/cpdf.mli
+++ b/cpdf.mli
@@ -145,11 +145,3 @@ val remove_unused_resources : Pdf.t -> Pdf.t
 val bookmarks_open_to_level : int -> Pdf.t -> Pdf.t
 
 val create_pdf : int -> Pdfpaper.t -> Pdf.t
-
-val name_of_spec : Cpdfmetadata.encoding ->
-           Pdfmarks.t list ->
-           Pdf.t -> int -> string -> int -> string -> int -> int -> string
-
-val extract_images : string ->
-           string ->
-           Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
diff --git a/cpdfbookmarks.ml b/cpdfbookmarks.ml
index d10f5b9..dc7fd92 100644
--- a/cpdfbookmarks.ml
+++ b/cpdfbookmarks.ml
@@ -292,3 +292,70 @@ let split_on_bookmarks pdf level =
   in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
     let ranges = splitat points (indx pdf_pages) in
       map (fun rs -> Pdfpage.pdf_of_pages pdf rs) ranges
+
+let get_bookmark_name encoding pdf marks splitlevel n _ =
+  let refnums = Pdf.page_reference_numbers pdf in
+  let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
+  match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
+  | {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters encoding title
+  | _ -> ""
+
+(* @F means filename without extension *)
+(* @N means sequence number with no padding *)
+(* @S means start page of this section *)
+(* @E means end page of this section *)
+(* @B means bookmark name at start page *)
+let process_others encoding marks pdf splitlevel filename sequence startpage endpage s =
+  let rec find_ats p = function
+    '@'::r -> find_ats (p + 1) r
+  | r -> (p, r)
+  in
+  let string_of_int_width w i =
+    if w < 0 then raise (Pdf.PDFError "width of field too narrow")
+    else if w > 8 then raise (Pdf.PDFError "width of field too broad") else
+      let formats =
+        [|format_of_string "%i";
+          format_of_string "%i";
+          format_of_string "%02i";
+          format_of_string "%03i";
+          format_of_string "%04i";
+          format_of_string "%05i";
+          format_of_string "%06i";
+          format_of_string "%07i";
+          format_of_string "%08i"|]
+      in
+        Printf.sprintf formats.(w) i
+  in
+    let rec procss prev = function
+      | [] -> rev prev
+      | '@'::'F'::t -> procss (rev (explode filename) @ prev) t
+      | '@'::'N'::t ->
+          let width, rest = find_ats 0 t in
+            procss (rev (explode (string_of_int_width width sequence)) @ prev) rest
+      | '@'::'S'::t ->
+          let width, rest = find_ats 0 t in
+            procss (rev (explode (string_of_int_width width startpage)) @ prev) rest
+      | '@'::'E'::t ->
+          let width, rest = find_ats 0 t in
+            procss (rev (explode (string_of_int_width width endpage)) @ prev) rest
+      | '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t
+      | h::t -> procss (h::prev) t
+    in
+       implode (procss [] (explode s))
+
+let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
+  let fill l n =
+    let chars = explode (string_of_int n) in
+      if length chars > l
+        then implode (drop chars (length chars - l))
+        else implode ((many '0' (l - length chars)) @ chars)
+  in
+    let chars = explode spec in
+      let before, including = cleavewhile (neq '%') chars in
+        let percents, after = cleavewhile (eq '%') including in
+          if percents = []
+            then
+              process_others encoding marks pdf splitlevel filename n startpage endpage spec
+            else
+              process_others encoding marks pdf splitlevel filename n startpage endpage
+              (implode before ^ fill (length percents) n ^ implode after)
diff --git a/cpdfbookmarks.mli b/cpdfbookmarks.mli
index 7cd3842..f4a8a35 100644
--- a/cpdfbookmarks.mli
+++ b/cpdfbookmarks.mli
@@ -12,3 +12,7 @@ val add_bookmarks : json:bool -> bool -> Pdfio.input -> Pdf.t -> Pdf.t
 (** [list_bookmarks encoding range pdf output] lists the bookmarks to the given
 output in the format specified in cpdfmanual.pdf *)
 val list_bookmarks : json:bool -> Cpdfmetadata.encoding -> int list -> Pdf.t -> Pdfio.output -> unit
+
+val name_of_spec : Cpdfmetadata.encoding ->
+           Pdfmarks.t list ->
+           Pdf.t -> int -> string -> int -> string -> int -> int -> string
diff --git a/cpdfcommand.ml b/cpdfcommand.ml
index 6d7b019..e7d11ab 100644
--- a/cpdfcommand.ml
+++ b/cpdfcommand.ml
@@ -2883,7 +2883,7 @@ let fast_write_split_pdfs
          let pdf = Pdfpage.pdf_of_pages main_pdf pagenums in
            let startpage, endpage = extremes pagenums in
              let name =
-               Cpdf.name_of_spec
+               Cpdfbookmarks.name_of_spec
                  args.encoding marks main_pdf splitlevel spec number
                  (stem original_filename) startpage endpage
              in
@@ -3774,7 +3774,7 @@ let go () =
       in
         let pdf = get_single_pdf args.op true in
           let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
-            Cpdf.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec
+            Cpdfimage.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec
   | Some (ImageResolution f) ->
       let pdf = get_single_pdf args.op true in
         let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
diff --git a/cpdfimage.ml b/cpdfimage.ml
new file mode 100644
index 0000000..7dd09bb
--- /dev/null
+++ b/cpdfimage.ml
@@ -0,0 +1,127 @@
+open Pdfutil
+open Pdfio
+
+(* Extract Images. *)
+let pnm_to_channel_24 channel w h s =
+  let white () = output_char channel ' ' 
+  and newline () = output_char channel '\n'
+  and output_string = Pervasives.output_string channel in
+    output_string "P6";
+    white ();
+    output_string (string_of_int w);
+    white ();
+    output_string (string_of_int h);
+    white ();
+    output_string "255";
+    newline ();
+    let pos = ref 0 in
+      for y = 1 to h do
+        for x = 1 to w * 3 do
+          output_byte channel (bget s !pos);
+          incr pos
+        done
+      done
+
+let write_stream name stream =
+  let fh = open_out_bin name in
+    for x = 0 to bytes_size stream - 1 do
+      output_byte fh (bget stream x)
+    done;
+    close_out fh
+
+let write_image path_to_p2p path_to_im pdf resources name image =
+  match Pdfimage.get_image_24bpp pdf resources image with
+  | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
+  | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
+  | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream
+  | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
+      let pnm = name ^ ".pnm" in
+      let png = name ^ ".png" in
+      let fh = open_out_bin pnm in
+        pnm_to_channel_24 fh w h stream;
+        close_out fh;
+        begin match path_to_p2p with
+        | "" ->
+          begin match path_to_im with
+            "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!"
+          | _ ->
+            begin match
+              Sys.command (Filename.quote_command path_to_im [pnm; png])
+            with
+              0 -> Sys.remove pnm
+            | _ -> 
+              Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!";
+              Sys.remove pnm
+            end
+          end
+        | _ ->
+          begin match
+            Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
+          with
+          | 0 -> Sys.remove pnm
+          | _ ->
+              Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!";
+              Sys.remove pnm
+          end
+        end
+  | _ ->
+      Printf.eprintf "Unsupported image type when extracting image %s %!" name
+
+let written = ref []
+
+let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images =
+  let names = map
+    (fun _ ->
+       Cpdfbookmarks.name_of_spec
+         encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
+         (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
+  in
+    iter2 (write_image path_to_p2p path_to_im pdf resources) names images
+
+let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
+  let resources =
+    match Pdf.lookup_direct pdf "/Resources" form with
+      Some (Pdf.Dictionary d) -> Pdf.Dictionary d
+    | _ -> Pdf.Dictionary []
+  in
+    let images =
+      let xobjects =
+        match Pdf.lookup_direct pdf "/XObject" resources with
+        | Some (Pdf.Dictionary elts) -> map snd elts
+        | _ -> []
+      in
+        (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
+        let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
+        let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
+          if dedup || dedup_per_page then
+            written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
+          images
+    in
+      extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images
+
+let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem =
+  if dedup || dedup_per_page then written := [];
+  let pdf_pages = Pdfpage.pages_of_pagetree pdf in
+    let pages =
+      option_map
+        (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
+        (combine (indx pdf_pages) pdf_pages)
+    in
+      let serial = ref 0 in
+        iter2
+          (fun page pnum ->
+             if dedup_per_page then written := [];
+             let xobjects =
+               match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
+               | Some (Pdf.Dictionary elts) -> map snd elts
+               | _ -> []
+             in
+               let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
+               let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
+               if dedup || dedup_per_page then
+                 written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
+               let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
+                 extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
+                 iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
+          pages
+          (indx pages)
diff --git a/cpdfimage.mli b/cpdfimage.mli
new file mode 100644
index 0000000..9e39778
--- /dev/null
+++ b/cpdfimage.mli
@@ -0,0 +1,3 @@
+val extract_images : string ->
+           string ->
+           Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit