more

2025-06-05 22:09:39 +02:00 · 2021-11-15 09:30:11 -08:00
parent 4e12e01848
commit b908e5f57d
3 changed files with 219 additions and 212 deletions
--- a/cpdf.ml
+++ b/cpdf.ml
@@ -4649,3 +4649,211 @@ let create_pdf pages pagesize =
  in
    let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
      Pdfpage.add_root pageroot [] pdf
+
+(* Remove characters which might not make good filenames. *)
+let remove_unsafe_characters encoding s =
+  if encoding = Raw then s else
+    let chars =
+      lose
+        (function x ->
+           match x with
+           '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true
+           | x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Stripped) -> true
+           | _ -> false)
+        (explode s)
+    in
+      match chars with
+      | '.'::more -> implode more
+      | chars -> implode chars
+
+let get_bookmark_name encoding pdf marks splitlevel n _ =
+  let refnums = Pdf.page_reference_numbers pdf in
+  let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
+  match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
+  | {Pdfmarks.text = title}::_ -> remove_unsafe_characters encoding title
+  | _ -> ""
+
+(* @F means filename without extension *)
+(* @N means sequence number with no padding *)
+(* @S means start page of this section *)
+(* @E means end page of this section *)
+(* @B means bookmark name at start page *)
+let process_others encoding marks pdf splitlevel filename sequence startpage endpage s =
+  let rec find_ats p = function
+    '@'::r -> find_ats (p + 1) r
+  | r -> (p, r)
+  in
+  let string_of_int_width w i =
+    if w < 0 then raise (Pdf.PDFError "width of field too narrow")
+    else if w > 8 then raise (Pdf.PDFError "width of field too broad") else
+      let formats =
+        [|format_of_string "%i";
+          format_of_string "%i";
+          format_of_string "%02i";
+          format_of_string "%03i";
+          format_of_string "%04i";
+          format_of_string "%05i";
+          format_of_string "%06i";
+          format_of_string "%07i";
+          format_of_string "%08i"|]
+      in
+        Printf.sprintf formats.(w) i
+  in
+    let rec procss prev = function
+      | [] -> rev prev
+      | '@'::'F'::t -> procss (rev (explode filename) @ prev) t
+      | '@'::'N'::t ->
+          let width, rest = find_ats 0 t in
+            procss (rev (explode (string_of_int_width width sequence)) @ prev) rest
+      | '@'::'S'::t ->
+          let width, rest = find_ats 0 t in
+            procss (rev (explode (string_of_int_width width startpage)) @ prev) rest
+      | '@'::'E'::t ->
+          let width, rest = find_ats 0 t in
+            procss (rev (explode (string_of_int_width width endpage)) @ prev) rest
+      | '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t
+      | h::t -> procss (h::prev) t
+    in
+       implode (procss [] (explode s))
+
+let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
+  let fill l n =
+    let chars = explode (string_of_int n) in
+      if length chars > l
+        then implode (drop chars (length chars - l))
+        else implode ((many '0' (l - length chars)) @ chars)
+  in
+    let chars = explode spec in
+      let before, including = cleavewhile (neq '%') chars in
+        let percents, after = cleavewhile (eq '%') including in
+          if percents = []
+            then
+              process_others encoding marks pdf splitlevel filename n startpage endpage spec
+            else
+              process_others encoding marks pdf splitlevel filename n startpage endpage
+              (implode before ^ fill (length percents) n ^ implode after)
+
+(* Extract Images. *)
+let pnm_to_channel_24 channel w h s =
+  let white () = output_char channel ' ' 
+  and newline () = output_char channel '\n'
+  and output_string = Pervasives.output_string channel in
+    output_string "P6";
+    white ();
+    output_string (string_of_int w);
+    white ();
+    output_string (string_of_int h);
+    white ();
+    output_string "255";
+    newline ();
+    let pos = ref 0 in
+      for y = 1 to h do
+        for x = 1 to w * 3 do
+          output_byte channel (bget s !pos);
+          incr pos
+        done
+      done
+
+let write_stream name stream =
+  let fh = open_out_bin name in
+    for x = 0 to bytes_size stream - 1 do
+      output_byte fh (bget stream x)
+    done;
+    close_out fh
+
+let write_image path_to_p2p path_to_im pdf resources name image =
+  match Pdfimage.get_image_24bpp pdf resources image with
+  | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
+  | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
+  | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream
+  | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
+      let pnm = name ^ ".pnm" in
+      let png = name ^ ".png" in
+      let fh = open_out_bin pnm in
+        pnm_to_channel_24 fh w h stream;
+        close_out fh;
+        begin match path_to_p2p with
+        | "" ->
+          begin match path_to_im with
+            "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!"
+          | _ ->
+            begin match
+              Sys.command (Filename.quote_command path_to_im [pnm; png])
+            with
+              0 -> Sys.remove pnm
+            | _ -> 
+              Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!";
+              Sys.remove pnm
+            end
+          end
+        | _ ->
+          begin match
+            Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
+          with
+          | 0 -> Sys.remove pnm
+          | _ ->
+              Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!";
+              Sys.remove pnm
+          end
+        end
+  | _ ->
+      Printf.eprintf "Unsupported image type when extracting image %s %!" name
+
+let written = ref []
+
+let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images =
+  let names = map
+    (fun _ ->
+       name_of_spec
+         encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
+         (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
+  in
+    iter2 (write_image path_to_p2p path_to_im pdf resources) names images
+
+let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
+  let resources =
+    match Pdf.lookup_direct pdf "/Resources" form with
+      Some (Pdf.Dictionary d) -> Pdf.Dictionary d
+    | _ -> Pdf.Dictionary []
+  in
+    let images =
+      let xobjects =
+        match Pdf.lookup_direct pdf "/XObject" resources with
+        | Some (Pdf.Dictionary elts) -> map snd elts
+        | _ -> []
+      in
+        (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
+        let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
+        let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
+          if dedup || dedup_per_page then
+            written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
+          images
+    in
+      extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images
+
+let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem =
+  if dedup || dedup_per_page then written := [];
+  let pdf_pages = Pdfpage.pages_of_pagetree pdf in
+    let pages =
+      option_map
+        (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
+        (combine (indx pdf_pages) pdf_pages)
+    in
+      let serial = ref 0 in
+        iter2
+          (fun page pnum ->
+             if dedup_per_page then written := [];
+             let xobjects =
+               match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
+               | Some (Pdf.Dictionary elts) -> map snd elts
+               | _ -> []
+             in
+               let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
+               let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
+               if dedup || dedup_per_page then
+                 written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
+               let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
+                 extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
+                 iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
+          pages
+          (indx pages)
--- a/cpdf.mli
+++ b/cpdf.mli
@@ -412,3 +412,10 @@ val bookmarks_open_to_level : int -> Pdf.t -> Pdf.t

 val create_pdf : int -> Pdfpaper.t -> Pdf.t

+val name_of_spec : encoding ->
+           Pdfmarks.t list ->
+           Pdf.t -> int -> string -> int -> string -> int -> int -> string
+
+val extract_images : string ->
+           string ->
+           encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit
--- a/cpdfcommand.ml
+++ b/cpdfcommand.ml
@@ -3,7 +3,7 @@ let demo = false
 let noncomp = false
 let major_version = 2
 let minor_version = 5
-let version_date = "(devel, 28th Sept 2021)"
+let version_date = "(devel, 15th Nov 2021)"

 open Pdfutil
 open Pdfio
@@ -2773,89 +2773,6 @@ let write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf =
                end;
                flush stdout (*r For Windows *)

-(* Remove characters which might not make good filenames. *)
-let remove_unsafe_characters s =
-  if args.encoding = Cpdf.Raw then s else
-    let chars =
-      lose
-        (function x ->
-           match x with
-           '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true
-           | x when int_of_char x < 32 || (int_of_char x > 126 && args.encoding <> Cpdf.Stripped) -> true
-           | _ -> false)
-        (explode s)
-    in
-      match chars with
-      | '.'::more -> implode more
-      | chars -> implode chars
-
-let get_bookmark_name pdf marks splitlevel n _ =
-  let refnums = Pdf.page_reference_numbers pdf in
-  let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
-  match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
-  | {Pdfmarks.text = title}::_ -> remove_unsafe_characters title
-  | _ -> ""
-
-(* @F means filename without extension *)
-(* @N means sequence number with no padding *)
-(* @S means start page of this section *)
-(* @E means end page of this section *)
-(* @B means bookmark name at start page *)
-let process_others marks pdf splitlevel filename sequence startpage endpage s =
-  let rec find_ats p = function
-    '@'::r -> find_ats (p + 1) r
-  | r -> (p, r)
-  in
-  let string_of_int_width w i =
-    if w < 0 then raise (Pdf.PDFError "width of field too narrow")
-    else if w > 8 then raise (Pdf.PDFError "width of field too broad") else
-      let formats =
-        [|format_of_string "%i";
-          format_of_string "%i";
-          format_of_string "%02i";
-          format_of_string "%03i";
-          format_of_string "%04i";
-          format_of_string "%05i";
-          format_of_string "%06i";
-          format_of_string "%07i";
-          format_of_string "%08i"|]
-      in
-        Printf.sprintf formats.(w) i
-  in
-    let rec procss prev = function
-      | [] -> rev prev
-      | '@'::'F'::t -> procss (rev (explode filename) @ prev) t
-      | '@'::'N'::t ->
-          let width, rest = find_ats 0 t in
-            procss (rev (explode (string_of_int_width width sequence)) @ prev) rest
-      | '@'::'S'::t ->
-          let width, rest = find_ats 0 t in
-            procss (rev (explode (string_of_int_width width startpage)) @ prev) rest
-      | '@'::'E'::t ->
-          let width, rest = find_ats 0 t in
-            procss (rev (explode (string_of_int_width width endpage)) @ prev) rest
-      | '@'::'B'::t -> procss (rev (explode (get_bookmark_name pdf marks splitlevel startpage pdf)) @ prev) t
-      | h::t -> procss (h::prev) t
-    in
-       implode (procss [] (explode s))
-
-let name_of_spec marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
-  let fill l n =
-    let chars = explode (string_of_int n) in
-      if length chars > l
-        then implode (drop chars (length chars - l))
-        else implode ((many '0' (l - length chars)) @ chars)
-  in
-    let chars = explode spec in
-      let before, including = cleavewhile (neq '%') chars in
-        let percents, after = cleavewhile (eq '%') including in
-          if percents = []
-            then
-              process_others marks pdf splitlevel filename n startpage endpage spec
-            else
-              process_others marks pdf splitlevel filename n startpage endpage
-              (implode before ^ fill (length percents) n ^ implode after)
-
 (* Find the stem of a filename *)
 let stem s =
  implode
@@ -2872,8 +2789,8 @@ let fast_write_split_pdfs
         let pdf = Pdfpage.pdf_of_pages main_pdf pagenums in
           let startpage, endpage = extremes pagenums in
             let name =
-               name_of_spec
-                 marks main_pdf splitlevel spec number
+               Cpdf.name_of_spec
+                 args.encoding marks main_pdf splitlevel spec number
                 (stem original_filename) startpage endpage
             in
               Pdf.remove_unreferenced pdf;
@@ -2916,131 +2833,6 @@ let split_pdf
      enc 0 original_filename squeeze spec pdf
      (splitinto chunksize (indx pdf_pages)) pdf_pages

-(* Extract Images. *)
-let pnm_to_channel_24 channel w h s =
-  let white () = output_char channel ' ' 
-  and newline () = output_char channel '\n'
-  and output_string = Pervasives.output_string channel in
-    output_string "P6";
-    white ();
-    output_string (string_of_int w);
-    white ();
-    output_string (string_of_int h);
-    white ();
-    output_string "255";
-    newline ();
-    let pos = ref 0 in
-      for y = 1 to h do
-        for x = 1 to w * 3 do
-          output_byte channel (bget s !pos);
-          incr pos
-        done
-      done
-
-let write_stream name stream =
-  let fh = open_out_bin name in
-    for x = 0 to bytes_size stream - 1 do
-      output_byte fh (bget stream x)
-    done;
-    close_out fh
-
-let write_image pdf resources name image =
-  match Pdfimage.get_image_24bpp pdf resources image with
-  | Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
-  | Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
-  | Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream
-  | Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
-      let pnm = name ^ ".pnm" in
-      let png = name ^ ".png" in
-      let fh = open_out_bin pnm in
-        pnm_to_channel_24 fh w h stream;
-        close_out fh;
-        begin match args.path_to_p2p with
-        | "" ->
-          begin match args.path_to_im with
-            "" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!"
-          | _ ->
-            begin match
-              Sys.command (Filename.quote_command args.path_to_im [pnm; png])
-            with
-              0 -> Sys.remove pnm
-            | _ -> 
-              Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!";
-              Sys.remove pnm
-            end
-          end
-        | _ ->
-          begin match
-            Sys.command (Filename.quote_command args.path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
-          with
-          | 0 -> Sys.remove pnm
-          | _ ->
-              Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!";
-              Sys.remove pnm
-          end
-        end
-  | _ ->
-      Printf.eprintf "Unsupported image type when extracting image %s %!" name
-
-let written = ref []
-
-let extract_images_inner serial pdf resources stem pnum images =
-  let names = map
-    (fun _ ->
-       name_of_spec
-         [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
-         (let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
-  in
-    iter2 (write_image pdf resources) names images
-
-let rec extract_images_form_xobject pdf serial stem pnum form =
-  let resources =
-    match Pdf.lookup_direct pdf "/Resources" form with
-      Some (Pdf.Dictionary d) -> Pdf.Dictionary d
-    | _ -> Pdf.Dictionary []
-  in
-    let images =
-      let xobjects =
-        match Pdf.lookup_direct pdf "/XObject" resources with
-        | Some (Pdf.Dictionary elts) -> map snd elts
-        | _ -> []
-      in
-        (* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
-        let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
-        let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
-          if args.dedup || args.dedup_per_page then
-            written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
-          images
-    in
-      extract_images_inner serial pdf resources stem pnum images
-
-let extract_images pdf range stem =
-  if args.dedup || args.dedup_per_page then written := [];
-  let pdf_pages = Pdfpage.pages_of_pagetree pdf in
-    let pages =
-      option_map
-        (function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
-        (combine (indx pdf_pages) pdf_pages)
-    in
-      let serial = ref 0 in
-        iter2
-          (fun page pnum ->
-             if args.dedup_per_page then written := [];
-             let xobjects =
-               match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
-               | Some (Pdf.Dictionary elts) -> map snd elts
-               | _ -> []
-             in
-               let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
-               let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
-               if args.dedup || args.dedup_per_page then
-                 written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
-               let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
-                 extract_images_inner serial pdf page.Pdfpage.resources stem pnum images;
-                 iter (extract_images_form_xobject pdf serial stem pnum) forms)
-          pages
-          (indx pages)
-
 let getencryption pdf =
  match Pdfread.what_encryption pdf with
  | None | Some Pdfwrite.AlreadyEncrypted -> "Not encrypted"
@@ -3888,7 +3680,7 @@ let go () =
      in
        let pdf = get_single_pdf args.op true in
          let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
-            extract_images pdf range output_spec
+            Cpdf.extract_images args.path_to_p2p args.path_to_im args.encoding args.dedup args.dedup_per_page pdf range output_spec
  | Some (ImageResolution f) ->
      let pdf = get_single_pdf args.op true in
        let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in