From c822e100db72cb3fa8cfa57a2f0ba536d712b2db Mon Sep 17 00:00:00 2001 From: John Whitington Date: Sun, 19 Dec 2021 13:38:27 +0000 Subject: [PATCH] more --- cpdf.ml | 98 ++------------------------------------------------ cpdf.mli | 2 +- cpdfattach.ml | 78 ++++++++++++++++++++++++++++++++++++++++ cpdfattach.mli | 4 +++ cpdfcommand.ml | 4 +-- 5 files changed, 87 insertions(+), 99 deletions(-) diff --git a/cpdf.ml b/cpdf.ml index 4a2345c..962e3be 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -408,27 +408,11 @@ let list_bookmarks ~json encoding range pdf output = (* \section{Split at bookmarks} *) -(* Returns empty string on failure. Should only be used in conjunction with -split at bookmarks code, so should never fail, by definiton. *) -let remove_unsafe_characters s = - let chars = - lose - (function x -> - match x with - '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true - | x when int_of_char x < 32 || int_of_char x > 126 -> true - | _ -> false) - (explode s) - in - match chars with - | '.'::more -> implode more - | chars -> implode chars - let get_bookmark_name pdf marks splitlevel n _ = let refnums = Pdf.page_reference_numbers pdf in let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with - | {Pdfmarks.text = title}::_ -> remove_unsafe_characters title + | {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters Cpdfmetadata.UTF8 title | _ -> "" (* Find the stem of a filename *) @@ -3266,68 +3250,6 @@ let copy_box f t mediabox_if_missing pdf range = pdf range -let dump_attachment out pdf (_, embeddedfile) = - match Pdf.lookup_direct pdf "/F" embeddedfile with - | Some (Pdf.String s) -> - let efdata = - begin match Pdf.lookup_direct pdf "/EF" embeddedfile with - | Some d -> - let stream = - match Pdf.lookup_direct pdf "/F" d with - | Some s -> s - | None -> error "Bad embedded file stream" - in - Pdfcodec.decode_pdfstream_until_unknown pdf stream; - begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end - | _ -> error "Bad embedded file stream" - end - in - let s = remove_unsafe_characters s in - let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in - begin try - let fh = open_out_bin filename in - for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done; - close_out fh - with - e -> Printf.eprintf "Failed to write attachment to %s\n%!" filename; - end - | _ -> () - -let dump_attached_document pdf out = - let root = Pdf.lookup_obj pdf pdf.Pdf.root in - let names = - match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary [] - in - match Pdf.lookup_direct pdf "/EmbeddedFiles" names with - | Some x -> - iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x) - | None -> () - -let dump_attached_page pdf out page = - let annots = - match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with - | Some (Pdf.Array l) -> l - | _ -> [] - in - let efannots = - keep - (fun annot -> - match Pdf.lookup_direct pdf "/Subtype" annot with - | Some (Pdf.Name "/FileAttachment") -> true - | _ -> false) - annots - in - let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in - iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots) - -(* Dump both document-level and page-level attached files to file, using their file names *) -let dump_attached_files pdf out = - try - dump_attached_document pdf out; - iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf) - with - e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e)) - let remove_unused_resources_page pdf n page = let xobjects, all_names = match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with @@ -3388,27 +3310,11 @@ let create_pdf pages pagesize = let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in Pdfpage.add_root pageroot [] pdf -(* Remove characters which might not make good filenames. *) -let remove_unsafe_characters encoding s = - if encoding = Cpdfmetadata.Raw then s else - let chars = - lose - (function x -> - match x with - '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true - | x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Cpdfmetadata.Stripped) -> true - | _ -> false) - (explode s) - in - match chars with - | '.'::more -> implode more - | chars -> implode chars - let get_bookmark_name encoding pdf marks splitlevel n _ = let refnums = Pdf.page_reference_numbers pdf in let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with - | {Pdfmarks.text = title}::_ -> remove_unsafe_characters encoding title + | {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters encoding title | _ -> "" (* @F means filename without extension *) diff --git a/cpdf.mli b/cpdf.mli index e65f73a..624f9a6 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -293,7 +293,7 @@ val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int * val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t -val dump_attached_files : Pdf.t -> string -> unit + val add_bookmark_title : string -> bool -> Pdf.t -> Pdf.t diff --git a/cpdfattach.ml b/cpdfattach.ml index f9c790c..ce83799 100644 --- a/cpdfattach.ml +++ b/cpdfattach.ml @@ -2,6 +2,22 @@ open Pdfutil open Pdfio open Cpdferror +(* Remove characters which might not make good filenames. *) +let remove_unsafe_characters encoding s = + if encoding = Cpdfmetadata.Raw then s else + let chars = + lose + (function x -> + match x with + '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true + | x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Cpdfmetadata.Stripped) -> true + | _ -> false) + (explode s) + in + match chars with + | '.'::more -> implode more + | chars -> implode chars + (* Attaching files *) let attach_file ?memory keepversion topage pdf file = let data = @@ -224,3 +240,65 @@ let remove_attached_files pdf = Pdf.trailerdict = Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootdict'num)} +let dump_attachment out pdf (_, embeddedfile) = + match Pdf.lookup_direct pdf "/F" embeddedfile with + | Some (Pdf.String s) -> + let efdata = + begin match Pdf.lookup_direct pdf "/EF" embeddedfile with + | Some d -> + let stream = + match Pdf.lookup_direct pdf "/F" d with + | Some s -> s + | None -> error "Bad embedded file stream" + in + Pdfcodec.decode_pdfstream_until_unknown pdf stream; + begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end + | _ -> error "Bad embedded file stream" + end + in + let s = remove_unsafe_characters Cpdfmetadata.UTF8 s in + let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in + begin try + let fh = open_out_bin filename in + for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done; + close_out fh + with + e -> Printf.eprintf "Failed to write attachment to %s\n%!" filename; + end + | _ -> () + +let dump_attached_document pdf out = + let root = Pdf.lookup_obj pdf pdf.Pdf.root in + let names = + match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary [] + in + match Pdf.lookup_direct pdf "/EmbeddedFiles" names with + | Some x -> + iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x) + | None -> () + +let dump_attached_page pdf out page = + let annots = + match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with + | Some (Pdf.Array l) -> l + | _ -> [] + in + let efannots = + keep + (fun annot -> + match Pdf.lookup_direct pdf "/Subtype" annot with + | Some (Pdf.Name "/FileAttachment") -> true + | _ -> false) + annots + in + let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in + iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots) + +(* Dump both document-level and page-level attached files to file, using their file names *) +let dump_attached_files pdf out = + try + dump_attached_document pdf out; + iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf) + with + e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e)) + diff --git a/cpdfattach.mli b/cpdfattach.mli index c9ae94e..58b6c7b 100644 --- a/cpdfattach.mli +++ b/cpdfattach.mli @@ -1,4 +1,6 @@ (** {2 File Attachments} *) +val remove_unsafe_characters : Cpdfmetadata.encoding -> string -> string + (** [attach_file keepversion topage pdf filename] attaches the file in [filename] to the pdf, optionally to a page (rather than document-level). If keepversion is true, the PDF version number won't be altered. *) val attach_file : ?memory:Pdfio.bytes -> bool -> int option -> Pdf.t -> string -> Pdf.t @@ -12,3 +14,5 @@ type attachment = (** List attached files. Attachment name and page number. Page 0 is document level. *) val list_attached_files : Pdf.t -> attachment list + +val dump_attached_files : Pdf.t -> string -> unit diff --git a/cpdfcommand.ml b/cpdfcommand.ml index e1ec797..80432be 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -3588,8 +3588,8 @@ let go () = | Some DumpAttachedFiles -> let pdf = get_single_pdf args.op false in begin match args.out with - | NoOutputSpecified -> Cpdf.dump_attached_files pdf "" - | File n -> Cpdf.dump_attached_files pdf n + | NoOutputSpecified -> Cpdfattach.dump_attached_files pdf "" + | File n -> Cpdfattach.dump_attached_files pdf n | Stdout -> error "Can't dump attachments to stdout" end | Some RemoveAttachedFiles ->