This commit is contained in:
John Whitington 2021-12-19 13:38:27 +00:00
parent 0e9f35ab69
commit c822e100db
5 changed files with 87 additions and 99 deletions

98
cpdf.ml
View File

@ -408,27 +408,11 @@ let list_bookmarks ~json encoding range pdf output =
(* \section{Split at bookmarks} *)
(* Returns empty string on failure. Should only be used in conjunction with
split at bookmarks code, so should never fail, by definiton. *)
let remove_unsafe_characters s =
let chars =
lose
(function x ->
match x with
'/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true
| x when int_of_char x < 32 || int_of_char x > 126 -> true
| _ -> false)
(explode s)
in
match chars with
| '.'::more -> implode more
| chars -> implode chars
let get_bookmark_name pdf marks splitlevel n _ =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
| {Pdfmarks.text = title}::_ -> remove_unsafe_characters title
| {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters Cpdfmetadata.UTF8 title
| _ -> ""
(* Find the stem of a filename *)
@ -3266,68 +3250,6 @@ let copy_box f t mediabox_if_missing pdf range =
pdf
range
let dump_attachment out pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
let efdata =
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
Pdfcodec.decode_pdfstream_until_unknown pdf stream;
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
in
let s = remove_unsafe_characters s in
let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in
begin try
let fh = open_out_bin filename in
for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done;
close_out fh
with
e -> Printf.eprintf "Failed to write attachment to %s\n%!" filename;
end
| _ -> ()
let dump_attached_document pdf out =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x)
| None -> ()
let dump_attached_page pdf out page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots)
(* Dump both document-level and page-level attached files to file, using their file names *)
let dump_attached_files pdf out =
try
dump_attached_document pdf out;
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
with
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
let remove_unused_resources_page pdf n page =
let xobjects, all_names =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
@ -3388,27 +3310,11 @@ let create_pdf pages pagesize =
let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf
(* Remove characters which might not make good filenames. *)
let remove_unsafe_characters encoding s =
if encoding = Cpdfmetadata.Raw then s else
let chars =
lose
(function x ->
match x with
'/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true
| x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Cpdfmetadata.Stripped) -> true
| _ -> false)
(explode s)
in
match chars with
| '.'::more -> implode more
| chars -> implode chars
let get_bookmark_name encoding pdf marks splitlevel n _ =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
| {Pdfmarks.text = title}::_ -> remove_unsafe_characters encoding title
| {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters encoding title
| _ -> ""
(* @F means filename without extension *)

View File

@ -293,7 +293,7 @@ val image_resolution : Pdf.t -> int list -> float -> (int * string * int * int *
val copy_box : string -> string -> bool -> Pdf.t -> int list -> Pdf.t
val dump_attached_files : Pdf.t -> string -> unit
val add_bookmark_title : string -> bool -> Pdf.t -> Pdf.t

View File

@ -2,6 +2,22 @@ open Pdfutil
open Pdfio
open Cpdferror
(* Remove characters which might not make good filenames. *)
let remove_unsafe_characters encoding s =
if encoding = Cpdfmetadata.Raw then s else
let chars =
lose
(function x ->
match x with
'/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true
| x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Cpdfmetadata.Stripped) -> true
| _ -> false)
(explode s)
in
match chars with
| '.'::more -> implode more
| chars -> implode chars
(* Attaching files *)
let attach_file ?memory keepversion topage pdf file =
let data =
@ -224,3 +240,65 @@ let remove_attached_files pdf =
Pdf.trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootdict'num)}
let dump_attachment out pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
let efdata =
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
in
Pdfcodec.decode_pdfstream_until_unknown pdf stream;
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
end
in
let s = remove_unsafe_characters Cpdfmetadata.UTF8 s in
let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in
begin try
let fh = open_out_bin filename in
for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done;
close_out fh
with
e -> Printf.eprintf "Failed to write attachment to %s\n%!" filename;
end
| _ -> ()
let dump_attached_document pdf out =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
in
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x)
| None -> ()
let dump_attached_page pdf out page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
in
let efannots =
keep
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
annots
in
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots)
(* Dump both document-level and page-level attached files to file, using their file names *)
let dump_attached_files pdf out =
try
dump_attached_document pdf out;
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
with
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))

View File

@ -1,4 +1,6 @@
(** {2 File Attachments} *)
val remove_unsafe_characters : Cpdfmetadata.encoding -> string -> string
(** [attach_file keepversion topage pdf filename] attaches the file in [filename] to the pdf, optionally to a page (rather than document-level). If keepversion is true, the PDF version number won't be altered. *)
val attach_file : ?memory:Pdfio.bytes -> bool -> int option -> Pdf.t -> string -> Pdf.t
@ -12,3 +14,5 @@ type attachment =
(** List attached files. Attachment name and page number. Page 0 is document level. *)
val list_attached_files : Pdf.t -> attachment list
val dump_attached_files : Pdf.t -> string -> unit

View File

@ -3588,8 +3588,8 @@ let go () =
| Some DumpAttachedFiles ->
let pdf = get_single_pdf args.op false in
begin match args.out with
| NoOutputSpecified -> Cpdf.dump_attached_files pdf ""
| File n -> Cpdf.dump_attached_files pdf n
| NoOutputSpecified -> Cpdfattach.dump_attached_files pdf ""
| File n -> Cpdfattach.dump_attached_files pdf n
| Stdout -> error "Can't dump attachments to stdout"
end
| Some RemoveAttachedFiles ->