mirror of
https://github.com/johnwhitington/cpdf-source.git
synced 2025-01-20 12:48:40 +01:00
359 lines
15 KiB
OCaml
359 lines
15 KiB
OCaml
open Pdfutil
|
|
open Pdfio
|
|
open Cpdferror
|
|
|
|
(* Remove characters which might not make good filenames. In, UTF8, out UTF8. *)
|
|
let remove_unsafe_characters s =
|
|
let codepoints = Pdftext.codepoints_of_utf8 s in
|
|
let codepoints =
|
|
lose
|
|
(function x ->
|
|
x = int_of_char '/'
|
|
|| x = int_of_char '?'
|
|
|| x = int_of_char '<'
|
|
|| x = int_of_char '>'
|
|
|| x = int_of_char '\\'
|
|
|| x = int_of_char ':'
|
|
|| x = int_of_char '*'
|
|
|| x = int_of_char '|'
|
|
|| x = int_of_char '\"'
|
|
|| x = int_of_char '^'
|
|
|| x = int_of_char '+'
|
|
|| x = int_of_char '='
|
|
|| x < 32
|
|
|| x = 127)
|
|
codepoints
|
|
in
|
|
match codepoints with
|
|
| 46::more -> Pdftext.utf8_of_codepoints codepoints (* Don't produce a dotfile *)
|
|
| chars -> Pdftext.utf8_of_codepoints codepoints
|
|
|
|
(* Attaching files *)
|
|
let attach_file ?memory keepversion topage pdf file =
|
|
let data =
|
|
match memory with
|
|
Some data -> data
|
|
| None ->
|
|
let ch = open_in_bin file in
|
|
let len = in_channel_length ch in
|
|
let stream = mkbytes len in
|
|
let i = input_of_channel ch in
|
|
setinit i stream 0 len;
|
|
close_in ch;
|
|
stream
|
|
in
|
|
let filestream =
|
|
Pdf.Stream
|
|
(ref (Pdf.Dictionary
|
|
[("/Length", Pdf.Integer (bytes_size data));
|
|
("/Type", Pdf.Name "/EmbeddedFile");
|
|
("/Params",
|
|
Pdf.Dictionary
|
|
[("/Size", Pdf.Integer (bytes_size data));
|
|
("/CheckSum", Pdf.String (Digest.string (string_of_bytes data)))
|
|
])],
|
|
Pdf.Got data))
|
|
in
|
|
let filestream_num = Pdf.addobj pdf filestream in
|
|
let basename = Pdftext.pdfdocstring_of_utf8 (Filename.basename file) in
|
|
let filespec =
|
|
Pdf.Dictionary
|
|
[("/EF", Pdf.Dictionary ["/F", Pdf.Indirect filestream_num]);
|
|
("/F", Pdf.String basename);
|
|
("/Type", Pdf.Name "/Filespec");
|
|
("/Desc", Pdf.String "");
|
|
("/UF", Pdf.String basename)]
|
|
in
|
|
match topage with
|
|
| None ->
|
|
(* Look up /Names and /EmbeddedFiles and /Names. *)
|
|
let rootdict = Pdf.lookup_obj pdf pdf.Pdf.root in
|
|
let namedict =
|
|
match Pdf.lookup_direct pdf "/Names" rootdict with
|
|
| None -> Pdf.Dictionary []
|
|
| Some namedict -> namedict
|
|
in
|
|
let embeddednamedict =
|
|
match Pdf.lookup_direct pdf "/EmbeddedFiles" namedict with
|
|
| None -> Pdf.Dictionary []
|
|
| Some embeddednamedict -> embeddednamedict
|
|
in
|
|
let elts =
|
|
match Pdf.lookup_direct pdf "/Names" embeddednamedict with
|
|
| Some (Pdf.Array elts) -> elts
|
|
| _ -> []
|
|
in
|
|
let filespecobj = Pdf.addobj pdf filespec in
|
|
let names' = Pdf.Array (elts @ [Pdf.String basename; Pdf.Indirect filespecobj]) in
|
|
let embeddednamedict' = Pdf.add_dict_entry embeddednamedict "/Names" names' in
|
|
let namedict' = Pdf.add_dict_entry namedict "/EmbeddedFiles" embeddednamedict' in
|
|
let rootdict' = Pdf.add_dict_entry rootdict "/Names" namedict' in
|
|
let rootnum = Pdf.addobj pdf rootdict' in
|
|
{pdf with
|
|
Pdf.minor = if keepversion || pdf.Pdf.major > 1 then pdf.Pdf.minor else max pdf.Pdf.minor 4;
|
|
Pdf.root = rootnum;
|
|
Pdf.trailerdict =
|
|
Pdf.add_dict_entry
|
|
pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum)}
|
|
| Some pagenumber ->
|
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
|
if pagenumber < 0 || pagenumber > length pages then error "attach_file: Page not found" else
|
|
let page = select pagenumber pages in
|
|
let annots =
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
| Some (Pdf.Array annots) -> annots
|
|
| _ -> []
|
|
in
|
|
let rect =
|
|
let minx, miny, maxx, maxy = Pdf.parse_rectangle pdf page.Pdfpage.mediabox in
|
|
Pdf.Array [Pdf.Real 18.; Pdf.Real (maxy -. 45.); Pdf.Real 45.; Pdf.Real (maxy -. 18.)]
|
|
in
|
|
let filespecobj = Pdf.addobj pdf filespec in
|
|
let annot =
|
|
Pdf.Dictionary
|
|
[("/FS", Pdf.Indirect filespecobj);
|
|
("/Subtype", Pdf.Name "/FileAttachment");
|
|
("/Contents", Pdf.String basename);
|
|
("/Rect", rect)]
|
|
in
|
|
let annots' = Pdf.Array (annot::annots) in
|
|
let page' =
|
|
{page with Pdfpage.rest = Pdf.add_dict_entry page.Pdfpage.rest "/Annots" annots'}
|
|
in
|
|
let pages' = replace_number pagenumber page' pages in
|
|
let pdf = Pdfpage.change_pages true pdf pages' in
|
|
{pdf with
|
|
Pdf.minor = if keepversion || pdf.Pdf.major > 1 then pdf.Pdf.minor else max pdf.Pdf.minor 4}
|
|
|
|
type attachment =
|
|
{name : string;
|
|
pagenumber : int;
|
|
data : unit -> Pdfio.bytes}
|
|
|
|
let list_attached_files pdf =
|
|
let toplevel =
|
|
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
|
|
| None -> []
|
|
| Some rootdict ->
|
|
match Pdf.lookup_direct pdf "/Names" rootdict with
|
|
| None -> []
|
|
| Some namedict ->
|
|
match Pdf.lookup_direct pdf "/EmbeddedFiles" namedict with
|
|
| Some nametree ->
|
|
map
|
|
(function (x, ef) ->
|
|
match Pdf.lookup_direct pdf "/EF" ef with
|
|
| Some ((Pdf.Dictionary _) as d) ->
|
|
begin match Pdf.lookup_direct pdf "/F" d with
|
|
| Some stream ->
|
|
{name = Pdftext.utf8_of_pdfdocstring x;
|
|
pagenumber = 0;
|
|
data =
|
|
(fun () ->
|
|
try
|
|
Pdf.getstream stream;
|
|
Pdfcodec.decode_pdfstream pdf stream;
|
|
match stream with
|
|
Pdf.Stream {contents = (_, Pdf.Got data)} -> data
|
|
| _ -> raise Not_found
|
|
with
|
|
_ -> raise (Pdf.PDFError "could not retreive attachment data"))}
|
|
| None -> raise (Pdf.PDFError "/F not found")
|
|
end
|
|
| _ -> raise (Pdf.PDFError "/EF not found"))
|
|
(option_map
|
|
(function (Pdf.String s, ef) -> Some (s, ef) | _ -> None)
|
|
(Pdf.contents_of_nametree pdf nametree))
|
|
| _ -> []
|
|
in let pagelevel =
|
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
|
flatten
|
|
(map2
|
|
(fun page pagenumber ->
|
|
option_map
|
|
(function annot ->
|
|
match Pdf.lookup_direct pdf "/Subtype" annot with
|
|
| Some (Pdf.Name "/FileAttachment") ->
|
|
(match Pdf.lookup_direct pdf "/Contents" annot with
|
|
| Some (Pdf.String s) ->
|
|
begin match Pdf.lookup_direct pdf "/FS" annot with
|
|
| Some ((Pdf.Dictionary _) as d) ->
|
|
(*Pdfe.log (Printf.sprintf "%s\n" (Pdfwrite.string_of_pdf d));*)
|
|
begin match Pdf.lookup_direct pdf "/EF" d with
|
|
| Some ((Pdf.Dictionary _) as d) ->
|
|
begin match Pdf.lookup_direct pdf "/F" d with
|
|
| Some stream ->
|
|
Some
|
|
{name = Pdftext.utf8_of_pdfdocstring s;
|
|
pagenumber = pagenumber;
|
|
data =
|
|
(fun () ->
|
|
try
|
|
Pdf.getstream stream;
|
|
Pdfcodec.decode_pdfstream pdf stream;
|
|
match stream with
|
|
Pdf.Stream {contents = (_, Pdf.Got data)} -> data
|
|
| _ -> raise Not_found
|
|
with
|
|
_ -> raise (Pdf.PDFError "could not retreive attachment data"))}
|
|
| _ -> raise (Pdf.PDFError "no /F found in attachment")
|
|
end
|
|
| _ ->
|
|
Some
|
|
{name = Pdftext.utf8_of_pdfdocstring s;
|
|
pagenumber = pagenumber;
|
|
data = (fun () -> raise (Pdf.PDFError "no attachment data"))}
|
|
end
|
|
| _ -> None
|
|
end
|
|
| _ -> None)
|
|
| _ -> None)
|
|
(match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
| Some (Pdf.Array annots) -> annots
|
|
| _ -> []))
|
|
pages
|
|
(indx pages))
|
|
in
|
|
toplevel @ pagelevel
|
|
|
|
(* Remove Attached files *)
|
|
let remove_attached_files_on_pages pdf =
|
|
let remove_from_page page =
|
|
{page with Pdfpage.rest =
|
|
Pdf.add_dict_entry page.Pdfpage.rest "/Annots"
|
|
(Pdf.Array
|
|
(option_map
|
|
(function annot ->
|
|
match Pdf.lookup_direct pdf "/Subtype" annot with
|
|
| Some (Pdf.Name "/FileAttachment") -> None
|
|
| _ -> Some annot)
|
|
(match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
| Some (Pdf.Array annots) -> annots
|
|
| _ -> [])))}
|
|
in
|
|
Pdfpage.change_pages true pdf (map remove_from_page (Pdfpage.pages_of_pagetree pdf))
|
|
|
|
let remove_attached_files pdf =
|
|
let pdf = remove_attached_files_on_pages pdf in
|
|
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
|
|
| None -> pdf
|
|
| Some rootdict ->
|
|
match Pdf.lookup_direct pdf "/Names" rootdict with
|
|
| None -> pdf
|
|
| Some namedict ->
|
|
let namedict' = Pdf.remove_dict_entry namedict "/EmbeddedFiles" in
|
|
let rootdict' = Pdf.add_dict_entry rootdict "/Names" namedict' in
|
|
let rootdict'num = Pdf.addobj pdf rootdict' in
|
|
{pdf with
|
|
Pdf.root =
|
|
rootdict'num;
|
|
Pdf.trailerdict =
|
|
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootdict'num)}
|
|
|
|
let dump_attachment out pdf (_, embeddedfile) =
|
|
match Pdf.lookup_direct pdf "/F" embeddedfile with
|
|
| Some (Pdf.String s) ->
|
|
let efdata =
|
|
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
|
|
| Some d ->
|
|
let stream =
|
|
match Pdf.lookup_direct pdf "/F" d with
|
|
| Some s -> s
|
|
| None -> error "Bad embedded file stream"
|
|
in
|
|
Pdfcodec.decode_pdfstream_until_unknown pdf stream;
|
|
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end
|
|
| _ -> error "Bad embedded file stream"
|
|
end
|
|
in
|
|
let s = remove_unsafe_characters (Pdftext.utf8_of_pdfdocstring s) in
|
|
let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in
|
|
begin try
|
|
let fh = open_out_bin filename in
|
|
for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done;
|
|
close_out fh
|
|
with
|
|
e -> Pdfe.log (Printf.sprintf "Failed to write attachment to %s\n" filename);
|
|
end
|
|
| _ -> ()
|
|
|
|
let dump_attached_document pdf out =
|
|
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
|
|
let names =
|
|
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
|
|
in
|
|
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
|
|
| Some x ->
|
|
iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x)
|
|
| None -> ()
|
|
|
|
let dump_attached_page pdf out page =
|
|
let annots =
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
| Some (Pdf.Array l) -> l
|
|
| _ -> []
|
|
in
|
|
let efannots =
|
|
keep
|
|
(fun annot ->
|
|
match Pdf.lookup_direct pdf "/Subtype" annot with
|
|
| Some (Pdf.Name "/FileAttachment") -> true
|
|
| _ -> false)
|
|
annots
|
|
in
|
|
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
|
|
iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots)
|
|
|
|
(* Dump both document-level and page-level attached files to file, using their file names *)
|
|
let dump_attached_files pdf out =
|
|
try
|
|
dump_attached_document pdf out;
|
|
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
|
|
with
|
|
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
|
|
|
|
let size_attachment pdf (_, embeddedfile) =
|
|
match Pdf.lookup_direct pdf "/F" embeddedfile with
|
|
| Some (Pdf.String s) ->
|
|
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
|
|
| Some d ->
|
|
let stream =
|
|
match Pdf.lookup_direct pdf "/F" d with
|
|
| Some s -> s
|
|
| None -> error "Bad embedded file stream"
|
|
in
|
|
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> bytes_size b | _ -> error "Bad embedded file stream" end
|
|
| _ -> error "Bad embedded file stream"
|
|
end
|
|
| _ -> 0
|
|
|
|
let size_page_files pdf page =
|
|
let annots =
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
| Some (Pdf.Array l) -> l
|
|
| _ -> []
|
|
in
|
|
let efannots =
|
|
keep
|
|
(fun annot ->
|
|
match Pdf.lookup_direct pdf "/Subtype" annot with
|
|
| Some (Pdf.Name "/FileAttachment") -> true
|
|
| _ -> false)
|
|
annots
|
|
in
|
|
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
|
|
map (size_attachment pdf) (map (fun x -> 0, x) fsannots)
|
|
|
|
let size_document_files pdf =
|
|
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
|
|
let names =
|
|
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
|
|
in
|
|
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
|
|
| Some x ->
|
|
sum (map (size_attachment pdf) (Pdf.contents_of_nametree pdf x))
|
|
| None -> 0
|
|
|
|
let size_attached_files pdf =
|
|
size_document_files pdf + sum (flatten (map (size_page_files pdf) (Pdfpage.pages_of_pagetree pdf)))
|