2023-05-31 15:15:51 +02:00
|
|
|
(** A loose JSON equivalent of XFDF. *)
|
2021-12-21 14:44:46 +01:00
|
|
|
open Pdfutil
|
2023-04-18 17:39:51 +02:00
|
|
|
open Cpdferror
|
2021-12-21 14:44:46 +01:00
|
|
|
|
2023-04-22 17:58:05 +02:00
|
|
|
(* List annotations, simple old style. *)
|
2021-12-21 14:44:46 +01:00
|
|
|
let get_annotation_string encoding pdf annot =
|
|
|
|
match Pdf.lookup_direct pdf "/Contents" annot with
|
|
|
|
| Some (Pdf.String s) -> Cpdfmetadata.encode_output encoding s
|
|
|
|
| _ -> ""
|
|
|
|
|
|
|
|
let print_annotation encoding pdf num s =
|
|
|
|
let s = get_annotation_string encoding pdf s in
|
|
|
|
match s with
|
|
|
|
| "" -> ()
|
|
|
|
| s ->
|
|
|
|
flprint (Printf.sprintf "Page %d: " num);
|
|
|
|
flprint s;
|
|
|
|
flprint "\n"
|
|
|
|
|
|
|
|
let list_page_annotations encoding pdf num page =
|
|
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
|
|
| Some (Pdf.Array annots) ->
|
|
|
|
iter (print_annotation encoding pdf num) (map (Pdf.direct pdf) annots)
|
|
|
|
| _ -> ()
|
|
|
|
|
2023-04-22 17:58:05 +02:00
|
|
|
let list_annotations range encoding pdf =
|
|
|
|
Cpdfpage.iter_pages (list_page_annotations encoding pdf) pdf range
|
|
|
|
|
|
|
|
(* New, JSON style *)
|
2023-04-21 16:13:46 +02:00
|
|
|
let rewrite_destination f d =
|
2021-12-31 20:28:42 +01:00
|
|
|
match d with
|
|
|
|
| Pdf.Array (Pdf.Indirect i::r) ->
|
2023-04-21 16:13:46 +02:00
|
|
|
Pdf.Array (Pdf.Integer (f i)::r)
|
2021-12-31 20:28:42 +01:00
|
|
|
| x -> x
|
|
|
|
|
2023-04-21 16:13:46 +02:00
|
|
|
let rewrite_destinations f pdf annot =
|
|
|
|
(* Deal with /P in annotation *)
|
|
|
|
let annot =
|
|
|
|
match Pdf.indirect_number pdf "/P" annot with
|
|
|
|
| Some i -> Pdf.add_dict_entry annot "/P" (Pdf.Integer (f i))
|
|
|
|
| None -> annot
|
|
|
|
in
|
|
|
|
(* Deal with /Dest in annotation *)
|
|
|
|
match Pdf.lookup_direct pdf "/Dest" annot with
|
|
|
|
| Some d -> Pdf.add_dict_entry annot "/Dest" (rewrite_destination f d)
|
|
|
|
| None ->
|
|
|
|
(* Deal with /A --> /D dest when /A --> /S = /GoTo *)
|
|
|
|
match Pdf.lookup_direct pdf "/A" annot with
|
|
|
|
| Some action ->
|
|
|
|
begin match Pdf.lookup_direct pdf "/D" action with
|
|
|
|
| Some d ->
|
|
|
|
Pdf.add_dict_entry
|
|
|
|
annot "/A" (Pdf.add_dict_entry action "/D" (rewrite_destination f d))
|
|
|
|
| None -> annot
|
|
|
|
end
|
|
|
|
| None -> annot
|
2021-12-31 20:28:42 +01:00
|
|
|
|
2023-05-29 21:07:42 +02:00
|
|
|
(* We exclude the same annotations as the XFDF spec does *)
|
|
|
|
let excluded pdf annot =
|
|
|
|
match Pdf.lookup_direct pdf "/Subtype" annot with
|
|
|
|
| Some (Pdf.Name ("/Movie" | "/Widget" | "/Screen" | "/PrinterMark" | "/TrapNet")) -> true
|
|
|
|
| _ -> false
|
2023-05-30 15:55:29 +02:00
|
|
|
(* Printf.printf "Skipping %s\n" n; true
|
|
|
|
| Some (Pdf.Name x) ->
|
|
|
|
Printf.printf "Including %s\n" x; false*)
|
2023-05-29 21:07:42 +02:00
|
|
|
|
2023-01-13 06:51:43 +01:00
|
|
|
let extra = ref []
|
|
|
|
|
2023-04-21 16:13:46 +02:00
|
|
|
let annotations_json_page calculate_pagenumber pdf page pagenum =
|
2021-12-21 14:44:46 +01:00
|
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
|
|
| Some (Pdf.Array annots) ->
|
2023-04-21 15:38:03 +02:00
|
|
|
option_map
|
2021-12-21 14:44:46 +01:00
|
|
|
(fun annot ->
|
2023-04-21 15:38:03 +02:00
|
|
|
begin match annot with
|
|
|
|
| Pdf.Indirect objnum ->
|
|
|
|
let annot = Pdf.direct pdf annot in
|
2023-05-29 21:07:42 +02:00
|
|
|
if excluded pdf annot then None else
|
2023-04-21 16:13:46 +02:00
|
|
|
let annot =
|
|
|
|
rewrite_destinations
|
|
|
|
(fun i -> calculate_pagenumber (Pdfdest.Fit (Pdfdest.PageObject i)))
|
|
|
|
pdf annot
|
|
|
|
in
|
2023-04-21 15:38:03 +02:00
|
|
|
extra := annot::!extra;
|
|
|
|
Some (`List
|
|
|
|
[`Int pagenum;
|
|
|
|
`Int objnum;
|
|
|
|
Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ())
|
|
|
|
~no_stream_data:false ~parse_content:false annot])
|
2023-04-25 14:45:56 +02:00
|
|
|
| _ -> Pdfe.log "Warning: annotations must be indirect\n"; None
|
2023-04-21 15:38:03 +02:00
|
|
|
end)
|
2023-01-16 03:06:59 +01:00
|
|
|
annots
|
2021-12-21 14:44:46 +01:00
|
|
|
| _ -> []
|
|
|
|
|
2023-04-22 17:58:05 +02:00
|
|
|
let get_annotations_json pdf range =
|
2023-04-21 16:13:46 +02:00
|
|
|
let refnums = Pdf.page_reference_numbers pdf in
|
|
|
|
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
|
|
|
|
let calculate_pagenumber = Pdfpage.pagenumber_of_target ~fastrefnums pdf in
|
2023-01-13 06:51:43 +01:00
|
|
|
extra := [];
|
2021-12-21 14:44:46 +01:00
|
|
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
|
|
|
let pagenums = indx pages in
|
2023-01-11 07:55:50 +01:00
|
|
|
let pairs = combine pages pagenums in
|
|
|
|
let pairs = option_map (fun (p, n) -> if mem n range then Some (p, n) else None) pairs in
|
|
|
|
let pages, pagenums = split pairs in
|
2023-04-21 16:13:46 +02:00
|
|
|
let json = flatten (map2 (annotations_json_page calculate_pagenumber pdf) pages pagenums) in
|
2023-05-29 16:18:43 +02:00
|
|
|
let jsonobjnums : int list = map (function `List [_; `Int n; _] -> n | _ -> assert false) json in
|
2023-05-30 15:55:29 +02:00
|
|
|
(*Printf.eprintf "%i extra roots to explore\n" (length !extra);
|
|
|
|
iter (fun x -> Pdfe.log (Printf.sprintf "%s\n\n" (Pdfwrite.string_of_pdf x))) !extra;*)
|
2023-01-14 05:59:54 +01:00
|
|
|
let extra =
|
|
|
|
map
|
|
|
|
(fun n ->
|
2023-01-19 04:34:07 +01:00
|
|
|
`List
|
2023-04-21 15:38:03 +02:00
|
|
|
[`Int n;
|
2023-01-19 04:34:07 +01:00
|
|
|
Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ())
|
|
|
|
~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
|
|
|
|
(setify
|
|
|
|
(flatten
|
|
|
|
(map
|
|
|
|
(fun x ->
|
2023-05-30 15:55:29 +02:00
|
|
|
let x = Pdf.remove_dict_entry x "/Popup" in
|
|
|
|
let x = Pdf.remove_dict_entry x "/Parent" in
|
2023-01-19 04:34:07 +01:00
|
|
|
let r = Pdf.objects_referenced [] [] pdf x in
|
2023-05-30 15:55:29 +02:00
|
|
|
(*Printf.eprintf "%i extra for annot %s\n" (length r) (Pdfwrite.string_of_pdf x);*)
|
|
|
|
r)
|
2023-04-21 15:38:03 +02:00
|
|
|
!extra)))
|
2023-01-14 05:59:54 +01:00
|
|
|
in
|
2023-05-29 16:18:43 +02:00
|
|
|
let extra =
|
|
|
|
option_map
|
|
|
|
(function `List [`Int n; _] as json -> if mem n jsonobjnums then None else Some json | _ -> assert false)
|
|
|
|
extra
|
|
|
|
in
|
2023-01-16 07:07:11 +01:00
|
|
|
let header =
|
|
|
|
`List
|
|
|
|
[`Int 0;
|
2023-01-19 04:34:07 +01:00
|
|
|
Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ())
|
|
|
|
~no_stream_data:false ~parse_content:false
|
2023-01-16 07:07:11 +01:00
|
|
|
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
|
|
|
|
in
|
|
|
|
let json = `List ([header] @ json @ extra) in
|
2023-06-01 16:07:55 +02:00
|
|
|
Pdfio.bytes_of_string (Cpdfyojson.Safe.pretty_to_string json)
|
2021-12-21 14:44:46 +01:00
|
|
|
|
2021-12-31 17:47:28 +01:00
|
|
|
(* Return annotations *)
|
2021-12-21 14:44:46 +01:00
|
|
|
let get_annotations encoding pdf =
|
|
|
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
|
|
|
flatten
|
|
|
|
(map2
|
|
|
|
(fun page pagenumber ->
|
|
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
|
|
| Some (Pdf.Array annots) ->
|
|
|
|
let strings =
|
|
|
|
map (get_annotation_string encoding pdf) (map (Pdf.direct pdf) annots)
|
|
|
|
in
|
|
|
|
combine (many pagenumber (length strings)) strings
|
|
|
|
| _ -> [])
|
|
|
|
pages
|
|
|
|
(ilist 1 (length pages)))
|
|
|
|
|
2023-04-18 17:39:51 +02:00
|
|
|
(** Set annotations from JSON, keeping any existing ones. *)
|
|
|
|
let set_annotations_json pdf i =
|
2023-06-01 16:06:05 +02:00
|
|
|
match Cpdfyojson.Safe.from_string (Pdfio.string_of_input i) with
|
2023-04-18 17:39:51 +02:00
|
|
|
| `List entries ->
|
2023-04-18 18:09:21 +02:00
|
|
|
(* Renumber the PDF so everything has bigger object numbers than that. *)
|
2023-04-18 17:39:51 +02:00
|
|
|
let maxobjnum =
|
2023-04-21 16:15:06 +02:00
|
|
|
fold_left max min_int
|
|
|
|
(map
|
|
|
|
(function
|
2023-05-22 17:38:43 +02:00
|
|
|
| `List [_; `Int i; _] | `List [`Int i; _] -> i
|
2023-04-21 16:15:06 +02:00
|
|
|
| _ -> error "Bad annots entry")
|
|
|
|
entries)
|
2023-04-18 18:09:21 +02:00
|
|
|
in
|
2023-04-21 16:15:06 +02:00
|
|
|
let pdf_objnums = map fst (list_of_hashtbl pdf.Pdf.objects.Pdf.pdfobjects) in
|
2023-04-18 18:09:21 +02:00
|
|
|
let change_table =
|
|
|
|
hashtable_of_dictionary (map2 (fun f t -> (f, t)) pdf_objnums (ilist (maxobjnum + 1) (maxobjnum + length pdf_objnums)))
|
|
|
|
in
|
2023-04-21 17:22:57 +02:00
|
|
|
let pdf' = Pdf.renumber change_table pdf in
|
|
|
|
pdf.root <- pdf'.root;
|
|
|
|
pdf.objects <- pdf'.objects;
|
|
|
|
pdf.trailerdict <- pdf'.trailerdict;
|
2023-05-22 18:25:47 +02:00
|
|
|
(* Add the extra objects back in and build the annotations. *)
|
2023-05-22 17:38:43 +02:00
|
|
|
let extras = option_map (function `List [`Int i; o] -> Some (i, o) | _ -> None) entries in
|
|
|
|
let annots = option_map (function `List [`Int pagenum; `Int i; o] -> Some (pagenum, i, o) | _ -> None) entries in
|
|
|
|
iter (fun (i, o) -> Pdf.addobj_given_num pdf (i, Cpdfjson.object_of_json o)) extras;
|
|
|
|
let pageobjnummap =
|
|
|
|
let refnums = Pdf.page_reference_numbers pdf in
|
|
|
|
combine (indx refnums) refnums
|
|
|
|
in
|
|
|
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
2023-05-22 18:25:47 +02:00
|
|
|
let annotsforeachpage = collate compare (sort compare annots) in
|
2023-05-22 17:38:43 +02:00
|
|
|
let newpages =
|
|
|
|
map2
|
|
|
|
(fun pagenum page ->
|
|
|
|
let forthispage = flatten (keep (function (p, _, _)::t when p = pagenum -> true | _ -> false) annotsforeachpage) in
|
|
|
|
iter
|
|
|
|
(fun (pnum, i, o) ->
|
|
|
|
let pageobjnum = match lookup pnum pageobjnummap with Some x -> x | None -> 0 in
|
|
|
|
let f = fun pnum -> if pageobjnum = 0 then pnum else pageobjnum in
|
2023-05-22 18:25:47 +02:00
|
|
|
Pdf.addobj_given_num pdf (i, rewrite_destinations f pdf (Cpdfjson.object_of_json o)))
|
2023-05-22 17:38:43 +02:00
|
|
|
forthispage;
|
|
|
|
if forthispage = [] then page else
|
|
|
|
let annots =
|
2023-05-22 18:25:47 +02:00
|
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with | Some (Pdf.Array annots) -> annots | _ -> []
|
2023-05-22 17:38:43 +02:00
|
|
|
in
|
|
|
|
let newannots = map (fun (_, i, _) -> Pdf.Indirect i) forthispage in
|
|
|
|
{page with Pdfpage.rest = Pdf.add_dict_entry page.Pdfpage.rest "/Annots" (Pdf.Array (annots @ newannots))})
|
|
|
|
(indx pages)
|
|
|
|
pages
|
|
|
|
in
|
|
|
|
let pdf' = Pdfpage.change_pages true pdf newpages in
|
|
|
|
pdf.root <- pdf'.root;
|
|
|
|
pdf.objects <- pdf'.objects;
|
|
|
|
pdf.trailerdict <- pdf'.trailerdict
|
2023-04-18 17:39:51 +02:00
|
|
|
| _ -> error "Bad Annotations JSON file"
|
2023-01-13 06:51:43 +01:00
|
|
|
|
2023-04-22 16:28:11 +02:00
|
|
|
let copy_annotations range frompdf topdf =
|
|
|
|
set_annotations_json topdf (Pdfio.input_of_bytes (get_annotations_json frompdf range))
|
2021-12-21 14:44:46 +01:00
|
|
|
|
2021-12-31 17:47:28 +01:00
|
|
|
(* Remove annotations *)
|
2021-12-21 14:44:46 +01:00
|
|
|
let remove_annotations range pdf =
|
|
|
|
let remove_annotations_page pagenum page =
|
|
|
|
if mem pagenum range then
|
|
|
|
let rest' =
|
|
|
|
Pdf.remove_dict_entry page.Pdfpage.rest "/Annots"
|
|
|
|
in
|
|
|
|
{page with Pdfpage.rest = rest'}
|
|
|
|
else
|
|
|
|
page
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub remove_annotations_page) pdf range
|