cpdf-source/cpdfannot.ml

331 lines
12 KiB
OCaml
Raw Normal View History

2021-12-21 14:44:46 +01:00
open Pdfutil
2021-12-31 17:47:28 +01:00
(* List annotations *)
2021-12-21 14:44:46 +01:00
let get_annotation_string encoding pdf annot =
match Pdf.lookup_direct pdf "/Contents" annot with
| Some (Pdf.String s) -> Cpdfmetadata.encode_output encoding s
| _ -> ""
let print_annotation encoding pdf num s =
let s = get_annotation_string encoding pdf s in
match s with
| "" -> ()
| s ->
flprint (Printf.sprintf "Page %d: " num);
flprint s;
flprint "\n"
let list_page_annotations encoding pdf num page =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
iter (print_annotation encoding pdf num) (map (Pdf.direct pdf) annots)
| _ -> ()
2021-12-31 20:28:42 +01:00
let rewrite_destination calculate_pagenumber d =
match d with
| Pdf.Array (Pdf.Indirect i::r) ->
2023-01-13 08:40:37 +01:00
Pdf.Array (Pdf.Integer (calculate_pagenumber (Pdfdest.Fit (Pdfdest.PageObject i)))::r)
2021-12-31 20:28:42 +01:00
| x -> x
let rewrite_destinations pdf annot =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
let calculate_pagenumber = Pdfpage.pagenumber_of_target ~fastrefnums pdf in
2023-01-13 08:40:37 +01:00
(* Deal with /P in annotation *)
let annot =
match Pdf.indirect_number pdf "/P" annot with
| Some i -> Pdf.add_dict_entry annot "/P" (Pdf.Integer (calculate_pagenumber (Pdfdest.Fit (Pdfdest.PageObject i))))
| None -> annot
in
2021-12-31 20:28:42 +01:00
(* Deal with /Dest in annotation *)
match Pdf.lookup_direct pdf "/Dest" annot with
| Some d -> Pdf.add_dict_entry annot "/Dest" (rewrite_destination calculate_pagenumber d)
| None ->
(* Deal with /A --> /D dest when /A --> /S = /GoTo *)
match Pdf.lookup_direct pdf "/A" annot with
| Some action ->
begin match Pdf.lookup_direct pdf "/D" action with
| Some d ->
Pdf.add_dict_entry
annot "/A" (Pdf.add_dict_entry action "/D" (rewrite_destination calculate_pagenumber d))
| None -> annot
end
| None -> annot
2023-01-13 06:51:43 +01:00
let extra = ref []
2023-01-14 05:59:54 +01:00
let serial = ref ~-1
let getserial () =
serial +=1; !serial
2023-01-16 03:06:59 +01:00
let objnum_to_serial_map = ref []
2021-12-21 14:44:46 +01:00
let annotations_json_page pdf page pagenum =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
map
(fun annot ->
2023-01-16 03:06:59 +01:00
let serial = getserial () in
begin match annot with
| Pdf.Indirect i -> objnum_to_serial_map := (i, serial)::!objnum_to_serial_map
| _ -> Printf.eprintf "annotations must be indirect\n"
end;
let annot = Pdf.direct pdf annot in
let annot = rewrite_destinations pdf annot in
2023-01-16 03:19:48 +01:00
extra := annot::!extra;
2023-01-16 08:03:34 +01:00
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false annot])
2023-01-16 03:06:59 +01:00
annots
2021-12-21 14:44:46 +01:00
| _ -> []
2023-01-16 04:37:25 +01:00
(* Rewrite any /Parent entries in /Popup annotations to have annot serial number, not object number, and all /Popup entries in parent annotations similarly. *)
2023-01-16 03:06:59 +01:00
let postprocess_json_pdf objnum_to_serial_map pdf obj =
2023-01-16 06:35:40 +01:00
let obj =
match obj with
| Pdf.Dictionary d ->
(* These things seem to be to do with digital signatures, which aren't
going to survive round-tripping of annotations anyway, and drag in
all sorts of extra objects we don't want, so we remove them. *)
let d = remove "/Lock" (remove "/V" d) in Pdf.Dictionary d
| _ -> obj
in
2023-01-16 03:06:59 +01:00
match obj with
| Pdf.Dictionary d ->
2023-01-16 04:37:25 +01:00
let obj =
2023-01-16 06:35:40 +01:00
begin match lookup "/Subtype" d, lookup "/Parent" d with
| Some (Pdf.Name "/Popup"), Some (Pdf.Indirect i) ->
begin match lookup i objnum_to_serial_map with
| Some s -> Pdf.add_dict_entry obj "/Parent" (Pdf.Integer s)
| None -> Printf.eprintf "Warning: Cpdfannot.process_extra_object: could not find serial number\n"; obj
end
2023-02-20 20:21:59 +01:00
| _ -> ()
(*(* If not a popup annotation, remove /Parent. It drags in lots of
2023-01-16 06:35:40 +01:00
extra objects (the whole page tree!) with a widget
annotation, and we are unlikely to be able to round-trip them
anyway. One day, if we can match FDF properly, it might be
possible, but not now. *)
2023-02-20 20:21:59 +01:00
Pdf.remove_dict_entry obj "/Parent"*)
2023-01-16 06:35:40 +01:00
end
2023-01-16 04:37:25 +01:00
in
begin match obj with
| Pdf.Dictionary d ->
begin match lookup "/Popup" d with
| Some (Pdf.Indirect i) ->
begin match lookup i objnum_to_serial_map with
| Some s -> Pdf.add_dict_entry obj "/Popup" (Pdf.Integer s)
| None -> Printf.eprintf "Warning: Cpdfannot.process_extra_object: could not find serial number 2\n"; obj
end
| _ -> obj
end
| _ -> obj
end
2023-01-16 03:06:59 +01:00
| x -> x
let postprocess_json pdf objnum_to_serial_map json =
map
(function
| `List [`Int pagenum; `Int serial; jo] ->
2023-02-17 16:30:39 +01:00
let pdfobj = Cpdfjson.object_of_json jo in
2023-01-16 03:06:59 +01:00
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
2023-01-16 08:03:34 +01:00
`List [`Int pagenum; `Int serial; Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) ~no_stream_data:false ~parse_content:false fixed]
2023-01-16 03:06:59 +01:00
| _ -> assert false)
json
2023-01-14 05:59:54 +01:00
2023-01-11 07:55:50 +01:00
let list_annotations_json range pdf =
2023-01-13 06:51:43 +01:00
extra := [];
2023-01-14 05:59:54 +01:00
serial := ~-1;
2023-01-16 03:06:59 +01:00
objnum_to_serial_map := [];
2021-12-21 14:44:46 +01:00
let module J = Cpdfyojson.Safe in
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
2023-01-11 07:55:50 +01:00
let pairs = combine pages pagenums in
let pairs = option_map (fun (p, n) -> if mem n range then Some (p, n) else None) pairs in
let pages, pagenums = split pairs in
2023-01-14 05:59:54 +01:00
let json = flatten (map2 (annotations_json_page pdf) pages pagenums) in
2023-01-16 03:06:59 +01:00
let json = postprocess_json pdf !objnum_to_serial_map json in
2023-01-16 04:37:25 +01:00
let extra = map (postprocess_json_pdf !objnum_to_serial_map pdf) !extra in
(*Printf.printf "%i extra roots to explore\n" (length extra);
iter (fun x -> Printf.eprintf "%s\n\n" (Pdfwrite.string_of_pdf x)) extra;*)
2023-01-14 05:59:54 +01:00
let extra =
map
(fun n ->
2023-01-19 04:34:07 +01:00
`List
[`Int ~-n;
Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ())
~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
(setify
(flatten
(map
(fun x ->
let r = Pdf.objects_referenced [] [] pdf x in
(*Printf.printf "%i extra for annot %s\n" (length r)
(Pdfwrite.string_of_pdf x);*) r)
extra)))
2023-01-14 05:59:54 +01:00
in
2023-01-16 07:07:11 +01:00
let header =
`List
[`Int 0;
2023-01-19 04:34:07 +01:00
Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ())
~no_stream_data:false ~parse_content:false
2023-01-16 07:07:11 +01:00
(Pdf.Dictionary ["/CPDFJSONannotformatversion", Pdf.Integer 1])]
in
let json = `List ([header] @ json @ extra) in
2021-12-21 14:44:46 +01:00
J.pretty_to_channel stdout json
2023-01-11 07:55:50 +01:00
let list_annotations ~json range encoding pdf =
2021-12-21 14:44:46 +01:00
if json
2023-01-11 07:55:50 +01:00
then list_annotations_json range pdf
2021-12-21 14:44:46 +01:00
else Cpdfpage.iter_pages (list_page_annotations encoding pdf) pdf range
2021-12-31 17:47:28 +01:00
(* Return annotations *)
2021-12-21 14:44:46 +01:00
let get_annotations encoding pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
flatten
(map2
(fun page pagenumber ->
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
let strings =
map (get_annotation_string encoding pdf) (map (Pdf.direct pdf) annots)
in
combine (many pagenumber (length strings)) strings
| _ -> [])
pages
(ilist 1 (length pages)))
let get_annotations_json pdf =
let module J = Cpdfyojson.Safe in
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let json = `List (flatten (map2 (annotations_json_page pdf) pages pagenums)) in
Pdfio.bytes_of_string (J.to_string json)
2023-02-20 13:56:59 +01:00
(** Set annotations from JSON. Existing annotations? *)
2023-01-13 06:51:43 +01:00
let set_annotations_json pdf json = ()
2021-12-21 14:44:46 +01:00
(* Equalise the page lengths of two PDFs by chopping or extending the first one.
*)
let equalise_lengths a b =
let a' =
if Pdfpage.endpage a < Pdfpage.endpage b then
Pdfpage.change_pages false a
(Pdfpage.pages_of_pagetree a @
many (Pdfpage.blankpage Pdfpaper.a4) (Pdfpage.endpage b - Pdfpage.endpage a))
else if Pdfpage.endpage a > Pdfpage.endpage b then
Pdfpage.change_pages false a
(take (Pdfpage.pages_of_pagetree a) (Pdfpage.endpage b))
else a
in
a', b
(* Copy annotations *)
(* FIXME: Why does this chop the files to the same length? Should be able to
apply annotations from a longer file to a shorter? *)
(* Rewrite any annotation destinations to point to pages in the
destination file. This prevents pages being copied, and ensures the links are
correct Any Indirect link inside a /Dest is rewritten if in the table. If not
inside a /Dest, nothing is rewritten. *)
let rec renumber_in_dest table indest = function
Pdf.Indirect i ->
begin
try Pdf.Indirect (Hashtbl.find table i) with _ -> Pdf.Indirect i
end
| Pdf.Array a ->
Pdf.recurse_array (renumber_in_dest table indest) a
| Pdf.Dictionary d ->
Pdf.Dictionary
(map
(function
("/Dest", v) -> ("/Dest", renumber_in_dest table true v)
| (k, v) -> (k, renumber_in_dest table indest v))
d)
| x -> x
let renumber_in_object pdf objnum table =
Pdf.addobj_given_num
pdf (objnum, (renumber_in_dest table false (Pdf.lookup_obj pdf objnum)))
let copy_annotations_page topdf frompdf frompage topage =
match Pdf.lookup_direct frompdf "/Annots" frompage.Pdfpage.rest with
Some (Pdf.Array frompage_annots as annots) ->
let table =
hashtable_of_dictionary
(combine
(Pdf.page_reference_numbers frompdf)
(Pdf.page_reference_numbers topdf))
in
iter
(function
(* FIXME: We assume they are indirects. Must also do direct, though rare.*)
Pdf.Indirect x ->
(*Printf.printf "Copying annotation %s which is\n%s\n"
(Pdfwrite.string_of_pdf (Pdf.Indirect x))
(Pdfwrite.string_of_pdf (Pdf.direct frompdf (Pdf.Indirect
x)));*)
renumber_in_object frompdf x table
| _ -> ())
frompage_annots;
let objects_to_copy = Pdf.objects_referenced [] [] frompdf annots in
iter
(fun n ->
ignore (Pdf.addobj_given_num topdf (n, Pdf.lookup_obj frompdf n)))
objects_to_copy;
let topage_annots =
match Pdf.lookup_direct frompdf "/Annots" topage.Pdfpage.rest with
| Some (Pdf.Array annots) -> annots
| _ -> []
in
let merged_dict = Pdf.Array (frompage_annots @ topage_annots) in
let topage' =
{topage with Pdfpage.rest =
Pdf.add_dict_entry topage.Pdfpage.rest "/Annots" merged_dict}
in
topdf, topage'
| Some x -> topdf, topage
| None -> topdf, topage
let copy_annotations range frompdf topdf =
let frompdf, topdf = equalise_lengths frompdf topdf in
match Pdf.renumber_pdfs [frompdf; topdf] with
| [frompdf; topdf] ->
let frompdf_pages = Pdfpage.pages_of_pagetree frompdf in
let topdf_pages = Pdfpage.pages_of_pagetree topdf in
let pdf = ref topdf
and pages = ref []
and pnum = ref 1
and frompdf_pages = ref frompdf_pages
and topdf_pages = ref topdf_pages in
(* Go through, updating pdf and collecting new pages. *)
while not (isnull !frompdf_pages) do
let frompdf_page = hd !frompdf_pages
and topdf_page = hd !topdf_pages in
let pdf', page =
if mem !pnum range
then copy_annotations_page !pdf frompdf frompdf_page topdf_page
else !pdf, topdf_page
in
pdf := pdf';
pages =| page;
incr pnum;
frompdf_pages := tl !frompdf_pages;
topdf_pages := tl !topdf_pages
done;
Pdfpage.change_pages true !pdf (rev !pages)
| _ -> assert false
2021-12-31 17:47:28 +01:00
(* Remove annotations *)
2021-12-21 14:44:46 +01:00
let remove_annotations range pdf =
let remove_annotations_page pagenum page =
if mem pagenum range then
let rest' =
Pdf.remove_dict_entry page.Pdfpage.rest "/Annots"
in
{page with Pdfpage.rest = rest'}
else
page
in
2021-12-27 16:20:07 +01:00
Cpdfpage.process_pages (Cpdfutil.ppstub remove_annotations_page) pdf range