cpdf-source/cpdfannot.ml

230 lines
8.5 KiB
OCaml
Raw Normal View History

2021-12-21 14:44:46 +01:00
open Pdfutil
2021-12-31 17:47:28 +01:00
(* List annotations *)
2021-12-21 14:44:46 +01:00
let get_annotation_string encoding pdf annot =
match Pdf.lookup_direct pdf "/Contents" annot with
| Some (Pdf.String s) -> Cpdfmetadata.encode_output encoding s
| _ -> ""
let print_annotation encoding pdf num s =
let s = get_annotation_string encoding pdf s in
match s with
| "" -> ()
| s ->
flprint (Printf.sprintf "Page %d: " num);
flprint s;
flprint "\n"
let list_page_annotations encoding pdf num page =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
iter (print_annotation encoding pdf num) (map (Pdf.direct pdf) annots)
| _ -> ()
2021-12-31 17:47:28 +01:00
(* In the future, we will allow round-tripping of JSON annotations, but this
will be complicated. For now, we just turn some indirect things into direct
things, so that the output contains all the pertinent information, not for
round-tripping, but for mere extraction. *)
let make_direct pdf annot =
match Pdf.lookup_direct pdf "/A" annot with
| None -> annot
| Some d -> Pdf.add_dict_entry annot "/A" d
2021-12-31 20:28:42 +01:00
let rewrite_destination calculate_pagenumber d =
match d with
| Pdf.Array (Pdf.Indirect i::r) ->
Pdf.Array (Pdf.Indirect (calculate_pagenumber (Pdfdest.Fit (Pdfdest.PageObject i)))::r)
| x -> x
let rewrite_destinations pdf annot =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
let calculate_pagenumber = Pdfpage.pagenumber_of_target ~fastrefnums pdf in
(* Deal with /Dest in annotation *)
match Pdf.lookup_direct pdf "/Dest" annot with
| Some d -> Pdf.add_dict_entry annot "/Dest" (rewrite_destination calculate_pagenumber d)
| None ->
(* Deal with /A --> /D dest when /A --> /S = /GoTo *)
match Pdf.lookup_direct pdf "/A" annot with
| Some action ->
begin match Pdf.lookup_direct pdf "/D" action with
| Some d ->
Pdf.add_dict_entry
annot "/A" (Pdf.add_dict_entry action "/D" (rewrite_destination calculate_pagenumber d))
| None -> annot
end
| None -> annot
2021-12-21 14:44:46 +01:00
let annotations_json_page pdf page pagenum =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
map
(fun annot ->
2021-12-31 17:47:28 +01:00
let annot = make_direct pdf annot in
`List [`Int pagenum; Cpdfjson.json_of_object ~clean_strings:true pdf (fun _ -> ()) false false annot])
2021-12-21 14:44:46 +01:00
(map (Pdf.direct pdf) annots)
| _ -> []
2023-01-11 07:55:50 +01:00
let list_annotations_json range pdf =
2021-12-21 14:44:46 +01:00
let module J = Cpdfyojson.Safe in
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
2023-01-11 07:55:50 +01:00
let pairs = combine pages pagenums in
let pairs = option_map (fun (p, n) -> if mem n range then Some (p, n) else None) pairs in
let pages, pagenums = split pairs in
2021-12-21 14:44:46 +01:00
let json = `List (flatten (map2 (annotations_json_page pdf) pages pagenums)) in
J.pretty_to_channel stdout json
2023-01-11 07:55:50 +01:00
let list_annotations ~json range encoding pdf =
2021-12-21 14:44:46 +01:00
if json
2023-01-11 07:55:50 +01:00
then list_annotations_json range pdf
2021-12-21 14:44:46 +01:00
else Cpdfpage.iter_pages (list_page_annotations encoding pdf) pdf range
2021-12-31 17:47:28 +01:00
(* Return annotations *)
2021-12-21 14:44:46 +01:00
let get_annotations encoding pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
flatten
(map2
(fun page pagenumber ->
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
let strings =
map (get_annotation_string encoding pdf) (map (Pdf.direct pdf) annots)
in
combine (many pagenumber (length strings)) strings
| _ -> [])
pages
(ilist 1 (length pages)))
let get_annotations_json pdf =
let module J = Cpdfyojson.Safe in
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let json = `List (flatten (map2 (annotations_json_page pdf) pages pagenums)) in
Pdfio.bytes_of_string (J.to_string json)
2021-12-21 14:44:46 +01:00
(* Equalise the page lengths of two PDFs by chopping or extending the first one.
*)
let equalise_lengths a b =
let a' =
if Pdfpage.endpage a < Pdfpage.endpage b then
Pdfpage.change_pages false a
(Pdfpage.pages_of_pagetree a @
many (Pdfpage.blankpage Pdfpaper.a4) (Pdfpage.endpage b - Pdfpage.endpage a))
else if Pdfpage.endpage a > Pdfpage.endpage b then
Pdfpage.change_pages false a
(take (Pdfpage.pages_of_pagetree a) (Pdfpage.endpage b))
else a
in
a', b
(* Copy annotations *)
(* FIXME: Why does this chop the files to the same length? Should be able to
apply annotations from a longer file to a shorter? *)
(* Rewrite any annotation destinations to point to pages in the
destination file. This prevents pages being copied, and ensures the links are
correct Any Indirect link inside a /Dest is rewritten if in the table. If not
inside a /Dest, nothing is rewritten. *)
let rec renumber_in_dest table indest = function
Pdf.Indirect i ->
begin
try Pdf.Indirect (Hashtbl.find table i) with _ -> Pdf.Indirect i
end
| Pdf.Array a ->
Pdf.recurse_array (renumber_in_dest table indest) a
| Pdf.Dictionary d ->
Pdf.Dictionary
(map
(function
("/Dest", v) -> ("/Dest", renumber_in_dest table true v)
| (k, v) -> (k, renumber_in_dest table indest v))
d)
| x -> x
let renumber_in_object pdf objnum table =
Pdf.addobj_given_num
pdf (objnum, (renumber_in_dest table false (Pdf.lookup_obj pdf objnum)))
let copy_annotations_page topdf frompdf frompage topage =
match Pdf.lookup_direct frompdf "/Annots" frompage.Pdfpage.rest with
Some (Pdf.Array frompage_annots as annots) ->
let table =
hashtable_of_dictionary
(combine
(Pdf.page_reference_numbers frompdf)
(Pdf.page_reference_numbers topdf))
in
iter
(function
(* FIXME: We assume they are indirects. Must also do direct, though rare.*)
Pdf.Indirect x ->
(*Printf.printf "Copying annotation %s which is\n%s\n"
(Pdfwrite.string_of_pdf (Pdf.Indirect x))
(Pdfwrite.string_of_pdf (Pdf.direct frompdf (Pdf.Indirect
x)));*)
renumber_in_object frompdf x table
| _ -> ())
frompage_annots;
let objects_to_copy = Pdf.objects_referenced [] [] frompdf annots in
iter
(fun n ->
ignore (Pdf.addobj_given_num topdf (n, Pdf.lookup_obj frompdf n)))
objects_to_copy;
let topage_annots =
match Pdf.lookup_direct frompdf "/Annots" topage.Pdfpage.rest with
| Some (Pdf.Array annots) -> annots
| _ -> []
in
let merged_dict = Pdf.Array (frompage_annots @ topage_annots) in
let topage' =
{topage with Pdfpage.rest =
Pdf.add_dict_entry topage.Pdfpage.rest "/Annots" merged_dict}
in
topdf, topage'
| Some x -> topdf, topage
| None -> topdf, topage
let copy_annotations range frompdf topdf =
let frompdf, topdf = equalise_lengths frompdf topdf in
match Pdf.renumber_pdfs [frompdf; topdf] with
| [frompdf; topdf] ->
let frompdf_pages = Pdfpage.pages_of_pagetree frompdf in
let topdf_pages = Pdfpage.pages_of_pagetree topdf in
let pdf = ref topdf
and pages = ref []
and pnum = ref 1
and frompdf_pages = ref frompdf_pages
and topdf_pages = ref topdf_pages in
(* Go through, updating pdf and collecting new pages. *)
while not (isnull !frompdf_pages) do
let frompdf_page = hd !frompdf_pages
and topdf_page = hd !topdf_pages in
let pdf', page =
if mem !pnum range
then copy_annotations_page !pdf frompdf frompdf_page topdf_page
else !pdf, topdf_page
in
pdf := pdf';
pages =| page;
incr pnum;
frompdf_pages := tl !frompdf_pages;
topdf_pages := tl !topdf_pages
done;
Pdfpage.change_pages true !pdf (rev !pages)
| _ -> assert false
2021-12-31 17:47:28 +01:00
(* Remove annotations *)
2021-12-21 14:44:46 +01:00
let remove_annotations range pdf =
let remove_annotations_page pagenum page =
if mem pagenum range then
let rest' =
Pdf.remove_dict_entry page.Pdfpage.rest "/Annots"
in
{page with Pdfpage.rest = rest'}
else
page
in
2021-12-27 16:20:07 +01:00
Cpdfpage.process_pages (Cpdfutil.ppstub remove_annotations_page) pdf range