2021-12-21 14:44:46 +01:00
open Pdfutil
2023-04-18 17:39:51 +02:00
open Cpdferror
2021-12-21 14:44:46 +01:00
2021-12-31 17:47:28 +01:00
(* List annotations *)
2021-12-21 14:44:46 +01:00
let get_annotation_string encoding pdf annot =
match Pdf . lookup_direct pdf " /Contents " annot with
| Some ( Pdf . String s ) -> Cpdfmetadata . encode_output encoding s
| _ -> " "
let print_annotation encoding pdf num s =
let s = get_annotation_string encoding pdf s in
match s with
| " " -> ()
| s ->
flprint ( Printf . sprintf " Page %d: " num ) ;
flprint s ;
flprint " \n "
let list_page_annotations encoding pdf num page =
match Pdf . lookup_direct pdf " /Annots " page . Pdfpage . rest with
| Some ( Pdf . Array annots ) ->
iter ( print_annotation encoding pdf num ) ( map ( Pdf . direct pdf ) annots )
| _ -> ()
2021-12-31 20:28:42 +01:00
let rewrite_destination calculate_pagenumber d =
match d with
| Pdf . Array ( Pdf . Indirect i :: r ) ->
2023-01-13 08:40:37 +01:00
Pdf . Array ( Pdf . Integer ( calculate_pagenumber ( Pdfdest . Fit ( Pdfdest . PageObject i ) ) ) :: r )
2021-12-31 20:28:42 +01:00
| x -> x
let rewrite_destinations pdf annot =
let refnums = Pdf . page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary ( combine refnums ( indx refnums ) ) in
let calculate_pagenumber = Pdfpage . pagenumber_of_target ~ fastrefnums pdf in
2023-01-13 08:40:37 +01:00
(* Deal with /P in annotation *)
let annot =
match Pdf . indirect_number pdf " /P " annot with
| Some i -> Pdf . add_dict_entry annot " /P " ( Pdf . Integer ( calculate_pagenumber ( Pdfdest . Fit ( Pdfdest . PageObject i ) ) ) )
| None -> annot
in
2021-12-31 20:28:42 +01:00
(* Deal with /Dest in annotation *)
match Pdf . lookup_direct pdf " /Dest " annot with
| Some d -> Pdf . add_dict_entry annot " /Dest " ( rewrite_destination calculate_pagenumber d )
| None ->
(* Deal with /A --> /D dest when /A --> /S = /GoTo *)
match Pdf . lookup_direct pdf " /A " annot with
| Some action ->
begin match Pdf . lookup_direct pdf " /D " action with
| Some d ->
Pdf . add_dict_entry
annot " /A " ( Pdf . add_dict_entry action " /D " ( rewrite_destination calculate_pagenumber d ) )
| None -> annot
end
| None -> annot
2023-01-13 06:51:43 +01:00
let extra = ref []
2023-01-14 05:59:54 +01:00
let serial = ref ~ - 1
let getserial () =
serial + = 1 ; ! serial
2023-01-16 03:06:59 +01:00
let objnum_to_serial_map = ref []
2021-12-21 14:44:46 +01:00
let annotations_json_page pdf page pagenum =
match Pdf . lookup_direct pdf " /Annots " page . Pdfpage . rest with
| Some ( Pdf . Array annots ) ->
map
( fun annot ->
2023-01-16 03:06:59 +01:00
let serial = getserial () in
begin match annot with
| Pdf . Indirect i -> objnum_to_serial_map := ( i , serial ) :: ! objnum_to_serial_map
| _ -> Printf . eprintf " annotations must be indirect \n "
end ;
let annot = Pdf . direct pdf annot in
let annot = rewrite_destinations pdf annot in
2023-01-16 03:19:48 +01:00
extra := annot :: ! extra ;
2023-01-16 08:03:34 +01:00
` List [ ` Int pagenum ; ` Int serial ; Cpdfjson . json_of_object ~ utf8 : true ~ clean_strings : true pdf ( fun _ -> () ) ~ no_stream_data : false ~ parse_content : false annot ] )
2023-01-16 03:06:59 +01:00
annots
2021-12-21 14:44:46 +01:00
| _ -> []
2023-01-16 04:37:25 +01:00
(* Rewrite any /Parent entries in /Popup annotations to have annot serial number, not object number, and all /Popup entries in parent annotations similarly. *)
2023-01-16 03:06:59 +01:00
let postprocess_json_pdf objnum_to_serial_map pdf obj =
2023-01-16 06:35:40 +01:00
let obj =
match obj with
| Pdf . Dictionary d ->
(* These things seem to be to do with digital signatures, which aren't
going to survive round - tripping of annotations anyway , and drag in
all sorts of extra objects we don't want , so we remove them . * )
let d = remove " /Lock " ( remove " /V " d ) in Pdf . Dictionary d
| _ -> obj
in
2023-01-16 03:06:59 +01:00
match obj with
| Pdf . Dictionary d ->
2023-01-16 04:37:25 +01:00
let obj =
2023-01-16 06:35:40 +01:00
begin match lookup " /Subtype " d , lookup " /Parent " d with
| Some ( Pdf . Name " /Popup " ) , Some ( Pdf . Indirect i ) ->
begin match lookup i objnum_to_serial_map with
| Some s -> Pdf . add_dict_entry obj " /Parent " ( Pdf . Integer s )
| None -> Printf . eprintf " Warning: Cpdfannot.process_extra_object: could not find serial number \n " ; obj
end
2023-02-20 20:39:42 +01:00
| _ ->
(* If not a popup annotation, remove /Parent. It drags in lots of
2023-01-16 06:35:40 +01:00
extra objects ( the whole page tree ! ) with a widget
annotation , and we are unlikely to be able to round - trip them
anyway . One day , if we can match FDF properly , it might be
possible , but not now . * )
2023-02-20 20:39:42 +01:00
Pdf . remove_dict_entry obj " /Parent "
2023-01-16 06:35:40 +01:00
end
2023-01-16 04:37:25 +01:00
in
begin match obj with
| Pdf . Dictionary d ->
begin match lookup " /Popup " d with
| Some ( Pdf . Indirect i ) ->
begin match lookup i objnum_to_serial_map with
| Some s -> Pdf . add_dict_entry obj " /Popup " ( Pdf . Integer s )
| None -> Printf . eprintf " Warning: Cpdfannot.process_extra_object: could not find serial number 2 \n " ; obj
end
| _ -> obj
end
| _ -> obj
end
2023-01-16 03:06:59 +01:00
| x -> x
let postprocess_json pdf objnum_to_serial_map json =
map
( function
| ` List [ ` Int pagenum ; ` Int serial ; jo ] ->
2023-02-17 16:30:39 +01:00
let pdfobj = Cpdfjson . object_of_json jo in
2023-01-16 03:06:59 +01:00
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
2023-01-16 08:03:34 +01:00
` List [ ` Int pagenum ; ` Int serial ; Cpdfjson . json_of_object ~ utf8 : true ~ clean_strings : true pdf ( fun _ -> () ) ~ no_stream_data : false ~ parse_content : false fixed ]
2023-01-16 03:06:59 +01:00
| _ -> assert false )
json
2023-01-14 05:59:54 +01:00
2023-01-11 07:55:50 +01:00
let list_annotations_json range pdf =
2023-01-13 06:51:43 +01:00
extra := [] ;
2023-01-14 05:59:54 +01:00
serial := ~ - 1 ;
2023-01-16 03:06:59 +01:00
objnum_to_serial_map := [] ;
2021-12-21 14:44:46 +01:00
let module J = Cpdfyojson . Safe in
let pages = Pdfpage . pages_of_pagetree pdf in
let pagenums = indx pages in
2023-01-11 07:55:50 +01:00
let pairs = combine pages pagenums in
let pairs = option_map ( fun ( p , n ) -> if mem n range then Some ( p , n ) else None ) pairs in
let pages , pagenums = split pairs in
2023-01-14 05:59:54 +01:00
let json = flatten ( map2 ( annotations_json_page pdf ) pages pagenums ) in
2023-01-16 03:06:59 +01:00
let json = postprocess_json pdf ! objnum_to_serial_map json in
2023-01-16 04:37:25 +01:00
let extra = map ( postprocess_json_pdf ! objnum_to_serial_map pdf ) ! extra in
(* Printf.printf "%i extra roots to explore\n" ( length extra ) ;
iter ( fun x -> Printf . eprintf " %s \n \n " ( Pdfwrite . string_of_pdf x ) ) extra ; * )
2023-01-14 05:59:54 +01:00
let extra =
map
( fun n ->
2023-01-19 04:34:07 +01:00
` List
[ ` Int ~ - n ;
Cpdfjson . json_of_object ~ utf8 : true ~ clean_strings : true pdf ( fun _ -> () )
~ no_stream_data : false ~ parse_content : false ( Pdf . lookup_obj pdf n ) ] )
( setify
( flatten
( map
( fun x ->
let r = Pdf . objects_referenced [] [] pdf x in
(* Printf.printf "%i extra for annot %s\n" ( length r )
( Pdfwrite . string_of_pdf x ) ; * ) r )
extra ) ) )
2023-01-14 05:59:54 +01:00
in
2023-01-16 07:07:11 +01:00
let header =
` List
[ ` Int 0 ;
2023-01-19 04:34:07 +01:00
Cpdfjson . json_of_object ~ utf8 : true ~ clean_strings : true pdf ( fun _ -> () )
~ no_stream_data : false ~ parse_content : false
2023-01-16 07:07:11 +01:00
( Pdf . Dictionary [ " /CPDFJSONannotformatversion " , Pdf . Integer 1 ] ) ]
in
let json = ` List ( [ header ] @ json @ extra ) in
2021-12-21 14:44:46 +01:00
J . pretty_to_channel stdout json
2023-01-11 07:55:50 +01:00
let list_annotations ~ json range encoding pdf =
2021-12-21 14:44:46 +01:00
if json
2023-01-11 07:55:50 +01:00
then list_annotations_json range pdf
2021-12-21 14:44:46 +01:00
else Cpdfpage . iter_pages ( list_page_annotations encoding pdf ) pdf range
2021-12-31 17:47:28 +01:00
(* Return annotations *)
2021-12-21 14:44:46 +01:00
let get_annotations encoding pdf =
let pages = Pdfpage . pages_of_pagetree pdf in
flatten
( map2
( fun page pagenumber ->
match Pdf . lookup_direct pdf " /Annots " page . Pdfpage . rest with
| Some ( Pdf . Array annots ) ->
let strings =
map ( get_annotation_string encoding pdf ) ( map ( Pdf . direct pdf ) annots )
in
combine ( many pagenumber ( length strings ) ) strings
| _ -> [] )
pages
( ilist 1 ( length pages ) ) )
2022-01-16 17:30:36 +01:00
let get_annotations_json pdf =
let module J = Cpdfyojson . Safe in
let pages = Pdfpage . pages_of_pagetree pdf in
let pagenums = indx pages in
let json = ` List ( flatten ( map2 ( annotations_json_page pdf ) pages pagenums ) ) in
Pdfio . bytes_of_string ( J . to_string json )
2023-04-18 17:39:51 +02:00
(* * Set annotations from JSON, keeping any existing ones. *)
let set_annotations_json pdf i =
let module J = Cpdfyojson . Safe in
let content = Pdfio . string_of_bytes ( Pdfio . bytes_of_input i 0 i . Pdfio . in_channel_length ) in
let json = J . from_string content in
(* Find largest negative objnumber. Then add number of annot objects. *)
match json with
| ` List entries ->
2023-04-18 18:09:21 +02:00
(* Renumber the PDF so everything has bigger object numbers than that. *)
2023-04-18 17:39:51 +02:00
let maxobjnum =
2023-04-18 18:09:21 +02:00
fold_left
max
min_int
( map ( fun e -> match e with ` List ( ` Int i :: _ ) -> abs i | _ -> error " Bad annots entry " ) entries )
2023-04-18 17:39:51 +02:00
in
2023-04-18 18:09:21 +02:00
let pdf_objnums =
map fst ( list_of_hashtbl pdf . Pdf . objects . Pdf . pdfobjects )
in
let change_table =
hashtable_of_dictionary ( map2 ( fun f t -> ( f , t ) ) pdf_objnums ( ilist ( maxobjnum + 1 ) ( maxobjnum + length pdf_objnums ) ) )
in
let pdf = Pdf . renumber change_table pdf in
2023-04-18 17:39:51 +02:00
()
| _ -> error " Bad Annotations JSON file "
2023-01-13 06:51:43 +01:00
2021-12-21 14:44:46 +01:00
(* Equalise the page lengths of two PDFs by chopping or extending the first one.
* )
let equalise_lengths a b =
let a' =
if Pdfpage . endpage a < Pdfpage . endpage b then
Pdfpage . change_pages false a
( Pdfpage . pages_of_pagetree a @
many ( Pdfpage . blankpage Pdfpaper . a4 ) ( Pdfpage . endpage b - Pdfpage . endpage a ) )
else if Pdfpage . endpage a > Pdfpage . endpage b then
Pdfpage . change_pages false a
( take ( Pdfpage . pages_of_pagetree a ) ( Pdfpage . endpage b ) )
else a
in
a' , b
(* Copy annotations *)
(* FIXME: Why does this chop the files to the same length? Should be able to
apply annotations from a longer file to a shorter ? * )
(* Rewrite any annotation destinations to point to pages in the
destination file . This prevents pages being copied , and ensures the links are
correct Any Indirect link inside a / Dest is rewritten if in the table . If not
inside a / Dest , nothing is rewritten . * )
let rec renumber_in_dest table indest = function
Pdf . Indirect i ->
begin
try Pdf . Indirect ( Hashtbl . find table i ) with _ -> Pdf . Indirect i
end
| Pdf . Array a ->
Pdf . recurse_array ( renumber_in_dest table indest ) a
| Pdf . Dictionary d ->
Pdf . Dictionary
( map
( function
( " /Dest " , v ) -> ( " /Dest " , renumber_in_dest table true v )
| ( k , v ) -> ( k , renumber_in_dest table indest v ) )
d )
| x -> x
let renumber_in_object pdf objnum table =
Pdf . addobj_given_num
pdf ( objnum , ( renumber_in_dest table false ( Pdf . lookup_obj pdf objnum ) ) )
let copy_annotations_page topdf frompdf frompage topage =
match Pdf . lookup_direct frompdf " /Annots " frompage . Pdfpage . rest with
Some ( Pdf . Array frompage_annots as annots ) ->
let table =
hashtable_of_dictionary
( combine
( Pdf . page_reference_numbers frompdf )
( Pdf . page_reference_numbers topdf ) )
in
iter
( function
(* FIXME: We assume they are indirects. Must also do direct, though rare. *)
Pdf . Indirect x ->
(* Printf.printf "Copying annotation %s which is\n%s\n"
( Pdfwrite . string_of_pdf ( Pdf . Indirect x ) )
( Pdfwrite . string_of_pdf ( Pdf . direct frompdf ( Pdf . Indirect
x ) ) ) ; * )
renumber_in_object frompdf x table
| _ -> () )
frompage_annots ;
let objects_to_copy = Pdf . objects_referenced [] [] frompdf annots in
iter
( fun n ->
ignore ( Pdf . addobj_given_num topdf ( n , Pdf . lookup_obj frompdf n ) ) )
objects_to_copy ;
let topage_annots =
match Pdf . lookup_direct frompdf " /Annots " topage . Pdfpage . rest with
| Some ( Pdf . Array annots ) -> annots
| _ -> []
in
let merged_dict = Pdf . Array ( frompage_annots @ topage_annots ) in
let topage' =
{ topage with Pdfpage . rest =
Pdf . add_dict_entry topage . Pdfpage . rest " /Annots " merged_dict }
in
topdf , topage'
| Some x -> topdf , topage
| None -> topdf , topage
let copy_annotations range frompdf topdf =
let frompdf , topdf = equalise_lengths frompdf topdf in
match Pdf . renumber_pdfs [ frompdf ; topdf ] with
| [ frompdf ; topdf ] ->
let frompdf_pages = Pdfpage . pages_of_pagetree frompdf in
let topdf_pages = Pdfpage . pages_of_pagetree topdf in
let pdf = ref topdf
and pages = ref []
and pnum = ref 1
and frompdf_pages = ref frompdf_pages
and topdf_pages = ref topdf_pages in
(* Go through, updating pdf and collecting new pages. *)
while not ( isnull ! frompdf_pages ) do
let frompdf_page = hd ! frompdf_pages
and topdf_page = hd ! topdf_pages in
let pdf' , page =
if mem ! pnum range
then copy_annotations_page ! pdf frompdf frompdf_page topdf_page
else ! pdf , topdf_page
in
pdf := pdf' ;
pages = | page ;
incr pnum ;
frompdf_pages := tl ! frompdf_pages ;
topdf_pages := tl ! topdf_pages
done ;
Pdfpage . change_pages true ! pdf ( rev ! pages )
| _ -> assert false
2021-12-31 17:47:28 +01:00
(* Remove annotations *)
2021-12-21 14:44:46 +01:00
let remove_annotations range pdf =
let remove_annotations_page pagenum page =
if mem pagenum range then
let rest' =
Pdf . remove_dict_entry page . Pdfpage . rest " /Annots "
in
{ page with Pdfpage . rest = rest' }
else
page
in
2023-03-23 20:24:50 +01:00
Cpdfpage . process_pages ( Pdfpage . ppstub remove_annotations_page ) pdf range