This commit is contained in:
John Whitington 2023-04-21 14:38:03 +01:00
parent e0192789c0
commit ff1f78bb00
1 changed files with 15 additions and 87 deletions

View File

@ -55,98 +55,28 @@ let rewrite_destinations pdf annot =
let extra = ref [] let extra = ref []
let serial = ref 0
let getserial () =
serial +=1; !serial
let objnum_to_serial_map = ref []
let annotations_json_page pdf page pagenum = let annotations_json_page pdf page pagenum =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) -> | Some (Pdf.Array annots) ->
map option_map
(fun annot -> (fun annot ->
let serial = getserial () in begin match annot with
begin match annot with | Pdf.Indirect objnum ->
| Pdf.Indirect i -> objnum_to_serial_map := (i, serial)::!objnum_to_serial_map let annot = Pdf.direct pdf annot in
| _ -> Printf.eprintf "annotations must be indirect\n" let annot = rewrite_destinations pdf annot in
end; extra := annot::!extra;
let annot = Pdf.direct pdf annot in Some (`List
let annot = rewrite_destinations pdf annot in [`Int pagenum;
extra := annot::!extra; `Int objnum;
`List Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ())
[`Int pagenum; ~no_stream_data:false ~parse_content:false annot])
`Int serial; | _ -> Printf.eprintf "Warning: annotations must be indirect\n"; None
Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) end)
~no_stream_data:false ~parse_content:false annot])
annots annots
| _ -> [] | _ -> []
(* Rewrite any /Parent entries in /Popup annotations to have annot serial
number, not object number, and all /Popup entries in parent annotations
similarly. *)
let postprocess_json_pdf objnum_to_serial_map pdf obj =
let obj =
match obj with
| Pdf.Dictionary d ->
(* These things seem to be to do with digital signatures, which aren't
going to survive round-tripping of annotations anyway, and drag in
all sorts of extra objects we don't want, so we remove them. *)
let d = remove "/Lock" (remove "/V" d) in Pdf.Dictionary d
| _ -> obj
in
match obj with
| Pdf.Dictionary d ->
let obj =
begin match lookup "/Subtype" d, lookup "/Parent" d with
| Some (Pdf.Name "/Popup"), Some (Pdf.Indirect i) ->
begin match lookup i objnum_to_serial_map with
| Some s -> Pdf.add_dict_entry obj "/Parent" (Pdf.Integer s)
| None -> Printf.eprintf "Warning: Cpdfannot.process_extra_object: could not find serial number\n"; obj
end
| _ ->
(* If not a popup annotation, remove /Parent. It drags in lots of
extra objects (the whole page tree!) with a widget
annotation, and we are unlikely to be able to round-trip them
anyway. One day, if we can match FDF properly, it might be
possible, but not now. *)
Pdf.remove_dict_entry obj "/Parent"
end
in
begin match obj with
| Pdf.Dictionary d ->
begin match lookup "/Popup" d with
| Some (Pdf.Indirect i) ->
begin match lookup i objnum_to_serial_map with
| Some s -> Pdf.add_dict_entry obj "/Popup" (Pdf.Integer s)
| None -> Printf.eprintf "Warning: Cpdfannot.process_extra_object: could not find serial number 2\n"; obj
end
| _ -> obj
end
| _ -> obj
end
| x -> x
let postprocess_json pdf objnum_to_serial_map json =
map
(function
| `List [`Int pagenum; `Int serial; jo] ->
let pdfobj = Cpdfjson.object_of_json jo in
let fixed = postprocess_json_pdf objnum_to_serial_map pdf pdfobj in
`List
[`Int pagenum;
`Int serial;
Cpdfjson.json_of_object
~utf8:true ~clean_strings:true pdf (fun _ -> ())
~no_stream_data:false ~parse_content:false fixed]
| _ -> assert false)
json
let list_annotations_json range pdf = let list_annotations_json range pdf =
extra := []; extra := [];
serial := 0;
objnum_to_serial_map := [];
let module J = Cpdfyojson.Safe in let module J = Cpdfyojson.Safe in
let pages = Pdfpage.pages_of_pagetree pdf in let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in let pagenums = indx pages in
@ -154,15 +84,13 @@ let list_annotations_json range pdf =
let pairs = option_map (fun (p, n) -> if mem n range then Some (p, n) else None) pairs in let pairs = option_map (fun (p, n) -> if mem n range then Some (p, n) else None) pairs in
let pages, pagenums = split pairs in let pages, pagenums = split pairs in
let json = flatten (map2 (annotations_json_page pdf) pages pagenums) in let json = flatten (map2 (annotations_json_page pdf) pages pagenums) in
let json = postprocess_json pdf !objnum_to_serial_map json in
let extra = map (postprocess_json_pdf !objnum_to_serial_map pdf) !extra in
(*Printf.printf "%i extra roots to explore\n" (length extra); (*Printf.printf "%i extra roots to explore\n" (length extra);
iter (fun x -> Printf.eprintf "%s\n\n" (Pdfwrite.string_of_pdf x)) extra;*) iter (fun x -> Printf.eprintf "%s\n\n" (Pdfwrite.string_of_pdf x)) extra;*)
let extra = let extra =
map map
(fun n -> (fun n ->
`List `List
[`Int ~-n; [`Int n;
Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ()) Cpdfjson.json_of_object ~utf8:true ~clean_strings:true pdf (fun _ -> ())
~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)]) ~no_stream_data:false ~parse_content:false (Pdf.lookup_obj pdf n)])
(setify (setify
@ -172,7 +100,7 @@ let list_annotations_json range pdf =
let r = Pdf.objects_referenced [] [] pdf x in let r = Pdf.objects_referenced [] [] pdf x in
(*Printf.printf "%i extra for annot %s\n" (length r) (*Printf.printf "%i extra for annot %s\n" (length r)
(Pdfwrite.string_of_pdf x);*) r) (Pdfwrite.string_of_pdf x);*) r)
extra))) !extra)))
in in
let header = let header =
`List `List