From 59178af85c1ed995dd07c644f2a5fe2da4ed9a1c Mon Sep 17 00:00:00 2001 From: John Whitington Date: Mon, 3 Jun 2024 16:29:51 +0100 Subject: [PATCH] First plausible struct tree extraction --- cpdfua.ml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/cpdfua.ml b/cpdfua.ml index ad571d9..f5c571d 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -331,4 +331,20 @@ let mark pdf = pdf.Pdf.root <- pdf3.Pdf.root | None -> assert false -let extract_struct_tree pdf = `String "" +let extract_struct_tree pdf = + match Pdf.lookup_obj pdf pdf.Pdf.root with + | Pdf.Dictionary d -> + begin match lookup "/StructTreeRoot" d with + | None -> `List [] + | Some x -> + let objs = Pdf.objects_referenced ["/Pg"] [] pdf x in + `List + (map + (fun objnum -> + let jsonobj = + Cpdfjson.json_of_object ~utf8:true ~no_stream_data:false ~parse_content:false pdf (function _ -> ()) (Pdf.lookup_obj pdf objnum) + in + `Tuple [`Int objnum; jsonobj]) + objs) + end + | _ -> error "extract_struct_tree: no root"