Adds read_st

This commit is contained in:
John Whitington 2024-06-13 16:52:23 +01:00
parent 9e3c8f4666
commit bdbc503b27
1 changed files with 38 additions and 1 deletions

View File

@ -12,6 +12,39 @@ let todo () = ()
let not_fully_implemented () = ()
let covered_elsewhere () = ()
(* A simple type for structure trees, for doing structure checks. For now just
the element name, and its children. *)
type st = E of string * st list
let rec read_st_inner pdf stnode =
let s =
match Pdf.lookup_direct pdf "/S" stnode with
| Some (Pdf.Name s) -> s
| _ -> ""
in
match Pdf.lookup_direct pdf "/K" stnode with
| None -> E (s, [])
| Some (Pdf.Dictionary d) -> E (s, [read_st_inner pdf (Pdf.Dictionary d)])
| Some (Pdf.Integer mcd) -> E (s, []) (* marked content identifier, we drop. *)
| Some (Pdf.Array a) -> E (s, read_st_inner_array pdf a)
| _ -> error "malformed st node"
and read_st_inner_array pdf nodes =
map (read_st_inner pdf) nodes
let read_st pdf =
match Pdf.lookup_obj pdf pdf.Pdf.root with
| Pdf.Dictionary d ->
begin match lookup "/StructTreeRoot" d with
| None -> E ("/StructTreeRoot", [])
| Some st -> E ("/StructTreeRoot", [read_st_inner pdf st])
end
| _ -> error "read_st no root"
let string_of_st st =
let rec convert (E (s, ks)) = `Tuple [`String s; `List (map convert ks)] in
Cpdfyojson.Safe.pretty_to_string (convert st)
(* Content marked as Artifact is present inside tagged content. *)
let matterhorn_01_003 pdf = todo ()
@ -145,7 +178,10 @@ let matterhorn_09_004 pdf = todo ()
(* A list-related structure element is used in a way that does not conform to
Table 336 in ISO 32000-1. *)
let matterhorn_09_005 pdf = todo ()
let matterhorn_09_005 pdf =
let st = read_st pdf in
flprint (string_of_st st);
()
(* A TOC-related structure element is used in a way that does not conform to
Table 333 in ISO 32000-1. *)
@ -1002,6 +1038,7 @@ let extract_struct_tree pdf =
end
| _ -> error "extract_struct_tree: no root"
(* Use JSON data to replace objects in a file. Negative objects are new ones,
we make them positive and renumber them not to clash. Everything else must
remain unrenumbered. *)