diff --git a/cpdfua.ml b/cpdfua.ml index 360f19c..a486ecb 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -1,6 +1,11 @@ open Pdfutil open Cpdferror +(* Implements all Matterhorn checks except for: + + a) Those which require looking deep inside font files; and + b) Those which require reading inside the graphics stream. *) + exception MatterhornError of Cpdfyojson.Safe.t exception MatterhornUnimplemented @@ -32,7 +37,7 @@ let rec read_st_inner pdf stnode = and read_st_inner_array pdf nodes = map (read_st_inner pdf) nodes -let read_st pdf = +let read_st_basic pdf = match Pdf.lookup_obj pdf pdf.Pdf.root with | Pdf.Dictionary d -> begin match lookup "/StructTreeRoot" d with @@ -41,6 +46,29 @@ let read_st pdf = end | _ -> error "read_st no root" +(* Rewrite a tree according to a rolemap. FIXME Better do the non-circularity + check when reading a rolemap, or this won't terminate... *) +let rec rewrite_st rolemap (E (n, cs)) = + let rec rewrite_st_name rolemap n = + match List.assoc_opt n rolemap with + | Some n' -> rewrite_st_name rolemap n' + | None -> n + in + E (rewrite_st_name rolemap n, map (rewrite_st rolemap) cs) + +let read_rolemap pdf = function + | Pdf.Dictionary d -> + option_map (function (k, Pdf.Name v) -> Some (k, v) | _ -> None) d + | _ -> error "read_rolemap: not a rolemap" + +let read_st pdf = + let rolemap = + match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with + | Some rm -> read_rolemap pdf rm + | None -> [] + in + rewrite_st rolemap (read_st_basic pdf) + let rec st_mem p = function | E (s, _) when p s -> true | E (_, cs) -> List.exists (st_mem p) cs @@ -77,11 +105,6 @@ let standard_structure_types = "/Art"; "/BlockQuote"; "/TOC"; "/TOCI"; "/Index"; "/Private"; "/Quote"; "/Note"; "/Reference"; "/Code"] -let read_rolemap pdf = function - | Pdf.Dictionary d -> - option_map (function (k, Pdf.Name v) -> Some (k, v) | _ -> None) d - | _ -> error "read_rolemap: not a rolemap" - let rec follow_standard rm n = match List.assoc_opt n rm with | None -> raise Exit