First example requiring attributes

This commit is contained in:
John Whitington 2024-06-19 15:19:32 +01:00
parent dcb4dfdadd
commit 38528dcbd2
1 changed files with 47 additions and 10 deletions

View File

@ -21,6 +21,30 @@ let covered_elsewhere () = ()
the element name, and its children. *) the element name, and its children. *)
type st = E of string * st list type st = E of string * st list
(* Now one which contains the attributes too. *)
type st2 = E2 of string * string list * st2 list
(* FIXME What about /C? *)
(* FIXME What about class map? *)
(* Read attributes. *)
let read_a pdf stnode =
let rec read_single d =
match d with
| Pdf.Dictionary d -> map fst d
| Pdf.Stream s -> read_single (fst !s)
| _ -> error "read_single"
in
match Pdf.lookup_direct pdf "/A" stnode with
| Some (Pdf.Array attrs) ->
let attrs = keep (function Pdf.Integer _ -> false | _ -> true) attrs in
flatten (map read_single attrs)
| Some (Pdf.Dictionary d) ->
read_single (Pdf.Dictionary d)
| Some (Pdf.Stream s) ->
read_single (Pdf.Stream s)
| Some _ -> []
| None -> []
let rec read_st_inner pdf stnode = let rec read_st_inner pdf stnode =
let s = let s =
match Pdf.lookup_direct pdf "/S" stnode with match Pdf.lookup_direct pdf "/S" stnode with
@ -28,10 +52,10 @@ let rec read_st_inner pdf stnode =
| _ -> "" | _ -> ""
in in
match Pdf.lookup_direct pdf "/K" stnode with match Pdf.lookup_direct pdf "/K" stnode with
| None -> E (s, []) | None -> E2 (s, read_a pdf stnode, [])
| Some (Pdf.Dictionary d) -> E (s, [read_st_inner pdf (Pdf.Dictionary d)]) | Some (Pdf.Dictionary d) -> E2 (s, read_a pdf stnode, [read_st_inner pdf (Pdf.Dictionary d)])
| Some (Pdf.Integer mcd) -> E (s, []) (* marked content identifier, we drop. *) | Some (Pdf.Integer mcd) -> E2 (s, read_a pdf stnode, []) (* marked content identifier, we drop. *)
| Some (Pdf.Array a) -> E (s, read_st_inner_array pdf a) | Some (Pdf.Array a) -> E2 (s, read_a pdf stnode, read_st_inner_array pdf a)
| _ -> error "malformed st node" | _ -> error "malformed st node"
and read_st_inner_array pdf nodes = and read_st_inner_array pdf nodes =
@ -41,27 +65,27 @@ let read_st_basic pdf =
match Pdf.lookup_obj pdf pdf.Pdf.root with match Pdf.lookup_obj pdf pdf.Pdf.root with
| Pdf.Dictionary d -> | Pdf.Dictionary d ->
begin match lookup "/StructTreeRoot" d with begin match lookup "/StructTreeRoot" d with
| None -> E ("/StructTreeRoot", []) | None -> E2 ("/StructTreeRoot", [], [])
| Some st -> E ("/StructTreeRoot", [read_st_inner pdf st]) | Some st -> E2 ("/StructTreeRoot", [], [read_st_inner pdf st])
end end
| _ -> error "read_st no root" | _ -> error "read_st no root"
(* Rewrite a tree according to a rolemap. FIXME Better do the non-circularity (* Rewrite a tree according to a rolemap. FIXME Better do the non-circularity
check when reading a rolemap, or this won't terminate... *) check when reading a rolemap, or this won't terminate... *)
let rec rewrite_st rolemap (E (n, cs)) = let rec rewrite_st rolemap (E2 (n, attr, cs)) =
let rec rewrite_st_name rolemap n = let rec rewrite_st_name rolemap n =
match List.assoc_opt n rolemap with match List.assoc_opt n rolemap with
| Some n' -> rewrite_st_name rolemap n' | Some n' -> rewrite_st_name rolemap n'
| None -> n | None -> n
in in
E (rewrite_st_name rolemap n, map (rewrite_st rolemap) cs) E2 (rewrite_st_name rolemap n, attr, map (rewrite_st rolemap) cs)
let read_rolemap pdf = function let read_rolemap pdf = function
| Pdf.Dictionary d -> | Pdf.Dictionary d ->
option_map (function (k, Pdf.Name v) -> Some (k, v) | _ -> None) d option_map (function (k, Pdf.Name v) -> Some (k, v) | _ -> None) d
| _ -> error "read_rolemap: not a rolemap" | _ -> error "read_rolemap: not a rolemap"
let read_st pdf = let read_st2 pdf =
let rolemap = let rolemap =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with
| Some rm -> read_rolemap pdf rm | Some rm -> read_rolemap pdf rm
@ -69,6 +93,11 @@ let read_st pdf =
in in
rewrite_st rolemap (read_st_basic pdf) rewrite_st rolemap (read_st_basic pdf)
let rec st_of_st2 = function E2 (a, _, cs) -> E (a, map st_of_st2 cs)
let read_st pdf =
st_of_st2 (read_st2 pdf)
let rec st_mem p = function let rec st_mem p = function
| E (s, _) when p s -> true | E (s, _) when p s -> true
| E (_, cs) -> List.exists (st_mem p) cs | E (_, cs) -> List.exists (st_mem p) cs
@ -408,7 +437,15 @@ let matterhorn_14_007 pdf =
(* In a table not organized with Headers attributes and IDs, a <TH> cell does (* In a table not organized with Headers attributes and IDs, a <TH> cell does
not contain a Scope attribute. *) not contain a Scope attribute. *)
let matterhorn_15_003 pdf = todo () let matterhorn_15_003 pdf =
(* For now, we complain any time a <TH> does not have a scope. The 2008 PDF
spec, 2014 PDF/UA spec and Matterhorn protocol combined do not quite give
enough information to know what is required. To be returned to. *)
let rec check_th = function
| E2 ("/TH", attr, _) -> if not (List.mem "/Scope" attr) then merror ()
| E2 (_, _, cs) -> iter check_th cs
in
check_th (read_st2 pdf)
(* <Formula> tag is missing an Alt attribute. *) (* <Formula> tag is missing an Alt attribute. *)
let matterhorn_17_002 pdf = todo () let matterhorn_17_002 pdf = todo ()