First example requiring attributes
This commit is contained in:
parent
dcb4dfdadd
commit
38528dcbd2
57
cpdfua.ml
57
cpdfua.ml
|
@ -21,6 +21,30 @@ let covered_elsewhere () = ()
|
||||||
the element name, and its children. *)
|
the element name, and its children. *)
|
||||||
type st = E of string * st list
|
type st = E of string * st list
|
||||||
|
|
||||||
|
(* Now one which contains the attributes too. *)
|
||||||
|
type st2 = E2 of string * string list * st2 list
|
||||||
|
|
||||||
|
(* FIXME What about /C? *)
|
||||||
|
(* FIXME What about class map? *)
|
||||||
|
(* Read attributes. *)
|
||||||
|
let read_a pdf stnode =
|
||||||
|
let rec read_single d =
|
||||||
|
match d with
|
||||||
|
| Pdf.Dictionary d -> map fst d
|
||||||
|
| Pdf.Stream s -> read_single (fst !s)
|
||||||
|
| _ -> error "read_single"
|
||||||
|
in
|
||||||
|
match Pdf.lookup_direct pdf "/A" stnode with
|
||||||
|
| Some (Pdf.Array attrs) ->
|
||||||
|
let attrs = keep (function Pdf.Integer _ -> false | _ -> true) attrs in
|
||||||
|
flatten (map read_single attrs)
|
||||||
|
| Some (Pdf.Dictionary d) ->
|
||||||
|
read_single (Pdf.Dictionary d)
|
||||||
|
| Some (Pdf.Stream s) ->
|
||||||
|
read_single (Pdf.Stream s)
|
||||||
|
| Some _ -> []
|
||||||
|
| None -> []
|
||||||
|
|
||||||
let rec read_st_inner pdf stnode =
|
let rec read_st_inner pdf stnode =
|
||||||
let s =
|
let s =
|
||||||
match Pdf.lookup_direct pdf "/S" stnode with
|
match Pdf.lookup_direct pdf "/S" stnode with
|
||||||
|
@ -28,10 +52,10 @@ let rec read_st_inner pdf stnode =
|
||||||
| _ -> ""
|
| _ -> ""
|
||||||
in
|
in
|
||||||
match Pdf.lookup_direct pdf "/K" stnode with
|
match Pdf.lookup_direct pdf "/K" stnode with
|
||||||
| None -> E (s, [])
|
| None -> E2 (s, read_a pdf stnode, [])
|
||||||
| Some (Pdf.Dictionary d) -> E (s, [read_st_inner pdf (Pdf.Dictionary d)])
|
| Some (Pdf.Dictionary d) -> E2 (s, read_a pdf stnode, [read_st_inner pdf (Pdf.Dictionary d)])
|
||||||
| Some (Pdf.Integer mcd) -> E (s, []) (* marked content identifier, we drop. *)
|
| Some (Pdf.Integer mcd) -> E2 (s, read_a pdf stnode, []) (* marked content identifier, we drop. *)
|
||||||
| Some (Pdf.Array a) -> E (s, read_st_inner_array pdf a)
|
| Some (Pdf.Array a) -> E2 (s, read_a pdf stnode, read_st_inner_array pdf a)
|
||||||
| _ -> error "malformed st node"
|
| _ -> error "malformed st node"
|
||||||
|
|
||||||
and read_st_inner_array pdf nodes =
|
and read_st_inner_array pdf nodes =
|
||||||
|
@ -41,27 +65,27 @@ let read_st_basic pdf =
|
||||||
match Pdf.lookup_obj pdf pdf.Pdf.root with
|
match Pdf.lookup_obj pdf pdf.Pdf.root with
|
||||||
| Pdf.Dictionary d ->
|
| Pdf.Dictionary d ->
|
||||||
begin match lookup "/StructTreeRoot" d with
|
begin match lookup "/StructTreeRoot" d with
|
||||||
| None -> E ("/StructTreeRoot", [])
|
| None -> E2 ("/StructTreeRoot", [], [])
|
||||||
| Some st -> E ("/StructTreeRoot", [read_st_inner pdf st])
|
| Some st -> E2 ("/StructTreeRoot", [], [read_st_inner pdf st])
|
||||||
end
|
end
|
||||||
| _ -> error "read_st no root"
|
| _ -> error "read_st no root"
|
||||||
|
|
||||||
(* Rewrite a tree according to a rolemap. FIXME Better do the non-circularity
|
(* Rewrite a tree according to a rolemap. FIXME Better do the non-circularity
|
||||||
check when reading a rolemap, or this won't terminate... *)
|
check when reading a rolemap, or this won't terminate... *)
|
||||||
let rec rewrite_st rolemap (E (n, cs)) =
|
let rec rewrite_st rolemap (E2 (n, attr, cs)) =
|
||||||
let rec rewrite_st_name rolemap n =
|
let rec rewrite_st_name rolemap n =
|
||||||
match List.assoc_opt n rolemap with
|
match List.assoc_opt n rolemap with
|
||||||
| Some n' -> rewrite_st_name rolemap n'
|
| Some n' -> rewrite_st_name rolemap n'
|
||||||
| None -> n
|
| None -> n
|
||||||
in
|
in
|
||||||
E (rewrite_st_name rolemap n, map (rewrite_st rolemap) cs)
|
E2 (rewrite_st_name rolemap n, attr, map (rewrite_st rolemap) cs)
|
||||||
|
|
||||||
let read_rolemap pdf = function
|
let read_rolemap pdf = function
|
||||||
| Pdf.Dictionary d ->
|
| Pdf.Dictionary d ->
|
||||||
option_map (function (k, Pdf.Name v) -> Some (k, v) | _ -> None) d
|
option_map (function (k, Pdf.Name v) -> Some (k, v) | _ -> None) d
|
||||||
| _ -> error "read_rolemap: not a rolemap"
|
| _ -> error "read_rolemap: not a rolemap"
|
||||||
|
|
||||||
let read_st pdf =
|
let read_st2 pdf =
|
||||||
let rolemap =
|
let rolemap =
|
||||||
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with
|
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/StructTreeRoot"; "/RoleMap"] with
|
||||||
| Some rm -> read_rolemap pdf rm
|
| Some rm -> read_rolemap pdf rm
|
||||||
|
@ -69,6 +93,11 @@ let read_st pdf =
|
||||||
in
|
in
|
||||||
rewrite_st rolemap (read_st_basic pdf)
|
rewrite_st rolemap (read_st_basic pdf)
|
||||||
|
|
||||||
|
let rec st_of_st2 = function E2 (a, _, cs) -> E (a, map st_of_st2 cs)
|
||||||
|
|
||||||
|
let read_st pdf =
|
||||||
|
st_of_st2 (read_st2 pdf)
|
||||||
|
|
||||||
let rec st_mem p = function
|
let rec st_mem p = function
|
||||||
| E (s, _) when p s -> true
|
| E (s, _) when p s -> true
|
||||||
| E (_, cs) -> List.exists (st_mem p) cs
|
| E (_, cs) -> List.exists (st_mem p) cs
|
||||||
|
@ -408,7 +437,15 @@ let matterhorn_14_007 pdf =
|
||||||
|
|
||||||
(* In a table not organized with Headers attributes and IDs, a <TH> cell does
|
(* In a table not organized with Headers attributes and IDs, a <TH> cell does
|
||||||
not contain a Scope attribute. *)
|
not contain a Scope attribute. *)
|
||||||
let matterhorn_15_003 pdf = todo ()
|
let matterhorn_15_003 pdf =
|
||||||
|
(* For now, we complain any time a <TH> does not have a scope. The 2008 PDF
|
||||||
|
spec, 2014 PDF/UA spec and Matterhorn protocol combined do not quite give
|
||||||
|
enough information to know what is required. To be returned to. *)
|
||||||
|
let rec check_th = function
|
||||||
|
| E2 ("/TH", attr, _) -> if not (List.mem "/Scope" attr) then merror ()
|
||||||
|
| E2 (_, _, cs) -> iter check_th cs
|
||||||
|
in
|
||||||
|
check_th (read_st2 pdf)
|
||||||
|
|
||||||
(* <Formula> tag is missing an Alt attribute. *)
|
(* <Formula> tag is missing an Alt attribute. *)
|
||||||
let matterhorn_17_002 pdf = todo ()
|
let matterhorn_17_002 pdf = todo ()
|
||||||
|
|
Loading…
Reference in New Issue