/Reconcile /C and /A

This commit is contained in:
John Whitington 2024-07-02 17:45:35 +01:00
parent facb9d3c75
commit cda49a1cd8
1 changed files with 47 additions and 45 deletions

View File

@ -27,49 +27,51 @@ let print_children (E (n, cs)) =
iter (fun (E (n, _)) -> Printf.printf "%S " n) cs; iter (fun (E (n, _)) -> Printf.printf "%S " n) cs;
flprint "\n" flprint "\n"
(* FIXME What about /C? *)
(* FIXME What about class map? *)
(* Read attributes. *) (* Read attributes. *)
let read_a pdf stnode = let rec read_single d =
let rec read_single d = match d with
match d with | Pdf.Dictionary d -> map fst d
| Pdf.Dictionary d -> map fst d | Pdf.Stream s -> read_single (fst !s)
| Pdf.Stream s -> read_single (fst !s) | _ -> error "read_single"
| _ -> error "read_single"
let read_a pdf n stnode =
match Pdf.lookup_direct pdf n stnode with
| Some (Pdf.Array attrs) ->
let attrs = keep (function Pdf.Integer _ -> false | _ -> true) attrs in
flatten (map read_single attrs)
| Some (Pdf.Dictionary d) ->
read_single (Pdf.Dictionary d)
| Some (Pdf.Stream s) ->
read_single (Pdf.Stream s)
| Some _ -> []
| None -> []
let read_attributes pdf stnode =
let from_a = read_a pdf "/A" stnode in
let from_c = read_a pdf "/C" stnode in
(* Prefer entries from a, but we are just testing for presence, so merely setify *)
let attrs = setify (from_a @ from_c) in
(* For now, stick /ID, /Alt, /ActualText in here too. Eventually, move to prevent crashes. *)
let alt =
match Pdf.lookup_direct pdf "/Alt" stnode with | Some _ -> ["/Alt"] | None -> []
in in
let from_a = let id =
match Pdf.lookup_direct pdf "/A" stnode with match Pdf.lookup_direct pdf "/ID" stnode with | Some _ -> ["/ID"] | None -> []
| Some (Pdf.Array attrs) -> in
let attrs = keep (function Pdf.Integer _ -> false | _ -> true) attrs in let at =
flatten (map read_single attrs) match Pdf.lookup_direct pdf "/ActualText" stnode with | Some _ -> ["/ActualText"] | None -> []
| Some (Pdf.Dictionary d) -> in
read_single (Pdf.Dictionary d) let pageref =
| Some (Pdf.Stream s) -> match Pdf.direct pdf stnode with
read_single (Pdf.Stream s) | Pdf.Dictionary d ->
| Some _ -> [] begin match lookup "/Pg" d with
| None -> [] | Some (Pdf.Indirect i) ->
in ["_" ^ string_of_int i]
(* For now, stick /ID, /Alt, /ActualText in here too. Eventually, move to prevent crashes. *) | _ -> []
let alt = end
match Pdf.lookup_direct pdf "/Alt" stnode with | Some _ -> ["/Alt"] | None -> [] | _ -> []
in in
let id = attrs @ id @ at @ alt @ pageref
match Pdf.lookup_direct pdf "/ID" stnode with | Some _ -> ["/ID"] | None -> []
in
let at =
match Pdf.lookup_direct pdf "/ActualText" stnode with | Some _ -> ["/ActualText"] | None -> []
in
let pageref =
match Pdf.direct pdf stnode with
| Pdf.Dictionary d ->
begin match lookup "/Pg" d with
| Some (Pdf.Indirect i) ->
["_" ^ string_of_int i]
| _ -> []
end
| _ -> []
in
from_a @ id @ at @ alt @ pageref
let rec read_st_inner pdf stnode = let rec read_st_inner pdf stnode =
let s = let s =
@ -78,10 +80,10 @@ let rec read_st_inner pdf stnode =
| _ -> "" | _ -> ""
in in
match Pdf.lookup_direct pdf "/K" stnode with match Pdf.lookup_direct pdf "/K" stnode with
| None -> E2 (s, read_a pdf stnode, []) | None -> E2 (s, read_attributes pdf stnode, [])
| Some (Pdf.Dictionary d) -> E2 (s, read_a pdf stnode, [read_st_inner pdf (Pdf.Dictionary d)]) | Some (Pdf.Dictionary d) -> E2 (s, read_attributes pdf stnode, [read_st_inner pdf (Pdf.Dictionary d)])
| Some (Pdf.Integer mcd) -> E2 (s, read_a pdf stnode, []) (* marked content identifier, we drop. *) | Some (Pdf.Integer mcd) -> E2 (s, read_attributes pdf stnode, []) (* marked content identifier, we drop. *)
| Some (Pdf.Array a) -> E2 (s, read_a pdf stnode, read_st_inner_array pdf a) | Some (Pdf.Array a) -> E2 (s, read_attributes pdf stnode, read_st_inner_array pdf a)
| _ -> error "malformed st node" | _ -> error "malformed st node"
and read_st_inner_array pdf nodes = and read_st_inner_array pdf nodes =