Rework autotagging

This commit is contained in:
John Whitington 2024-09-27 12:40:36 +01:00
parent 505969d6cb
commit d0ef176c68
3 changed files with 16 additions and 8 deletions

View File

@ -69,8 +69,9 @@ type drawops =
| Namespace of string | Namespace of string
| EltInfo of string * Pdf.pdfobject | EltInfo of string * Pdf.pdfobject
| EndEltInfo of string | EndEltInfo of string
| AutoTag of bool
(*let rec string_of_drawop = function let rec string_of_drawop = function
| Qq o -> "Qq (" ^ string_of_drawops o ^ ")" | Qq o -> "Qq (" ^ string_of_drawops o ^ ")"
| FormXObject (_, _, _, _, _, o) -> "FormXObject (" ^ string_of_drawops o ^ ")" | FormXObject (_, _, _, _, _, o) -> "FormXObject (" ^ string_of_drawops o ^ ")"
| TextSection o -> "TextSection (" ^ string_of_drawops o ^ ")" | TextSection o -> "TextSection (" ^ string_of_drawops o ^ ")"
@ -89,9 +90,14 @@ type drawops =
| Newline -> "Newline" | Leading _ -> "Leading" | CharSpace _ -> "CharSpace" | Newline -> "Newline" | Leading _ -> "Leading" | CharSpace _ -> "CharSpace"
| WordSpace _ -> "WordSpace" | TextScale _ -> "TextScale" | WordSpace _ -> "WordSpace" | TextScale _ -> "TextScale"
| RenderMode _ -> "RenderMode" | Rise _ -> "Rise" | RenderMode _ -> "RenderMode" | Rise _ -> "Rise"
| EndTag -> "EndTag" | Tag s -> "Tag " ^ s | EndSTag -> "EndSTag" | STag s -> "Tag " ^ s
| BeginArtifact -> "BeginArtifact" | EndArtifact -> "EndArtifact"
| Para (_, _, _, _) -> "Para" | Namespace s -> "Namespace " ^ s
| EltInfo (_, _) -> "EltInfo" | EndEltInfo _ -> "EndEltInfo"
| AutoTag _ -> "AutoTag"
and string_of_drawops l = and string_of_drawops l =
fold_left (fun x y -> x ^ " " ^ y) "" (map string_of_drawop l)*) fold_left (fun x y -> x ^ " " ^ y) "" (map string_of_drawop l)
(* Per page / xobject resources *) (* Per page / xobject resources *)
type res = type res =
@ -437,7 +443,7 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
[] []
| TextSection ops -> | TextSection ops ->
let m = mcid () in let m = mcid () in
if not dryrun then structdata := StDataMCID ("/P", m)::!structdata; if not dryrun && !do_auto_tag then structdata := StDataMCID ("/P", m)::!structdata;
(if struct_tree && !do_auto_tag then [Pdfops.Op_BDC ("/P", Pdf.Dictionary ["/MCID", Pdf.Integer m])] else []) (if struct_tree && !do_auto_tag then [Pdfops.Op_BDC ("/P", Pdf.Dictionary ["/MCID", Pdf.Integer m])] else [])
@ [Pdfops.Op_BT] @ [Pdfops.Op_BT]
@ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops
@ -487,6 +493,9 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
| EndEltInfo s -> | EndEltInfo s ->
if not dryrun then structdata =| StEndEltInfo s; if not dryrun then structdata =| StEndEltInfo s;
[] []
| AutoTag b ->
do_auto_tag := b;
[]
and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops = and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops =
flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops) flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops)
@ -653,12 +662,12 @@ let rec find_tree_contents a level = function
if level = 1 then (rev a, t) else find_tree_contents a (level - 1) t if level = 1 then (rev a, t) else find_tree_contents a (level - 1) t
| h::t -> find_tree_contents (h::a) level t | h::t -> find_tree_contents (h::a) level t
let mstdebug = ref false let mstdebug = ref true
let rec make_structure_tree pageobjnums (pn, ns, ei) pdf = function let rec make_structure_tree pageobjnums (pn, ns, ei) pdf = function
| [] -> [] | [] -> []
| StDataMCID (n, mcid)::t -> | StDataMCID (n, mcid)::t ->
if !mstdebug then Printf.printf "StDataMCID, pagenum = %i, pageobjnum = %i\n" !pn (unopt (lookup !pn pageobjnums)); if !mstdebug then Printf.printf "StDataMCID, type = %s pagenum = %i, pageobjnum = %i\n" n !pn (unopt (lookup !pn pageobjnums));
let item = let item =
StItem {kind = n; namespace = !ns; alt = list_of_hashtbl ei; pageobjnum = lookup !pn pageobjnums; children = [StMCID mcid]} StItem {kind = n; namespace = !ns; alt = list_of_hashtbl ei; pageobjnum = lookup !pn pageobjnums; children = [StMCID mcid]}
in in

View File

@ -62,11 +62,10 @@ type drawops =
| Namespace of string | Namespace of string
| EltInfo of string * Pdf.pdfobject | EltInfo of string * Pdf.pdfobject
| EndEltInfo of string | EndEltInfo of string
| AutoTag of bool
val do_add_artifacts : bool ref val do_add_artifacts : bool ref
val do_auto_tag : bool ref
val rolemap : string ref val rolemap : string ref
(** Calling [draw fast underneath filename bates batespad range pdf drawops] draws on (** Calling [draw fast underneath filename bates batespad range pdf drawops] draws on

View File

@ -109,7 +109,7 @@ let endstag () =
addop Cpdfdraw.EndSTag addop Cpdfdraw.EndSTag
let autotags b = let autotags b =
Cpdfdraw.do_auto_tag := b addop (Cpdfdraw.AutoTag b)
let autoartifacts b = let autoartifacts b =
Cpdfdraw.do_add_artifacts := b Cpdfdraw.do_add_artifacts := b