From d0ef176c68eb7f8887db05ef99454994f7929772 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Fri, 27 Sep 2024 12:40:36 +0100 Subject: [PATCH] Rework autotagging --- cpdfdraw.ml | 19 ++++++++++++++----- cpdfdraw.mli | 3 +-- cpdfdrawcontrol.ml | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cpdfdraw.ml b/cpdfdraw.ml index 7c80448..8718f2d 100644 --- a/cpdfdraw.ml +++ b/cpdfdraw.ml @@ -69,8 +69,9 @@ type drawops = | Namespace of string | EltInfo of string * Pdf.pdfobject | EndEltInfo of string + | AutoTag of bool -(*let rec string_of_drawop = function +let rec string_of_drawop = function | Qq o -> "Qq (" ^ string_of_drawops o ^ ")" | FormXObject (_, _, _, _, _, o) -> "FormXObject (" ^ string_of_drawops o ^ ")" | TextSection o -> "TextSection (" ^ string_of_drawops o ^ ")" @@ -89,9 +90,14 @@ type drawops = | Newline -> "Newline" | Leading _ -> "Leading" | CharSpace _ -> "CharSpace" | WordSpace _ -> "WordSpace" | TextScale _ -> "TextScale" | RenderMode _ -> "RenderMode" | Rise _ -> "Rise" + | EndTag -> "EndTag" | Tag s -> "Tag " ^ s | EndSTag -> "EndSTag" | STag s -> "Tag " ^ s + | BeginArtifact -> "BeginArtifact" | EndArtifact -> "EndArtifact" + | Para (_, _, _, _) -> "Para" | Namespace s -> "Namespace " ^ s + | EltInfo (_, _) -> "EltInfo" | EndEltInfo _ -> "EndEltInfo" + | AutoTag _ -> "AutoTag" and string_of_drawops l = - fold_left (fun x y -> x ^ " " ^ y) "" (map string_of_drawop l)*) + fold_left (fun x y -> x ^ " " ^ y) "" (map string_of_drawop l) (* Per page / xobject resources *) type res = @@ -437,7 +443,7 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num [] | TextSection ops -> let m = mcid () in - if not dryrun then structdata := StDataMCID ("/P", m)::!structdata; + if not dryrun && !do_auto_tag then structdata := StDataMCID ("/P", m)::!structdata; (if struct_tree && !do_auto_tag then [Pdfops.Op_BDC ("/P", Pdf.Dictionary ["/MCID", Pdf.Integer m])] else []) @ [Pdfops.Op_BT] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @@ -487,6 +493,9 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num | EndEltInfo s -> if not dryrun then structdata =| StEndEltInfo s; [] + | AutoTag b -> + do_auto_tag := b; + [] and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops = flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops) @@ -653,12 +662,12 @@ let rec find_tree_contents a level = function if level = 1 then (rev a, t) else find_tree_contents a (level - 1) t | h::t -> find_tree_contents (h::a) level t -let mstdebug = ref false +let mstdebug = ref true let rec make_structure_tree pageobjnums (pn, ns, ei) pdf = function | [] -> [] | StDataMCID (n, mcid)::t -> - if !mstdebug then Printf.printf "StDataMCID, pagenum = %i, pageobjnum = %i\n" !pn (unopt (lookup !pn pageobjnums)); + if !mstdebug then Printf.printf "StDataMCID, type = %s pagenum = %i, pageobjnum = %i\n" n !pn (unopt (lookup !pn pageobjnums)); let item = StItem {kind = n; namespace = !ns; alt = list_of_hashtbl ei; pageobjnum = lookup !pn pageobjnums; children = [StMCID mcid]} in diff --git a/cpdfdraw.mli b/cpdfdraw.mli index 1f977d3..20bfa32 100644 --- a/cpdfdraw.mli +++ b/cpdfdraw.mli @@ -62,11 +62,10 @@ type drawops = | Namespace of string | EltInfo of string * Pdf.pdfobject | EndEltInfo of string + | AutoTag of bool val do_add_artifacts : bool ref -val do_auto_tag : bool ref - val rolemap : string ref (** Calling [draw fast underneath filename bates batespad range pdf drawops] draws on diff --git a/cpdfdrawcontrol.ml b/cpdfdrawcontrol.ml index ea165f8..c0c74bb 100644 --- a/cpdfdrawcontrol.ml +++ b/cpdfdrawcontrol.ml @@ -109,7 +109,7 @@ let endstag () = addop Cpdfdraw.EndSTag let autotags b = - Cpdfdraw.do_auto_tag := b + addop (Cpdfdraw.AutoTag b) let autoartifacts b = Cpdfdraw.do_add_artifacts := b