From a5e49e09864e76ed80252aa169ce685f0829b95f Mon Sep 17 00:00:00 2001 From: John Whitington Date: Thu, 26 Sep 2024 15:22:22 +0100 Subject: [PATCH] Rework, add elt_info --- cpdfcommand.ml | 9 ++++++++- cpdfdraw.ml | 49 +++++++++++++++++++++++++++------------------ cpdfdraw.mli | 2 +- cpdfdrawcontrol.ml | 9 ++------- cpdfdrawcontrol.mli | 2 +- 5 files changed, 42 insertions(+), 29 deletions(-) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 82ff35e..3ee048a 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -1907,6 +1907,13 @@ let setreadableops () = Pdfops.always_add_whitespace := true; Pdfops.write_comments := true +let addeltinfo s = + match String.split_on_char '=' s with + | h::t -> + let pdfobj = Pdfread.parse_single_object (String.concat "" t) in + Cpdfdrawcontrol.eltinfo h pdfobj + | [] -> error "addeltinfo: bad format" + let specs = [("-version", Arg.Unit (setop Version), @@ -2838,7 +2845,7 @@ let specs = ("-artifact", Arg.Unit (fun _ -> Cpdfdrawcontrol.artifact ()), " Begin an artifact"); ("-end-artifact", Arg.Unit (fun _ -> Cpdfdrawcontrol.endartifact ()), "End an artifact"); ("-no-auto-artifacts", Arg.Unit (fun _ -> Cpdfdrawcontrol.autoartifacts false), " Don't mark untagged content as artifacts"); - ("-eltinfo", Arg.String (fun s -> Cpdfdrawcontrol.eltinfo s), " Add element information"); + ("-eltinfo", Arg.String addeltinfo, " Add element information"); ("-end-eltinfo", Arg.String (fun s -> Cpdfdrawcontrol.endeltinfo s), " Erase element information"); ("-namespace", Arg.String (fun s -> Cpdfdrawcontrol.addnamespace (expand_namespace s)), " Set the structure tree namespace"); ("-rolemap", Arg.String (fun s -> Cpdfdrawcontrol.setrolemap s), " Set a role map"); diff --git a/cpdfdraw.ml b/cpdfdraw.ml index 76734b4..15fce2b 100644 --- a/cpdfdraw.ml +++ b/cpdfdraw.ml @@ -67,7 +67,7 @@ type drawops = | BeginArtifact | EndArtifact | Namespace of string - | EltInfo of string * string + | EltInfo of string * Pdf.pdfobject | EndEltInfo of string (*let rec string_of_drawop = function @@ -277,7 +277,7 @@ type structdata = | StDataMCID of string * int | StDataPage of int | StDataNamespace of string - | StEltInfo of string * string + | StEltInfo of string * Pdf.pdfobject | StEndEltInfo of string let structdata = ref [] @@ -642,7 +642,7 @@ let dryrun ~struct_tree ~filename ~bates ~batespad range pdf chunks = type st = StMCID of int - | StItem of {kind : string; namespace : string; pageobjnum : int option; alt : string option; children : st list} + | StItem of {kind : string; namespace : string; pageobjnum : int option; alt : (string * Pdf.pdfobject) list; children : st list} (* Build a tree from the MCIDs and structure tree instructions gathered *) let rec find_tree_contents a level = function @@ -653,24 +653,38 @@ let rec find_tree_contents a level = function if level = 1 then (rev a, t) else find_tree_contents a (level - 1) t | h::t -> find_tree_contents (h::a) level t -let rec make_structure_tree pageobjnums pdf pagenum namespace = function +let rec make_structure_tree pageobjnums (pn, ns, ei) pdf = function | [] -> [] | StDataMCID (n, mcid)::t -> - StItem {kind = n; namespace = !namespace; alt = None; pageobjnum = lookup !pagenum pageobjnums; children = [StMCID mcid]}::make_structure_tree pageobjnums pdf pagenum namespace t + Printf.printf "StDataMCID, pagenum = %i, pageobjnum = %i\n" !pn (unopt (lookup !pn pageobjnums)); + let item = + StItem {kind = n; namespace = !ns; alt = list_of_hashtbl ei; pageobjnum = lookup !pn pageobjnums; children = [StMCID mcid]} + in + item::make_structure_tree pageobjnums (pn, ns, ei) pdf t | StDataPage n::t -> - pagenum := n; - make_structure_tree pageobjnums pdf pagenum namespace t + Printf.printf "StDataPage %i\n" n; + pn := n; + make_structure_tree pageobjnums (pn, ns, ei) pdf t | StDataNamespace s::t -> - namespace := s; - make_structure_tree pageobjnums pdf pagenum namespace t + Printf.printf "StDataNamespace %s\n" s; + ns := s; + make_structure_tree pageobjnums (pn, ns, ei) pdf t | StEltInfo (k, v)::t -> - make_structure_tree pageobjnums pdf pagenum namespace t + Printf.printf "StEltInfo %s, %s\n" k (Pdfwrite.string_of_pdf v); + Hashtbl.replace ei k v; + make_structure_tree pageobjnums (pn, ns, ei) pdf t | StEndEltInfo s::t -> - make_structure_tree pageobjnums pdf pagenum namespace t + Printf.printf "StEndEltInfo %s\n" s; + Hashtbl.remove ei s; + make_structure_tree pageobjnums (pn, ns, ei) pdf t | StDataBeginTree s::t -> + Printf.printf "StBeginTree %s\n" s; let tree_contents, rest = find_tree_contents [] 1 t in - StItem {kind = s; namespace = !namespace; alt = None; pageobjnum = None; children = make_structure_tree pageobjnums pdf pagenum namespace tree_contents} - ::make_structure_tree pageobjnums pdf pagenum namespace rest + let item = + StItem {kind = s; namespace = !ns; alt = list_of_hashtbl ei; pageobjnum = None; + children = make_structure_tree pageobjnums (pn, ns, ei) pdf tree_contents} + in + item::make_structure_tree pageobjnums (pn, ns, ei) pdf rest | StDataEndTree::t -> error "Too many -end-tags" @@ -679,7 +693,7 @@ let make_structure_tree pdf items = let objnums = Pdf.page_reference_numbers pdf in combine (indx objnums) objnums in - make_structure_tree pageobjnums pdf (ref 0) (ref standard_namespace) items + make_structure_tree pageobjnums (ref 0, ref "", null_hash ()) pdf items (* Write such a structure tree to a PDF. *) let write_structure_tree pdf st = @@ -692,16 +706,13 @@ let write_structure_tree pdf st = let struct_tree_root = Pdf.addobj pdf Pdf.Null in let rec mktree struct_tree_parent = function | StItem {kind; namespace; pageobjnum; alt; children} -> + Printf.printf "Found StItem with pageobjnum %s\n" (match pageobjnum with Some x -> string_of_int x | None -> ""); let this_objnum = Pdf.addobj pdf Pdf.Null in begin match pageobjnum with | Some p -> add_parentmap p this_objnum | _ -> () end; - let alt = - match alt with - | Some s -> [("/Alt", Pdf.String s)] - | None -> [] - in + let alt = map (fun (k, v) -> ("/" ^ k, v)) alt in let page = match pageobjnum with | Some i -> [("/Pg", Pdf.Indirect i)] diff --git a/cpdfdraw.mli b/cpdfdraw.mli index 04564ed..1f977d3 100644 --- a/cpdfdraw.mli +++ b/cpdfdraw.mli @@ -60,7 +60,7 @@ type drawops = | BeginArtifact | EndArtifact | Namespace of string - | EltInfo of string * string + | EltInfo of string * Pdf.pdfobject | EndEltInfo of string val do_add_artifacts : bool ref diff --git a/cpdfdrawcontrol.ml b/cpdfdrawcontrol.ml index 43ba739..ea165f8 100644 --- a/cpdfdrawcontrol.ml +++ b/cpdfdrawcontrol.ml @@ -123,13 +123,8 @@ let endartifact () = let addnamespace s = addop (Cpdfdraw.Namespace s) -let eltinfo s = - let k, v = - match String.split_on_char '=' s with - | [h; t] -> (h, t) - | _ -> error "Bad -eltinfo format" - in - addop (Cpdfdraw.EltInfo (k, v)) +let eltinfo k v = + addop (Cpdfdraw.EltInfo (k, v)) let endeltinfo s = addop (Cpdfdraw.EndEltInfo s) diff --git a/cpdfdrawcontrol.mli b/cpdfdrawcontrol.mli index b8195f3..eb80215 100644 --- a/cpdfdrawcontrol.mli +++ b/cpdfdrawcontrol.mli @@ -20,7 +20,7 @@ val addstag : string -> unit val endtag : unit -> unit val endstag : unit -> unit val autotags : bool -> unit -val eltinfo : string -> unit +val eltinfo : string -> Pdf.pdfobject -> unit val endeltinfo : string -> unit val addnamespace : string -> unit val setrolemap : string -> unit