Rework, add elt_info

This commit is contained in:
John Whitington 2024-09-26 15:22:22 +01:00
parent 6df2ff7a9c
commit a5e49e0986
5 changed files with 42 additions and 29 deletions

View File

@ -1907,6 +1907,13 @@ let setreadableops () =
Pdfops.always_add_whitespace := true; Pdfops.always_add_whitespace := true;
Pdfops.write_comments := true Pdfops.write_comments := true
let addeltinfo s =
match String.split_on_char '=' s with
| h::t ->
let pdfobj = Pdfread.parse_single_object (String.concat "" t) in
Cpdfdrawcontrol.eltinfo h pdfobj
| [] -> error "addeltinfo: bad format"
let specs = let specs =
[("-version", [("-version",
Arg.Unit (setop Version), Arg.Unit (setop Version),
@ -2838,7 +2845,7 @@ let specs =
("-artifact", Arg.Unit (fun _ -> Cpdfdrawcontrol.artifact ()), " Begin an artifact"); ("-artifact", Arg.Unit (fun _ -> Cpdfdrawcontrol.artifact ()), " Begin an artifact");
("-end-artifact", Arg.Unit (fun _ -> Cpdfdrawcontrol.endartifact ()), "End an artifact"); ("-end-artifact", Arg.Unit (fun _ -> Cpdfdrawcontrol.endartifact ()), "End an artifact");
("-no-auto-artifacts", Arg.Unit (fun _ -> Cpdfdrawcontrol.autoartifacts false), " Don't mark untagged content as artifacts"); ("-no-auto-artifacts", Arg.Unit (fun _ -> Cpdfdrawcontrol.autoartifacts false), " Don't mark untagged content as artifacts");
("-eltinfo", Arg.String (fun s -> Cpdfdrawcontrol.eltinfo s), " Add element information"); ("-eltinfo", Arg.String addeltinfo, " Add element information");
("-end-eltinfo", Arg.String (fun s -> Cpdfdrawcontrol.endeltinfo s), " Erase element information"); ("-end-eltinfo", Arg.String (fun s -> Cpdfdrawcontrol.endeltinfo s), " Erase element information");
("-namespace", Arg.String (fun s -> Cpdfdrawcontrol.addnamespace (expand_namespace s)), " Set the structure tree namespace"); ("-namespace", Arg.String (fun s -> Cpdfdrawcontrol.addnamespace (expand_namespace s)), " Set the structure tree namespace");
("-rolemap", Arg.String (fun s -> Cpdfdrawcontrol.setrolemap s), " Set a role map"); ("-rolemap", Arg.String (fun s -> Cpdfdrawcontrol.setrolemap s), " Set a role map");

View File

@ -67,7 +67,7 @@ type drawops =
| BeginArtifact | BeginArtifact
| EndArtifact | EndArtifact
| Namespace of string | Namespace of string
| EltInfo of string * string | EltInfo of string * Pdf.pdfobject
| EndEltInfo of string | EndEltInfo of string
(*let rec string_of_drawop = function (*let rec string_of_drawop = function
@ -277,7 +277,7 @@ type structdata =
| StDataMCID of string * int | StDataMCID of string * int
| StDataPage of int | StDataPage of int
| StDataNamespace of string | StDataNamespace of string
| StEltInfo of string * string | StEltInfo of string * Pdf.pdfobject
| StEndEltInfo of string | StEndEltInfo of string
let structdata = ref [] let structdata = ref []
@ -642,7 +642,7 @@ let dryrun ~struct_tree ~filename ~bates ~batespad range pdf chunks =
type st = type st =
StMCID of int StMCID of int
| StItem of {kind : string; namespace : string; pageobjnum : int option; alt : string option; children : st list} | StItem of {kind : string; namespace : string; pageobjnum : int option; alt : (string * Pdf.pdfobject) list; children : st list}
(* Build a tree from the MCIDs and structure tree instructions gathered *) (* Build a tree from the MCIDs and structure tree instructions gathered *)
let rec find_tree_contents a level = function let rec find_tree_contents a level = function
@ -653,24 +653,38 @@ let rec find_tree_contents a level = function
if level = 1 then (rev a, t) else find_tree_contents a (level - 1) t if level = 1 then (rev a, t) else find_tree_contents a (level - 1) t
| h::t -> find_tree_contents (h::a) level t | h::t -> find_tree_contents (h::a) level t
let rec make_structure_tree pageobjnums pdf pagenum namespace = function let rec make_structure_tree pageobjnums (pn, ns, ei) pdf = function
| [] -> [] | [] -> []
| StDataMCID (n, mcid)::t -> | StDataMCID (n, mcid)::t ->
StItem {kind = n; namespace = !namespace; alt = None; pageobjnum = lookup !pagenum pageobjnums; children = [StMCID mcid]}::make_structure_tree pageobjnums pdf pagenum namespace t Printf.printf "StDataMCID, pagenum = %i, pageobjnum = %i\n" !pn (unopt (lookup !pn pageobjnums));
let item =
StItem {kind = n; namespace = !ns; alt = list_of_hashtbl ei; pageobjnum = lookup !pn pageobjnums; children = [StMCID mcid]}
in
item::make_structure_tree pageobjnums (pn, ns, ei) pdf t
| StDataPage n::t -> | StDataPage n::t ->
pagenum := n; Printf.printf "StDataPage %i\n" n;
make_structure_tree pageobjnums pdf pagenum namespace t pn := n;
make_structure_tree pageobjnums (pn, ns, ei) pdf t
| StDataNamespace s::t -> | StDataNamespace s::t ->
namespace := s; Printf.printf "StDataNamespace %s\n" s;
make_structure_tree pageobjnums pdf pagenum namespace t ns := s;
make_structure_tree pageobjnums (pn, ns, ei) pdf t
| StEltInfo (k, v)::t -> | StEltInfo (k, v)::t ->
make_structure_tree pageobjnums pdf pagenum namespace t Printf.printf "StEltInfo %s, %s\n" k (Pdfwrite.string_of_pdf v);
Hashtbl.replace ei k v;
make_structure_tree pageobjnums (pn, ns, ei) pdf t
| StEndEltInfo s::t -> | StEndEltInfo s::t ->
make_structure_tree pageobjnums pdf pagenum namespace t Printf.printf "StEndEltInfo %s\n" s;
Hashtbl.remove ei s;
make_structure_tree pageobjnums (pn, ns, ei) pdf t
| StDataBeginTree s::t -> | StDataBeginTree s::t ->
Printf.printf "StBeginTree %s\n" s;
let tree_contents, rest = find_tree_contents [] 1 t in let tree_contents, rest = find_tree_contents [] 1 t in
StItem {kind = s; namespace = !namespace; alt = None; pageobjnum = None; children = make_structure_tree pageobjnums pdf pagenum namespace tree_contents} let item =
::make_structure_tree pageobjnums pdf pagenum namespace rest StItem {kind = s; namespace = !ns; alt = list_of_hashtbl ei; pageobjnum = None;
children = make_structure_tree pageobjnums (pn, ns, ei) pdf tree_contents}
in
item::make_structure_tree pageobjnums (pn, ns, ei) pdf rest
| StDataEndTree::t -> | StDataEndTree::t ->
error "Too many -end-tags" error "Too many -end-tags"
@ -679,7 +693,7 @@ let make_structure_tree pdf items =
let objnums = Pdf.page_reference_numbers pdf in let objnums = Pdf.page_reference_numbers pdf in
combine (indx objnums) objnums combine (indx objnums) objnums
in in
make_structure_tree pageobjnums pdf (ref 0) (ref standard_namespace) items make_structure_tree pageobjnums (ref 0, ref "", null_hash ()) pdf items
(* Write such a structure tree to a PDF. *) (* Write such a structure tree to a PDF. *)
let write_structure_tree pdf st = let write_structure_tree pdf st =
@ -692,16 +706,13 @@ let write_structure_tree pdf st =
let struct_tree_root = Pdf.addobj pdf Pdf.Null in let struct_tree_root = Pdf.addobj pdf Pdf.Null in
let rec mktree struct_tree_parent = function let rec mktree struct_tree_parent = function
| StItem {kind; namespace; pageobjnum; alt; children} -> | StItem {kind; namespace; pageobjnum; alt; children} ->
Printf.printf "Found StItem with pageobjnum %s\n" (match pageobjnum with Some x -> string_of_int x | None -> "");
let this_objnum = Pdf.addobj pdf Pdf.Null in let this_objnum = Pdf.addobj pdf Pdf.Null in
begin match pageobjnum with begin match pageobjnum with
| Some p -> add_parentmap p this_objnum | Some p -> add_parentmap p this_objnum
| _ -> () | _ -> ()
end; end;
let alt = let alt = map (fun (k, v) -> ("/" ^ k, v)) alt in
match alt with
| Some s -> [("/Alt", Pdf.String s)]
| None -> []
in
let page = let page =
match pageobjnum with match pageobjnum with
| Some i -> [("/Pg", Pdf.Indirect i)] | Some i -> [("/Pg", Pdf.Indirect i)]

View File

@ -60,7 +60,7 @@ type drawops =
| BeginArtifact | BeginArtifact
| EndArtifact | EndArtifact
| Namespace of string | Namespace of string
| EltInfo of string * string | EltInfo of string * Pdf.pdfobject
| EndEltInfo of string | EndEltInfo of string
val do_add_artifacts : bool ref val do_add_artifacts : bool ref

View File

@ -123,12 +123,7 @@ let endartifact () =
let addnamespace s = let addnamespace s =
addop (Cpdfdraw.Namespace s) addop (Cpdfdraw.Namespace s)
let eltinfo s = let eltinfo k v =
let k, v =
match String.split_on_char '=' s with
| [h; t] -> (h, t)
| _ -> error "Bad -eltinfo format"
in
addop (Cpdfdraw.EltInfo (k, v)) addop (Cpdfdraw.EltInfo (k, v))
let endeltinfo s = let endeltinfo s =

View File

@ -20,7 +20,7 @@ val addstag : string -> unit
val endtag : unit -> unit val endtag : unit -> unit
val endstag : unit -> unit val endstag : unit -> unit
val autotags : bool -> unit val autotags : bool -> unit
val eltinfo : string -> unit val eltinfo : string -> Pdf.pdfobject -> unit
val endeltinfo : string -> unit val endeltinfo : string -> unit
val addnamespace : string -> unit val addnamespace : string -> unit
val setrolemap : string -> unit val setrolemap : string -> unit