From 7824bd40f95eab39d3f332cf7611a49689c03632 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Thu, 27 Jun 2024 14:21:47 +0100 Subject: [PATCH] Adds -print-struct-tree --- cpdfcommand.ml | 9 ++++++++- cpdfua.ml | 29 ++++++++++++++++++++++++++++- cpdfua.mli | 2 ++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 5a12448..f79622e 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -221,6 +221,7 @@ type op = | Verify of string | MarkAs of string | RemoveMark of string + | PrintStructTree | ExtractStructTree | ReplaceStructTree of string | SetLanguage of string @@ -369,6 +370,7 @@ let string_of_op = function | Verify _ -> "Verify" | MarkAs _ -> "MarkAs" | RemoveMark _ -> "RemoveMark" + | PrintStructTree -> "PrintStructTree" | ExtractStructTree -> "ExtractStructTree" | ReplaceStructTree _ -> "ReplaceStructTree" | SetLanguage _ -> "SetLanguage" @@ -895,7 +897,8 @@ let banned banlist = function | AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce | OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _ | TextWidth _ | SetAnnotations _ | CopyAnnotations _ | ExtractStream _ | PrintObj _ - | Verify _ | MarkAs _ | RemoveMark _ | ExtractStructTree | ReplaceStructTree _ | SetLanguage _ + | Verify _ | MarkAs _ | RemoveMark _ | ExtractStructTree | ReplaceStructTree _ | SetLanguage _ + | PrintStructTree -> false (* Always allowed *) (* Combine pages is not allowed because we would not know where to get the -recrypt from -- the first or second file? *) @@ -2825,6 +2828,7 @@ and specs = ("-verify-single", Arg.String (fun s -> args.verify_single <- Some s), "Verify a single test"); ("-mark-as", Arg.String (fun s -> setop (MarkAs s) ()), "Mark as conforming to a standard"); ("-remove-mark", Arg.String (fun s -> setop (RemoveMark s) ()), "Remove conformance mark"); + ("-print-struct-tree", Arg.Unit (fun () -> setop PrintStructTree ()), "Print structure tree"); ("-extract-struct-tree", Arg.Unit (fun () -> setop ExtractStructTree ()), "Extract structure tree in JSON format"); ("-replace-struct-tree", Arg.String (fun s -> setop (ReplaceStructTree s) ()), "Replace structure tree from JSON"); (* These items are undocumented *) @@ -4516,6 +4520,9 @@ let go () = write_pdf false pdf | _ -> error "Unknown standard" end + | Some PrintStructTree -> + let pdf = get_single_pdf args.op true in + Cpdfua.print_struct_tree pdf | Some ExtractStructTree -> let pdf = get_single_pdf args.op true in let json = Cpdfua.extract_struct_tree pdf in diff --git a/cpdfua.ml b/cpdfua.ml index 3f29627..d6515d3 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -54,7 +54,17 @@ let read_a pdf stnode = let at = match Pdf.lookup_direct pdf "/ActualText" stnode with | Some _ -> ["/ActualText"] | None -> [] in - from_a @ id @ at @ alt + let pageref = + match Pdf.direct pdf stnode with + | Pdf.Dictionary d -> + begin match lookup "/Pg" d with + | Some (Pdf.Indirect i) -> + ["_" ^ string_of_int i] + | _ -> [] + end + | _ -> [] + in + from_a @ id @ at @ alt @ pageref let rec read_st_inner pdf stnode = let s = @@ -1508,3 +1518,20 @@ let replace_struct_tree pdf json = | _ -> error "Top level JSON wrong. Must be list with 0 first." with e -> error (Printf.sprintf "replace_struct_tree: %s" (Printexc.to_string e)) + +let print_struct_tree pdf = + let page_lookup = + hashtable_of_dictionary (combine (Pdf.page_reference_numbers pdf) (ilist 1 (Pdfpage.endpage pdf))) + in + let get_page attrs = + match option_map (fun x -> match explode x with '_'::more -> Some (implode more) | _ -> None) attrs with + | [i] -> string_of_int (try Hashtbl.find page_lookup (int_of_string i) with _ -> 0) + | _ -> "0" + in + let st = read_st2 pdf in + match st with E2 ("/StructTreeRoot", _, []) -> () | _ -> + flprint + (Cpdfprinttree.to_string + ~get_name:(fun (E2 (x, a, _)) -> if int_of_string (get_page a) > 0 then x ^ " (" ^ get_page a ^ ")" else x) + ~get_children:(fun (E2 (_, _, cs)) -> cs) + st) diff --git a/cpdfua.mli b/cpdfua.mli index a2c9876..e097a2d 100644 --- a/cpdfua.mli +++ b/cpdfua.mli @@ -9,6 +9,8 @@ val mark2 : int -> Pdf.t -> unit val remove_mark : Pdf.t -> unit +val print_struct_tree : Pdf.t -> unit + val extract_struct_tree : Pdf.t -> Cpdfyojson.Safe.t val replace_struct_tree : Pdf.t -> Cpdfyojson.Safe.t -> unit