Report natural language

This commit is contained in:
John Whitington 2024-06-12 16:15:33 +01:00
parent 9a76c291ae
commit 9d3c4384e4
4 changed files with 13 additions and 1 deletions

View File

@ -8,6 +8,7 @@ o Verify compliance to PDF/UA via the Matterhorn protocol
o Extract, edit and reapply document structure tree o Extract, edit and reapply document structure tree
o Split structure tree when splitting PDF to save size o Split structure tree when splitting PDF to save size
o Combine structure trees when stamping PDFs o Combine structure trees when stamping PDFs
o Report natural language on -info
2.7 (February 2024) 2.7 (February 2024)

View File

@ -442,6 +442,11 @@ let determine_subformats pdf =
end; end;
!formats !formats
let language pdf =
match Pdf.lookup_chain pdf pdf.Pdf.trailerdict ["/Root"; "/Lang"] with
| Some (Pdf.String x) -> Some x
| _ -> None
let output_xmp_info ?(json=ref [("none", `Null)]) encoding pdf = let output_xmp_info ?(json=ref [("none", `Null)]) encoding pdf =
let notjson = !json = [("none", `Null)] in let notjson = !json = [("none", `Null)] in
let print_out tree title namespace name = let print_out tree title namespace name =
@ -459,6 +464,9 @@ let output_xmp_info ?(json=ref [("none", `Null)]) encoding pdf =
if notjson if notjson
then Printf.printf "Subformats: %s\n" (combine_with_commas (determine_subformats pdf)) then Printf.printf "Subformats: %s\n" (combine_with_commas (determine_subformats pdf))
else json =| ("Subformats", `List (map (fun x -> `String x) (determine_subformats pdf))); else json =| ("Subformats", `List (map (fun x -> `String x) (determine_subformats pdf)));
if notjson
then Printf.printf "Language: %s\n" (match language pdf with None -> "" | Some x -> "\"" ^ x ^ "\"")
else json =| ("Language", match language pdf with None -> `Null | Some x -> `String x);
match get_metadata pdf with match get_metadata pdf with
None -> () None -> ()
| Some metadata -> | Some metadata ->

View File

@ -87,6 +87,8 @@ val get_viewer_pref_item : string -> Pdf.t -> string
val determine_subformats : Pdf.t -> string list val determine_subformats : Pdf.t -> string list
val language : Pdf.t -> string option
val adobe : string val adobe : string
val xmp : string val xmp : string
val dc : string val dc : string

View File

@ -164,7 +164,8 @@ let matterhorn_10_001 pdf =
unimpl () unimpl ()
(* Natural language for text in page content cannot be determined. *) (* Natural language for text in page content cannot be determined. *)
let matterhorn_11_001 pdf = todo () let matterhorn_11_001 pdf =
unimpl ()
(* Natural language for text in Alt, ActualText and E attributes cannot be (* Natural language for text in Alt, ActualText and E attributes cannot be
determined. *) determined. *)