Truetype cmap matterhorns

This commit is contained in:
John Whitington 2024-06-21 17:27:40 +01:00
parent 9609100383
commit 707578d724
3 changed files with 98 additions and 17 deletions

View File

@ -504,6 +504,9 @@ let find_main encoding subset =
in in
(first, splitinto 224 rest) (first, splitinto 224 rest)
let collecting_cmaps = ref false
let collected_cmaps = ref []
let parse ~subset data encoding = let parse ~subset data encoding =
let mk_b byte_offset = bitbytes_of_input (let i = input_of_bytes data in i.seek_in byte_offset; i) in let mk_b byte_offset = bitbytes_of_input (let i = input_of_bytes data in i.seek_in byte_offset; i) in
let b = mk_b 0 in let b = mk_b 0 in
@ -547,7 +550,9 @@ let parse ~subset data encoding =
in in
let ascent, descent, capheight, xheight, avgwidth = let ascent, descent, capheight, xheight, avgwidth =
match os2 with match os2 with
| None -> raise (Pdf.PDFError "No os/2 table found in truetype font") | None ->
if !collecting_cmaps then (0, 0, 0, 0, 0) else
raise (Pdf.PDFError "No os/2 table found in truetype font")
| Some (o, l) -> let b = mk_b (i32toi o) in read_os2_table unitsPerEm b (i32toi l) | Some (o, l) -> let b = mk_b (i32toi o) in read_os2_table unitsPerEm b (i32toi l)
in in
let italicangle = let italicangle =
@ -579,6 +584,7 @@ let parse ~subset data encoding =
let subtable_offset = read_ulong b in let subtable_offset = read_ulong b in
if !dbg then Printf.printf "subtable %i. platform_id = %i, encoding_id = %i, subtable_offset = %li\n" if !dbg then Printf.printf "subtable %i. platform_id = %i, encoding_id = %i, subtable_offset = %li\n"
x platform_id encoding_id subtable_offset; x platform_id encoding_id subtable_offset;
collected_cmaps := (platform_id, encoding_id)::!collected_cmaps;
let b = mk_b (i32toi cmapoffset + i32toi subtable_offset) in let b = mk_b (i32toi cmapoffset + i32toi subtable_offset) in
let fmt = read_ushort b in let fmt = read_ushort b in
let lngth = read_ushort b in let lngth = read_ushort b in
@ -684,3 +690,11 @@ let parse ~subset data encoding =
let parse ~subset data encoding = let parse ~subset data encoding =
try parse ~subset data encoding with try parse ~subset data encoding with
e -> raise (Cpdferror.error ("Failed to parse TrueType font: " ^ Printexc.to_string e)) e -> raise (Cpdferror.error ("Failed to parse TrueType font: " ^ Printexc.to_string e))
(** Return the list of cmaps from a font file (used for PDF/UA verification). *)
let cmaps data =
set collecting_cmaps;
collected_cmaps := [];
let _ = try ignore (parse ~subset:[] data Pdftext.WinAnsiEncoding) with e -> () in
clear collecting_cmaps;
!collected_cmaps

View File

@ -27,3 +27,6 @@ type t =
additional characters in the font. You should supply a subset (a list of additional characters in the font. You should supply a subset (a list of
unicode codepoints whose corresponding glyphs are required). *) unicode codepoints whose corresponding glyphs are required). *)
val parse : subset:int list -> Pdfio.bytes -> Pdftext.encoding -> t list val parse : subset:int list -> Pdfio.bytes -> Pdftext.encoding -> t list
(** Return the list of cmaps from a font file (used for PDF/UA verification). *)
val cmaps : Pdfio.bytes -> (int * int) list

View File

@ -925,19 +925,6 @@ let matterhorn_31_015 _ _ pdf =
let matterhorn_31_016 _ _ pdf = let matterhorn_31_016 _ _ pdf =
unimpl () unimpl ()
(* A non-symbolic TrueType font is used for rendering, but none of the cmap
entries in the embedded font program is a non-symbolic cmap. *)
let matterhorn_31_017 _ _ pdf =
unimpl ()
(* A non-symbolic TrueType font is used for rendering, but for at least one
glyph to be rendered the glyph cannot be looked up by any of the
non-symbolic cmap entries in the embedded font program. *)
let matterhorn_31_018 _ _ pdf =
unimpl ()
(* The font dictionary for a non-symbolic TrueType font does not contain an
Encoding entry. *)
let is_non_symbolic pdf o = let is_non_symbolic pdf o =
match Pdf.lookup_direct pdf "/FontDescriptor" o with match Pdf.lookup_direct pdf "/FontDescriptor" o with
| Some fd -> | Some fd ->
@ -947,6 +934,43 @@ let is_non_symbolic pdf o =
end end
| None -> true | None -> true
let truetype_fontfile pdf o =
match Pdf.lookup_chain pdf o ["/FontDescriptor"; "/FontFile2"] with
| Some (Pdf.Stream s) ->
Pdfcodec.decode_pdfstream_until_unknown pdf (Pdf.Stream s);
begin match s with
| {contents = (_, Pdf.Got bs)} -> Some bs
| _ -> None
end
| _ -> None
(* A non-symbolic TrueType font is used for rendering, but none of the cmap
entries in the embedded font program is a non-symbolic cmap. *)
let matterhorn_31_017 _ _ pdf =
Pdf.objiter
(fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with
| Some (Pdf.Name "/TrueType") ->
if not (is_non_symbolic pdf o) then
let fontfile = truetype_fontfile pdf o in
if fontfile = None then () else
let cmaps = Cpdftruetype.cmaps (unopt fontfile) in
(*iter (fun (x, y) -> Printf.printf "%i, %i\n" x y) cmaps;*)
(* Must all be symbolic *)
if (List.for_all (function (1, 8) | (3, 0) -> true | _ -> false) cmaps) then merror ()
else
()
| _ -> ())
pdf
(* A non-symbolic TrueType font is used for rendering, but for at least one
glyph to be rendered the glyph cannot be looked up by any of the
non-symbolic cmap entries in the embedded font program. *)
let matterhorn_31_018 _ _ pdf =
unimpl ()
(* The font dictionary for a non-symbolic TrueType font does not contain an
Encoding entry. *)
let matterhorn_31_019 _ _ pdf = let matterhorn_31_019 _ _ pdf =
Pdf.objiter Pdf.objiter
(fun _ o -> (fun _ o ->
@ -1025,7 +1049,19 @@ let matterhorn_31_022 _ _ pdf =
TrueType font dictionary but the embedded font program does not contain a TrueType font dictionary but the embedded font program does not contain a
(3,1) Microsoft Unicode cmap. *) (3,1) Microsoft Unicode cmap. *)
let matterhorn_31_023 _ _ pdf = let matterhorn_31_023 _ _ pdf =
unimpl () Pdf.objiter
(fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o, Pdf.lookup_chain pdf o ["/Encoding"; "/Differences"] with
| Some (Pdf.Name "/TrueType"), Some _ ->
if is_non_symbolic pdf o then
let fontfile = truetype_fontfile pdf o in
if fontfile = None then () else
let cmaps = Cpdftruetype.cmaps (unopt fontfile) in
if mem (3, 1) cmaps then () else merror ()
else
()
| _ -> ())
pdf
(* The Encoding entry is present in the font dictionary for a symbolic TrueType (* The Encoding entry is present in the font dictionary for a symbolic TrueType
font. *) font. *)
@ -1043,12 +1079,38 @@ let matterhorn_31_024 _ _ pdf =
(* The embedded font program for a symbolic TrueType font contains no cmap. *) (* The embedded font program for a symbolic TrueType font contains no cmap. *)
let matterhorn_31_025 _ _ pdf = let matterhorn_31_025 _ _ pdf =
unimpl () Pdf.objiter
(fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with
| Some (Pdf.Name "/TrueType") ->
if not (is_non_symbolic pdf o) then
let fontfile = truetype_fontfile pdf o in
if fontfile = None then () else
let cmaps = Cpdftruetype.cmaps (unopt fontfile) in
(*iter (fun (x, y) -> Printf.printf "%i, %i\n" x y) cmaps;*)
if cmaps = [] then merror ()
else
()
| _ -> ())
pdf
(* The embedded font program for a symbolic TrueType font contains more than (* The embedded font program for a symbolic TrueType font contains more than
one cmap, but none of the cmap entries is a (3,0) Microsoft Symbol cmap. *) one cmap, but none of the cmap entries is a (3,0) Microsoft Symbol cmap. *)
let matterhorn_31_026 _ _ pdf = let matterhorn_31_026 _ _ pdf =
unimpl () Pdf.objiter
(fun _ o ->
match Pdf.lookup_direct pdf "/Subtype" o with
| Some (Pdf.Name "/TrueType") ->
if true (*not (is_non_symbolic pdf o)*) (*FIXME reinstate test*) then
let fontfile = truetype_fontfile pdf o in
if fontfile = None then () else
let cmaps = Cpdftruetype.cmaps (unopt fontfile) in
(*iter (fun (x, y) -> Printf.printf "%i, %i\n" x y) cmaps;*)
if length cmaps > 1 && not (mem (3, 0) cmaps) then merror ()
else
()
| _ -> ())
pdf
(* A font dictionary does not contain the ToUnicode entry and none of the (* A font dictionary does not contain the ToUnicode entry and none of the
following is true: the font uses MacRomanEncoding, MacExpertEncoding or following is true: the font uses MacRomanEncoding, MacExpertEncoding or
@ -1228,6 +1290,8 @@ let matterhorn =
("31-030", "One or more characters used in text showing operators reference the .notdef glyph.", "UA1:7.21.8-1", matterhorn_31_030); ("31-030", "One or more characters used in text showing operators reference the .notdef glyph.", "UA1:7.21.8-1", matterhorn_31_030);
] ]
(* FIXME Allow the use of just a single test, and expose it in cpdf command line *)
let test_matterhorn pdf = let test_matterhorn pdf =
(* A circularity in the role map prevents all structure checks, so we do it first at stop if it fails. *) (* A circularity in the role map prevents all structure checks, so we do it first at stop if it fails. *)
let circularity_error = let circularity_error =