From 707578d7246b0a461a34d216d5d7e0eb71ac75f2 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Fri, 21 Jun 2024 17:27:40 +0100 Subject: [PATCH] Truetype cmap matterhorns --- cpdftruetype.ml | 16 +++++++- cpdftruetype.mli | 3 ++ cpdfua.ml | 96 ++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 98 insertions(+), 17 deletions(-) diff --git a/cpdftruetype.ml b/cpdftruetype.ml index 5de5a1b..10a5f16 100644 --- a/cpdftruetype.ml +++ b/cpdftruetype.ml @@ -504,6 +504,9 @@ let find_main encoding subset = in (first, splitinto 224 rest) +let collecting_cmaps = ref false +let collected_cmaps = ref [] + let parse ~subset data encoding = let mk_b byte_offset = bitbytes_of_input (let i = input_of_bytes data in i.seek_in byte_offset; i) in let b = mk_b 0 in @@ -547,7 +550,9 @@ let parse ~subset data encoding = in let ascent, descent, capheight, xheight, avgwidth = match os2 with - | None -> raise (Pdf.PDFError "No os/2 table found in truetype font") + | None -> + if !collecting_cmaps then (0, 0, 0, 0, 0) else + raise (Pdf.PDFError "No os/2 table found in truetype font") | Some (o, l) -> let b = mk_b (i32toi o) in read_os2_table unitsPerEm b (i32toi l) in let italicangle = @@ -579,6 +584,7 @@ let parse ~subset data encoding = let subtable_offset = read_ulong b in if !dbg then Printf.printf "subtable %i. platform_id = %i, encoding_id = %i, subtable_offset = %li\n" x platform_id encoding_id subtable_offset; + collected_cmaps := (platform_id, encoding_id)::!collected_cmaps; let b = mk_b (i32toi cmapoffset + i32toi subtable_offset) in let fmt = read_ushort b in let lngth = read_ushort b in @@ -684,3 +690,11 @@ let parse ~subset data encoding = let parse ~subset data encoding = try parse ~subset data encoding with e -> raise (Cpdferror.error ("Failed to parse TrueType font: " ^ Printexc.to_string e)) + +(** Return the list of cmaps from a font file (used for PDF/UA verification). *) +let cmaps data = + set collecting_cmaps; + collected_cmaps := []; + let _ = try ignore (parse ~subset:[] data Pdftext.WinAnsiEncoding) with e -> () in + clear collecting_cmaps; + !collected_cmaps diff --git a/cpdftruetype.mli b/cpdftruetype.mli index d52fa6a..bc17e59 100644 --- a/cpdftruetype.mli +++ b/cpdftruetype.mli @@ -27,3 +27,6 @@ type t = additional characters in the font. You should supply a subset (a list of unicode codepoints whose corresponding glyphs are required). *) val parse : subset:int list -> Pdfio.bytes -> Pdftext.encoding -> t list + +(** Return the list of cmaps from a font file (used for PDF/UA verification). *) +val cmaps : Pdfio.bytes -> (int * int) list diff --git a/cpdfua.ml b/cpdfua.ml index 35730c3..bd1520a 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -925,19 +925,6 @@ let matterhorn_31_015 _ _ pdf = let matterhorn_31_016 _ _ pdf = unimpl () -(* A non-symbolic TrueType font is used for rendering, but none of the cmap - entries in the embedded font program is a non-symbolic cmap. *) -let matterhorn_31_017 _ _ pdf = - unimpl () - -(* A non-symbolic TrueType font is used for rendering, but for at least one - glyph to be rendered the glyph cannot be looked up by any of the - non-symbolic cmap entries in the embedded font program. *) -let matterhorn_31_018 _ _ pdf = - unimpl () - -(* The font dictionary for a non-symbolic TrueType font does not contain an - Encoding entry. *) let is_non_symbolic pdf o = match Pdf.lookup_direct pdf "/FontDescriptor" o with | Some fd -> @@ -947,6 +934,43 @@ let is_non_symbolic pdf o = end | None -> true +let truetype_fontfile pdf o = + match Pdf.lookup_chain pdf o ["/FontDescriptor"; "/FontFile2"] with + | Some (Pdf.Stream s) -> + Pdfcodec.decode_pdfstream_until_unknown pdf (Pdf.Stream s); + begin match s with + | {contents = (_, Pdf.Got bs)} -> Some bs + | _ -> None + end + | _ -> None + +(* A non-symbolic TrueType font is used for rendering, but none of the cmap + entries in the embedded font program is a non-symbolic cmap. *) +let matterhorn_31_017 _ _ pdf = + Pdf.objiter + (fun _ o -> + match Pdf.lookup_direct pdf "/Subtype" o with + | Some (Pdf.Name "/TrueType") -> + if not (is_non_symbolic pdf o) then + let fontfile = truetype_fontfile pdf o in + if fontfile = None then () else + let cmaps = Cpdftruetype.cmaps (unopt fontfile) in + (*iter (fun (x, y) -> Printf.printf "%i, %i\n" x y) cmaps;*) + (* Must all be symbolic *) + if (List.for_all (function (1, 8) | (3, 0) -> true | _ -> false) cmaps) then merror () + else + () + | _ -> ()) + pdf + +(* A non-symbolic TrueType font is used for rendering, but for at least one + glyph to be rendered the glyph cannot be looked up by any of the + non-symbolic cmap entries in the embedded font program. *) +let matterhorn_31_018 _ _ pdf = + unimpl () + +(* The font dictionary for a non-symbolic TrueType font does not contain an + Encoding entry. *) let matterhorn_31_019 _ _ pdf = Pdf.objiter (fun _ o -> @@ -1025,7 +1049,19 @@ let matterhorn_31_022 _ _ pdf = TrueType font dictionary but the embedded font program does not contain a (3,1) Microsoft Unicode cmap. *) let matterhorn_31_023 _ _ pdf = - unimpl () + Pdf.objiter + (fun _ o -> + match Pdf.lookup_direct pdf "/Subtype" o, Pdf.lookup_chain pdf o ["/Encoding"; "/Differences"] with + | Some (Pdf.Name "/TrueType"), Some _ -> + if is_non_symbolic pdf o then + let fontfile = truetype_fontfile pdf o in + if fontfile = None then () else + let cmaps = Cpdftruetype.cmaps (unopt fontfile) in + if mem (3, 1) cmaps then () else merror () + else + () + | _ -> ()) + pdf (* The Encoding entry is present in the font dictionary for a symbolic TrueType font. *) @@ -1043,12 +1079,38 @@ let matterhorn_31_024 _ _ pdf = (* The embedded font program for a symbolic TrueType font contains no cmap. *) let matterhorn_31_025 _ _ pdf = - unimpl () + Pdf.objiter + (fun _ o -> + match Pdf.lookup_direct pdf "/Subtype" o with + | Some (Pdf.Name "/TrueType") -> + if not (is_non_symbolic pdf o) then + let fontfile = truetype_fontfile pdf o in + if fontfile = None then () else + let cmaps = Cpdftruetype.cmaps (unopt fontfile) in + (*iter (fun (x, y) -> Printf.printf "%i, %i\n" x y) cmaps;*) + if cmaps = [] then merror () + else + () + | _ -> ()) + pdf (* The embedded font program for a symbolic TrueType font contains more than one cmap, but none of the cmap entries is a (3,0) Microsoft Symbol cmap. *) let matterhorn_31_026 _ _ pdf = - unimpl () + Pdf.objiter + (fun _ o -> + match Pdf.lookup_direct pdf "/Subtype" o with + | Some (Pdf.Name "/TrueType") -> + if true (*not (is_non_symbolic pdf o)*) (*FIXME reinstate test*) then + let fontfile = truetype_fontfile pdf o in + if fontfile = None then () else + let cmaps = Cpdftruetype.cmaps (unopt fontfile) in + (*iter (fun (x, y) -> Printf.printf "%i, %i\n" x y) cmaps;*) + if length cmaps > 1 && not (mem (3, 0) cmaps) then merror () + else + () + | _ -> ()) + pdf (* A font dictionary does not contain the ToUnicode entry and none of the following is true: the font uses MacRomanEncoding, MacExpertEncoding or @@ -1228,6 +1290,8 @@ let matterhorn = ("31-030", "One or more characters used in text showing operators reference the .notdef glyph.", "UA1:7.21.8-1", matterhorn_31_030); ] +(* FIXME Allow the use of just a single test, and expose it in cpdf command line *) + let test_matterhorn pdf = (* A circularity in the role map prevents all structure checks, so we do it first at stop if it fails. *) let circularity_error =