From 930b162969214fcededd37d008d349cf9452551d Mon Sep 17 00:00:00 2001
From: John Whitington <john@coherentgraphics.co.uk>
Date: Tue, 28 Jan 2025 16:13:59 +0800
Subject: [PATCH] Skeleton for returning actual fonts in list_fonts

---
 Changes      | 11 +++++++----
 cpdffont.ml  |  6 +++---
 cpdffont.mli |  2 +-
 cpdfua.ml    | 17 ++++++++++++-----
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/Changes b/Changes
index b641d4a..0b0d3ec 100644
--- a/Changes
+++ b/Changes
@@ -3,17 +3,20 @@
 Extended features:
 
 o -obj JSON output
-o -obj/-obj-json can follow a chain from an object number
-o -obj/-obj-json can explore through arrays and name/number trees
-o -print-dict-entry, -remove-dict-entry and -replace-dict-entry
+* -obj/-obj-json can follow a chain from an object number
+* -obj/-obj-json can explore through arrays and name/number trees
+* -print-dict-entry, -remove-dict-entry and -replace-dict-entry
 can follow a chain from each found dictionary entry
-o More of the PDF/UA Matterhorn verification suite implemented
+* More of the PDF/UA Matterhorn verification suite implemented
+* Font lister now returns the font itself
 
 Fixes:
 
 o Harden auto-compression against malformed streams
 o Add backup JPEG dimensions method in Cpdfimage
 
+* = Supported by a grant from NLnet
+
 2.8 (December 2024)
 
 New features:
diff --git a/cpdffont.ml b/cpdffont.ml
index 2f2558c..117f061 100644
--- a/cpdffont.ml
+++ b/cpdffont.ml
@@ -268,7 +268,7 @@ let list_font pdf page (name, dict) =
     | Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
     | _ -> ""
   in 
-    (page, name, subtype, basefont, encoding)
+    (page, name, subtype, basefont, encoding, Pdf.Null)
 
 (* List the fonts used in an xobject, and in any of the xobjects it has. Do not
    process an xobject twice. *)
@@ -321,10 +321,10 @@ let list_fonts pdf range =
              [])
         (combine (ilist 1 (length pages)) pages))
 
-let string_of_font (p, n, s, b, e) =
+let string_of_font (p, n, s, b, e, _) =
   Printf.sprintf "%i %s %s %s %s\n" p n s b e
 
-let json_of_font (pagenum, name, subtype, basefont, encoding) =
+let json_of_font (pagenum, name, subtype, basefont, encoding, _) =
   `Assoc
     [("page", `Int pagenum);
      ("name", `String name);
diff --git a/cpdffont.mli b/cpdffont.mli
index 406401a..107221a 100644
--- a/cpdffont.mli
+++ b/cpdffont.mli
@@ -6,7 +6,7 @@
 val print_fonts : ?json:bool -> Pdf.t -> int list -> unit
 
 (** Return font list. Page number, name, subtype, basefont, encoding.  *)
-val list_fonts : Pdf.t -> int list -> (int * string * string * string * string) list
+val list_fonts : Pdf.t -> int list -> (int * string * string * string * string * Pdf.pdfobject) list
 
 (** Return font list in JSON format *)
 val json_fonts : Pdf.t -> int list -> Cpdfyojson.Safe.t
diff --git a/cpdfua.ml b/cpdfua.ml
index 7e917a4..d73e358 100644
--- a/cpdfua.ml
+++ b/cpdfua.ml
@@ -468,6 +468,7 @@ in
       if not (List.for_all (mem' allowed_names) names) then merror ()
   in
   let check_font font =
+    Printf.printf "Check font: %s\n" (Pdfwrite.string_of_pdf font);
     match Pdf.lookup_direct pdf "/ToUnicode" font with
     | Some _ -> (* a) *) ()
     | _ ->
@@ -485,12 +486,18 @@ in
               unimpl ()
             | _ -> merror ()
   in
+
+    (* FIXME Not all object numbers, because text extraction need not be
+    possible on fonts referenced only from within AcroForms. Also fonts may be
+    direct and not even object numbers at all. So, instead, return the list of
+    fonts from a file just like -list-fonts and use those fonts. *)
     Pdf.objiter
-      (fun _ o ->
-         match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/Subtype" o with
-         | Some (Pdf.Name "/Font"), Some (Pdf.Name ("/CIDFontType0" | "/CIDFontType2")) -> ()
-         | Some (Pdf.Name "/Font"), _ -> check_font o
-         | _ -> ())
+      (fun o _ ->
+         let o = Pdf.lookup_obj pdf o in
+           match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/Subtype" o with
+           | Some (Pdf.Name "/Font"), Some (Pdf.Name ("/CIDFontType0" | "/CIDFontType2")) -> ()
+           | Some (Pdf.Name "/Font"), _ -> check_font o
+           | _ -> ())
       pdf
 
 (* If the top-level /Lang is present, that rules all and is sufficient. *)