From 930b162969214fcededd37d008d349cf9452551d Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 28 Jan 2025 16:13:59 +0800 Subject: [PATCH] Skeleton for returning actual fonts in list_fonts --- Changes | 11 +++++++---- cpdffont.ml | 6 +++--- cpdffont.mli | 2 +- cpdfua.ml | 17 ++++++++++++----- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/Changes b/Changes index b641d4a..0b0d3ec 100644 --- a/Changes +++ b/Changes @@ -3,17 +3,20 @@ Extended features: o -obj JSON output -o -obj/-obj-json can follow a chain from an object number -o -obj/-obj-json can explore through arrays and name/number trees -o -print-dict-entry, -remove-dict-entry and -replace-dict-entry +* -obj/-obj-json can follow a chain from an object number +* -obj/-obj-json can explore through arrays and name/number trees +* -print-dict-entry, -remove-dict-entry and -replace-dict-entry can follow a chain from each found dictionary entry -o More of the PDF/UA Matterhorn verification suite implemented +* More of the PDF/UA Matterhorn verification suite implemented +* Font lister now returns the font itself Fixes: o Harden auto-compression against malformed streams o Add backup JPEG dimensions method in Cpdfimage +* = Supported by a grant from NLnet + 2.8 (December 2024) New features: diff --git a/cpdffont.ml b/cpdffont.ml index 2f2558c..117f061 100644 --- a/cpdffont.ml +++ b/cpdffont.ml @@ -268,7 +268,7 @@ let list_font pdf page (name, dict) = | Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n) | _ -> "" in - (page, name, subtype, basefont, encoding) + (page, name, subtype, basefont, encoding, Pdf.Null) (* List the fonts used in an xobject, and in any of the xobjects it has. Do not process an xobject twice. *) @@ -321,10 +321,10 @@ let list_fonts pdf range = []) (combine (ilist 1 (length pages)) pages)) -let string_of_font (p, n, s, b, e) = +let string_of_font (p, n, s, b, e, _) = Printf.sprintf "%i %s %s %s %s\n" p n s b e -let json_of_font (pagenum, name, subtype, basefont, encoding) = +let json_of_font (pagenum, name, subtype, basefont, encoding, _) = `Assoc [("page", `Int pagenum); ("name", `String name); diff --git a/cpdffont.mli b/cpdffont.mli index 406401a..107221a 100644 --- a/cpdffont.mli +++ b/cpdffont.mli @@ -6,7 +6,7 @@ val print_fonts : ?json:bool -> Pdf.t -> int list -> unit (** Return font list. Page number, name, subtype, basefont, encoding. *) -val list_fonts : Pdf.t -> int list -> (int * string * string * string * string) list +val list_fonts : Pdf.t -> int list -> (int * string * string * string * string * Pdf.pdfobject) list (** Return font list in JSON format *) val json_fonts : Pdf.t -> int list -> Cpdfyojson.Safe.t diff --git a/cpdfua.ml b/cpdfua.ml index 7e917a4..d73e358 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -468,6 +468,7 @@ in if not (List.for_all (mem' allowed_names) names) then merror () in let check_font font = + Printf.printf "Check font: %s\n" (Pdfwrite.string_of_pdf font); match Pdf.lookup_direct pdf "/ToUnicode" font with | Some _ -> (* a) *) () | _ -> @@ -485,12 +486,18 @@ in unimpl () | _ -> merror () in + + (* FIXME Not all object numbers, because text extraction need not be + possible on fonts referenced only from within AcroForms. Also fonts may be + direct and not even object numbers at all. So, instead, return the list of + fonts from a file just like -list-fonts and use those fonts. *) Pdf.objiter - (fun _ o -> - match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/Subtype" o with - | Some (Pdf.Name "/Font"), Some (Pdf.Name ("/CIDFontType0" | "/CIDFontType2")) -> () - | Some (Pdf.Name "/Font"), _ -> check_font o - | _ -> ()) + (fun o _ -> + let o = Pdf.lookup_obj pdf o in + match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/Subtype" o with + | Some (Pdf.Name "/Font"), Some (Pdf.Name ("/CIDFontType0" | "/CIDFontType2")) -> () + | Some (Pdf.Name "/Font"), _ -> check_font o + | _ -> ()) pdf (* If the top-level /Lang is present, that rules all and is sufficient. *)