Skeleton for returning actual fonts in list_fonts

This commit is contained in:
John Whitington
2025-01-28 16:13:59 +08:00
parent d05e9aa411
commit 930b162969
4 changed files with 23 additions and 13 deletions

11
Changes
View File

@@ -3,17 +3,20 @@
Extended features: Extended features:
o -obj JSON output o -obj JSON output
o -obj/-obj-json can follow a chain from an object number * -obj/-obj-json can follow a chain from an object number
o -obj/-obj-json can explore through arrays and name/number trees * -obj/-obj-json can explore through arrays and name/number trees
o -print-dict-entry, -remove-dict-entry and -replace-dict-entry * -print-dict-entry, -remove-dict-entry and -replace-dict-entry
can follow a chain from each found dictionary entry can follow a chain from each found dictionary entry
o More of the PDF/UA Matterhorn verification suite implemented * More of the PDF/UA Matterhorn verification suite implemented
* Font lister now returns the font itself
Fixes: Fixes:
o Harden auto-compression against malformed streams o Harden auto-compression against malformed streams
o Add backup JPEG dimensions method in Cpdfimage o Add backup JPEG dimensions method in Cpdfimage
* = Supported by a grant from NLnet
2.8 (December 2024) 2.8 (December 2024)
New features: New features:

View File

@@ -268,7 +268,7 @@ let list_font pdf page (name, dict) =
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n) | Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
| _ -> "" | _ -> ""
in in
(page, name, subtype, basefont, encoding) (page, name, subtype, basefont, encoding, Pdf.Null)
(* List the fonts used in an xobject, and in any of the xobjects it has. Do not (* List the fonts used in an xobject, and in any of the xobjects it has. Do not
process an xobject twice. *) process an xobject twice. *)
@@ -321,10 +321,10 @@ let list_fonts pdf range =
[]) [])
(combine (ilist 1 (length pages)) pages)) (combine (ilist 1 (length pages)) pages))
let string_of_font (p, n, s, b, e) = let string_of_font (p, n, s, b, e, _) =
Printf.sprintf "%i %s %s %s %s\n" p n s b e Printf.sprintf "%i %s %s %s %s\n" p n s b e
let json_of_font (pagenum, name, subtype, basefont, encoding) = let json_of_font (pagenum, name, subtype, basefont, encoding, _) =
`Assoc `Assoc
[("page", `Int pagenum); [("page", `Int pagenum);
("name", `String name); ("name", `String name);

View File

@@ -6,7 +6,7 @@
val print_fonts : ?json:bool -> Pdf.t -> int list -> unit val print_fonts : ?json:bool -> Pdf.t -> int list -> unit
(** Return font list. Page number, name, subtype, basefont, encoding. *) (** Return font list. Page number, name, subtype, basefont, encoding. *)
val list_fonts : Pdf.t -> int list -> (int * string * string * string * string) list val list_fonts : Pdf.t -> int list -> (int * string * string * string * string * Pdf.pdfobject) list
(** Return font list in JSON format *) (** Return font list in JSON format *)
val json_fonts : Pdf.t -> int list -> Cpdfyojson.Safe.t val json_fonts : Pdf.t -> int list -> Cpdfyojson.Safe.t

View File

@@ -468,6 +468,7 @@ in
if not (List.for_all (mem' allowed_names) names) then merror () if not (List.for_all (mem' allowed_names) names) then merror ()
in in
let check_font font = let check_font font =
Printf.printf "Check font: %s\n" (Pdfwrite.string_of_pdf font);
match Pdf.lookup_direct pdf "/ToUnicode" font with match Pdf.lookup_direct pdf "/ToUnicode" font with
| Some _ -> (* a) *) () | Some _ -> (* a) *) ()
| _ -> | _ ->
@@ -485,12 +486,18 @@ in
unimpl () unimpl ()
| _ -> merror () | _ -> merror ()
in in
(* FIXME Not all object numbers, because text extraction need not be
possible on fonts referenced only from within AcroForms. Also fonts may be
direct and not even object numbers at all. So, instead, return the list of
fonts from a file just like -list-fonts and use those fonts. *)
Pdf.objiter Pdf.objiter
(fun _ o -> (fun o _ ->
match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/Subtype" o with let o = Pdf.lookup_obj pdf o in
| Some (Pdf.Name "/Font"), Some (Pdf.Name ("/CIDFontType0" | "/CIDFontType2")) -> () match Pdf.lookup_direct pdf "/Type" o, Pdf.lookup_direct pdf "/Subtype" o with
| Some (Pdf.Name "/Font"), _ -> check_font o | Some (Pdf.Name "/Font"), Some (Pdf.Name ("/CIDFontType0" | "/CIDFontType2")) -> ()
| _ -> ()) | Some (Pdf.Name "/Font"), _ -> check_font o
| _ -> ())
pdf pdf
(* If the top-level /Lang is present, that rules all and is sufficient. *) (* If the top-level /Lang is present, that rules all and is sufficient. *)