Expand font extraction to xobject fonts

This commit is contained in:
John Whitington 2024-11-21 14:47:44 +00:00
parent 1d2aab34f4
commit 3bb5019c46

View File

@ -208,33 +208,46 @@ let print_font_table pdf fontname pagenumber =
done done
| _ -> failwith "addtext: font not found for width" | _ -> failwith "addtext: font not found for width"
let font_from_name pdf fontname pagenumber =
try
let resources = ref (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
let chain = ref (tl (String.split_on_char '/' fontname)) in
let font = ref Pdf.Null in
while !chain <> [] do
match !chain with
| [f] ->
font := unopt (Pdf.lookup_chain pdf !resources ["/Font"; ("/" ^ f)]);
chain := []
| x::xs ->
resources := unopt (Pdf.lookup_chain pdf !resources ["/XObject"; "/" ^ x; "/Resources"]);
chain := xs
| [] -> ()
done;
!font
with
_ -> Pdfe.log (Printf.sprintf "Not found: font %s on page %i\n" fontname pagenumber); Pdf.Null
let extract_fontfile pagenumber fontname filename pdf = let extract_fontfile pagenumber fontname filename pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in match Pdftext.read_font pdf (font_from_name pdf fontname pagenumber) with
match Pdf.lookup_direct pdf "/Font" resources with | Pdftext.CIDKeyedFont (_, {Pdftext.cid_fontdescriptor = {Pdftext.fontfile = Some fontfile}}, _)
| None -> failwith "extract_fontfile: font not found" | Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
| Some fonts -> begin let objnum =
let fontobj = Pdf.lookup_fail ("no font " ^ fontname) pdf fontname fonts in match fontfile with
let font = Pdftext.read_font pdf fontobj in | Pdftext.FontFile i | Pdftext.FontFile2 i | Pdftext.FontFile3 i -> i
match font with in
| Pdftext.CIDKeyedFont (_, {Pdftext.cid_fontdescriptor = {Pdftext.fontfile = Some fontfile}}, _) match Pdf.lookup_obj pdf objnum with
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} -> | Pdf.Stream s as obj ->
begin let objnum = Pdfcodec.decode_pdfstream pdf obj;
match fontfile with begin match s with
| Pdftext.FontFile i | Pdftext.FontFile2 i | Pdftext.FontFile3 i -> i | {contents = (_, Pdf.Got bytes)} ->
in let fh = open_out_bin filename in
match Pdf.lookup_obj pdf objnum with for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
| Pdf.Stream s as obj -> close_out fh
Pdfcodec.decode_pdfstream pdf obj; | _ -> failwith "extract_fontfile"
begin match s with end
| {contents = (_, Pdf.Got bytes)} -> | _ -> failwith "extract_fontfile"
let fh = open_out_bin filename in end
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done; | _ -> failwith "unsupported or unfound font"
close_out fh
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "unsupported or unfound font"
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *) (* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
let remove_fontdescriptor pdf = function let remove_fontdescriptor pdf = function