cpdf-source/cpdffont.ml

349 lines
14 KiB
OCaml
Raw Normal View History

2021-10-18 17:22:38 +02:00
open Pdfutil
2021-11-12 23:09:49 +01:00
open Cpdferror
open Pdfio
2021-10-18 17:22:38 +02:00
2021-10-18 17:12:10 +02:00
(* Embed missing fonts with Ghostscript. *)
let embed_missing_fonts path_to_ghostscript gs_quiet fi fo =
if path_to_ghostscript = "" then begin
2023-04-25 14:45:56 +02:00
Pdfe.log "Please supply path to gs with -gs\n";
2021-10-18 17:12:10 +02:00
exit 2
end;
let gscall =
2023-02-21 15:50:07 +01:00
Filename.quote_command path_to_ghostscript
((if gs_quiet then ["-dQUIET"] else []) @
["-dNOPAUSE";
"-sDEVICE=pdfwrite";
("-sOUTPUTFILE=" ^ fo);
"-dBATCH";
fi])
2021-10-18 17:12:10 +02:00
in
match Sys.command gscall with
| 0 -> exit 0
2023-04-25 14:45:56 +02:00
| _ -> Pdfe.log "Font embedding failed.\n"; exit 2
2021-10-18 17:22:38 +02:00
(* Copy a font from [frompdf] with name [fontname] on page [fontpage] to [pdf] on all pages in [range] *)
let copy_font frompdf fontname fontpage range pdf =
match Pdf.renumber_pdfs [frompdf; pdf] with
| [] | [_] | _::_::_::_ -> assert false
| [frompdf; pdf] ->
(* 1. Get fontpage *)
let frompdf_pages = Pdfpage.pages_of_pagetree frompdf in
let frompdf_page =
try select fontpage frompdf_pages with
Not_found -> failwith "copy_font: Page not found in input pdf"
in
(* 2. Extract font *)
let fonts =
match Pdf.lookup_direct frompdf "/Font" frompdf_page.Pdfpage.resources with
| Some f -> f
| None -> failwith "copy_font: font not found"
in
let fromfont =
match Pdf.lookup_direct frompdf fontname fonts with
| Some f -> f
| None -> failwith "copy_font: font not found"
in
let basefontname =
match Pdf.lookup_direct frompdf "/BaseFont" fromfont with
| Some (Pdf.Name n) -> n
| _ -> "/CopyFontAddedNoName"
in
(* 3. Get all objects forming font (except main /Font one) *)
let objnumbers = Pdf.objects_referenced [] [] frompdf fromfont in
(* 4. Copy them to from frompdf to pdf. *)
iter (function objnum -> Pdf.addobj_given_num pdf (objnum, Pdf.lookup_obj frompdf objnum)) objnumbers;
(* 5. Get pages from pdf *)
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
(* 6. Add the font to pages in range *)
let pages' =
map
(function (page, pagenum) ->
if mem pagenum range then
let font =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some f -> f
| None -> Pdf.Dictionary []
in
let font' =
match font with
| (Pdf.Dictionary _) as d ->
Pdf.add_dict_entry d basefontname fromfont
| _ -> failwith "copy_font: error"
in
let resources' =
Pdf.add_dict_entry page.Pdfpage.resources "/Font" font'
in
{page with
Pdfpage.resources = resources'}
else page)
(combine pdf_pages (indx pdf_pages));
in
(* 7. Put the pages back into the pdf, and return *)
let pdf, root = Pdfpage.add_pagetree pages' pdf in
Pdfpage.add_root root [] pdf
(* Missing Fonts *)
let is_missing pdf dict =
2024-07-02 19:17:00 +02:00
match Pdf.lookup_direct pdf "/Subtype" dict with
| Some (Pdf.Name "/Type3") -> false
| _ ->
match Pdf.lookup_direct pdf "/FontDescriptor" dict with
| None -> true
| Some d ->
match Pdf.lookup_direct pdf "/FontFile" d with
2021-10-18 17:22:38 +02:00
| Some _ -> false
| None ->
2024-07-02 19:17:00 +02:00
match Pdf.lookup_direct pdf "/FontFile2" d with
2021-10-18 17:22:38 +02:00
| Some _ -> false
2024-07-02 19:17:00 +02:00
| None ->
match Pdf.lookup_direct pdf "/FontFile3" d with
| Some _ -> false
| None -> true
2021-10-18 17:22:38 +02:00
2024-06-08 14:41:18 +02:00
let missing_font ?l pdf page (name, dict) =
2021-10-18 17:22:38 +02:00
if is_missing pdf dict then
let subtype =
match Pdf.lookup_direct pdf "/Subtype" dict with
| Some (Pdf.Name n) -> n
| _ -> ""
and basefont =
match Pdf.lookup_direct pdf "/BaseFont" dict with
| Some (Pdf.Name n) -> n
| _ -> ""
and encoding =
match Pdf.lookup_direct pdf "/Encoding" dict with
| Some (Pdf.Name n) -> n
2024-06-08 14:41:18 +02:00
| _ -> "Built-in"
2021-10-18 17:22:38 +02:00
in
2024-06-08 14:41:18 +02:00
match l with
| None -> Printf.printf "%i, %s, %s, %s, %s\n" page name subtype basefont encoding
2024-07-02 19:12:18 +02:00
| Some r -> r := (page, name, subtype, basefont, encoding)::!r
2021-10-18 17:22:38 +02:00
2024-06-08 14:41:18 +02:00
let missing_fonts ?l pdf range =
2021-12-21 14:44:46 +01:00
Cpdfpage.iter_pages
2021-10-18 17:22:38 +02:00
(fun num page ->
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some (Pdf.Dictionary fontdict) ->
(* Extract descendant fonts *)
let name_dict_pairs =
flatten
(map
(fun (name, dict) ->
match Pdf.lookup_direct pdf "/DescendantFonts" dict with
| Some (Pdf.Array desc_fonts) -> map (fun d -> name, d) desc_fonts
| _ -> [(name, dict)])
fontdict)
in
2024-06-08 14:41:18 +02:00
iter (missing_font ?l pdf num) name_dict_pairs
2021-10-18 17:22:38 +02:00
| _ -> ())
pdf
range
2021-11-12 23:09:49 +01:00
2024-06-08 14:41:18 +02:00
let missing_fonts_return pdf range =
let l = ref [] in
missing_fonts ~l pdf range;
!l
2021-11-12 23:09:49 +01:00
let print_font_table pdf fontname pagenumber =
let page = try List.nth (Pdfpage.pages_of_pagetree pdf) (pagenumber - 1) with e -> error "page not found" in
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
let font =
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "print_font_encoding: font %s not found" fontname)
end
in
let pdftextfont = Pdftext.read_font pdf font in
let charset =
match pdftextfont with
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs
| _ -> None
in
2021-12-02 21:04:14 +01:00
let extractor = Pdftext.text_extractor_of_font_real pdftextfont in
2021-11-12 23:09:49 +01:00
let unicodedata = Cpdfunicodedata.unicodedata () in
let unicodetable = Hashtbl.create 16000 in
iter
(fun x ->
Hashtbl.add
unicodetable
(int_of_string ("0x" ^ x.Cpdfunicodedata.code_value))
(x.Cpdfunicodedata.code_value,
x.Cpdfunicodedata.general_category,
x.Cpdfunicodedata.character_name,
x.Cpdfunicodedata.iso_10646_comment_field))
unicodedata;
for x = 0 to 255 do
let str =
(if Pdftext.is_identity_h pdftextfont then "\000" else "")
^ string_of_char (char_of_int x)
in
let codepoints = Pdftext.codepoints_of_text extractor str in
let unicodenumber, unicodename, is_control =
match codepoints with
| [c] ->
begin try
let codeval, category, character_name, comment = Hashtbl.find unicodetable c in
codeval, character_name, category = "Cc"
with
Not_found -> "", "", false
end
| _ -> "***multiple", "***multiple", false
in
let utf8 = if is_control then "<nonprintable>" else Pdftext.utf8_of_codepoints codepoints in
let glyphnames = fold_left ( ^ ) "" (Pdftext.glyphnames_of_text extractor str) in
let is_in_charset s = match charset with None -> true | Some cs -> mem s cs in
if glyphnames <> ".notdef" && is_in_charset glyphnames then
Printf.printf
"%i = U+%s (%s - %s) = %s\n" x unicodenumber utf8 unicodename glyphnames
done
| _ -> failwith "addtext: font not found for width"
2023-11-02 19:49:15 +01:00
let extract_fontfile pagenumber fontname filename pdf =
2021-11-12 23:09:49 +01:00
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
match Pdf.lookup_direct pdf "/Font" resources with
| None -> failwith "extract_fontfile: font not found"
| Some fonts ->
2022-10-04 19:55:06 +02:00
let fontobj = Pdf.lookup_fail ("no font " ^ fontname) pdf fontname fonts in
2021-11-12 23:09:49 +01:00
let font = Pdftext.read_font pdf fontobj in
match font with
| Pdftext.CIDKeyedFont (_, {Pdftext.cid_fontdescriptor = {Pdftext.fontfile = Some fontfile}}, _)
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
begin let objnum =
match fontfile with
2023-11-02 19:49:15 +01:00
| Pdftext.FontFile i | Pdftext.FontFile2 i | Pdftext.FontFile3 i -> i
2021-11-12 23:09:49 +01:00
in
match Pdf.lookup_obj pdf objnum with
| Pdf.Stream s as obj ->
Pdfcodec.decode_pdfstream pdf obj;
begin match s with
| {contents = (_, Pdf.Got bytes)} ->
2023-11-02 19:49:15 +01:00
let fh = open_out_bin filename in
2021-11-12 23:09:49 +01:00
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
2023-11-02 19:49:15 +01:00
close_out fh
2021-11-12 23:09:49 +01:00
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "unsupported or unfound font"
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
let remove_fontdescriptor pdf = function
| Pdf.Dictionary d as font ->
begin match lookup "/Type" d with
| Some (Pdf.Name "/Font") ->
(match Pdf.lookup_direct pdf "/FontDescriptor" font with
| Some fontdes ->
let fontdescriptor' =
Pdf.remove_dict_entry
(Pdf.remove_dict_entry
(Pdf.remove_dict_entry fontdes "/FontFile")
"/FontFile2")
"/FontFile3"
in
Pdf.add_dict_entry font "/FontDescriptor" (Pdf.Indirect (Pdf.addobj pdf fontdescriptor'))
| _ -> font)
| _ -> font
end
| x -> x
let remove_fonts pdf =
Pdf.objiter (fun k v -> ignore (Pdf.addobj_given_num pdf (k, remove_fontdescriptor pdf v))) pdf;
pdf
2021-12-21 14:44:46 +01:00
(* List fonts *)
let list_font pdf page (name, dict) =
let subtype =
match Pdf.lookup_direct pdf "/Subtype" dict with
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
| _ -> ""
in let basefont =
match Pdf.lookup_direct pdf "/BaseFont" dict with
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
| _ -> ""
in let encoding =
match Pdf.lookup_direct pdf "/Encoding" dict with
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
| _ -> ""
in
(page, name, subtype, basefont, encoding)
(* List the fonts used in an xobject, and in any of the xobjects it has. Do not
process an xobject twice. *)
let xobjs_processed = null_hash ()
let rec list_fonts_xobject pdf pagenum xobjname xobjnum =
match Hashtbl.find_opt xobjs_processed xobjnum with
| None ->
let from_xobjs =
match Pdf.lookup_direct pdf "/Resources" (Pdf.lookup_obj pdf xobjnum) with
| Some r ->
begin match Pdf.lookup_direct pdf "/XObject" r with
| Some (Pdf.Dictionary xobjs) ->
flatten (option_map (function (n, Pdf.Indirect i) -> Some (list_fonts_xobject pdf pagenum (xobjname ^ n) i) | _ -> None) xobjs)
| _ -> []
end
| _ -> []
in
begin match Pdf.lookup_direct pdf "/Resources" (Pdf.lookup_obj pdf xobjnum) with
| Some r ->
begin match Pdf.lookup_direct pdf "/Font" r with
| Some (Pdf.Dictionary fonts) -> map (list_font pdf pagenum) (map (function (n, f) -> (xobjname ^ n, f)) fonts) @ from_xobjs
| _ -> from_xobjs
end
| None -> from_xobjs
end
| Some _ ->
Hashtbl.add xobjs_processed xobjnum ();
[]
2021-12-21 14:44:46 +01:00
let list_fonts pdf range =
Hashtbl.clear xobjs_processed;
2021-12-21 14:44:46 +01:00
let pages = Pdfpage.pages_of_pagetree pdf in
flatten
(map
(fun (num, page) ->
if mem num range then
let from_xobjs =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjs) ->
flatten (option_map (function (n, Pdf.Indirect i) -> Some (list_fonts_xobject pdf num n i) | _ -> None) xobjs)
| _ -> []
in
begin match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some (Pdf.Dictionary fontdict) ->
map (list_font pdf num) fontdict @ from_xobjs
| _ -> from_xobjs
end
2021-12-21 14:44:46 +01:00
else
[])
(combine (ilist 1 (length pages)) pages))
let string_of_font (p, n, s, b, e) =
Printf.sprintf "%i %s %s %s %s\n" p n s b e
2023-10-31 17:23:20 +01:00
let json_of_font (pagenum, name, subtype, basefont, encoding) =
`Assoc
[("page", `Int pagenum);
("name", `String name);
2023-11-08 12:36:34 +01:00
("subtype", if subtype = "" then `Null else `String subtype);
("basefont", if basefont = "" then `Null else `String basefont);
("encoding", if encoding = "" then `Null else `String encoding)]
2021-12-21 14:44:46 +01:00
2024-02-07 21:41:53 +01:00
let json_fonts pdf range =
`List (map json_of_font (list_fonts pdf range))
2023-10-31 17:23:20 +01:00
let print_fonts ?(json=false) pdf range =
if json
then flprint (Cpdfyojson.Safe.pretty_to_string (`List (map json_of_font (list_fonts pdf range))))
else flprint (fold_left ( ^ ) "" (map string_of_font (list_fonts pdf range)))