This commit is contained in:
John Whitington 2021-11-12 14:09:49 -08:00
parent 98e619baca
commit f6631aa101
4 changed files with 139 additions and 130 deletions

View File

@ -21,6 +21,7 @@ o Environment variable CPDF_REPRODUCIBLE_DATES for testing
o Environment variable CPDF_DEBUG for -debug
o Effectively make stderr unbuffered
o Prepend NOT to a page range to invert it
o Split functionality into separate modules, but retain cpdf.ml for interface
2.4 (June 2021)

View File

@ -3081,30 +3081,6 @@ let copy_box f t mediabox_if_missing pdf range =
pdf
range
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
let remove_fontdescriptor pdf = function
| Pdf.Dictionary d as font ->
begin match lookup "/Type" d with
| Some (Pdf.Name "/Font") ->
(match Pdf.lookup_direct pdf "/FontDescriptor" font with
| Some fontdes ->
let fontdescriptor' =
Pdf.remove_dict_entry
(Pdf.remove_dict_entry
(Pdf.remove_dict_entry fontdes "/FontFile")
"/FontFile2")
"/FontFile3"
in
Pdf.add_dict_entry font "/FontDescriptor" (Pdf.Indirect (Pdf.addobj pdf fontdescriptor'))
| _ -> font)
| _ -> font
end
| x -> x
let remove_fonts pdf =
Pdf.objiter (fun k v -> ignore (Pdf.addobj_given_num pdf (k, remove_fontdescriptor pdf v))) pdf;
pdf
let dump_attachment out pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
@ -3198,45 +3174,6 @@ let remove_unused_resources_page pdf n page =
let remove_unused_resources pdf =
Cpdf.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
(* Extracts font to font.dat in CWD. *)
let extract_fontfile pagenumber fontname pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
match Pdf.lookup_direct pdf "/Font" resources with
| None -> failwith "extract_fontfile: font not found"
| Some fonts ->
let fontobj = Pdf.lookup_fail "no /Fonts" pdf fontname fonts in
let font = Pdftext.read_font pdf fontobj in
match font with
| Pdftext.CIDKeyedFont (_, {Pdftext.cid_fontdescriptor = {Pdftext.fontfile = Some fontfile}}, _)
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
begin let objnum =
match fontfile with
| Pdftext.FontFile i -> i
| Pdftext.FontFile2 i -> i
| Pdftext.FontFile3 i -> i
in
match Pdf.lookup_obj pdf objnum with
| Pdf.Stream s as obj ->
Pdfcodec.decode_pdfstream pdf obj;
begin match s with
| {contents = (_, Pdf.Got bytes)} ->
let fh = open_out_bin "font.dat" in
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
close_out fh;
(* Now try to read using Pdfcff module *)
(*let font = Pdftruetype.to_type3 pdf font in*)
(*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*)
(*flprint "glyph names for incodes 0,1,2,3...";
iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007");
flprint "\n";*)
()
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "unsupported or unfound font"
let print_spot_colour n s =
Printf.printf "%i %s\n" n s
@ -3330,70 +3267,6 @@ let collate (names, pdfs, ranges) =
done;
split3 (rev !nis)
let print_font_encoding pdf fontname pagenumber =
let page = try List.nth (Pdfpage.pages_of_pagetree pdf) (pagenumber - 1) with e -> error "page not found" in
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
let font =
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "print_font_encoding: font %s not found" fontname)
end
in
let pdftextfont = Pdftext.read_font pdf font in
let charset =
match pdftextfont with
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs
| _ -> None
in
let extractor = Pdftext.text_extractor_of_font pdf font in
let unicodedata = Cpdfunicodedata.unicodedata () in
let unicodetable = Hashtbl.create 16000 in
iter
(fun x ->
Hashtbl.add
unicodetable
(int_of_string ("0x" ^ x.Cpdfunicodedata.code_value))
(x.Cpdfunicodedata.code_value,
x.Cpdfunicodedata.general_category,
x.Cpdfunicodedata.character_name,
x.Cpdfunicodedata.iso_10646_comment_field))
unicodedata;
for x = 0 to 255 do
let str =
(if Pdftext.is_identity_h pdftextfont then "\000" else "")
^ string_of_char (char_of_int x)
in
let codepoints = Pdftext.codepoints_of_text extractor str in
let unicodenumber, unicodename, is_control =
match codepoints with
| [c] ->
begin try
let codeval, category, character_name, comment = Hashtbl.find unicodetable c in
codeval, character_name, category = "Cc"
with
Not_found -> "", "", false
end
| _ -> "***multiple", "***multiple", false
in
let utf8 = if is_control then "<nonprintable>" else Pdftext.utf8_of_codepoints codepoints in
let glyphnames = fold_left ( ^ ) "" (Pdftext.glyphnames_of_text extractor str) in
let is_in_charset s = match charset with None -> true | Some cs -> mem s cs in
if glyphnames <> ".notdef" && is_in_charset glyphnames then
Printf.printf
"%i = U+%s (%s - %s) = %s\n" x unicodenumber utf8 unicodename glyphnames
done
| _ -> failwith "addtext: font not found for width"
(* Main function *)
let go () =
match args.op with
@ -3498,7 +3371,7 @@ let go () =
begin match args.inputs, args.out with
| (_, pagespec, _, _, _, _)::_, _ ->
let pdf = get_single_pdf (Some RemoveFonts) false in
write_pdf true (remove_fonts pdf)
write_pdf true (Cpdffont.remove_fonts pdf)
| _ -> error "remove fonts: bad command line"
end
| Some ExtractFontFile ->
@ -3512,7 +3385,7 @@ let go () =
| Some x -> x
| None -> failwith "extract fontfile: no font name given"
in
extract_fontfile page name pdf
Cpdffont.extract_fontfile page name pdf
| _ -> error "extract fontfile: bad command line"
end
| Some CountPages ->
@ -4313,7 +4186,7 @@ let go () =
write_pdf false pdf
| Some (PrintFontEncoding fontname) ->
let pdf = get_single_pdf args.op true in
print_font_encoding pdf fontname args.copyfontpage
Cpdffont.print_font_table pdf fontname args.copyfontpage
(* Advise the user if a combination of command line flags makes little sense,
or error out if it make no sense at all. *)

View File

@ -1,4 +1,6 @@
open Pdfutil
open Cpdferror
open Pdfio
(* Embed missing fonts with Ghostscript. *)
let embed_missing_fonts path_to_ghostscript gs_quiet fi fo =
@ -128,3 +130,130 @@ let missing_fonts pdf range =
| _ -> ())
pdf
range
let print_font_table pdf fontname pagenumber =
let page = try List.nth (Pdfpage.pages_of_pagetree pdf) (pagenumber - 1) with e -> error "page not found" in
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
let font =
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "print_font_encoding: font %s not found" fontname)
end
in
let pdftextfont = Pdftext.read_font pdf font in
let charset =
match pdftextfont with
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs
| _ -> None
in
let extractor = Pdftext.text_extractor_of_font pdf font in
let unicodedata = Cpdfunicodedata.unicodedata () in
let unicodetable = Hashtbl.create 16000 in
iter
(fun x ->
Hashtbl.add
unicodetable
(int_of_string ("0x" ^ x.Cpdfunicodedata.code_value))
(x.Cpdfunicodedata.code_value,
x.Cpdfunicodedata.general_category,
x.Cpdfunicodedata.character_name,
x.Cpdfunicodedata.iso_10646_comment_field))
unicodedata;
for x = 0 to 255 do
let str =
(if Pdftext.is_identity_h pdftextfont then "\000" else "")
^ string_of_char (char_of_int x)
in
let codepoints = Pdftext.codepoints_of_text extractor str in
let unicodenumber, unicodename, is_control =
match codepoints with
| [c] ->
begin try
let codeval, category, character_name, comment = Hashtbl.find unicodetable c in
codeval, character_name, category = "Cc"
with
Not_found -> "", "", false
end
| _ -> "***multiple", "***multiple", false
in
let utf8 = if is_control then "<nonprintable>" else Pdftext.utf8_of_codepoints codepoints in
let glyphnames = fold_left ( ^ ) "" (Pdftext.glyphnames_of_text extractor str) in
let is_in_charset s = match charset with None -> true | Some cs -> mem s cs in
if glyphnames <> ".notdef" && is_in_charset glyphnames then
Printf.printf
"%i = U+%s (%s - %s) = %s\n" x unicodenumber utf8 unicodename glyphnames
done
| _ -> failwith "addtext: font not found for width"
(* Extracts font to font.dat in CWD. *)
let extract_fontfile pagenumber fontname pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
match Pdf.lookup_direct pdf "/Font" resources with
| None -> failwith "extract_fontfile: font not found"
| Some fonts ->
let fontobj = Pdf.lookup_fail "no /Fonts" pdf fontname fonts in
let font = Pdftext.read_font pdf fontobj in
match font with
| Pdftext.CIDKeyedFont (_, {Pdftext.cid_fontdescriptor = {Pdftext.fontfile = Some fontfile}}, _)
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
begin let objnum =
match fontfile with
| Pdftext.FontFile i -> i
| Pdftext.FontFile2 i -> i
| Pdftext.FontFile3 i -> i
in
match Pdf.lookup_obj pdf objnum with
| Pdf.Stream s as obj ->
Pdfcodec.decode_pdfstream pdf obj;
begin match s with
| {contents = (_, Pdf.Got bytes)} ->
let fh = open_out_bin "font.dat" in
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
close_out fh;
(* Now try to read using Pdfcff module *)
(*let font = Pdftruetype.to_type3 pdf font in*)
(*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*)
(*flprint "glyph names for incodes 0,1,2,3...";
iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007");
flprint "\n";*)
()
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "unsupported or unfound font"
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
let remove_fontdescriptor pdf = function
| Pdf.Dictionary d as font ->
begin match lookup "/Type" d with
| Some (Pdf.Name "/Font") ->
(match Pdf.lookup_direct pdf "/FontDescriptor" font with
| Some fontdes ->
let fontdescriptor' =
Pdf.remove_dict_entry
(Pdf.remove_dict_entry
(Pdf.remove_dict_entry fontdes "/FontFile")
"/FontFile2")
"/FontFile3"
in
Pdf.add_dict_entry font "/FontDescriptor" (Pdf.Indirect (Pdf.addobj pdf fontdescriptor'))
| _ -> font)
| _ -> font
end
| x -> x
let remove_fonts pdf =
Pdf.objiter (fun k v -> ignore (Pdf.addobj_given_num pdf (k, remove_fontdescriptor pdf v))) pdf;
pdf

View File

@ -3,3 +3,9 @@ val embed_missing_fonts : string -> bool -> string -> string -> unit
val copy_font : Pdf.t -> string -> int -> int list -> Pdf.t -> Pdf.t
val missing_fonts : Pdf.t -> int list -> unit
val print_font_table : Pdf.t -> string -> int -> unit
val extract_fontfile : int -> string -> Pdf.t -> unit
val remove_fonts : Pdf.t -> Pdf.t