mirror of
https://github.com/johnwhitington/cpdf-source.git
synced 2025-02-17 04:10:48 +01:00
more
This commit is contained in:
parent
98e619baca
commit
f6631aa101
1
Changes
1
Changes
@ -21,6 +21,7 @@ o Environment variable CPDF_REPRODUCIBLE_DATES for testing
|
|||||||
o Environment variable CPDF_DEBUG for -debug
|
o Environment variable CPDF_DEBUG for -debug
|
||||||
o Effectively make stderr unbuffered
|
o Effectively make stderr unbuffered
|
||||||
o Prepend NOT to a page range to invert it
|
o Prepend NOT to a page range to invert it
|
||||||
|
o Split functionality into separate modules, but retain cpdf.ml for interface
|
||||||
|
|
||||||
2.4 (June 2021)
|
2.4 (June 2021)
|
||||||
|
|
||||||
|
133
cpdfcommand.ml
133
cpdfcommand.ml
@ -3081,30 +3081,6 @@ let copy_box f t mediabox_if_missing pdf range =
|
|||||||
pdf
|
pdf
|
||||||
range
|
range
|
||||||
|
|
||||||
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
|
|
||||||
let remove_fontdescriptor pdf = function
|
|
||||||
| Pdf.Dictionary d as font ->
|
|
||||||
begin match lookup "/Type" d with
|
|
||||||
| Some (Pdf.Name "/Font") ->
|
|
||||||
(match Pdf.lookup_direct pdf "/FontDescriptor" font with
|
|
||||||
| Some fontdes ->
|
|
||||||
let fontdescriptor' =
|
|
||||||
Pdf.remove_dict_entry
|
|
||||||
(Pdf.remove_dict_entry
|
|
||||||
(Pdf.remove_dict_entry fontdes "/FontFile")
|
|
||||||
"/FontFile2")
|
|
||||||
"/FontFile3"
|
|
||||||
in
|
|
||||||
Pdf.add_dict_entry font "/FontDescriptor" (Pdf.Indirect (Pdf.addobj pdf fontdescriptor'))
|
|
||||||
| _ -> font)
|
|
||||||
| _ -> font
|
|
||||||
end
|
|
||||||
| x -> x
|
|
||||||
|
|
||||||
let remove_fonts pdf =
|
|
||||||
Pdf.objiter (fun k v -> ignore (Pdf.addobj_given_num pdf (k, remove_fontdescriptor pdf v))) pdf;
|
|
||||||
pdf
|
|
||||||
|
|
||||||
let dump_attachment out pdf (_, embeddedfile) =
|
let dump_attachment out pdf (_, embeddedfile) =
|
||||||
match Pdf.lookup_direct pdf "/F" embeddedfile with
|
match Pdf.lookup_direct pdf "/F" embeddedfile with
|
||||||
| Some (Pdf.String s) ->
|
| Some (Pdf.String s) ->
|
||||||
@ -3198,45 +3174,6 @@ let remove_unused_resources_page pdf n page =
|
|||||||
let remove_unused_resources pdf =
|
let remove_unused_resources pdf =
|
||||||
Cpdf.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
|
Cpdf.process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
|
||||||
|
|
||||||
(* Extracts font to font.dat in CWD. *)
|
|
||||||
let extract_fontfile pagenumber fontname pdf =
|
|
||||||
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
|
|
||||||
match Pdf.lookup_direct pdf "/Font" resources with
|
|
||||||
| None -> failwith "extract_fontfile: font not found"
|
|
||||||
| Some fonts ->
|
|
||||||
let fontobj = Pdf.lookup_fail "no /Fonts" pdf fontname fonts in
|
|
||||||
let font = Pdftext.read_font pdf fontobj in
|
|
||||||
match font with
|
|
||||||
| Pdftext.CIDKeyedFont (_, {Pdftext.cid_fontdescriptor = {Pdftext.fontfile = Some fontfile}}, _)
|
|
||||||
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
|
|
||||||
begin let objnum =
|
|
||||||
match fontfile with
|
|
||||||
| Pdftext.FontFile i -> i
|
|
||||||
| Pdftext.FontFile2 i -> i
|
|
||||||
| Pdftext.FontFile3 i -> i
|
|
||||||
in
|
|
||||||
match Pdf.lookup_obj pdf objnum with
|
|
||||||
| Pdf.Stream s as obj ->
|
|
||||||
Pdfcodec.decode_pdfstream pdf obj;
|
|
||||||
begin match s with
|
|
||||||
| {contents = (_, Pdf.Got bytes)} ->
|
|
||||||
let fh = open_out_bin "font.dat" in
|
|
||||||
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
|
|
||||||
close_out fh;
|
|
||||||
(* Now try to read using Pdfcff module *)
|
|
||||||
(*let font = Pdftruetype.to_type3 pdf font in*)
|
|
||||||
(*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*)
|
|
||||||
(*flprint "glyph names for incodes 0,1,2,3...";
|
|
||||||
iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007");
|
|
||||||
flprint "\n";*)
|
|
||||||
()
|
|
||||||
| _ -> failwith "extract_fontfile"
|
|
||||||
end
|
|
||||||
| _ -> failwith "extract_fontfile"
|
|
||||||
end
|
|
||||||
| _ -> failwith "unsupported or unfound font"
|
|
||||||
|
|
||||||
|
|
||||||
let print_spot_colour n s =
|
let print_spot_colour n s =
|
||||||
Printf.printf "%i %s\n" n s
|
Printf.printf "%i %s\n" n s
|
||||||
|
|
||||||
@ -3330,70 +3267,6 @@ let collate (names, pdfs, ranges) =
|
|||||||
done;
|
done;
|
||||||
split3 (rev !nis)
|
split3 (rev !nis)
|
||||||
|
|
||||||
let print_font_encoding pdf fontname pagenumber =
|
|
||||||
let page = try List.nth (Pdfpage.pages_of_pagetree pdf) (pagenumber - 1) with e -> error "page not found" in
|
|
||||||
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
|
||||||
| Some fontdict ->
|
|
||||||
let font =
|
|
||||||
begin match Pdf.lookup_direct pdf fontname fontdict with
|
|
||||||
| Some font -> font
|
|
||||||
| None ->
|
|
||||||
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
|
|
||||||
let font = ref None in
|
|
||||||
iter
|
|
||||||
(fun (k, v) ->
|
|
||||||
match Pdf.lookup_direct pdf "/BaseFont" v with
|
|
||||||
| Some (Pdf.Name n) when n = fontname -> font := Some v
|
|
||||||
| _ -> ())
|
|
||||||
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
|
|
||||||
match !font with Some f -> f | None -> failwith (Printf.sprintf "print_font_encoding: font %s not found" fontname)
|
|
||||||
end
|
|
||||||
in
|
|
||||||
let pdftextfont = Pdftext.read_font pdf font in
|
|
||||||
let charset =
|
|
||||||
match pdftextfont with
|
|
||||||
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs
|
|
||||||
| _ -> None
|
|
||||||
in
|
|
||||||
let extractor = Pdftext.text_extractor_of_font pdf font in
|
|
||||||
let unicodedata = Cpdfunicodedata.unicodedata () in
|
|
||||||
let unicodetable = Hashtbl.create 16000 in
|
|
||||||
iter
|
|
||||||
(fun x ->
|
|
||||||
Hashtbl.add
|
|
||||||
unicodetable
|
|
||||||
(int_of_string ("0x" ^ x.Cpdfunicodedata.code_value))
|
|
||||||
(x.Cpdfunicodedata.code_value,
|
|
||||||
x.Cpdfunicodedata.general_category,
|
|
||||||
x.Cpdfunicodedata.character_name,
|
|
||||||
x.Cpdfunicodedata.iso_10646_comment_field))
|
|
||||||
unicodedata;
|
|
||||||
for x = 0 to 255 do
|
|
||||||
let str =
|
|
||||||
(if Pdftext.is_identity_h pdftextfont then "\000" else "")
|
|
||||||
^ string_of_char (char_of_int x)
|
|
||||||
in
|
|
||||||
let codepoints = Pdftext.codepoints_of_text extractor str in
|
|
||||||
let unicodenumber, unicodename, is_control =
|
|
||||||
match codepoints with
|
|
||||||
| [c] ->
|
|
||||||
begin try
|
|
||||||
let codeval, category, character_name, comment = Hashtbl.find unicodetable c in
|
|
||||||
codeval, character_name, category = "Cc"
|
|
||||||
with
|
|
||||||
Not_found -> "", "", false
|
|
||||||
end
|
|
||||||
| _ -> "***multiple", "***multiple", false
|
|
||||||
in
|
|
||||||
let utf8 = if is_control then "<nonprintable>" else Pdftext.utf8_of_codepoints codepoints in
|
|
||||||
let glyphnames = fold_left ( ^ ) "" (Pdftext.glyphnames_of_text extractor str) in
|
|
||||||
let is_in_charset s = match charset with None -> true | Some cs -> mem s cs in
|
|
||||||
if glyphnames <> ".notdef" && is_in_charset glyphnames then
|
|
||||||
Printf.printf
|
|
||||||
"%i = U+%s (%s - %s) = %s\n" x unicodenumber utf8 unicodename glyphnames
|
|
||||||
done
|
|
||||||
| _ -> failwith "addtext: font not found for width"
|
|
||||||
|
|
||||||
(* Main function *)
|
(* Main function *)
|
||||||
let go () =
|
let go () =
|
||||||
match args.op with
|
match args.op with
|
||||||
@ -3498,7 +3371,7 @@ let go () =
|
|||||||
begin match args.inputs, args.out with
|
begin match args.inputs, args.out with
|
||||||
| (_, pagespec, _, _, _, _)::_, _ ->
|
| (_, pagespec, _, _, _, _)::_, _ ->
|
||||||
let pdf = get_single_pdf (Some RemoveFonts) false in
|
let pdf = get_single_pdf (Some RemoveFonts) false in
|
||||||
write_pdf true (remove_fonts pdf)
|
write_pdf true (Cpdffont.remove_fonts pdf)
|
||||||
| _ -> error "remove fonts: bad command line"
|
| _ -> error "remove fonts: bad command line"
|
||||||
end
|
end
|
||||||
| Some ExtractFontFile ->
|
| Some ExtractFontFile ->
|
||||||
@ -3512,7 +3385,7 @@ let go () =
|
|||||||
| Some x -> x
|
| Some x -> x
|
||||||
| None -> failwith "extract fontfile: no font name given"
|
| None -> failwith "extract fontfile: no font name given"
|
||||||
in
|
in
|
||||||
extract_fontfile page name pdf
|
Cpdffont.extract_fontfile page name pdf
|
||||||
| _ -> error "extract fontfile: bad command line"
|
| _ -> error "extract fontfile: bad command line"
|
||||||
end
|
end
|
||||||
| Some CountPages ->
|
| Some CountPages ->
|
||||||
@ -4313,7 +4186,7 @@ let go () =
|
|||||||
write_pdf false pdf
|
write_pdf false pdf
|
||||||
| Some (PrintFontEncoding fontname) ->
|
| Some (PrintFontEncoding fontname) ->
|
||||||
let pdf = get_single_pdf args.op true in
|
let pdf = get_single_pdf args.op true in
|
||||||
print_font_encoding pdf fontname args.copyfontpage
|
Cpdffont.print_font_table pdf fontname args.copyfontpage
|
||||||
|
|
||||||
(* Advise the user if a combination of command line flags makes little sense,
|
(* Advise the user if a combination of command line flags makes little sense,
|
||||||
or error out if it make no sense at all. *)
|
or error out if it make no sense at all. *)
|
||||||
|
129
cpdffont.ml
129
cpdffont.ml
@ -1,4 +1,6 @@
|
|||||||
open Pdfutil
|
open Pdfutil
|
||||||
|
open Cpdferror
|
||||||
|
open Pdfio
|
||||||
|
|
||||||
(* Embed missing fonts with Ghostscript. *)
|
(* Embed missing fonts with Ghostscript. *)
|
||||||
let embed_missing_fonts path_to_ghostscript gs_quiet fi fo =
|
let embed_missing_fonts path_to_ghostscript gs_quiet fi fo =
|
||||||
@ -128,3 +130,130 @@ let missing_fonts pdf range =
|
|||||||
| _ -> ())
|
| _ -> ())
|
||||||
pdf
|
pdf
|
||||||
range
|
range
|
||||||
|
|
||||||
|
let print_font_table pdf fontname pagenumber =
|
||||||
|
let page = try List.nth (Pdfpage.pages_of_pagetree pdf) (pagenumber - 1) with e -> error "page not found" in
|
||||||
|
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
||||||
|
| Some fontdict ->
|
||||||
|
let font =
|
||||||
|
begin match Pdf.lookup_direct pdf fontname fontdict with
|
||||||
|
| Some font -> font
|
||||||
|
| None ->
|
||||||
|
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
|
||||||
|
let font = ref None in
|
||||||
|
iter
|
||||||
|
(fun (k, v) ->
|
||||||
|
match Pdf.lookup_direct pdf "/BaseFont" v with
|
||||||
|
| Some (Pdf.Name n) when n = fontname -> font := Some v
|
||||||
|
| _ -> ())
|
||||||
|
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
|
||||||
|
match !font with Some f -> f | None -> failwith (Printf.sprintf "print_font_encoding: font %s not found" fontname)
|
||||||
|
end
|
||||||
|
in
|
||||||
|
let pdftextfont = Pdftext.read_font pdf font in
|
||||||
|
let charset =
|
||||||
|
match pdftextfont with
|
||||||
|
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs
|
||||||
|
| _ -> None
|
||||||
|
in
|
||||||
|
let extractor = Pdftext.text_extractor_of_font pdf font in
|
||||||
|
let unicodedata = Cpdfunicodedata.unicodedata () in
|
||||||
|
let unicodetable = Hashtbl.create 16000 in
|
||||||
|
iter
|
||||||
|
(fun x ->
|
||||||
|
Hashtbl.add
|
||||||
|
unicodetable
|
||||||
|
(int_of_string ("0x" ^ x.Cpdfunicodedata.code_value))
|
||||||
|
(x.Cpdfunicodedata.code_value,
|
||||||
|
x.Cpdfunicodedata.general_category,
|
||||||
|
x.Cpdfunicodedata.character_name,
|
||||||
|
x.Cpdfunicodedata.iso_10646_comment_field))
|
||||||
|
unicodedata;
|
||||||
|
for x = 0 to 255 do
|
||||||
|
let str =
|
||||||
|
(if Pdftext.is_identity_h pdftextfont then "\000" else "")
|
||||||
|
^ string_of_char (char_of_int x)
|
||||||
|
in
|
||||||
|
let codepoints = Pdftext.codepoints_of_text extractor str in
|
||||||
|
let unicodenumber, unicodename, is_control =
|
||||||
|
match codepoints with
|
||||||
|
| [c] ->
|
||||||
|
begin try
|
||||||
|
let codeval, category, character_name, comment = Hashtbl.find unicodetable c in
|
||||||
|
codeval, character_name, category = "Cc"
|
||||||
|
with
|
||||||
|
Not_found -> "", "", false
|
||||||
|
end
|
||||||
|
| _ -> "***multiple", "***multiple", false
|
||||||
|
in
|
||||||
|
let utf8 = if is_control then "<nonprintable>" else Pdftext.utf8_of_codepoints codepoints in
|
||||||
|
let glyphnames = fold_left ( ^ ) "" (Pdftext.glyphnames_of_text extractor str) in
|
||||||
|
let is_in_charset s = match charset with None -> true | Some cs -> mem s cs in
|
||||||
|
if glyphnames <> ".notdef" && is_in_charset glyphnames then
|
||||||
|
Printf.printf
|
||||||
|
"%i = U+%s (%s - %s) = %s\n" x unicodenumber utf8 unicodename glyphnames
|
||||||
|
done
|
||||||
|
| _ -> failwith "addtext: font not found for width"
|
||||||
|
|
||||||
|
(* Extracts font to font.dat in CWD. *)
|
||||||
|
let extract_fontfile pagenumber fontname pdf =
|
||||||
|
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
|
||||||
|
match Pdf.lookup_direct pdf "/Font" resources with
|
||||||
|
| None -> failwith "extract_fontfile: font not found"
|
||||||
|
| Some fonts ->
|
||||||
|
let fontobj = Pdf.lookup_fail "no /Fonts" pdf fontname fonts in
|
||||||
|
let font = Pdftext.read_font pdf fontobj in
|
||||||
|
match font with
|
||||||
|
| Pdftext.CIDKeyedFont (_, {Pdftext.cid_fontdescriptor = {Pdftext.fontfile = Some fontfile}}, _)
|
||||||
|
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
|
||||||
|
begin let objnum =
|
||||||
|
match fontfile with
|
||||||
|
| Pdftext.FontFile i -> i
|
||||||
|
| Pdftext.FontFile2 i -> i
|
||||||
|
| Pdftext.FontFile3 i -> i
|
||||||
|
in
|
||||||
|
match Pdf.lookup_obj pdf objnum with
|
||||||
|
| Pdf.Stream s as obj ->
|
||||||
|
Pdfcodec.decode_pdfstream pdf obj;
|
||||||
|
begin match s with
|
||||||
|
| {contents = (_, Pdf.Got bytes)} ->
|
||||||
|
let fh = open_out_bin "font.dat" in
|
||||||
|
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
|
||||||
|
close_out fh;
|
||||||
|
(* Now try to read using Pdfcff module *)
|
||||||
|
(*let font = Pdftruetype.to_type3 pdf font in*)
|
||||||
|
(*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*)
|
||||||
|
(*flprint "glyph names for incodes 0,1,2,3...";
|
||||||
|
iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007");
|
||||||
|
flprint "\n";*)
|
||||||
|
()
|
||||||
|
| _ -> failwith "extract_fontfile"
|
||||||
|
end
|
||||||
|
| _ -> failwith "extract_fontfile"
|
||||||
|
end
|
||||||
|
| _ -> failwith "unsupported or unfound font"
|
||||||
|
|
||||||
|
|
||||||
|
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
|
||||||
|
let remove_fontdescriptor pdf = function
|
||||||
|
| Pdf.Dictionary d as font ->
|
||||||
|
begin match lookup "/Type" d with
|
||||||
|
| Some (Pdf.Name "/Font") ->
|
||||||
|
(match Pdf.lookup_direct pdf "/FontDescriptor" font with
|
||||||
|
| Some fontdes ->
|
||||||
|
let fontdescriptor' =
|
||||||
|
Pdf.remove_dict_entry
|
||||||
|
(Pdf.remove_dict_entry
|
||||||
|
(Pdf.remove_dict_entry fontdes "/FontFile")
|
||||||
|
"/FontFile2")
|
||||||
|
"/FontFile3"
|
||||||
|
in
|
||||||
|
Pdf.add_dict_entry font "/FontDescriptor" (Pdf.Indirect (Pdf.addobj pdf fontdescriptor'))
|
||||||
|
| _ -> font)
|
||||||
|
| _ -> font
|
||||||
|
end
|
||||||
|
| x -> x
|
||||||
|
|
||||||
|
let remove_fonts pdf =
|
||||||
|
Pdf.objiter (fun k v -> ignore (Pdf.addobj_given_num pdf (k, remove_fontdescriptor pdf v))) pdf;
|
||||||
|
pdf
|
||||||
|
@ -3,3 +3,9 @@ val embed_missing_fonts : string -> bool -> string -> string -> unit
|
|||||||
val copy_font : Pdf.t -> string -> int -> int list -> Pdf.t -> Pdf.t
|
val copy_font : Pdf.t -> string -> int -> int list -> Pdf.t -> Pdf.t
|
||||||
|
|
||||||
val missing_fonts : Pdf.t -> int list -> unit
|
val missing_fonts : Pdf.t -> int list -> unit
|
||||||
|
|
||||||
|
val print_font_table : Pdf.t -> string -> int -> unit
|
||||||
|
|
||||||
|
val extract_fontfile : int -> string -> Pdf.t -> unit
|
||||||
|
|
||||||
|
val remove_fonts : Pdf.t -> Pdf.t
|
||||||
|
Loading…
x
Reference in New Issue
Block a user