Rehabilitate -extract-font

This commit is contained in:
John Whitington 2023-11-02 18:49:15 +00:00
parent 35bf9f14d7
commit 8a1fdc23a4
4 changed files with 26 additions and 30 deletions

View File

@ -8,6 +8,7 @@ o List document and page info in JSON format
o List page labels in JSON format o List page labels in JSON format
o List fonts in JSON format o List fonts in JSON format
o Identify PDF/A, PDF/X, PDF/E, PDF/VT, PDF/UA o Identify PDF/A, PDF/X, PDF/E, PDF/VT, PDF/UA
o Extract font files from a document
Extended features: Extended features:

View File

@ -189,7 +189,7 @@ type op =
| ExtractImages | ExtractImages
| ImageResolution of float | ImageResolution of float
| MissingFonts | MissingFonts
| ExtractFontFile | ExtractFontFile of string
| ExtractText | ExtractText
| OpenAtPage of string | OpenAtPage of string
| OpenAtPageFit of string | OpenAtPageFit of string
@ -320,7 +320,7 @@ let string_of_op = function
| ExtractImages -> "ExtractImages" | ExtractImages -> "ExtractImages"
| ImageResolution _ -> "ImageResolution" | ImageResolution _ -> "ImageResolution"
| MissingFonts -> "MissingFonts" | MissingFonts -> "MissingFonts"
| ExtractFontFile -> "ExtractFontFile" | ExtractFontFile _ -> "ExtractFontFile"
| ExtractText -> "ExtractText" | ExtractText -> "ExtractText"
| OpenAtPage _ -> "OpenAtPage" | OpenAtPage _ -> "OpenAtPage"
| OpenAtPageFit _ -> "OpenAtPageFit" | OpenAtPageFit _ -> "OpenAtPageFit"
@ -837,7 +837,7 @@ let banned banlist = function
| ShowBoxes | TrimMarks | CreateMetadata | SetMetadataDate _ | SetVersion _ | ShowBoxes | TrimMarks | CreateMetadata | SetMetadataDate _ | SetVersion _
| SetAuthor _|SetTitle _|SetSubject _|SetKeywords _|SetCreate _ | SetAuthor _|SetTitle _|SetSubject _|SetKeywords _|SetCreate _
| SetModify _|SetCreator _|SetProducer _|RemoveDictEntry _ | ReplaceDictEntry _ | PrintDictEntry _ | SetMetadata _ | SetModify _|SetCreator _|SetProducer _|RemoveDictEntry _ | ReplaceDictEntry _ | PrintDictEntry _ | SetMetadata _
| ExtractText | ExtractImages | ExtractFontFile | ExtractText | ExtractImages | ExtractFontFile _
| AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce | AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce
| OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _ | OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _
| TextWidth _ | SetAnnotations _ | CopyAnnotations _ | TextWidth _ | SetAnnotations _ | CopyAnnotations _
@ -1871,6 +1871,9 @@ let settextwidth s =
let setdraw () = let setdraw () =
args.op <- Some Draw args.op <- Some Draw
let setextractfontfile s =
args.op <- Some (ExtractFontFile s)
let () = Cpdfdrawcontrol.getfontname := fun () -> args.fontname let () = Cpdfdrawcontrol.getfontname := fun () -> args.fontname
let () = Cpdfdrawcontrol.getfontsize := fun () -> args.fontsize let () = Cpdfdrawcontrol.getfontsize := fun () -> args.fontsize
let () = Cpdfdrawcontrol.setfontname := setfont let () = Cpdfdrawcontrol.setfontname := setfont
@ -2668,6 +2671,9 @@ and specs =
("-print-font-table-page", ("-print-font-table-page",
Arg.Int setfontpage, Arg.Int setfontpage,
" Set page for -print-font-table"); " Set page for -print-font-table");
("-extract-font",
Arg.String setextractfontfile,
" Extract a font");
("-table-of-contents", ("-table-of-contents",
Arg.Unit (setop TableOfContents), Arg.Unit (setop TableOfContents),
" Typeset a table of contents from bookmarks"); " Typeset a table of contents from bookmarks");
@ -2749,7 +2755,7 @@ and specs =
("-debug-stderr-to-stdout", Arg.Unit setstderrtostdout, ""); ("-debug-stderr-to-stdout", Arg.Unit setstderrtostdout, "");
("-stay-on-error", Arg.Unit setstayonerror, ""); ("-stay-on-error", Arg.Unit setstayonerror, "");
(* These items are unfinished *) (* These items are unfinished *)
("-extract-fontfile", Arg.Unit (setop ExtractFontFile), "");
("-extract-text", Arg.Unit (setop ExtractText), ""); ("-extract-text", Arg.Unit (setop ExtractText), "");
("-extract-text-font-size", Arg.Float setextracttextfontsize, ""); ("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
] ]
@ -3439,17 +3445,17 @@ let go () =
write_pdf true (Cpdffont.remove_fonts pdf) write_pdf true (Cpdffont.remove_fonts pdf)
| _ -> error "remove fonts: bad command line" | _ -> error "remove fonts: bad command line"
end end
| Some ExtractFontFile -> | Some (ExtractFontFile spec) ->
begin match args.inputs, args.out with begin match args.inputs, args.out with
| (_, pagespec, u, o, _, _)::_, _ -> | (_, pagespec, u, o, _, _)::_, File filename ->
let pdf = get_single_pdf (Some ExtractFontFile) false in let pdf = get_single_pdf (Some (ExtractFontFile spec)) false in
let page = args.copyfontpage begin match String.split_on_char ',' spec with
and name = | [pnum; name] ->
match args.copyfontname with begin try Cpdffont.extract_fontfile (int_of_string pnum) name filename pdf with
| Some x -> x Failure _ (*"int_of_string"*) -> error "extract font: bad page number"
| None -> failwith "extract fontfile: no font name given" end
in | _ -> error "extract font: bad specification"
Cpdffont.extract_fontfile page name pdf end
| _ -> error "extract fontfile: bad command line" | _ -> error "extract fontfile: bad command line"
end end
| Some CountPages -> | Some CountPages ->

View File

@ -199,8 +199,7 @@ let print_font_table pdf fontname pagenumber =
done done
| _ -> failwith "addtext: font not found for width" | _ -> failwith "addtext: font not found for width"
(* Extracts font to font.dat in CWD. *) let extract_fontfile pagenumber fontname filename pdf =
let extract_fontfile pagenumber fontname pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
match Pdf.lookup_direct pdf "/Font" resources with match Pdf.lookup_direct pdf "/Font" resources with
| None -> failwith "extract_fontfile: font not found" | None -> failwith "extract_fontfile: font not found"
@ -212,32 +211,22 @@ let extract_fontfile pagenumber fontname pdf =
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} -> | Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
begin let objnum = begin let objnum =
match fontfile with match fontfile with
| Pdftext.FontFile i -> i | Pdftext.FontFile i | Pdftext.FontFile2 i | Pdftext.FontFile3 i -> i
| Pdftext.FontFile2 i -> i
| Pdftext.FontFile3 i -> i
in in
match Pdf.lookup_obj pdf objnum with match Pdf.lookup_obj pdf objnum with
| Pdf.Stream s as obj -> | Pdf.Stream s as obj ->
Pdfcodec.decode_pdfstream pdf obj; Pdfcodec.decode_pdfstream pdf obj;
begin match s with begin match s with
| {contents = (_, Pdf.Got bytes)} -> | {contents = (_, Pdf.Got bytes)} ->
let fh = open_out_bin "font.dat" in let fh = open_out_bin filename in
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done; for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
close_out fh; close_out fh
(* Now try to read using Pdfcff module *)
(*let font = Pdftruetype.to_type3 pdf font in*)
(*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*)
(*flprint "glyph names for incodes 0,1,2,3...";
iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007");
flprint "\n";*)
()
| _ -> failwith "extract_fontfile" | _ -> failwith "extract_fontfile"
end end
| _ -> failwith "extract_fontfile" | _ -> failwith "extract_fontfile"
end end
| _ -> failwith "unsupported or unfound font" | _ -> failwith "unsupported or unfound font"
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *) (* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
let remove_fontdescriptor pdf = function let remove_fontdescriptor pdf = function
| Pdf.Dictionary d as font -> | Pdf.Dictionary d as font ->

View File

@ -23,7 +23,7 @@ val missing_fonts : Pdf.t -> int list -> unit
val print_font_table : Pdf.t -> string -> int -> unit val print_font_table : Pdf.t -> string -> int -> unit
(** Extract a font file to disk. *) (** Extract a font file to disk. *)
val extract_fontfile : int -> string -> Pdf.t -> unit val extract_fontfile : int -> string -> string -> Pdf.t -> unit
(** Remove fonts from a document. *) (** Remove fonts from a document. *)
val remove_fonts : Pdf.t -> Pdf.t val remove_fonts : Pdf.t -> Pdf.t