Rehabilitate -extract-font

This commit is contained in:
John Whitington 2023-11-02 18:49:15 +00:00
parent 35bf9f14d7
commit 8a1fdc23a4
4 changed files with 26 additions and 30 deletions

View File

@ -8,6 +8,7 @@ o List document and page info in JSON format
o List page labels in JSON format
o List fonts in JSON format
o Identify PDF/A, PDF/X, PDF/E, PDF/VT, PDF/UA
o Extract font files from a document
Extended features:

View File

@ -189,7 +189,7 @@ type op =
| ExtractImages
| ImageResolution of float
| MissingFonts
| ExtractFontFile
| ExtractFontFile of string
| ExtractText
| OpenAtPage of string
| OpenAtPageFit of string
@ -320,7 +320,7 @@ let string_of_op = function
| ExtractImages -> "ExtractImages"
| ImageResolution _ -> "ImageResolution"
| MissingFonts -> "MissingFonts"
| ExtractFontFile -> "ExtractFontFile"
| ExtractFontFile _ -> "ExtractFontFile"
| ExtractText -> "ExtractText"
| OpenAtPage _ -> "OpenAtPage"
| OpenAtPageFit _ -> "OpenAtPageFit"
@ -837,7 +837,7 @@ let banned banlist = function
| ShowBoxes | TrimMarks | CreateMetadata | SetMetadataDate _ | SetVersion _
| SetAuthor _|SetTitle _|SetSubject _|SetKeywords _|SetCreate _
| SetModify _|SetCreator _|SetProducer _|RemoveDictEntry _ | ReplaceDictEntry _ | PrintDictEntry _ | SetMetadata _
| ExtractText | ExtractImages | ExtractFontFile
| ExtractText | ExtractImages | ExtractFontFile _
| AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce
| OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _
| TextWidth _ | SetAnnotations _ | CopyAnnotations _
@ -1871,6 +1871,9 @@ let settextwidth s =
let setdraw () =
args.op <- Some Draw
let setextractfontfile s =
args.op <- Some (ExtractFontFile s)
let () = Cpdfdrawcontrol.getfontname := fun () -> args.fontname
let () = Cpdfdrawcontrol.getfontsize := fun () -> args.fontsize
let () = Cpdfdrawcontrol.setfontname := setfont
@ -2668,6 +2671,9 @@ and specs =
("-print-font-table-page",
Arg.Int setfontpage,
" Set page for -print-font-table");
("-extract-font",
Arg.String setextractfontfile,
" Extract a font");
("-table-of-contents",
Arg.Unit (setop TableOfContents),
" Typeset a table of contents from bookmarks");
@ -2749,7 +2755,7 @@ and specs =
("-debug-stderr-to-stdout", Arg.Unit setstderrtostdout, "");
("-stay-on-error", Arg.Unit setstayonerror, "");
(* These items are unfinished *)
("-extract-fontfile", Arg.Unit (setop ExtractFontFile), "");
("-extract-text", Arg.Unit (setop ExtractText), "");
("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
]
@ -3439,17 +3445,17 @@ let go () =
write_pdf true (Cpdffont.remove_fonts pdf)
| _ -> error "remove fonts: bad command line"
end
| Some ExtractFontFile ->
| Some (ExtractFontFile spec) ->
begin match args.inputs, args.out with
| (_, pagespec, u, o, _, _)::_, _ ->
let pdf = get_single_pdf (Some ExtractFontFile) false in
let page = args.copyfontpage
and name =
match args.copyfontname with
| Some x -> x
| None -> failwith "extract fontfile: no font name given"
in
Cpdffont.extract_fontfile page name pdf
| (_, pagespec, u, o, _, _)::_, File filename ->
let pdf = get_single_pdf (Some (ExtractFontFile spec)) false in
begin match String.split_on_char ',' spec with
| [pnum; name] ->
begin try Cpdffont.extract_fontfile (int_of_string pnum) name filename pdf with
Failure _ (*"int_of_string"*) -> error "extract font: bad page number"
end
| _ -> error "extract font: bad specification"
end
| _ -> error "extract fontfile: bad command line"
end
| Some CountPages ->

View File

@ -199,8 +199,7 @@ let print_font_table pdf fontname pagenumber =
done
| _ -> failwith "addtext: font not found for width"
(* Extracts font to font.dat in CWD. *)
let extract_fontfile pagenumber fontname pdf =
let extract_fontfile pagenumber fontname filename pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
match Pdf.lookup_direct pdf "/Font" resources with
| None -> failwith "extract_fontfile: font not found"
@ -212,32 +211,22 @@ let extract_fontfile pagenumber fontname pdf =
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
begin let objnum =
match fontfile with
| Pdftext.FontFile i -> i
| Pdftext.FontFile2 i -> i
| Pdftext.FontFile3 i -> i
| Pdftext.FontFile i | Pdftext.FontFile2 i | Pdftext.FontFile3 i -> i
in
match Pdf.lookup_obj pdf objnum with
| Pdf.Stream s as obj ->
Pdfcodec.decode_pdfstream pdf obj;
begin match s with
| {contents = (_, Pdf.Got bytes)} ->
let fh = open_out_bin "font.dat" in
let fh = open_out_bin filename in
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
close_out fh;
(* Now try to read using Pdfcff module *)
(*let font = Pdftruetype.to_type3 pdf font in*)
(*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*)
(*flprint "glyph names for incodes 0,1,2,3...";
iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007");
flprint "\n";*)
()
close_out fh
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "extract_fontfile"
end
| _ -> failwith "unsupported or unfound font"
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
let remove_fontdescriptor pdf = function
| Pdf.Dictionary d as font ->

View File

@ -23,7 +23,7 @@ val missing_fonts : Pdf.t -> int list -> unit
val print_font_table : Pdf.t -> string -> int -> unit
(** Extract a font file to disk. *)
val extract_fontfile : int -> string -> Pdf.t -> unit
val extract_fontfile : int -> string -> string -> Pdf.t -> unit
(** Remove fonts from a document. *)
val remove_fonts : Pdf.t -> Pdf.t