From 8a1fdc23a4e8b54f3e80619e3572f1ee0a496308 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Thu, 2 Nov 2023 18:49:15 +0000 Subject: [PATCH] Rehabilitate -extract-font --- Changes | 1 + cpdfcommand.ml | 34 ++++++++++++++++++++-------------- cpdffont.ml | 19 ++++--------------- cpdffont.mli | 2 +- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/Changes b/Changes index 5e02259..909dbf8 100644 --- a/Changes +++ b/Changes @@ -8,6 +8,7 @@ o List document and page info in JSON format o List page labels in JSON format o List fonts in JSON format o Identify PDF/A, PDF/X, PDF/E, PDF/VT, PDF/UA +o Extract font files from a document Extended features: diff --git a/cpdfcommand.ml b/cpdfcommand.ml index dff48d5..2488892 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -189,7 +189,7 @@ type op = | ExtractImages | ImageResolution of float | MissingFonts - | ExtractFontFile + | ExtractFontFile of string | ExtractText | OpenAtPage of string | OpenAtPageFit of string @@ -320,7 +320,7 @@ let string_of_op = function | ExtractImages -> "ExtractImages" | ImageResolution _ -> "ImageResolution" | MissingFonts -> "MissingFonts" - | ExtractFontFile -> "ExtractFontFile" + | ExtractFontFile _ -> "ExtractFontFile" | ExtractText -> "ExtractText" | OpenAtPage _ -> "OpenAtPage" | OpenAtPageFit _ -> "OpenAtPageFit" @@ -837,7 +837,7 @@ let banned banlist = function | ShowBoxes | TrimMarks | CreateMetadata | SetMetadataDate _ | SetVersion _ | SetAuthor _|SetTitle _|SetSubject _|SetKeywords _|SetCreate _ | SetModify _|SetCreator _|SetProducer _|RemoveDictEntry _ | ReplaceDictEntry _ | PrintDictEntry _ | SetMetadata _ - | ExtractText | ExtractImages | ExtractFontFile + | ExtractText | ExtractImages | ExtractFontFile _ | AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce | OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _ | TextWidth _ | SetAnnotations _ | CopyAnnotations _ @@ -1871,6 +1871,9 @@ let settextwidth s = let setdraw () = args.op <- Some Draw +let setextractfontfile s = + args.op <- Some (ExtractFontFile s) + let () = Cpdfdrawcontrol.getfontname := fun () -> args.fontname let () = Cpdfdrawcontrol.getfontsize := fun () -> args.fontsize let () = Cpdfdrawcontrol.setfontname := setfont @@ -2668,6 +2671,9 @@ and specs = ("-print-font-table-page", Arg.Int setfontpage, " Set page for -print-font-table"); + ("-extract-font", + Arg.String setextractfontfile, + " Extract a font"); ("-table-of-contents", Arg.Unit (setop TableOfContents), " Typeset a table of contents from bookmarks"); @@ -2749,7 +2755,7 @@ and specs = ("-debug-stderr-to-stdout", Arg.Unit setstderrtostdout, ""); ("-stay-on-error", Arg.Unit setstayonerror, ""); (* These items are unfinished *) - ("-extract-fontfile", Arg.Unit (setop ExtractFontFile), ""); + ("-extract-text", Arg.Unit (setop ExtractText), ""); ("-extract-text-font-size", Arg.Float setextracttextfontsize, ""); ] @@ -3439,17 +3445,17 @@ let go () = write_pdf true (Cpdffont.remove_fonts pdf) | _ -> error "remove fonts: bad command line" end - | Some ExtractFontFile -> + | Some (ExtractFontFile spec) -> begin match args.inputs, args.out with - | (_, pagespec, u, o, _, _)::_, _ -> - let pdf = get_single_pdf (Some ExtractFontFile) false in - let page = args.copyfontpage - and name = - match args.copyfontname with - | Some x -> x - | None -> failwith "extract fontfile: no font name given" - in - Cpdffont.extract_fontfile page name pdf + | (_, pagespec, u, o, _, _)::_, File filename -> + let pdf = get_single_pdf (Some (ExtractFontFile spec)) false in + begin match String.split_on_char ',' spec with + | [pnum; name] -> + begin try Cpdffont.extract_fontfile (int_of_string pnum) name filename pdf with + Failure _ (*"int_of_string"*) -> error "extract font: bad page number" + end + | _ -> error "extract font: bad specification" + end | _ -> error "extract fontfile: bad command line" end | Some CountPages -> diff --git a/cpdffont.ml b/cpdffont.ml index 2ec4953..aad99d8 100644 --- a/cpdffont.ml +++ b/cpdffont.ml @@ -199,8 +199,7 @@ let print_font_table pdf fontname pagenumber = done | _ -> failwith "addtext: font not found for width" -(* Extracts font to font.dat in CWD. *) -let extract_fontfile pagenumber fontname pdf = +let extract_fontfile pagenumber fontname filename pdf = let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in match Pdf.lookup_direct pdf "/Font" resources with | None -> failwith "extract_fontfile: font not found" @@ -212,32 +211,22 @@ let extract_fontfile pagenumber fontname pdf = | Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} -> begin let objnum = match fontfile with - | Pdftext.FontFile i -> i - | Pdftext.FontFile2 i -> i - | Pdftext.FontFile3 i -> i + | Pdftext.FontFile i | Pdftext.FontFile2 i | Pdftext.FontFile3 i -> i in match Pdf.lookup_obj pdf objnum with | Pdf.Stream s as obj -> Pdfcodec.decode_pdfstream pdf obj; begin match s with | {contents = (_, Pdf.Got bytes)} -> - let fh = open_out_bin "font.dat" in + let fh = open_out_bin filename in for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done; - close_out fh; - (* Now try to read using Pdfcff module *) - (*let font = Pdftruetype.to_type3 pdf font in*) - (*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*) - (*flprint "glyph names for incodes 0,1,2,3..."; - iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007"); - flprint "\n";*) - () + close_out fh | _ -> failwith "extract_fontfile" end | _ -> failwith "extract_fontfile" end | _ -> failwith "unsupported or unfound font" - (* Remove Embedded fonts. This is done by removing the Font Descriptor. *) let remove_fontdescriptor pdf = function | Pdf.Dictionary d as font -> diff --git a/cpdffont.mli b/cpdffont.mli index 8e6c61d..45efc3e 100644 --- a/cpdffont.mli +++ b/cpdffont.mli @@ -23,7 +23,7 @@ val missing_fonts : Pdf.t -> int list -> unit val print_font_table : Pdf.t -> string -> int -> unit (** Extract a font file to disk. *) -val extract_fontfile : int -> string -> Pdf.t -> unit +val extract_fontfile : int -> string -> string -> Pdf.t -> unit (** Remove fonts from a document. *) val remove_fonts : Pdf.t -> Pdf.t