Rehabilitate -extract-font
This commit is contained in:
parent
35bf9f14d7
commit
8a1fdc23a4
1
Changes
1
Changes
|
@ -8,6 +8,7 @@ o List document and page info in JSON format
|
||||||
o List page labels in JSON format
|
o List page labels in JSON format
|
||||||
o List fonts in JSON format
|
o List fonts in JSON format
|
||||||
o Identify PDF/A, PDF/X, PDF/E, PDF/VT, PDF/UA
|
o Identify PDF/A, PDF/X, PDF/E, PDF/VT, PDF/UA
|
||||||
|
o Extract font files from a document
|
||||||
|
|
||||||
Extended features:
|
Extended features:
|
||||||
|
|
||||||
|
|
|
@ -189,7 +189,7 @@ type op =
|
||||||
| ExtractImages
|
| ExtractImages
|
||||||
| ImageResolution of float
|
| ImageResolution of float
|
||||||
| MissingFonts
|
| MissingFonts
|
||||||
| ExtractFontFile
|
| ExtractFontFile of string
|
||||||
| ExtractText
|
| ExtractText
|
||||||
| OpenAtPage of string
|
| OpenAtPage of string
|
||||||
| OpenAtPageFit of string
|
| OpenAtPageFit of string
|
||||||
|
@ -320,7 +320,7 @@ let string_of_op = function
|
||||||
| ExtractImages -> "ExtractImages"
|
| ExtractImages -> "ExtractImages"
|
||||||
| ImageResolution _ -> "ImageResolution"
|
| ImageResolution _ -> "ImageResolution"
|
||||||
| MissingFonts -> "MissingFonts"
|
| MissingFonts -> "MissingFonts"
|
||||||
| ExtractFontFile -> "ExtractFontFile"
|
| ExtractFontFile _ -> "ExtractFontFile"
|
||||||
| ExtractText -> "ExtractText"
|
| ExtractText -> "ExtractText"
|
||||||
| OpenAtPage _ -> "OpenAtPage"
|
| OpenAtPage _ -> "OpenAtPage"
|
||||||
| OpenAtPageFit _ -> "OpenAtPageFit"
|
| OpenAtPageFit _ -> "OpenAtPageFit"
|
||||||
|
@ -837,7 +837,7 @@ let banned banlist = function
|
||||||
| ShowBoxes | TrimMarks | CreateMetadata | SetMetadataDate _ | SetVersion _
|
| ShowBoxes | TrimMarks | CreateMetadata | SetMetadataDate _ | SetVersion _
|
||||||
| SetAuthor _|SetTitle _|SetSubject _|SetKeywords _|SetCreate _
|
| SetAuthor _|SetTitle _|SetSubject _|SetKeywords _|SetCreate _
|
||||||
| SetModify _|SetCreator _|SetProducer _|RemoveDictEntry _ | ReplaceDictEntry _ | PrintDictEntry _ | SetMetadata _
|
| SetModify _|SetCreator _|SetProducer _|RemoveDictEntry _ | ReplaceDictEntry _ | PrintDictEntry _ | SetMetadata _
|
||||||
| ExtractText | ExtractImages | ExtractFontFile
|
| ExtractText | ExtractImages | ExtractFontFile _
|
||||||
| AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce
|
| AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce
|
||||||
| OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _
|
| OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _
|
||||||
| TextWidth _ | SetAnnotations _ | CopyAnnotations _
|
| TextWidth _ | SetAnnotations _ | CopyAnnotations _
|
||||||
|
@ -1871,6 +1871,9 @@ let settextwidth s =
|
||||||
let setdraw () =
|
let setdraw () =
|
||||||
args.op <- Some Draw
|
args.op <- Some Draw
|
||||||
|
|
||||||
|
let setextractfontfile s =
|
||||||
|
args.op <- Some (ExtractFontFile s)
|
||||||
|
|
||||||
let () = Cpdfdrawcontrol.getfontname := fun () -> args.fontname
|
let () = Cpdfdrawcontrol.getfontname := fun () -> args.fontname
|
||||||
let () = Cpdfdrawcontrol.getfontsize := fun () -> args.fontsize
|
let () = Cpdfdrawcontrol.getfontsize := fun () -> args.fontsize
|
||||||
let () = Cpdfdrawcontrol.setfontname := setfont
|
let () = Cpdfdrawcontrol.setfontname := setfont
|
||||||
|
@ -2668,6 +2671,9 @@ and specs =
|
||||||
("-print-font-table-page",
|
("-print-font-table-page",
|
||||||
Arg.Int setfontpage,
|
Arg.Int setfontpage,
|
||||||
" Set page for -print-font-table");
|
" Set page for -print-font-table");
|
||||||
|
("-extract-font",
|
||||||
|
Arg.String setextractfontfile,
|
||||||
|
" Extract a font");
|
||||||
("-table-of-contents",
|
("-table-of-contents",
|
||||||
Arg.Unit (setop TableOfContents),
|
Arg.Unit (setop TableOfContents),
|
||||||
" Typeset a table of contents from bookmarks");
|
" Typeset a table of contents from bookmarks");
|
||||||
|
@ -2749,7 +2755,7 @@ and specs =
|
||||||
("-debug-stderr-to-stdout", Arg.Unit setstderrtostdout, "");
|
("-debug-stderr-to-stdout", Arg.Unit setstderrtostdout, "");
|
||||||
("-stay-on-error", Arg.Unit setstayonerror, "");
|
("-stay-on-error", Arg.Unit setstayonerror, "");
|
||||||
(* These items are unfinished *)
|
(* These items are unfinished *)
|
||||||
("-extract-fontfile", Arg.Unit (setop ExtractFontFile), "");
|
|
||||||
("-extract-text", Arg.Unit (setop ExtractText), "");
|
("-extract-text", Arg.Unit (setop ExtractText), "");
|
||||||
("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
|
("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
|
||||||
]
|
]
|
||||||
|
@ -3439,17 +3445,17 @@ let go () =
|
||||||
write_pdf true (Cpdffont.remove_fonts pdf)
|
write_pdf true (Cpdffont.remove_fonts pdf)
|
||||||
| _ -> error "remove fonts: bad command line"
|
| _ -> error "remove fonts: bad command line"
|
||||||
end
|
end
|
||||||
| Some ExtractFontFile ->
|
| Some (ExtractFontFile spec) ->
|
||||||
begin match args.inputs, args.out with
|
begin match args.inputs, args.out with
|
||||||
| (_, pagespec, u, o, _, _)::_, _ ->
|
| (_, pagespec, u, o, _, _)::_, File filename ->
|
||||||
let pdf = get_single_pdf (Some ExtractFontFile) false in
|
let pdf = get_single_pdf (Some (ExtractFontFile spec)) false in
|
||||||
let page = args.copyfontpage
|
begin match String.split_on_char ',' spec with
|
||||||
and name =
|
| [pnum; name] ->
|
||||||
match args.copyfontname with
|
begin try Cpdffont.extract_fontfile (int_of_string pnum) name filename pdf with
|
||||||
| Some x -> x
|
Failure _ (*"int_of_string"*) -> error "extract font: bad page number"
|
||||||
| None -> failwith "extract fontfile: no font name given"
|
end
|
||||||
in
|
| _ -> error "extract font: bad specification"
|
||||||
Cpdffont.extract_fontfile page name pdf
|
end
|
||||||
| _ -> error "extract fontfile: bad command line"
|
| _ -> error "extract fontfile: bad command line"
|
||||||
end
|
end
|
||||||
| Some CountPages ->
|
| Some CountPages ->
|
||||||
|
|
19
cpdffont.ml
19
cpdffont.ml
|
@ -199,8 +199,7 @@ let print_font_table pdf fontname pagenumber =
|
||||||
done
|
done
|
||||||
| _ -> failwith "addtext: font not found for width"
|
| _ -> failwith "addtext: font not found for width"
|
||||||
|
|
||||||
(* Extracts font to font.dat in CWD. *)
|
let extract_fontfile pagenumber fontname filename pdf =
|
||||||
let extract_fontfile pagenumber fontname pdf =
|
|
||||||
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
|
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
|
||||||
match Pdf.lookup_direct pdf "/Font" resources with
|
match Pdf.lookup_direct pdf "/Font" resources with
|
||||||
| None -> failwith "extract_fontfile: font not found"
|
| None -> failwith "extract_fontfile: font not found"
|
||||||
|
@ -212,32 +211,22 @@ let extract_fontfile pagenumber fontname pdf =
|
||||||
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
|
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.fontfile = Some fontfile}} ->
|
||||||
begin let objnum =
|
begin let objnum =
|
||||||
match fontfile with
|
match fontfile with
|
||||||
| Pdftext.FontFile i -> i
|
| Pdftext.FontFile i | Pdftext.FontFile2 i | Pdftext.FontFile3 i -> i
|
||||||
| Pdftext.FontFile2 i -> i
|
|
||||||
| Pdftext.FontFile3 i -> i
|
|
||||||
in
|
in
|
||||||
match Pdf.lookup_obj pdf objnum with
|
match Pdf.lookup_obj pdf objnum with
|
||||||
| Pdf.Stream s as obj ->
|
| Pdf.Stream s as obj ->
|
||||||
Pdfcodec.decode_pdfstream pdf obj;
|
Pdfcodec.decode_pdfstream pdf obj;
|
||||||
begin match s with
|
begin match s with
|
||||||
| {contents = (_, Pdf.Got bytes)} ->
|
| {contents = (_, Pdf.Got bytes)} ->
|
||||||
let fh = open_out_bin "font.dat" in
|
let fh = open_out_bin filename in
|
||||||
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
|
for x = 0 to bytes_size bytes - 1 do output_byte fh (bget bytes x) done;
|
||||||
close_out fh;
|
close_out fh
|
||||||
(* Now try to read using Pdfcff module *)
|
|
||||||
(*let font = Pdftruetype.to_type3 pdf font in*)
|
|
||||||
(*let extractor = Pdftext.text_extractor_of_font pdf fontobj in*)
|
|
||||||
(*flprint "glyph names for incodes 0,1,2,3...";
|
|
||||||
iter print_string (Pdftext.glyphnames_of_text extractor "\000\001\002\003\004\005\006\007");
|
|
||||||
flprint "\n";*)
|
|
||||||
()
|
|
||||||
| _ -> failwith "extract_fontfile"
|
| _ -> failwith "extract_fontfile"
|
||||||
end
|
end
|
||||||
| _ -> failwith "extract_fontfile"
|
| _ -> failwith "extract_fontfile"
|
||||||
end
|
end
|
||||||
| _ -> failwith "unsupported or unfound font"
|
| _ -> failwith "unsupported or unfound font"
|
||||||
|
|
||||||
|
|
||||||
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
|
(* Remove Embedded fonts. This is done by removing the Font Descriptor. *)
|
||||||
let remove_fontdescriptor pdf = function
|
let remove_fontdescriptor pdf = function
|
||||||
| Pdf.Dictionary d as font ->
|
| Pdf.Dictionary d as font ->
|
||||||
|
|
|
@ -23,7 +23,7 @@ val missing_fonts : Pdf.t -> int list -> unit
|
||||||
val print_font_table : Pdf.t -> string -> int -> unit
|
val print_font_table : Pdf.t -> string -> int -> unit
|
||||||
|
|
||||||
(** Extract a font file to disk. *)
|
(** Extract a font file to disk. *)
|
||||||
val extract_fontfile : int -> string -> Pdf.t -> unit
|
val extract_fontfile : int -> string -> string -> Pdf.t -> unit
|
||||||
|
|
||||||
(** Remove fonts from a document. *)
|
(** Remove fonts from a document. *)
|
||||||
val remove_fonts : Pdf.t -> Pdf.t
|
val remove_fonts : Pdf.t -> Pdf.t
|
||||||
|
|
Loading…
Reference in New Issue