This commit is contained in:
John Whitington 2021-12-02 12:04:14 -08:00
parent 680e66505e
commit bb5fae55d2
4 changed files with 38 additions and 30 deletions

14
cpdf.ml
View File

@ -1125,8 +1125,8 @@ let print_fonts pdf range =
(* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever (* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever
is in the font (for existing fonts). *) is in the font (for existing fonts). *)
let charcodes_of_utf8 pdf font s = let charcodes_of_utf8 font s =
let extractor = Pdftext.charcode_extractor_of_font ~debug:false pdf font in let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in
let codepoints = Pdftext.codepoints_of_utf8 s in let codepoints = Pdftext.codepoints_of_utf8 s in
let charcodes = let charcodes =
option_map option_map
@ -1141,12 +1141,8 @@ let charcodes_of_utf8 pdf font s =
(* Process codepoints back to UTF8, assuming it came from UTF8 to start with *) (* Process codepoints back to UTF8, assuming it came from UTF8 to start with *)
let utf8_of_winansi s = let utf8_of_winansi s =
let text_extractor = let text_extractor =
Pdftext.text_extractor_of_font Pdftext.text_extractor_of_font_real
(Pdf.empty ()) (Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding))
(Pdf.Dictionary
[("/BaseFont", Pdf.Name "/TimesRoman");
("/Subtype", Pdf.Name "/Type1");
("/Encoding", Pdf.Name "/WinAnsiEncoding")])
in in
let codepoints = Pdftext.codepoints_of_text text_extractor s in let codepoints = Pdftext.codepoints_of_text text_extractor s in
Pdftext.utf8_of_codepoints codepoints Pdftext.utf8_of_codepoints codepoints
@ -1601,7 +1597,7 @@ let
end end
| _ -> failwith "addtext: font dictionary not present" | _ -> failwith "addtext: font dictionary not present"
in in
let text = if raw then text else charcodes_of_utf8 pdf fontpdfobj text in let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in
let lines = map unescape_string (split_at_newline text) in let lines = map unescape_string (split_at_newline text) in
let pdf = ref pdf in let pdf = ref pdf in
let voffset = let voffset =

View File

@ -2919,11 +2919,8 @@ let collate (names, pdfs, ranges) =
split3 (rev !nis) split3 (rev !nis)
let of_utf8 (f, fontsize) t = let of_utf8 (f, fontsize) t =
let pdf = Pdf.empty () in
let fontdict = Pdftext.write_font pdf f in
let extractor = Pdftext.charcode_extractor_of_font pdf (Pdf.Indirect fontdict) in
Pdftext.codepoints_of_utf8 t Pdftext.codepoints_of_utf8 t
|> option_map extractor |> option_map (Pdftext.charcode_extractor_of_font_real f)
|> map char_of_int |> map char_of_int
|> implode |> implode
@ -2948,9 +2945,13 @@ let rec of_utf8_with_newlines t =
if c <> "" then items := Text (explode c)::!items; if c <> "" then items := Text (explode c)::!items;
rev !items rev !items
(* FIXME margins, hyphenation of too-long words, efficiency *)
let typeset text = let typeset text =
let pdf = Pdf.empty () in let pdf = Pdf.empty () in
let f = (Pdftext.StandardFont (Pdftext.Courier, Pdftext.WinAnsiEncoding), 12.) in let f =
(begin match args.font with StandardFont sf -> Pdftext.StandardFont (sf, Pdftext.WinAnsiEncoding) | _ -> failwith "typeset bad font" end,
args.fontsize)
in
let pages = let pages =
Cpdftype.typeset Cpdftype.typeset
20. 20. 20. 20. Pdfpaper.a4 pdf ([Cpdftype.Font f] @ of_utf8_with_newlines (string_of_bytes text)) 20. 20. 20. 20. Pdfpaper.a4 pdf ([Cpdftype.Font f] @ of_utf8_with_newlines (string_of_bytes text))
@ -2973,11 +2974,20 @@ let typeset_table_of_contents ~font pdf =
Pdfpaper.make Pdfunits.PdfPoint width height Pdfpaper.make Pdfunits.PdfPoint width height
in in
let lines = let lines =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
map map
(fun mark -> (fun mark ->
let label =
let labels = Pdfpagelabels.read pdf in
let pnum = Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target in
try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> string_of_int pnum
in
[Cpdftype.BeginDest mark.Pdfmarks.target; [Cpdftype.BeginDest mark.Pdfmarks.target;
Cpdftype.HGlue {Cpdftype.glen = float mark.Pdfmarks.level *. args.fontsize *. 2.; Cpdftype.gstretch = 0.}; Cpdftype.HGlue {Cpdftype.glen = float mark.Pdfmarks.level *. args.fontsize *. 2.; Cpdftype.gstretch = 0.};
Cpdftype.Text (explode (of_pdfdocencoding f mark.Pdfmarks.text)); Cpdftype.Text (explode (of_pdfdocencoding f mark.Pdfmarks.text ^ " " ^ of_pdfdocencoding f label));
(*Cpdftype.Text [' '];
Cpdftype.Text (explode (of_pdfdocencoding f label));*)
Cpdftype.EndDest; Cpdftype.EndDest;
Cpdftype.NewLine]) Cpdftype.NewLine])
(Pdfmarks.read_bookmarks pdf) (Pdfmarks.read_bookmarks pdf)

View File

@ -156,7 +156,7 @@ let print_font_table pdf fontname pagenumber =
| Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs | Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs
| _ -> None | _ -> None
in in
let extractor = Pdftext.text_extractor_of_font pdf font in let extractor = Pdftext.text_extractor_of_font_real pdftextfont in
let unicodedata = Cpdfunicodedata.unicodedata () in let unicodedata = Cpdfunicodedata.unicodedata () in
let unicodetable = Hashtbl.create 16000 in let unicodetable = Hashtbl.create 16000 in
iter iter

View File

@ -1,11 +1,6 @@
(* A typesetter for cpdf. A list of elements is manipulated zero or more times (* A typesetter for cpdf. A list of elements is manipulated zero or more times
to lay it out, paginate it, and so on. It is then typeset to produce a list to lay it out, paginate it, and so on. It is then typeset to produce a list
of pages *) of pages *)
(* FIXME We need to make Pdfstandard14 width calculations much more efficient
by caching so that we are not making a table up for each character! *)
(* FIXME We need to reintroduce kerning in Pdfstandard14. *)
(* FIXME Fix up charcode / text extractors to take fonts not fontdicts *)
open Pdfutil open Pdfutil
(* Glue *) (* Glue *)
@ -55,8 +50,15 @@ let initial_state () =
dest = None} dest = None}
let font_widths f fontsize = let font_widths f fontsize =
let w = fontsize *. (600. /. 1000.) in let stdfont =
Array.make 256 w match f with Pdftext.StandardFont (sf, _) -> sf | _ -> failwith "not a standard font"
in
Array.init
256
(fun x ->
fontsize
*. float_of_int (Pdfstandard14.textwidth false Pdftext.WinAnsiEncoding stdfont (string_of_char (char_of_int x)))
/. 1000.)
let width_of_string ws s = let width_of_string ws s =
let w = ref 0. in let w = ref 0. in