From bb5fae55d2aaa75cad35e8d7f2fccbe04e9eff45 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Thu, 2 Dec 2021 12:04:14 -0800 Subject: [PATCH] more --- cpdf.ml | 14 +++++--------- cpdfcommand.ml | 36 +++++++++++++++++++++++------------- cpdffont.ml | 2 +- cpdftype.ml | 16 +++++++++------- 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/cpdf.ml b/cpdf.ml index 08cdf28..c8449a1 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -1125,8 +1125,8 @@ let print_fonts pdf range = (* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever is in the font (for existing fonts). *) -let charcodes_of_utf8 pdf font s = - let extractor = Pdftext.charcode_extractor_of_font ~debug:false pdf font in +let charcodes_of_utf8 font s = + let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in let codepoints = Pdftext.codepoints_of_utf8 s in let charcodes = option_map @@ -1141,12 +1141,8 @@ let charcodes_of_utf8 pdf font s = (* Process codepoints back to UTF8, assuming it came from UTF8 to start with *) let utf8_of_winansi s = let text_extractor = - Pdftext.text_extractor_of_font - (Pdf.empty ()) - (Pdf.Dictionary - [("/BaseFont", Pdf.Name "/TimesRoman"); - ("/Subtype", Pdf.Name "/Type1"); - ("/Encoding", Pdf.Name "/WinAnsiEncoding")]) + Pdftext.text_extractor_of_font_real + (Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding)) in let codepoints = Pdftext.codepoints_of_text text_extractor s in Pdftext.utf8_of_codepoints codepoints @@ -1601,7 +1597,7 @@ let end | _ -> failwith "addtext: font dictionary not present" in - let text = if raw then text else charcodes_of_utf8 pdf fontpdfobj text in + let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in let lines = map unescape_string (split_at_newline text) in let pdf = ref pdf in let voffset = diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 228abf8..c58bfb5 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -2919,13 +2919,10 @@ let collate (names, pdfs, ranges) = split3 (rev !nis) let of_utf8 (f, fontsize) t = - let pdf = Pdf.empty () in - let fontdict = Pdftext.write_font pdf f in - let extractor = Pdftext.charcode_extractor_of_font pdf (Pdf.Indirect fontdict) in - Pdftext.codepoints_of_utf8 t - |> option_map extractor - |> map char_of_int - |> implode + Pdftext.codepoints_of_utf8 t + |> option_map (Pdftext.charcode_extractor_of_font_real f) + |> map char_of_int + |> implode let of_pdfdocencoding (f, fontsize) t = of_utf8 (f, fontsize) (Pdftext.utf8_of_pdfdocstring t) @@ -2948,9 +2945,13 @@ let rec of_utf8_with_newlines t = if c <> "" then items := Text (explode c)::!items; rev !items +(* FIXME margins, hyphenation of too-long words, efficiency *) let typeset text = let pdf = Pdf.empty () in - let f = (Pdftext.StandardFont (Pdftext.Courier, Pdftext.WinAnsiEncoding), 12.) in + let f = + (begin match args.font with StandardFont sf -> Pdftext.StandardFont (sf, Pdftext.WinAnsiEncoding) | _ -> failwith "typeset bad font" end, + args.fontsize) + in let pages = Cpdftype.typeset 20. 20. 20. 20. Pdfpaper.a4 pdf ([Cpdftype.Font f] @ of_utf8_with_newlines (string_of_bytes text)) @@ -2973,13 +2974,22 @@ let typeset_table_of_contents ~font pdf = Pdfpaper.make Pdfunits.PdfPoint width height in let lines = + let refnums = Pdf.page_reference_numbers pdf in + let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in map (fun mark -> - [Cpdftype.BeginDest mark.Pdfmarks.target; - Cpdftype.HGlue {Cpdftype.glen = float mark.Pdfmarks.level *. args.fontsize *. 2.; Cpdftype.gstretch = 0.}; - Cpdftype.Text (explode (of_pdfdocencoding f mark.Pdfmarks.text)); - Cpdftype.EndDest; - Cpdftype.NewLine]) + let label = + let labels = Pdfpagelabels.read pdf in + let pnum = Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target in + try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> string_of_int pnum + in + [Cpdftype.BeginDest mark.Pdfmarks.target; + Cpdftype.HGlue {Cpdftype.glen = float mark.Pdfmarks.level *. args.fontsize *. 2.; Cpdftype.gstretch = 0.}; + Cpdftype.Text (explode (of_pdfdocencoding f mark.Pdfmarks.text ^ " " ^ of_pdfdocencoding f label)); + (*Cpdftype.Text [' ']; + Cpdftype.Text (explode (of_pdfdocencoding f label));*) + Cpdftype.EndDest; + Cpdftype.NewLine]) (Pdfmarks.read_bookmarks pdf) in let toc_pages = diff --git a/cpdffont.ml b/cpdffont.ml index cd064fa..936decc 100644 --- a/cpdffont.ml +++ b/cpdffont.ml @@ -156,7 +156,7 @@ let print_font_table pdf fontname pagenumber = | Pdftext.SimpleFont {Pdftext.fontdescriptor = Some {Pdftext.charset = Some cs}} -> Some cs | _ -> None in - let extractor = Pdftext.text_extractor_of_font pdf font in + let extractor = Pdftext.text_extractor_of_font_real pdftextfont in let unicodedata = Cpdfunicodedata.unicodedata () in let unicodetable = Hashtbl.create 16000 in iter diff --git a/cpdftype.ml b/cpdftype.ml index 8ad7f81..48e7963 100644 --- a/cpdftype.ml +++ b/cpdftype.ml @@ -1,11 +1,6 @@ (* A typesetter for cpdf. A list of elements is manipulated zero or more times to lay it out, paginate it, and so on. It is then typeset to produce a list of pages *) - -(* FIXME We need to make Pdfstandard14 width calculations much more efficient - by caching so that we are not making a table up for each character! *) -(* FIXME We need to reintroduce kerning in Pdfstandard14. *) -(* FIXME Fix up charcode / text extractors to take fonts not fontdicts *) open Pdfutil (* Glue *) @@ -55,8 +50,15 @@ let initial_state () = dest = None} let font_widths f fontsize = - let w = fontsize *. (600. /. 1000.) in - Array.make 256 w + let stdfont = + match f with Pdftext.StandardFont (sf, _) -> sf | _ -> failwith "not a standard font" + in + Array.init + 256 + (fun x -> + fontsize + *. float_of_int (Pdfstandard14.textwidth false Pdftext.WinAnsiEncoding stdfont (string_of_char (char_of_int x))) + /. 1000.) let width_of_string ws s = let w = ref 0. in