cpdf-source/cpdftoc.ml

160 lines
6.4 KiB
OCaml
Raw Normal View History

2021-12-07 00:46:52 +01:00
open Pdfutil
2021-12-11 06:51:05 +01:00
(* We allow \n in titles. Split for typesetter. *)
2021-12-07 04:45:10 +01:00
let rec split_toc_title_inner a = function
| '\\'::'n'::r -> rev a :: split_toc_title_inner [] r
| x::xs -> split_toc_title_inner (x::a) xs
2021-12-07 00:46:52 +01:00
| [] -> [rev a]
2021-12-07 04:45:10 +01:00
let split_toc_title = split_toc_title_inner []
2021-12-11 06:51:05 +01:00
(* And for new bookmark for TOC, change \\n to \n *)
let rec real_newline = function
| '\\'::'n'::r -> '\n'::real_newline r
| x::r -> x::real_newline r
| [] -> []
2021-12-07 04:45:10 +01:00
(* Cpdftype codepoints from a font and UTF8 *)
2022-09-23 20:29:07 +02:00
let of_utf8 used f t =
let codepoints = Pdftext.codepoints_of_utf8 t in
iter (fun u -> Hashtbl.replace used u ()) codepoints;
codepoints
2021-12-07 00:46:52 +01:00
|> option_map (Pdftext.charcode_extractor_of_font_real f)
|> map char_of_int
2021-12-07 04:45:10 +01:00
(* Cpdftype codepoints from a font and PDFDocEndoding string *)
2022-09-23 20:29:07 +02:00
let of_pdfdocencoding used f t =
of_utf8 used f (Pdftext.utf8_of_pdfdocstring t)
2021-12-07 00:46:52 +01:00
2022-01-09 14:46:41 +01:00
(* Remove characters until it is below the length. Then remove three more and
add dots for an ellipsis *)
let rec shorten_text_inner widths l t =
if t = [] then t else
if Cpdftype.width_of_string widths t > l then shorten_text_inner widths l (rev (tl (rev t)))
else t
let shorten_text widths l t =
let short = shorten_text_inner widths l t in
if short = t then t else short @ ['.'; '.'; '.']
2021-12-07 04:45:10 +01:00
(* Typeset a table of contents with given font, font size and title. Mediabox
2021-12-14 13:57:27 +01:00
(and CropBox) copied from first page of existing PDF. Margin of 10% inside
2021-12-07 04:45:10 +01:00
CropBox. Font size of title twice body font size. Null page labels added for
TOC, others bumped up and so preserved. *)
2022-09-23 20:06:07 +02:00
let typeset_table_of_contents ?embedinfo ~font ~fontsize ~title ~bookmark pdf =
2021-12-07 00:46:52 +01:00
let marks = Pdfmarks.read_bookmarks pdf in
if marks = [] then (Printf.eprintf "No bookmarks, not making table of contents\n%!"; pdf) else
2022-09-13 18:59:13 +02:00
let f, fs = (font, fontsize) in
2022-09-23 20:06:07 +02:00
let _, bfs as big = (font, fontsize *. 2.) in
2021-12-07 00:46:52 +01:00
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
2021-12-14 13:57:27 +01:00
let width, firstpage_papersize, pmaxx, pmaxy, margin =
2021-12-07 00:46:52 +01:00
let width, height, xmax, ymax =
2022-07-14 15:06:25 +02:00
match Pdf.parse_rectangle pdf firstpage.Pdfpage.mediabox with
2021-12-07 00:46:52 +01:00
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin, xmax, ymax
in
2021-12-14 13:57:27 +01:00
width, Pdfpaper.make Pdfunits.PdfPoint width height, xmax, ymax, fmin width height *. 0.1
2021-12-07 00:46:52 +01:00
in
let firstpage_cropbox =
match Pdf.lookup_direct pdf "/CropBox" firstpage.Pdfpage.rest with
2022-07-14 15:06:25 +02:00
| Some r -> Some (Pdf.parse_rectangle pdf r)
2021-12-07 00:46:52 +01:00
| None -> None
in
2021-12-14 13:57:27 +01:00
let width =
match firstpage_cropbox with
| Some (xmin, _, xmax, _) -> xmax -. xmin
| None -> width
in
2021-12-07 00:46:52 +01:00
let labels = Pdfpagelabels.read pdf in
2022-09-23 20:29:07 +02:00
let used = null_hash () in
2021-12-07 00:46:52 +01:00
let lines =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
map
(fun mark ->
2021-12-14 13:57:27 +01:00
let indent = float mark.Pdfmarks.level *. fontsize *. 2. in
2022-09-23 20:29:07 +02:00
let text = of_pdfdocencoding used f mark.Pdfmarks.text in
2021-12-07 00:46:52 +01:00
let label =
2022-09-23 20:29:07 +02:00
if mark.Pdfmarks.target = NullDestination then of_pdfdocencoding used f " " else
2021-12-14 13:57:27 +01:00
let pde =
let pnum = Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target in
try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> string_of_int pnum
in
2022-09-23 20:29:07 +02:00
of_pdfdocencoding used f pde
2021-12-14 13:57:27 +01:00
in
let widths = Cpdftype.font_widths f fontsize in
2022-01-09 14:46:41 +01:00
let textgap = width -. margin *. 2. -. indent -. Cpdftype.width_of_string widths label in
let text = shorten_text widths (textgap -. fontsize *. 3.) text in
let space = textgap -. Cpdftype.width_of_string widths text in
2021-12-07 00:46:52 +01:00
[Cpdftype.BeginDest mark.Pdfmarks.target;
2021-12-14 13:57:27 +01:00
Cpdftype.HGlue {Cpdftype.glen = indent; Cpdftype.gstretch = 0.};
Cpdftype.Text text;
Cpdftype.HGlue {Cpdftype.glen = space; Cpdftype.gstretch = 0.};
Cpdftype.Text label;
2021-12-07 00:46:52 +01:00
Cpdftype.EndDest;
Cpdftype.NewLine])
(Pdfmarks.read_bookmarks pdf)
in
let toc_pages =
let title =
2021-12-28 13:03:46 +01:00
let glue = Cpdftype.VGlue {glen = fontsize *. 2.; gstretch = 0.} in
if title = "" then [] else
flatten
(map
(fun l -> [Cpdftype.Text l; Cpdftype.NewLine])
2022-09-23 20:29:07 +02:00
(split_toc_title (of_utf8 used f title)))
2021-12-28 13:03:46 +01:00
@ [glue]
2021-12-07 00:46:52 +01:00
in
let lm, rm, tm, bm =
match firstpage_cropbox with
2021-12-14 12:36:21 +01:00
| None -> (margin, margin, margin, margin)
2021-12-07 00:46:52 +01:00
| Some (cminx, cminy, cmaxx, cmaxy) ->
2021-12-14 12:36:21 +01:00
(cminx +. margin, (pmaxx -. cmaxx) +. margin, cminy +. margin, (pmaxy -. cmaxy) +. margin)
2022-09-23 20:06:07 +02:00
in
2022-09-23 20:29:07 +02:00
let codepoints = map fst (list_of_hashtbl used) in
Printf.printf "%i codes used\n" (length codepoints);
2022-09-23 20:06:07 +02:00
let font =
match embedinfo with
| None -> font
| Some (pdf, fontfile, fontname, encoding) ->
Cpdfembed.embed_truetype pdf ~fontfile ~fontname ~codepoints ~encoding
2021-12-07 00:46:52 +01:00
in
Cpdftype.typeset lm rm tm bm firstpage_papersize pdf
2022-09-23 20:06:07 +02:00
([Cpdftype.Font (font, bfs); Cpdftype.BeginDocument] @ title @
[Cpdftype.Font (font, fs)] @ flatten lines)
2021-12-07 00:46:52 +01:00
in
let toc_pages =
match firstpage_cropbox with
| Some (a, b, c, d) ->
let rect =
Pdf.Array [Pdf.Real a; Pdf.Real b; Pdf.Real c; Pdf.Real d]
in
map
(fun p -> {p with Pdfpage.rest = Pdf.add_dict_entry p.Pdfpage.rest "/CropBox" rect})
toc_pages
| None -> toc_pages
in
let original_pages = Pdfpage.pages_of_pagetree pdf in
let toc_pages_len = length toc_pages in
let changes = map (fun n -> (n, n + toc_pages_len)) (indx original_pages) in
let pdf = Pdfpage.change_pages ~changes true pdf (toc_pages @ original_pages) in
let label =
{Pdfpagelabels.labelstyle = NoLabelPrefixOnly;
Pdfpagelabels.labelprefix = None;
Pdfpagelabels.startpage = 1;
Pdfpagelabels.startvalue = 1}
in
let labels' = label::map (fun l -> {l with Pdfpagelabels.startpage = l.Pdfpagelabels.startpage + toc_pages_len}) labels in
Pdfpagelabels.write pdf labels';
2021-12-10 13:58:30 +01:00
if bookmark then
let marks = Pdfmarks.read_bookmarks pdf in
let refnums = Pdf.page_reference_numbers pdf in
let newmark =
{Pdfmarks.level = 0;
2021-12-11 06:51:05 +01:00
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 (implode (real_newline (explode title)));
2021-12-10 13:58:30 +01:00
Pdfmarks.target = Pdfdest.XYZ (Pdfdest.PageObject (hd refnums), None, None, None);
Pdfmarks.isopen = false}
in
Pdfmarks.add_bookmarks (newmark::marks) pdf
else
pdf