cpdf-source/cpdftype.ml

355 lines
12 KiB
OCaml
Raw Permalink Normal View History

2021-11-18 20:09:09 +01:00
(* A typesetter for cpdf. A list of elements is manipulated zero or more times
to lay it out, paginate it, and so on. It is then typeset to produce a list
2021-12-14 13:57:27 +01:00
of pages.
For now, this is just an experiment for -table-of-contents and -typeset. To
be continued... *)
2023-07-12 16:26:27 +02:00
2021-11-20 00:21:37 +01:00
open Pdfutil
2021-11-18 20:09:09 +01:00
(* Main type *)
2021-11-18 23:48:25 +01:00
type element =
2021-12-02 01:11:33 +01:00
Text of char list (* charcodes 0..255 *)
2023-07-10 15:36:34 +02:00
| HGlue of float
| VGlue of float
2021-11-18 20:09:09 +01:00
| NewLine
| NewPage
2023-07-20 13:56:20 +02:00
| Font of string * Pdftext.font * float
2021-11-20 01:28:13 +01:00
| BeginDest of Pdfdest.t
| EndDest
2021-12-14 15:07:52 +01:00
| BeginDocument
| Tag of string * int
| EndTag
2021-11-18 23:48:25 +01:00
2021-11-21 23:15:06 +01:00
let to_string_elt = function
2021-12-02 01:11:33 +01:00
| Text t -> implode t
2023-07-10 15:36:34 +02:00
| HGlue len -> "HGLUE" ^ string_of_float len
2021-11-18 23:48:25 +01:00
| VGlue _ -> "VGLUE"
| NewLine -> "NewLine"
| NewPage -> "NewPage"
| Font _ -> "Font"
2021-11-20 01:28:13 +01:00
| BeginDest _ -> "BeginDest"
| EndDest -> "EndDest"
2021-12-14 15:07:52 +01:00
| BeginDocument -> "BeginDocument"
| Tag (s, i) -> "Tag " ^ s ^ " " ^ string_of_int i
| EndTag -> "EndTag"
2021-11-18 20:09:09 +01:00
2021-11-21 23:15:06 +01:00
let to_string es = fold_left (fun a b -> a ^ "\n" ^ b) "" (map to_string_elt es)
2021-11-18 23:48:25 +01:00
type t = element list
2021-11-18 20:09:09 +01:00
type state =
2021-11-18 23:48:25 +01:00
{mutable font : Pdftext.font option;
2023-07-20 13:56:20 +02:00
mutable fontid : string option;
2021-11-20 00:21:37 +01:00
mutable fontsize : float;
mutable width_table : float array; (* Widths for charcodes 0..255 *)
2021-11-18 20:09:09 +01:00
mutable xpos : float;
2021-11-20 01:28:13 +01:00
mutable ypos : float;
mutable dest : Pdfdest.t option}
2021-11-18 20:09:09 +01:00
2023-07-20 13:56:20 +02:00
let width_table_cache = null_hash ()
2021-11-18 20:09:09 +01:00
let initial_state () =
2021-11-18 23:48:25 +01:00
{font = None;
2023-07-20 13:56:20 +02:00
fontid = None;
2021-11-20 00:21:37 +01:00
fontsize = 0.;
width_table = [||];
2021-11-18 20:09:09 +01:00
xpos = 0.;
2021-11-20 01:28:13 +01:00
ypos = 0.;
dest = None}
2021-11-18 20:09:09 +01:00
(* Mark as an artifact anything not already marked. *)
let add_artifacts ops =
let content = ref false in
let artifact = ref false in
let rec loop a = function
| [] ->
(* The end. Must end artifact if in artifact. *)
if !artifact then rev (Pdfops.Op_EMC::a) else rev a
| Pdfops.Op_BMC "/BeginArtifact"::t ->
(* Convert back-channel artifact beginning. *)
set artifact;
loop (Pdfops.Op_BMC "/Artifact"::a) t
| Pdfops.Op_BMC "/EndArtifact"::t ->
(* Convert back-channel artifact ending. *)
clear artifact;
loop (Pdfops.Op_EMC::a) t
| Pdfops.Op_BDC _ as h::t ->
(* Entering content. If in artifact, must end artifact. *)
let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in
set content; clear artifact; loop a' t
| Pdfops.Op_EMC as h::t ->
(* Exiting content. *)
clear content;
loop (h::a) t
| h::t ->
(* A normal operation. If not in content or artifact must start artifact. *)
let a' =
if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a
in
loop a' t
in
loop [] ops
2023-07-20 13:56:20 +02:00
let font_widths id f fontsize =
match Hashtbl.find width_table_cache (id, fontsize) with
| x -> x
| exception Not_found ->
let newtable =
match f with
| Pdftext.StandardFont (sf, encoding) ->
Array.init
256
(fun x ->
fontsize
*. float_of_int
(Pdfstandard14.textwidth false encoding sf (string_of_char (char_of_int x)))
/. 1000.)
| Pdftext.SimpleFont {fontmetrics = Some m} ->
Array.map (fun x -> fontsize *. x /. 1000. ) m
| _ -> raise (Pdf.PDFError "Cpdftype: Unsupported font")
in
Hashtbl.add width_table_cache (id, fontsize) newtable;
newtable
2021-11-18 23:48:25 +01:00
2021-12-02 01:20:39 +01:00
let width_of_string ws s =
let w = ref 0. in
iter (fun s -> w := !w +. ws.(int_of_char s)) s;
!w
2021-11-20 01:28:13 +01:00
(* For now, split each text element into words, and lay them out ragged right
on one long page. Words longer than a whole line just fall off the margin.
Turn text newlines into real newlines. *)
2021-11-22 23:44:14 +01:00
(* Split into words on spaces. Find how many words (at least one, to make
progress) fit into the available space. We set needs_newline if the next
word would overflow. Return (text, needs_newline, remaining_text) *)
let split_text space_left widths t =
2021-12-02 01:11:33 +01:00
let chars = ref t in
2021-11-22 23:44:14 +01:00
let words = ref [] in
let space_left = ref space_left in
let return needs_newline =
2021-12-02 01:11:33 +01:00
(flatten (rev !words), needs_newline, !chars)
2021-11-22 23:44:14 +01:00
in
try
while !chars <> [] do
let word, rest = cleavewhile (neq ' ') !chars in
let w = width_of_string widths word in
if !words = [] || w < !space_left
2021-11-23 23:31:16 +01:00
then
let is_last_word = rest = [] in
let new_word = if is_last_word then word else word @ [' '] in
begin
words := new_word::!words;
space_left := !space_left -. w -. (if is_last_word then 0. else width_of_string widths [' '])
end
2021-11-22 23:44:14 +01:00
else raise Exit;
chars := if rest = [] then [] else tl rest;
done;
return false
with
Exit -> return true
2021-11-20 01:28:13 +01:00
2021-11-18 20:09:09 +01:00
let layout lmargin rmargin papersize i =
2023-04-11 14:50:17 +02:00
let width = Pdfunits.points (Pdfpaper.width papersize) (Pdfpaper.unit papersize) in
2021-11-19 00:23:38 +01:00
let o = ref [] in
2021-11-18 23:48:25 +01:00
let s = initial_state () in
2021-11-20 00:21:37 +01:00
let xpos_max = width -. lmargin in
2021-11-19 00:23:38 +01:00
s.xpos <- lmargin;
2021-11-20 00:21:37 +01:00
let rec layout_element = function
2023-07-20 13:56:20 +02:00
| Font (id, f, fontsize) ->
s.width_table <- font_widths id f fontsize;
o := Font (id, f, fontsize) :: !o
2021-11-20 00:21:37 +01:00
| Text text ->
2021-12-02 01:11:33 +01:00
if text = [] then () else
2021-11-22 23:44:14 +01:00
begin
let this_line, needs_newline, remaining_text =
split_text (xpos_max -. s.xpos) s.width_table text
in
o := Text this_line :: !o;
2021-12-02 01:11:33 +01:00
s.xpos <- s.xpos +. width_of_string s.width_table this_line;
2021-11-22 23:44:14 +01:00
if needs_newline then layout_element NewLine;
2021-12-02 01:11:33 +01:00
if remaining_text <> [] then layout_element (Text remaining_text)
2021-11-22 23:44:14 +01:00
end
2023-07-10 15:36:34 +02:00
| HGlue len as glue ->
s.xpos <- s.xpos +. len;
2021-11-20 00:21:37 +01:00
o := glue :: !o;
if s.xpos >= xpos_max then layout_element NewLine
2021-11-22 02:27:51 +01:00
| NewLine ->
s.xpos <- lmargin;
o := NewLine :: !o
| x ->
o := x :: !o
2021-11-20 00:21:37 +01:00
in
iter layout_element i;
2021-11-19 00:23:38 +01:00
rev !o
2021-11-18 20:09:09 +01:00
(* Paginate, simply line-based. When ypos + lineheight exceeds max_ypos, we
insert a page break. In addition, we re-write any paragraph tag/endtag to
make sure they appear on both pages. *)
2021-11-23 00:43:45 +01:00
let paginate tmargin bmargin papersize i =
2023-04-11 14:50:17 +02:00
let height = Pdfunits.points (Pdfpaper.height papersize) (Pdfpaper.unit papersize) in
2021-11-23 00:43:45 +01:00
let o = ref [] in
let s = initial_state () in
s.ypos <- tmargin;
2021-12-06 01:25:50 +01:00
let max_ypos = height -. bmargin in
let tag = ref None in
2021-11-23 00:43:45 +01:00
let rec process = function
2023-07-10 15:36:34 +02:00
| VGlue len as glue ->
s.ypos <- s.ypos +. len;
2021-11-23 00:43:45 +01:00
o := glue :: !o;
if s.ypos > max_ypos then process NewPage
| NewLine ->
s.ypos <- s.ypos +. s.fontsize *. 1.3;
o := NewLine::!o;
if s.ypos > max_ypos then process NewPage
2023-07-20 13:56:20 +02:00
| Font (id, f, fs) ->
2021-11-23 00:43:45 +01:00
s.font <- Some f;
2023-07-20 13:56:20 +02:00
s.fontid <- Some id;
2021-11-23 00:43:45 +01:00
s.fontsize <- fs;
2023-07-20 13:56:20 +02:00
o := Font (id, f, fs)::!o
2021-11-23 00:43:45 +01:00
| NewPage ->
2021-12-14 15:19:58 +01:00
s.ypos <- tmargin +. s.fontsize;
begin match !tag with Some (s, i) -> o := EndTag::!o | None -> () end;
o := NewPage::!o;
begin match !tag with Some (s, i) -> o := Tag (s, i)::!o | None -> () end
2021-12-14 15:19:58 +01:00
| BeginDocument ->
s.ypos <- tmargin +. s.fontsize;
o := BeginDocument::!o
| Tag (s, i) ->
tag := Some (s, i);
o := Tag (s, i)::!o
| EndTag ->
tag := None;
o := EndTag::!o
2021-11-23 00:43:45 +01:00
| x -> o := x::!o
in
iter process i;
rev !o
2021-11-18 20:09:09 +01:00
2021-11-18 23:48:25 +01:00
let make_resources fontobjnums =
Pdf.Dictionary
2022-09-27 21:10:19 +02:00
[("/Font", Pdf.Dictionary (map (fun fo -> ("/F" ^ string_of_int fo, Pdf.Indirect fo)) (setify fontobjnums)))]
2021-11-18 23:48:25 +01:00
2023-07-19 15:21:25 +02:00
let make_annotations pdf annots =
2021-11-23 23:20:38 +01:00
if annots = [] then Pdf.Dictionary [] else
2023-07-19 15:21:25 +02:00
Pdf.Dictionary ["/Annots", Pdf.Array (map (function a -> Pdf.Indirect (Pdf.addobj pdf a)) annots)]
2021-11-23 23:20:38 +01:00
2024-10-02 17:41:51 +02:00
let rec number_tags n = function
| Tag (s, _)::t -> Tag (s, n)::number_tags (n + 1) t
| h::t -> h::number_tags n t
| [] -> []
2021-11-18 23:48:25 +01:00
(* At this stage, just Font and Text and HGlue 0. and VGlue 0. and Newline and
NewPage elements. Split on NewPages, typeset each page, add font
2023-07-19 15:21:25 +02:00
dictionaries. New page only creates a page when that page has content. *)
let typeset ~process_struct_tree lmargin rmargin tmargin bmargin papersize pdf i =
2023-07-20 13:56:20 +02:00
Hashtbl.clear width_table_cache;
2021-11-23 23:31:16 +01:00
let debug = false in
if debug then (print_endline "***input:\n\n"; print_endline (to_string i));
2024-10-03 17:18:09 +02:00
let i = number_tags 0 i in
2021-11-18 23:48:25 +01:00
let i = layout lmargin rmargin papersize i in
2021-11-23 23:31:16 +01:00
if debug then (print_endline "***after layout:\n\n"; print_endline (to_string i));
2021-11-18 23:48:25 +01:00
let i = paginate tmargin bmargin papersize i in
2021-11-23 23:31:16 +01:00
if debug then (print_endline "***after pagination:\n\n"; print_endline (to_string i));
2023-04-11 14:50:17 +02:00
let height = Pdfunits.points (Pdfpaper.height papersize) (Pdfpaper.unit papersize) in
2021-11-18 23:48:25 +01:00
let s = initial_state () in
s.xpos <- lmargin;
2021-11-19 00:23:38 +01:00
s.ypos <- tmargin;
2024-10-02 16:26:30 +02:00
let mcidr = ref ~-1 in
let mcid () = incr mcidr; !mcidr in
2021-11-18 23:48:25 +01:00
let ops = ref [] in
let fonts = ref [] in
let thispagefontnums = ref [] in
2021-11-23 23:20:38 +01:00
let thispageannotations = ref [] in
let thisdestrectangles = ref [] in
2021-11-18 23:48:25 +01:00
let pages = ref [] in
2024-10-03 13:32:26 +02:00
let tags = ref [] in
let tagsout = ref [] in
2021-11-18 23:48:25 +01:00
let write_page () =
2024-10-02 14:54:23 +02:00
let ops = if process_struct_tree then add_artifacts (rev !ops) else rev !ops in
let page =
2024-10-02 14:54:23 +02:00
{Pdfpage.content = if ops = [] then [] else [Pdfops.stream_of_ops ops];
Pdfpage.mediabox = Pdfpage.rectangle_of_paper papersize;
Pdfpage.resources = make_resources !thispagefontnums;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = make_annotations pdf !thispageannotations}
in
2024-10-03 13:32:26 +02:00
pages := page::!pages;
2024-10-03 17:18:09 +02:00
tagsout := rev !tags::!tagsout;
tags := []
2021-11-18 23:48:25 +01:00
in
2021-11-20 00:21:37 +01:00
let rec typeset_element = function
2021-11-18 23:48:25 +01:00
| Text cps ->
2021-11-20 00:21:37 +01:00
ops :=
Pdfops.Op_Q
::Pdfops.Op_ET
2021-12-02 01:11:33 +01:00
::Pdfops.Op_Tj (implode cps)
2021-11-20 00:21:37 +01:00
::Pdfops.Op_BT
::Pdfops.Op_cm (Pdftransform.mktranslate s.xpos (height -. s.ypos))
::Pdfops.Op_q
2021-11-23 23:20:38 +01:00
::!ops;
2021-11-20 01:28:13 +01:00
(* If a destination, add the rectangle to the pile of rectangles for this annotation. *)
2021-11-23 23:20:38 +01:00
if s.dest <> None then
2021-12-07 00:33:18 +01:00
begin
2021-11-23 23:20:38 +01:00
let minx, miny = s.xpos, height -. s.ypos in
2021-12-07 00:33:18 +01:00
thisdestrectangles := (minx, miny, minx +. width_of_string s.width_table cps, miny +. s.fontsize)::!thisdestrectangles
end;
s.xpos <- s.xpos +. width_of_string s.width_table cps
2023-07-20 13:56:20 +02:00
| Font (id, f, fontsize) ->
2021-11-18 23:48:25 +01:00
let name, objnum =
2023-07-20 13:56:20 +02:00
match List.assoc_opt id !fonts with
2021-11-18 23:48:25 +01:00
| Some objnum -> ("/F" ^ string_of_int objnum, objnum)
| None ->
let num = Pdftext.write_font pdf f in
let n = "/F" ^ string_of_int num in
2023-07-20 13:56:20 +02:00
fonts := (id, num) :: !fonts;
2021-11-18 23:48:25 +01:00
(n, num)
in
2023-07-20 13:56:20 +02:00
s.width_table <- font_widths id f fontsize;
2021-11-18 23:48:25 +01:00
s.font <- Some f;
2023-07-20 13:56:20 +02:00
s.fontid <- Some id;
2021-11-20 00:21:37 +01:00
s.fontsize <- fontsize;
2021-11-18 23:48:25 +01:00
thispagefontnums := objnum :: !thispagefontnums;
ops := Pdfops.Op_Tf (name, fontsize)::!ops
2023-07-10 15:36:34 +02:00
| HGlue len ->
s.xpos <- s.xpos +. len
| VGlue len ->
s.ypos <- s.ypos +. len
2021-11-18 23:48:25 +01:00
| NewLine ->
2021-11-20 00:21:37 +01:00
s.xpos <- lmargin;
2023-07-10 15:36:34 +02:00
typeset_element (VGlue (s.fontsize *. 1.3))
2021-11-18 23:48:25 +01:00
| NewPage ->
write_page ();
2021-11-19 01:41:42 +01:00
thispagefontnums := [];
2021-11-23 23:20:38 +01:00
thispageannotations := [];
2021-11-18 23:48:25 +01:00
ops := [];
2024-10-02 16:26:30 +02:00
mcidr := -1;
2023-07-20 13:56:20 +02:00
if s.font <> None && s.fontid <> None then typeset_element (Font (unopt s.fontid, unopt s.font, s.fontsize));
2021-11-19 01:41:42 +01:00
s.xpos <- lmargin;
2021-12-14 15:07:52 +01:00
s.ypos <- tmargin +. s.fontsize
| BeginDocument ->
s.ypos <- tmargin +. s.fontsize
2021-11-20 01:28:13 +01:00
| BeginDest dest ->
s.dest <- Some dest
| EndDest ->
2021-11-23 23:20:38 +01:00
if !thisdestrectangles <> [] && s.dest <> None then
let annot (minx, miny, maxx, maxy) =
Pdf.Dictionary
[("/Type", Pdf.Name "/Annot");
("/Subtype", Pdf.Name "/Link");
2021-11-23 23:31:16 +01:00
("/Border", Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real 0.]);
2021-11-23 23:20:38 +01:00
("/Rect", Pdf.Array [Pdf.Real minx; Pdf.Real miny; Pdf.Real maxx; Pdf.Real maxy]);
("/Dest", Pdfdest.pdfobject_of_destination (unopt s.dest))]
in
thispageannotations := map annot !thisdestrectangles @ !thispageannotations;
s.dest <- None;
thisdestrectangles := []
2024-10-03 13:32:26 +02:00
| Tag (s, i) ->
tags := (s, i)::!tags;
ops := Pdfops.Op_BDC ("/" ^ s, Pdf.Dictionary [("/MCID", Pdf.Integer (mcid ()))])::!ops
2024-10-02 16:26:30 +02:00
| EndTag -> ops := Pdfops.Op_EMC::!ops
2021-11-18 23:48:25 +01:00
in
iter typeset_element i;
write_page ();
2024-10-03 13:32:26 +02:00
(rev !pages, rev !tagsout)