From b289d0e5e09cbf81e8e7d4bb5f8f238e6438a5d7 Mon Sep 17 00:00:00 2001
From: John Whitington <john@coherentgraphics.co.uk>
Date: Fri, 13 Sep 2024 15:49:09 +0100
Subject: [PATCH] Fixes for format_paragraph width calculation

---
 cpdfdraw.ml | 91 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 51 insertions(+), 40 deletions(-)

diff --git a/cpdfdraw.ml b/cpdfdraw.ml
index d6f0508..0d54911 100644
--- a/cpdfdraw.ml
+++ b/cpdfdraw.ml
@@ -148,13 +148,38 @@ let process_specials pdf endpage filename bates batespad num page s =
   in
     Cpdfaddtext.process_text (res ()).time s pairs
 
+(* FIXME cache (just for paragraph) *)
+let font_widths f fontsize =
+  match f with
+  | Pdftext.StandardFont (sf, encoding) ->
+      Array.init
+        256
+        (fun x ->
+             fontsize
+          *. float_of_int
+               (Pdfstandard14.textwidth false encoding sf (string_of_char (char_of_int x)))
+          /. 1000.)
+  | Pdftext.SimpleFont {fontmetrics = Some m} ->
+      Array.map (fun x -> fontsize *. x /. 1000. ) m
+  | _ -> raise (Pdf.PDFError "Cpdfdraw: Unsupported font")
+
 let runs_of_utf8 s =
   let identifier, fontpack = (res ()).current_fontpack in
   let codepoints = Pdftext.codepoints_of_utf8 s in
   let triples = option_map (Cpdfembed.get_char fontpack) codepoints in
   let collated = Cpdfembed.collate_runs triples in
+  (* FIXME Efficiency: runs, cacheing *)
+  let w =
+    fold_left ( +. ) 0.
+      (map
+        (fun (charcode, _, font) ->
+           let widths = font_widths font (res ()).font_size in
+             widths.(charcode))
+        triples)
+  in
+  let output =
     flatten
-     (map
+      (map
        (fun l ->
          if l = [] then [] else
            let f, n = match l with (_, n, f)::_ -> f, n | _ -> assert false in
@@ -162,7 +187,9 @@ let runs_of_utf8 s =
            let charcodes = map (fun (c, _, _) -> char_of_int c) l in
              [Pdfops.Op_Tf (fontname, (res ()).font_size);
               Pdfops.Op_Tj (implode charcodes)])
-      collated)
+       collated)
+  in
+    (output, w)
 
 let extgstate kind v =
   try Hashtbl.find (res ()).extgstates (kind, v) with
@@ -176,38 +203,6 @@ let read_resource pdf n res =
   | Some (Pdf.Dictionary d) -> d
   | _ -> []
 
-(* TODO Stolen from Cpdftype. Can these be reunited some day? *)
-let width_of_string ws s =
-  let w = ref 0. in
-    iter (fun s -> w := !w +. ws.(int_of_char s)) s;
-    !w
-
-let split_text space_left widths t =
-  let chars = ref t in
-  let words = ref [] in
-  let space_left = ref space_left in
-  let return needs_newline =
-    (flatten (rev !words), needs_newline, !chars)
-  in
-    try
-      while !chars <> [] do
-        let word, rest = cleavewhile (neq ' ') !chars in
-          let w = width_of_string widths word in 
-          if !words = [] || w < !space_left
-            then
-              let is_last_word = rest = [] in
-              let new_word = if is_last_word then word else word @ [' '] in
-                begin
-                  words := new_word::!words;
-                  space_left := !space_left -. w -. (if is_last_word then 0. else width_of_string widths [' '])
-                end
-            else raise Exit;
-          chars := if rest = [] then [] else tl rest;
-      done;
-      return false
-    with
-      Exit -> return true
-
 let update_resources pdf old_resources =
   let gss_resources = map (fun ((kind, v), n) -> (n, Pdf.Dictionary [(kind, Pdf.Real v)])) (list_of_hashtbl (res ()).extgstates) in
   let select_resources t =
@@ -243,6 +238,24 @@ type structdata =
 
 let structdata = ref []
 
+(* TODO: Use Uuseg for proper unicode segmentation. *)
+let format_paragraph j w s =
+  (* 1. Split on word boundaries *)
+  let ss = String.split_on_char ' ' s in
+  (* 2. Calculate the runs for each word *)
+  let rs_and_widths = ref (map runs_of_utf8 ss) in
+  (* 3. Calculate runs for a space *)
+  let space_runs, space_width = runs_of_utf8 " " in
+  (* 4. Now we may find the sections imperatively. *)
+  let remaining = ref w in
+  let lines = ref [] in
+    while !rs_and_widths <> [] do
+      (* 5. Calculate lines *)
+      ()
+    done;
+  (* 6. Now apply justification, and convert lines to final output. *)
+    []
+
 let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function
   | Qq ops ->
       [Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q]
@@ -358,13 +371,14 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
         @ (if struct_tree then [Pdfops.Op_EMC] else [])
   | Text s ->
       if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
-      runs_of_utf8 s
+      fst (runs_of_utf8 s)
   | SpecialText s ->
       let s = process_specials pdf endpage filename bates batespad num page s in
       if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
-        runs_of_utf8 s
+        fst (runs_of_utf8 s)
   | Para (j, w, s) ->
-      ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page (format_paragraph j w s)
+      if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
+      format_paragraph j w s
   | Leading f -> [Pdfops.Op_TL f]
   | CharSpace f -> [Pdfops.Op_Tc f]
   | WordSpace f -> [Pdfops.Op_Tw f]
@@ -373,9 +387,6 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
   | Rise f -> [Pdfops.Op_Ts f]
   | Newline -> [Pdfops.Op_T']
 
-(* TODO: Use Uuseg for proper unicode segmentation. *)
-and format_paragraph j w s =
-  [Text s]
 
 and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops =
   flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops)