From b289d0e5e09cbf81e8e7d4bb5f8f238e6438a5d7 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Fri, 13 Sep 2024 15:49:09 +0100 Subject: [PATCH] Fixes for format_paragraph width calculation --- cpdfdraw.ml | 91 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/cpdfdraw.ml b/cpdfdraw.ml index d6f0508..0d54911 100644 --- a/cpdfdraw.ml +++ b/cpdfdraw.ml @@ -148,13 +148,38 @@ let process_specials pdf endpage filename bates batespad num page s = in Cpdfaddtext.process_text (res ()).time s pairs +(* FIXME cache (just for paragraph) *) +let font_widths f fontsize = + match f with + | Pdftext.StandardFont (sf, encoding) -> + Array.init + 256 + (fun x -> + fontsize + *. float_of_int + (Pdfstandard14.textwidth false encoding sf (string_of_char (char_of_int x))) + /. 1000.) + | Pdftext.SimpleFont {fontmetrics = Some m} -> + Array.map (fun x -> fontsize *. x /. 1000. ) m + | _ -> raise (Pdf.PDFError "Cpdfdraw: Unsupported font") + let runs_of_utf8 s = let identifier, fontpack = (res ()).current_fontpack in let codepoints = Pdftext.codepoints_of_utf8 s in let triples = option_map (Cpdfembed.get_char fontpack) codepoints in let collated = Cpdfembed.collate_runs triples in + (* FIXME Efficiency: runs, cacheing *) + let w = + fold_left ( +. ) 0. + (map + (fun (charcode, _, font) -> + let widths = font_widths font (res ()).font_size in + widths.(charcode)) + triples) + in + let output = flatten - (map + (map (fun l -> if l = [] then [] else let f, n = match l with (_, n, f)::_ -> f, n | _ -> assert false in @@ -162,7 +187,9 @@ let runs_of_utf8 s = let charcodes = map (fun (c, _, _) -> char_of_int c) l in [Pdfops.Op_Tf (fontname, (res ()).font_size); Pdfops.Op_Tj (implode charcodes)]) - collated) + collated) + in + (output, w) let extgstate kind v = try Hashtbl.find (res ()).extgstates (kind, v) with @@ -176,38 +203,6 @@ let read_resource pdf n res = | Some (Pdf.Dictionary d) -> d | _ -> [] -(* TODO Stolen from Cpdftype. Can these be reunited some day? *) -let width_of_string ws s = - let w = ref 0. in - iter (fun s -> w := !w +. ws.(int_of_char s)) s; - !w - -let split_text space_left widths t = - let chars = ref t in - let words = ref [] in - let space_left = ref space_left in - let return needs_newline = - (flatten (rev !words), needs_newline, !chars) - in - try - while !chars <> [] do - let word, rest = cleavewhile (neq ' ') !chars in - let w = width_of_string widths word in - if !words = [] || w < !space_left - then - let is_last_word = rest = [] in - let new_word = if is_last_word then word else word @ [' '] in - begin - words := new_word::!words; - space_left := !space_left -. w -. (if is_last_word then 0. else width_of_string widths [' ']) - end - else raise Exit; - chars := if rest = [] then [] else tl rest; - done; - return false - with - Exit -> return true - let update_resources pdf old_resources = let gss_resources = map (fun ((kind, v), n) -> (n, Pdf.Dictionary [(kind, Pdf.Real v)])) (list_of_hashtbl (res ()).extgstates) in let select_resources t = @@ -243,6 +238,24 @@ type structdata = let structdata = ref [] +(* TODO: Use Uuseg for proper unicode segmentation. *) +let format_paragraph j w s = + (* 1. Split on word boundaries *) + let ss = String.split_on_char ' ' s in + (* 2. Calculate the runs for each word *) + let rs_and_widths = ref (map runs_of_utf8 ss) in + (* 3. Calculate runs for a space *) + let space_runs, space_width = runs_of_utf8 " " in + (* 4. Now we may find the sections imperatively. *) + let remaining = ref w in + let lines = ref [] in + while !rs_and_widths <> [] do + (* 5. Calculate lines *) + () + done; + (* 6. Now apply justification, and convert lines to final output. *) + [] + let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function | Qq ops -> [Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q] @@ -358,13 +371,14 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num @ (if struct_tree then [Pdfops.Op_EMC] else []) | Text s -> if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s); - runs_of_utf8 s + fst (runs_of_utf8 s) | SpecialText s -> let s = process_specials pdf endpage filename bates batespad num page s in if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s); - runs_of_utf8 s + fst (runs_of_utf8 s) | Para (j, w, s) -> - ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page (format_paragraph j w s) + if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s); + format_paragraph j w s | Leading f -> [Pdfops.Op_TL f] | CharSpace f -> [Pdfops.Op_Tc f] | WordSpace f -> [Pdfops.Op_Tw f] @@ -373,9 +387,6 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num | Rise f -> [Pdfops.Op_Ts f] | Newline -> [Pdfops.Op_T'] -(* TODO: Use Uuseg for proper unicode segmentation. *) -and format_paragraph j w s = - [Text s] and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops = flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops)