Fixes for format_paragraph width calculation

This commit is contained in:
John Whitington 2024-09-13 15:49:09 +01:00
parent c026f03626
commit b289d0e5e0
1 changed files with 51 additions and 40 deletions

View File

@ -148,11 +148,36 @@ let process_specials pdf endpage filename bates batespad num page s =
in in
Cpdfaddtext.process_text (res ()).time s pairs Cpdfaddtext.process_text (res ()).time s pairs
(* FIXME cache (just for paragraph) *)
let font_widths f fontsize =
match f with
| Pdftext.StandardFont (sf, encoding) ->
Array.init
256
(fun x ->
fontsize
*. float_of_int
(Pdfstandard14.textwidth false encoding sf (string_of_char (char_of_int x)))
/. 1000.)
| Pdftext.SimpleFont {fontmetrics = Some m} ->
Array.map (fun x -> fontsize *. x /. 1000. ) m
| _ -> raise (Pdf.PDFError "Cpdfdraw: Unsupported font")
let runs_of_utf8 s = let runs_of_utf8 s =
let identifier, fontpack = (res ()).current_fontpack in let identifier, fontpack = (res ()).current_fontpack in
let codepoints = Pdftext.codepoints_of_utf8 s in let codepoints = Pdftext.codepoints_of_utf8 s in
let triples = option_map (Cpdfembed.get_char fontpack) codepoints in let triples = option_map (Cpdfembed.get_char fontpack) codepoints in
let collated = Cpdfembed.collate_runs triples in let collated = Cpdfembed.collate_runs triples in
(* FIXME Efficiency: runs, cacheing *)
let w =
fold_left ( +. ) 0.
(map
(fun (charcode, _, font) ->
let widths = font_widths font (res ()).font_size in
widths.(charcode))
triples)
in
let output =
flatten flatten
(map (map
(fun l -> (fun l ->
@ -163,6 +188,8 @@ let runs_of_utf8 s =
[Pdfops.Op_Tf (fontname, (res ()).font_size); [Pdfops.Op_Tf (fontname, (res ()).font_size);
Pdfops.Op_Tj (implode charcodes)]) Pdfops.Op_Tj (implode charcodes)])
collated) collated)
in
(output, w)
let extgstate kind v = let extgstate kind v =
try Hashtbl.find (res ()).extgstates (kind, v) with try Hashtbl.find (res ()).extgstates (kind, v) with
@ -176,38 +203,6 @@ let read_resource pdf n res =
| Some (Pdf.Dictionary d) -> d | Some (Pdf.Dictionary d) -> d
| _ -> [] | _ -> []
(* TODO Stolen from Cpdftype. Can these be reunited some day? *)
let width_of_string ws s =
let w = ref 0. in
iter (fun s -> w := !w +. ws.(int_of_char s)) s;
!w
let split_text space_left widths t =
let chars = ref t in
let words = ref [] in
let space_left = ref space_left in
let return needs_newline =
(flatten (rev !words), needs_newline, !chars)
in
try
while !chars <> [] do
let word, rest = cleavewhile (neq ' ') !chars in
let w = width_of_string widths word in
if !words = [] || w < !space_left
then
let is_last_word = rest = [] in
let new_word = if is_last_word then word else word @ [' '] in
begin
words := new_word::!words;
space_left := !space_left -. w -. (if is_last_word then 0. else width_of_string widths [' '])
end
else raise Exit;
chars := if rest = [] then [] else tl rest;
done;
return false
with
Exit -> return true
let update_resources pdf old_resources = let update_resources pdf old_resources =
let gss_resources = map (fun ((kind, v), n) -> (n, Pdf.Dictionary [(kind, Pdf.Real v)])) (list_of_hashtbl (res ()).extgstates) in let gss_resources = map (fun ((kind, v), n) -> (n, Pdf.Dictionary [(kind, Pdf.Real v)])) (list_of_hashtbl (res ()).extgstates) in
let select_resources t = let select_resources t =
@ -243,6 +238,24 @@ type structdata =
let structdata = ref [] let structdata = ref []
(* TODO: Use Uuseg for proper unicode segmentation. *)
let format_paragraph j w s =
(* 1. Split on word boundaries *)
let ss = String.split_on_char ' ' s in
(* 2. Calculate the runs for each word *)
let rs_and_widths = ref (map runs_of_utf8 ss) in
(* 3. Calculate runs for a space *)
let space_runs, space_width = runs_of_utf8 " " in
(* 4. Now we may find the sections imperatively. *)
let remaining = ref w in
let lines = ref [] in
while !rs_and_widths <> [] do
(* 5. Calculate lines *)
()
done;
(* 6. Now apply justification, and convert lines to final output. *)
[]
let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function
| Qq ops -> | Qq ops ->
[Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q] [Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q]
@ -358,13 +371,14 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
@ (if struct_tree then [Pdfops.Op_EMC] else []) @ (if struct_tree then [Pdfops.Op_EMC] else [])
| Text s -> | Text s ->
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s); if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
runs_of_utf8 s fst (runs_of_utf8 s)
| SpecialText s -> | SpecialText s ->
let s = process_specials pdf endpage filename bates batespad num page s in let s = process_specials pdf endpage filename bates batespad num page s in
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s); if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
runs_of_utf8 s fst (runs_of_utf8 s)
| Para (j, w, s) -> | Para (j, w, s) ->
ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page (format_paragraph j w s) if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
format_paragraph j w s
| Leading f -> [Pdfops.Op_TL f] | Leading f -> [Pdfops.Op_TL f]
| CharSpace f -> [Pdfops.Op_Tc f] | CharSpace f -> [Pdfops.Op_Tc f]
| WordSpace f -> [Pdfops.Op_Tw f] | WordSpace f -> [Pdfops.Op_Tw f]
@ -373,9 +387,6 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
| Rise f -> [Pdfops.Op_Ts f] | Rise f -> [Pdfops.Op_Ts f]
| Newline -> [Pdfops.Op_T'] | Newline -> [Pdfops.Op_T']
(* TODO: Use Uuseg for proper unicode segmentation. *)
and format_paragraph j w s =
[Text s]
and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops = and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops =
flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops) flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops)