Fixes for format_paragraph width calculation
This commit is contained in:
parent
c026f03626
commit
b289d0e5e0
91
cpdfdraw.ml
91
cpdfdraw.ml
|
@ -148,13 +148,38 @@ let process_specials pdf endpage filename bates batespad num page s =
|
||||||
in
|
in
|
||||||
Cpdfaddtext.process_text (res ()).time s pairs
|
Cpdfaddtext.process_text (res ()).time s pairs
|
||||||
|
|
||||||
|
(* FIXME cache (just for paragraph) *)
|
||||||
|
let font_widths f fontsize =
|
||||||
|
match f with
|
||||||
|
| Pdftext.StandardFont (sf, encoding) ->
|
||||||
|
Array.init
|
||||||
|
256
|
||||||
|
(fun x ->
|
||||||
|
fontsize
|
||||||
|
*. float_of_int
|
||||||
|
(Pdfstandard14.textwidth false encoding sf (string_of_char (char_of_int x)))
|
||||||
|
/. 1000.)
|
||||||
|
| Pdftext.SimpleFont {fontmetrics = Some m} ->
|
||||||
|
Array.map (fun x -> fontsize *. x /. 1000. ) m
|
||||||
|
| _ -> raise (Pdf.PDFError "Cpdfdraw: Unsupported font")
|
||||||
|
|
||||||
let runs_of_utf8 s =
|
let runs_of_utf8 s =
|
||||||
let identifier, fontpack = (res ()).current_fontpack in
|
let identifier, fontpack = (res ()).current_fontpack in
|
||||||
let codepoints = Pdftext.codepoints_of_utf8 s in
|
let codepoints = Pdftext.codepoints_of_utf8 s in
|
||||||
let triples = option_map (Cpdfembed.get_char fontpack) codepoints in
|
let triples = option_map (Cpdfembed.get_char fontpack) codepoints in
|
||||||
let collated = Cpdfembed.collate_runs triples in
|
let collated = Cpdfembed.collate_runs triples in
|
||||||
|
(* FIXME Efficiency: runs, cacheing *)
|
||||||
|
let w =
|
||||||
|
fold_left ( +. ) 0.
|
||||||
|
(map
|
||||||
|
(fun (charcode, _, font) ->
|
||||||
|
let widths = font_widths font (res ()).font_size in
|
||||||
|
widths.(charcode))
|
||||||
|
triples)
|
||||||
|
in
|
||||||
|
let output =
|
||||||
flatten
|
flatten
|
||||||
(map
|
(map
|
||||||
(fun l ->
|
(fun l ->
|
||||||
if l = [] then [] else
|
if l = [] then [] else
|
||||||
let f, n = match l with (_, n, f)::_ -> f, n | _ -> assert false in
|
let f, n = match l with (_, n, f)::_ -> f, n | _ -> assert false in
|
||||||
|
@ -162,7 +187,9 @@ let runs_of_utf8 s =
|
||||||
let charcodes = map (fun (c, _, _) -> char_of_int c) l in
|
let charcodes = map (fun (c, _, _) -> char_of_int c) l in
|
||||||
[Pdfops.Op_Tf (fontname, (res ()).font_size);
|
[Pdfops.Op_Tf (fontname, (res ()).font_size);
|
||||||
Pdfops.Op_Tj (implode charcodes)])
|
Pdfops.Op_Tj (implode charcodes)])
|
||||||
collated)
|
collated)
|
||||||
|
in
|
||||||
|
(output, w)
|
||||||
|
|
||||||
let extgstate kind v =
|
let extgstate kind v =
|
||||||
try Hashtbl.find (res ()).extgstates (kind, v) with
|
try Hashtbl.find (res ()).extgstates (kind, v) with
|
||||||
|
@ -176,38 +203,6 @@ let read_resource pdf n res =
|
||||||
| Some (Pdf.Dictionary d) -> d
|
| Some (Pdf.Dictionary d) -> d
|
||||||
| _ -> []
|
| _ -> []
|
||||||
|
|
||||||
(* TODO Stolen from Cpdftype. Can these be reunited some day? *)
|
|
||||||
let width_of_string ws s =
|
|
||||||
let w = ref 0. in
|
|
||||||
iter (fun s -> w := !w +. ws.(int_of_char s)) s;
|
|
||||||
!w
|
|
||||||
|
|
||||||
let split_text space_left widths t =
|
|
||||||
let chars = ref t in
|
|
||||||
let words = ref [] in
|
|
||||||
let space_left = ref space_left in
|
|
||||||
let return needs_newline =
|
|
||||||
(flatten (rev !words), needs_newline, !chars)
|
|
||||||
in
|
|
||||||
try
|
|
||||||
while !chars <> [] do
|
|
||||||
let word, rest = cleavewhile (neq ' ') !chars in
|
|
||||||
let w = width_of_string widths word in
|
|
||||||
if !words = [] || w < !space_left
|
|
||||||
then
|
|
||||||
let is_last_word = rest = [] in
|
|
||||||
let new_word = if is_last_word then word else word @ [' '] in
|
|
||||||
begin
|
|
||||||
words := new_word::!words;
|
|
||||||
space_left := !space_left -. w -. (if is_last_word then 0. else width_of_string widths [' '])
|
|
||||||
end
|
|
||||||
else raise Exit;
|
|
||||||
chars := if rest = [] then [] else tl rest;
|
|
||||||
done;
|
|
||||||
return false
|
|
||||||
with
|
|
||||||
Exit -> return true
|
|
||||||
|
|
||||||
let update_resources pdf old_resources =
|
let update_resources pdf old_resources =
|
||||||
let gss_resources = map (fun ((kind, v), n) -> (n, Pdf.Dictionary [(kind, Pdf.Real v)])) (list_of_hashtbl (res ()).extgstates) in
|
let gss_resources = map (fun ((kind, v), n) -> (n, Pdf.Dictionary [(kind, Pdf.Real v)])) (list_of_hashtbl (res ()).extgstates) in
|
||||||
let select_resources t =
|
let select_resources t =
|
||||||
|
@ -243,6 +238,24 @@ type structdata =
|
||||||
|
|
||||||
let structdata = ref []
|
let structdata = ref []
|
||||||
|
|
||||||
|
(* TODO: Use Uuseg for proper unicode segmentation. *)
|
||||||
|
let format_paragraph j w s =
|
||||||
|
(* 1. Split on word boundaries *)
|
||||||
|
let ss = String.split_on_char ' ' s in
|
||||||
|
(* 2. Calculate the runs for each word *)
|
||||||
|
let rs_and_widths = ref (map runs_of_utf8 ss) in
|
||||||
|
(* 3. Calculate runs for a space *)
|
||||||
|
let space_runs, space_width = runs_of_utf8 " " in
|
||||||
|
(* 4. Now we may find the sections imperatively. *)
|
||||||
|
let remaining = ref w in
|
||||||
|
let lines = ref [] in
|
||||||
|
while !rs_and_widths <> [] do
|
||||||
|
(* 5. Calculate lines *)
|
||||||
|
()
|
||||||
|
done;
|
||||||
|
(* 6. Now apply justification, and convert lines to final output. *)
|
||||||
|
[]
|
||||||
|
|
||||||
let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function
|
let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function
|
||||||
| Qq ops ->
|
| Qq ops ->
|
||||||
[Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q]
|
[Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q]
|
||||||
|
@ -358,13 +371,14 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
|
||||||
@ (if struct_tree then [Pdfops.Op_EMC] else [])
|
@ (if struct_tree then [Pdfops.Op_EMC] else [])
|
||||||
| Text s ->
|
| Text s ->
|
||||||
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
|
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
|
||||||
runs_of_utf8 s
|
fst (runs_of_utf8 s)
|
||||||
| SpecialText s ->
|
| SpecialText s ->
|
||||||
let s = process_specials pdf endpage filename bates batespad num page s in
|
let s = process_specials pdf endpage filename bates batespad num page s in
|
||||||
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
|
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
|
||||||
runs_of_utf8 s
|
fst (runs_of_utf8 s)
|
||||||
| Para (j, w, s) ->
|
| Para (j, w, s) ->
|
||||||
ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page (format_paragraph j w s)
|
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
|
||||||
|
format_paragraph j w s
|
||||||
| Leading f -> [Pdfops.Op_TL f]
|
| Leading f -> [Pdfops.Op_TL f]
|
||||||
| CharSpace f -> [Pdfops.Op_Tc f]
|
| CharSpace f -> [Pdfops.Op_Tc f]
|
||||||
| WordSpace f -> [Pdfops.Op_Tw f]
|
| WordSpace f -> [Pdfops.Op_Tw f]
|
||||||
|
@ -373,9 +387,6 @@ let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num
|
||||||
| Rise f -> [Pdfops.Op_Ts f]
|
| Rise f -> [Pdfops.Op_Ts f]
|
||||||
| Newline -> [Pdfops.Op_T']
|
| Newline -> [Pdfops.Op_T']
|
||||||
|
|
||||||
(* TODO: Use Uuseg for proper unicode segmentation. *)
|
|
||||||
and format_paragraph j w s =
|
|
||||||
[Text s]
|
|
||||||
|
|
||||||
and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops =
|
and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops =
|
||||||
flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops)
|
flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops)
|
||||||
|
|
Loading…
Reference in New Issue