2022-09-21 18:40:28 +02:00
|
|
|
(* Superimpose text, page numbers etc. *)
|
2021-12-21 15:00:58 +01:00
|
|
|
open Pdfutil
|
|
|
|
open Cpdferror
|
|
|
|
|
|
|
|
type color =
|
|
|
|
Grey of float
|
|
|
|
| RGB of float * float * float
|
|
|
|
| CYMK of float * float * float * float
|
|
|
|
|
2022-09-21 18:40:28 +02:00
|
|
|
(* Process UTF8 text to charcodes, given a font. *)
|
2021-12-21 15:00:58 +01:00
|
|
|
let charcodes_of_utf8 font s =
|
|
|
|
let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in
|
|
|
|
let codepoints = Pdftext.codepoints_of_utf8 s in
|
|
|
|
let charcodes =
|
|
|
|
option_map
|
|
|
|
(fun codepoint ->
|
|
|
|
match extractor codepoint with
|
|
|
|
| Some cc -> Some cc
|
2022-09-27 17:28:34 +02:00
|
|
|
| None ->
|
2023-04-25 14:45:56 +02:00
|
|
|
Pdfe.log (Printf.sprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint);
|
2022-09-27 17:28:34 +02:00
|
|
|
None)
|
2021-12-21 15:00:58 +01:00
|
|
|
codepoints
|
|
|
|
in
|
|
|
|
implode (map char_of_int charcodes)
|
|
|
|
|
2022-09-24 12:58:09 +02:00
|
|
|
let unicode_codepoint_of_pdfcode encoding_table glyphlist_table p =
|
|
|
|
try
|
|
|
|
hd (Hashtbl.find glyphlist_table (Hashtbl.find encoding_table p))
|
|
|
|
with
|
|
|
|
Not_found -> 0
|
|
|
|
|
2021-12-21 15:00:58 +01:00
|
|
|
(* Get the width of some text in the given font *)
|
|
|
|
let width_of_text font text =
|
|
|
|
match font with
|
|
|
|
| Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} ->
|
|
|
|
begin try
|
2023-03-12 17:29:32 +01:00
|
|
|
fsum (map (fun c -> fontmetrics.(int_of_char c)) (explode text))
|
2021-12-21 15:00:58 +01:00
|
|
|
with
|
|
|
|
_ -> 0.
|
|
|
|
end
|
|
|
|
| _ -> 0.
|
|
|
|
|
|
|
|
let colour_op = function
|
|
|
|
| RGB (r, g, b) -> Pdfops.Op_rg (r, g, b)
|
|
|
|
| Grey g -> Pdfops.Op_g g
|
|
|
|
| CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k)
|
|
|
|
|
|
|
|
let colour_op_stroke = function
|
|
|
|
| RGB (r, g, b) -> Pdfops.Op_RG (r, g, b)
|
|
|
|
| Grey g -> Pdfops.Op_G g
|
|
|
|
| CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k)
|
|
|
|
|
2023-07-18 17:27:29 +02:00
|
|
|
let ops fontpack fontpackpdfobjs fontname longest_w x y rotate hoffset voffset outline linewidth unique_fontname unique_fontnames unique_extgstatename colour fontsize text =
|
2023-07-18 16:51:43 +02:00
|
|
|
let textops =
|
|
|
|
match fontpack with
|
|
|
|
| Some fontpack ->
|
|
|
|
let codepoints = Pdftext.codepoints_of_utf8 text in
|
|
|
|
let triples = option_map (Cpdfembed.get_char fontpack) codepoints in
|
|
|
|
let collated = Cpdfembed.collate_runs triples in
|
|
|
|
flatten
|
|
|
|
(map
|
|
|
|
(fun l ->
|
|
|
|
let (_, fontnum, _) = hd l in
|
2023-07-18 17:27:29 +02:00
|
|
|
[Pdfops.Op_Tf (List.nth unique_fontnames fontnum, fontsize);
|
|
|
|
Pdfops.Op_Tj (implode (map (fun (charcode, _, _) -> char_of_int charcode) l))])
|
2023-07-18 16:51:43 +02:00
|
|
|
collated)
|
|
|
|
| None ->
|
|
|
|
[Pdfops.Op_Tf (unique_fontname, fontsize); Pdfops.Op_Tj text]
|
|
|
|
in
|
|
|
|
[Pdfops.Op_q;
|
|
|
|
Pdfops.Op_BMC "/CPDFSTAMP";
|
|
|
|
Pdfops.Op_cm
|
|
|
|
(Pdftransform.matrix_of_transform
|
|
|
|
[Pdftransform.Translate (x -. hoffset, y -. voffset);
|
|
|
|
Pdftransform.Rotate ((0., 0.), rotate)]);
|
|
|
|
Pdfops.Op_BT]
|
|
|
|
@ (if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0])
|
|
|
|
@ [colour_op colour; colour_op_stroke colour]
|
|
|
|
@ (match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
|
|
|
|
@ textops
|
|
|
|
@ [Pdfops.Op_ET; Pdfops.Op_EMC; Pdfops.Op_Q]
|
2021-12-21 15:00:58 +01:00
|
|
|
|
|
|
|
type justification = LeftJustify | CentreJustify | RightJustify
|
|
|
|
|
|
|
|
(* Find the h-offset for justification based on the longest width, the current
|
|
|
|
width, the justification and the position. *)
|
|
|
|
let find_justification_offsets longest_w w position j =
|
|
|
|
let open Cpdfposition in
|
|
|
|
match j with
|
|
|
|
| LeftJustify ->
|
|
|
|
begin match position with
|
|
|
|
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0.
|
|
|
|
| Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2.
|
|
|
|
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w
|
|
|
|
| Diagonal -> 0.
|
|
|
|
| ReverseDiagonal -> 0.
|
|
|
|
end
|
|
|
|
| RightJustify ->
|
|
|
|
begin match position with
|
|
|
|
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w)
|
|
|
|
| Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.)
|
|
|
|
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0.
|
|
|
|
| Diagonal -> 0.
|
|
|
|
| ReverseDiagonal -> 0.
|
|
|
|
end
|
|
|
|
| CentreJustify ->
|
|
|
|
begin match position with
|
|
|
|
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.)
|
|
|
|
| Top _ | PosCentre _ | Bottom _ | Centre -> 0.
|
|
|
|
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2.
|
|
|
|
| Diagonal -> 0.
|
|
|
|
| ReverseDiagonal -> 0.
|
|
|
|
end
|
|
|
|
|
|
|
|
(* Lex an integer from the table *)
|
|
|
|
let extract_num header s =
|
|
|
|
match Pdfgenlex.lex_string (Hashtbl.find header s) with
|
|
|
|
[Pdfgenlex.LexInt i] -> Pdf.Integer i
|
|
|
|
| [Pdfgenlex.LexReal f] -> Pdf.Real f
|
|
|
|
| _ -> raise (Failure ("extract_num: " ^ s))
|
|
|
|
|
|
|
|
let extract_fontbbox header s =
|
|
|
|
let num = function
|
|
|
|
Pdfgenlex.LexInt i -> Pdf.Integer i
|
|
|
|
| Pdfgenlex.LexReal f -> Pdf.Real f
|
|
|
|
| _ -> raise (Failure "extract_fontbbox")
|
|
|
|
in
|
|
|
|
match Pdfgenlex.lex_string (Hashtbl.find header s) with
|
|
|
|
[a; b; c; d] -> [num a; num b; num c; num d]
|
|
|
|
| _ -> raise (Failure "extract_fontbbox")
|
|
|
|
|
|
|
|
let remove_slash s =
|
|
|
|
match explode s with
|
|
|
|
'/'::x -> implode x
|
|
|
|
| _ -> raise (Failure "remove_slash")
|
|
|
|
|
|
|
|
let extract_widths chars_and_widths =
|
|
|
|
let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in
|
|
|
|
map
|
|
|
|
(fun x ->
|
|
|
|
try
|
|
|
|
let name = List.assoc x win_to_name in
|
|
|
|
let width = List.assoc (remove_slash name) chars_and_widths in
|
|
|
|
width
|
|
|
|
with
|
|
|
|
_ -> 0)
|
|
|
|
(ilist 0 255)
|
|
|
|
|
2022-08-07 15:01:05 +02:00
|
|
|
(* For finding the height for URL links, we try to find the Cap Height for the
|
2022-09-27 17:28:34 +02:00
|
|
|
font. We fall back to using the font size alone if we cannot get the cap
|
|
|
|
height. *)
|
2022-09-24 13:34:50 +02:00
|
|
|
let cap_height font fontname =
|
|
|
|
match font with
|
|
|
|
| Some (Pdftext.SimpleFont {fontdescriptor = Some {capheight}}) ->
|
|
|
|
Some capheight
|
|
|
|
| _ ->
|
|
|
|
try
|
|
|
|
let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in
|
|
|
|
let header, _, _, _ = Pdfstandard14.afm_data font in
|
|
|
|
let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in
|
|
|
|
Some (match capheight with Pdf.Integer i -> float_of_int i | Pdf.Real r -> r | _ -> 0.)
|
|
|
|
with
|
|
|
|
_ -> None
|
2022-09-27 20:58:27 +02:00
|
|
|
|
2021-12-21 15:00:58 +01:00
|
|
|
let rec process_text time text m =
|
|
|
|
match m with
|
|
|
|
| [] -> Cpdfstrftime.strftime ~time text
|
|
|
|
| (s, r)::t -> process_text time (string_replace_all_lazy s r text) t
|
|
|
|
|
2022-08-03 17:38:05 +02:00
|
|
|
(* Find any %URL, sub in the text and return the new text together with a list
|
|
|
|
of ordered (line num, URL, startpos, endpos) data.
|
|
|
|
This will be used after any other %Specials have been processed, so that the
|
|
|
|
positions do not change. *)
|
|
|
|
|
|
|
|
(* text|url]abc -> text, url, abc *)
|
|
|
|
let extract_url line =
|
|
|
|
let text, rest = cleavewhile (neq '|') line in
|
|
|
|
if rest = [] then error "bad URL syntax in text" else
|
|
|
|
let url, rest = cleavewhile (neq ']') (tl rest) in
|
|
|
|
if rest = [] then error "bad URL syntax in text" else
|
|
|
|
(text, url, tl rest)
|
|
|
|
|
2022-08-04 18:16:04 +02:00
|
|
|
(* multiple %URL[a|b] *)
|
2022-08-03 17:38:05 +02:00
|
|
|
let get_urls_line line =
|
|
|
|
let line = explode line in
|
|
|
|
let urls = ref [] in
|
|
|
|
let pos = ref 0 in
|
|
|
|
let outline = ref [] in
|
|
|
|
let rec loop = function
|
|
|
|
| '%'::'U'::'R'::'L'::'['::t ->
|
|
|
|
let text, url, rest = extract_url t in
|
|
|
|
outline := rev text @ !outline;
|
|
|
|
urls := (implode url, !pos, !pos + length text)::!urls;
|
|
|
|
pos += length text;
|
|
|
|
loop rest
|
|
|
|
| h::t ->
|
|
|
|
outline := h::!outline;
|
|
|
|
pos += 1;
|
|
|
|
loop t
|
|
|
|
| [] -> ()
|
|
|
|
in
|
|
|
|
loop line;
|
|
|
|
(implode (rev !outline), rev !urls)
|
|
|
|
|
2021-12-21 15:00:58 +01:00
|
|
|
(* Return page label at pdf page num, or page number in arabic if no label *)
|
|
|
|
let pagelabel pdf num =
|
|
|
|
Pdfpagelabels.pagelabeltext_of_pagenumber
|
|
|
|
num
|
|
|
|
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
|
|
|
|
|
2023-05-01 17:19:09 +02:00
|
|
|
let replace_pairs pdf endpage extract_text_font_size filename bates batespad num page =
|
|
|
|
[
|
|
|
|
"%PageDiv2", (fun () -> string_of_int ((num + 1) / 2));
|
|
|
|
"%Page", (fun () -> string_of_int num);
|
|
|
|
"%Roman", (fun () -> roman_upper num);
|
|
|
|
"%roman", (fun () -> roman_lower num);
|
|
|
|
"%filename", (fun () -> filename);
|
|
|
|
"%Label", (fun () -> pagelabel pdf num);
|
|
|
|
"%EndPage", (fun () -> string_of_int endpage);
|
|
|
|
"%EndLabel", (fun () -> pagelabel pdf endpage);
|
|
|
|
"%ExtractedText", (fun () -> Cpdfextracttext.extract_page_text extract_text_font_size pdf num page);
|
|
|
|
"%Bates",
|
|
|
|
(fun () ->
|
|
|
|
(let numstring = string_of_int (bates + num - 1) in
|
|
|
|
match batespad with
|
|
|
|
None -> numstring
|
|
|
|
| Some w ->
|
|
|
|
if String.length numstring >= w
|
|
|
|
then numstring
|
|
|
|
else implode (many '0' (w - String.length numstring)) ^ numstring))]
|
|
|
|
|
2023-07-10 15:13:45 +02:00
|
|
|
let expand_lines text time pdf endpage extract_text_font_size filename bates batespad num page lines =
|
|
|
|
let expanded_lines =
|
|
|
|
map
|
|
|
|
(function text ->
|
|
|
|
process_text time text (replace_pairs pdf endpage extract_text_font_size filename bates batespad num page))
|
|
|
|
lines
|
|
|
|
in
|
|
|
|
(* process URLs for justification too *)
|
|
|
|
map (fun line -> fst (get_urls_line line)) expanded_lines
|
|
|
|
|
2021-12-21 15:00:58 +01:00
|
|
|
let addtext
|
2022-09-27 17:28:34 +02:00
|
|
|
time lines linewidth outline fast colour fontname encoding bates batespad
|
2023-07-18 15:05:17 +02:00
|
|
|
fontsize fontpack font fontpdfobj fontpackpdfobjs underneath position hoffset voffset text pages
|
2023-07-12 16:57:33 +02:00
|
|
|
cropbox opacity justification filename extract_text_font_size shift raw pdf
|
2021-12-21 15:00:58 +01:00
|
|
|
=
|
|
|
|
let endpage = Pdfpage.endpage pdf in
|
|
|
|
let shifts = Cpdfcoord.parse_coordinates pdf shift in
|
|
|
|
let addtext_page num page =
|
|
|
|
let shift_x, shift_y = List.nth shifts (num - 1) in
|
|
|
|
let resources', unique_extgstatename =
|
|
|
|
if opacity < 1.0 then
|
|
|
|
let dict =
|
|
|
|
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
|
|
|
|
| Some d -> d
|
|
|
|
| None -> Pdf.Dictionary []
|
|
|
|
in
|
|
|
|
let unique_extgstatename = Pdf.unique_key "gs" dict in
|
|
|
|
let dict' =
|
|
|
|
Pdf.add_dict_entry dict unique_extgstatename
|
|
|
|
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
|
|
|
|
in
|
|
|
|
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
|
|
|
|
else
|
|
|
|
page.Pdfpage.resources, None
|
|
|
|
in
|
|
|
|
let fontdict =
|
|
|
|
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
|
|
|
| None -> Pdf.Dictionary []
|
|
|
|
| Some d -> d
|
|
|
|
in
|
2023-07-10 15:13:45 +02:00
|
|
|
let calc_textwidth text =
|
2023-07-18 15:51:13 +02:00
|
|
|
match fontpack with
|
|
|
|
| Some fontpack ->
|
2023-07-20 13:56:20 +02:00
|
|
|
let widthss =
|
|
|
|
map2 (fun n font -> Cpdftype.font_widths (fontname ^ string_of_int n) font fontsize) (indx (fst fontpack)) (fst fontpack)
|
|
|
|
in
|
2023-07-18 15:51:13 +02:00
|
|
|
let triples = option_map (Cpdfembed.get_char fontpack) (Pdftext.codepoints_of_utf8 text) in
|
|
|
|
let widths = map (fun (charcode, fontnum, _) -> (List.nth widthss fontnum).(charcode)) triples in
|
|
|
|
fsum widths
|
|
|
|
| None ->
|
2023-07-10 15:13:45 +02:00
|
|
|
match font with
|
|
|
|
| Some (Pdftext.StandardFont (f, _)) ->
|
|
|
|
let rawwidth =
|
|
|
|
Pdfstandard14.textwidth false encoding f text
|
2021-12-21 15:00:58 +01:00
|
|
|
in
|
2023-07-10 15:13:45 +02:00
|
|
|
(float rawwidth *. fontsize) /. 1000.
|
|
|
|
| Some font ->
|
|
|
|
let rawwidth = width_of_text font text in
|
|
|
|
(rawwidth *. fontsize) /. 1000.
|
|
|
|
| None ->
|
|
|
|
let font =
|
|
|
|
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
|
|
|
| Some fontdict ->
|
|
|
|
begin match Pdf.lookup_direct pdf fontname fontdict with
|
|
|
|
| Some font -> font
|
|
|
|
| None ->
|
|
|
|
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
|
|
|
|
let font = ref None in
|
|
|
|
iter
|
|
|
|
(fun (k, v) ->
|
|
|
|
match Pdf.lookup_direct pdf "/BaseFont" v with
|
|
|
|
| Some (Pdf.Name n) when n = fontname -> font := Some v
|
|
|
|
| _ -> ())
|
|
|
|
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
|
|
|
|
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
|
|
|
|
end
|
|
|
|
| _ -> failwith "addtext: font not found for width"
|
|
|
|
in
|
|
|
|
let rawwidth = width_of_text (Pdftext.read_font pdf font) text in
|
|
|
|
(rawwidth *. fontsize) /. 1000.
|
|
|
|
in
|
2022-08-06 16:42:28 +02:00
|
|
|
let unique_fontname = Pdf.unique_key "F" fontdict in
|
2023-07-18 17:27:29 +02:00
|
|
|
let fd = ref fontdict in
|
|
|
|
let unique_fontnames =
|
|
|
|
match fontpack with None -> [] | Some fontpack ->
|
|
|
|
map
|
|
|
|
(fun _ ->
|
|
|
|
let key = Pdf.unique_key "F" !fd in
|
|
|
|
fd := Pdf.add_dict_entry !fd key Pdf.Null;
|
|
|
|
key)
|
|
|
|
(indx0 (fst fontpack))
|
|
|
|
in
|
2022-08-07 18:20:51 +02:00
|
|
|
let ops, urls, x, y, hoffset, voffset, text, joffset =
|
2023-05-01 17:19:09 +02:00
|
|
|
let text = process_text time text (replace_pairs pdf endpage extract_text_font_size filename bates batespad num page) in
|
2022-08-06 16:42:28 +02:00
|
|
|
let text, urls = get_urls_line text in
|
2023-07-18 16:03:15 +02:00
|
|
|
let lines = map (fun text -> if raw || fontpack <> None then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text) lines in
|
2023-07-10 15:13:45 +02:00
|
|
|
let expanded_lines = expand_lines text time pdf endpage extract_text_font_size filename bates batespad num page lines in
|
|
|
|
let textwidth = calc_textwidth text
|
|
|
|
and allwidths = map calc_textwidth expanded_lines in
|
|
|
|
let longest_w = last (sort compare allwidths) in
|
|
|
|
let joffset = find_justification_offsets longest_w textwidth position justification in
|
|
|
|
let mediabox =
|
|
|
|
if cropbox then
|
|
|
|
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
|
|
|
|
| Some pdfobject -> Pdf.parse_rectangle pdf (Pdf.direct pdf pdfobject)
|
|
|
|
| None -> Pdf.parse_rectangle pdf page.Pdfpage.mediabox
|
|
|
|
else
|
|
|
|
Pdf.parse_rectangle pdf page.Pdfpage.mediabox
|
|
|
|
in
|
|
|
|
let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox position in
|
|
|
|
let hoffset, voffset =
|
|
|
|
if position = Diagonal || position = ReverseDiagonal
|
|
|
|
then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset
|
|
|
|
else hoffset, voffset
|
2021-12-21 15:00:58 +01:00
|
|
|
in
|
2023-07-10 15:13:45 +02:00
|
|
|
match font with
|
|
|
|
| Some f ->
|
2023-07-18 16:51:43 +02:00
|
|
|
ops fontpack fontpackpdfobjs fontname longest_w (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
|
2023-07-18 17:27:29 +02:00
|
|
|
unique_fontname unique_fontnames unique_extgstatename colour fontsize text,
|
2023-07-10 15:13:45 +02:00
|
|
|
urls, x, y, hoffset, voffset, text, joffset
|
|
|
|
| None ->
|
2023-07-18 16:51:43 +02:00
|
|
|
ops fontpack fontpackpdfobjs fontname longest_w (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
|
2023-07-18 17:27:29 +02:00
|
|
|
fontname unique_fontnames None colour fontsize text,
|
2023-07-10 15:13:45 +02:00
|
|
|
urls, x, y, hoffset, voffset, text, joffset
|
2021-12-21 15:00:58 +01:00
|
|
|
in
|
|
|
|
let newresources =
|
2023-07-18 16:03:15 +02:00
|
|
|
match fontpack with
|
|
|
|
| Some fontpack ->
|
2023-07-20 15:22:13 +02:00
|
|
|
Printf.printf "New resources: a fontpack!\n";
|
2021-12-21 15:00:58 +01:00
|
|
|
let newfontdict =
|
2023-07-18 16:03:15 +02:00
|
|
|
let fd = ref fontdict in
|
2023-07-18 17:27:29 +02:00
|
|
|
iter2
|
|
|
|
(fun i n ->
|
|
|
|
fd := Pdf.add_dict_entry !fd (List.nth unique_fontnames n) (Pdf.Indirect i))
|
|
|
|
fontpackpdfobjs
|
|
|
|
(indx0 fontpackpdfobjs);
|
2023-07-18 16:03:15 +02:00
|
|
|
!fd
|
2021-12-21 15:00:58 +01:00
|
|
|
in
|
|
|
|
Pdf.add_dict_entry resources' "/Font" newfontdict
|
2023-07-18 16:03:15 +02:00
|
|
|
| None ->
|
2023-07-20 15:22:13 +02:00
|
|
|
Printf.printf "New resources: no fontpack!\n";
|
2023-07-18 16:03:15 +02:00
|
|
|
match font with
|
|
|
|
| Some (Pdftext.StandardFont _ as font) ->
|
|
|
|
let newfontdict =
|
|
|
|
Pdf.add_dict_entry fontdict unique_fontname (Pdf.Indirect (Pdftext.write_font pdf font))
|
|
|
|
in
|
|
|
|
Pdf.add_dict_entry resources' "/Font" newfontdict
|
|
|
|
| Some f ->
|
|
|
|
let newfontdict =
|
|
|
|
Pdf.add_dict_entry fontdict unique_fontname fontpdfobj
|
|
|
|
in
|
|
|
|
Pdf.add_dict_entry resources' "/Font" newfontdict
|
|
|
|
| None -> page.Pdfpage.resources
|
2021-12-21 15:00:58 +01:00
|
|
|
in
|
2022-08-04 18:16:04 +02:00
|
|
|
(* Build annotations from URL data (get_urls and some sense of metrics) *)
|
|
|
|
let annot (minx, miny, maxx, maxy) url =
|
|
|
|
Pdf.Dictionary
|
|
|
|
[("/Subtype", Pdf.Name "/Link");
|
|
|
|
("/Rect", Pdf.Array [Pdf.Real minx; Pdf.Real miny; Pdf.Real maxx; Pdf.Real maxy]);
|
2022-08-07 18:25:45 +02:00
|
|
|
("/BS", Pdf.Dictionary [("/W", Pdf.Integer 0)]);
|
2022-08-04 18:16:04 +02:00
|
|
|
("/A", Pdf.Dictionary [("/URI", Pdf.String url);
|
|
|
|
("/Type", Pdf.Name "/Action");
|
|
|
|
("/S", Pdf.Name "/URI")])]
|
|
|
|
in
|
|
|
|
let annots =
|
2022-08-07 15:01:05 +02:00
|
|
|
let annot_coord text pos =
|
|
|
|
let before = take (explode text) pos in
|
|
|
|
calc_textwidth (implode before)
|
2022-08-06 16:42:28 +02:00
|
|
|
in
|
|
|
|
map (fun (url, s, e) ->
|
2022-08-07 15:01:05 +02:00
|
|
|
let sx = annot_coord text s in
|
|
|
|
let ex = annot_coord text e in
|
2022-08-07 18:20:51 +02:00
|
|
|
let x, y = x -. hoffset -. joffset, y -. voffset in
|
2022-08-07 15:01:05 +02:00
|
|
|
let height =
|
2022-09-24 13:34:50 +02:00
|
|
|
match cap_height font fontname with
|
2022-08-07 15:01:05 +02:00
|
|
|
| Some c -> (c *. fontsize) /. 1000.
|
|
|
|
| None -> fontsize
|
|
|
|
in
|
2022-08-06 16:42:28 +02:00
|
|
|
Pdf.Indirect (Pdf.addobj pdf (annot (x +. sx, y, x +. ex, y +. height) url))) urls
|
2022-08-04 18:16:04 +02:00
|
|
|
in
|
|
|
|
let newrest =
|
|
|
|
if annots = [] then page.Pdfpage.rest else
|
|
|
|
let existing =
|
|
|
|
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
|
|
|
|
| Some (Pdf.Array a) -> a
|
|
|
|
| _ -> []
|
|
|
|
in
|
|
|
|
Pdf.add_dict_entry page.Pdfpage.rest "/Annots" (Pdf.Array (annots @ existing))
|
|
|
|
in
|
|
|
|
let page =
|
|
|
|
{page with
|
|
|
|
Pdfpage.resources = newresources;
|
|
|
|
Pdfpage.rest = newrest}
|
|
|
|
in
|
2021-12-21 15:00:58 +01:00
|
|
|
if underneath
|
|
|
|
then Pdfpage.prepend_operators pdf ops ~fast:fast page
|
|
|
|
else Pdfpage.postpend_operators pdf ops ~fast:fast page
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub addtext_page) pdf pages
|
2021-12-21 15:00:58 +01:00
|
|
|
|
|
|
|
(* Prev is a list of lists of characters *)
|
|
|
|
let split_at_newline t =
|
|
|
|
let rec split_at_newline_inner prev = function
|
|
|
|
| [] -> rev (map implode (map rev prev))
|
|
|
|
| '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t
|
|
|
|
| '\\'::'n'::t -> split_at_newline_inner ([]::prev) t
|
|
|
|
| h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t
|
|
|
|
in
|
|
|
|
split_at_newline_inner [[]] (explode t)
|
|
|
|
|
|
|
|
let rec unescape_chars prev = function
|
|
|
|
| [] -> rev prev
|
|
|
|
| '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t ->
|
|
|
|
let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in
|
|
|
|
unescape_chars (chr::prev) t
|
|
|
|
| '\\'::'\\'::t -> unescape_chars ('\\'::prev) t
|
|
|
|
| '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t
|
|
|
|
| h::t -> unescape_chars (h::prev) t
|
|
|
|
|
|
|
|
let unescape_string s =
|
|
|
|
implode (unescape_chars [] (explode s))
|
|
|
|
|
|
|
|
let
|
2022-10-19 17:32:56 +02:00
|
|
|
addtexts linewidth outline fast fontname (cpdffont : Cpdfembed.cpdffont) bates batespad
|
2022-09-27 17:28:34 +02:00
|
|
|
colour position linespacing fontsize underneath text pages cropbox opacity
|
|
|
|
justification midline topline filename extract_text_font_size shift
|
|
|
|
?(raw=false) pdf
|
2021-12-21 15:00:58 +01:00
|
|
|
=
|
|
|
|
if pages = [] then error "addtexts: empty page range" else
|
2023-07-10 15:13:45 +02:00
|
|
|
let time = Cpdfstrftime.current_time () in
|
|
|
|
let endpage = Pdfpage.endpage pdf in
|
|
|
|
let ps = Pdfpage.pages_of_pagetree pdf in
|
|
|
|
let used = null_hash () in
|
|
|
|
let lines = map unescape_string (split_at_newline text) in
|
|
|
|
iter2
|
|
|
|
(fun num page ->
|
|
|
|
let expanded_lines = expand_lines text time pdf endpage extract_text_font_size filename bates batespad num page lines in
|
|
|
|
let codepoints = map Pdftext.codepoints_of_utf8 expanded_lines in
|
|
|
|
iter (iter (fun x -> Hashtbl.replace used x ())) codepoints)
|
|
|
|
pages
|
|
|
|
(map (fun x -> List.nth ps (x - 1)) pages);
|
2021-12-21 15:00:58 +01:00
|
|
|
let realfontname = ref fontname in
|
2023-07-12 17:36:43 +02:00
|
|
|
let font, fontpack =
|
2022-10-19 17:32:56 +02:00
|
|
|
match cpdffont with
|
2023-06-13 15:07:34 +02:00
|
|
|
| Cpdfembed.PreMadeFontPack f ->
|
2023-07-20 15:22:13 +02:00
|
|
|
Some (hd (fst f)), None
|
2023-06-13 15:07:34 +02:00
|
|
|
| Cpdfembed.EmbedInfo {fontfile; fontname; encoding} ->
|
2023-07-10 15:13:45 +02:00
|
|
|
let embedded = Cpdfembed.embed_truetype pdf ~fontfile ~fontname ~codepoints:(map fst (list_of_hashtbl used)) ~encoding in
|
2023-07-12 17:36:43 +02:00
|
|
|
Some (hd (fst embedded)), Some embedded
|
|
|
|
| Cpdfembed.ExistingNamedFont -> None, None
|
2022-10-19 17:32:56 +02:00
|
|
|
in
|
2021-12-21 15:00:58 +01:00
|
|
|
let fontpdfobj =
|
|
|
|
match font with
|
2022-09-27 17:28:34 +02:00
|
|
|
| Some (Pdftext.StandardFont _ as font) ->
|
|
|
|
Pdf.Indirect (Pdftext.write_font pdf font)
|
2022-09-16 20:09:41 +02:00
|
|
|
| Some f ->
|
2023-07-10 15:13:45 +02:00
|
|
|
Pdf.Indirect (Pdftext.write_font pdf f)
|
2021-12-21 15:00:58 +01:00
|
|
|
| None ->
|
|
|
|
let firstpage =
|
|
|
|
List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1)
|
|
|
|
in
|
|
|
|
match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with
|
|
|
|
| Some fontdict ->
|
|
|
|
begin match Pdf.lookup_direct pdf fontname fontdict with
|
|
|
|
| Some font -> font
|
|
|
|
| _ ->
|
|
|
|
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
|
|
|
|
let font = ref None in
|
|
|
|
iter
|
|
|
|
(fun (k, v) ->
|
|
|
|
match Pdf.lookup_direct pdf "/BaseFont" v with
|
|
|
|
| Some (Pdf.Name n) when n = fontname ->
|
|
|
|
font := Some v; realfontname := k
|
|
|
|
| _ -> ())
|
|
|
|
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
|
|
|
|
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
|
|
|
|
end
|
|
|
|
| _ -> failwith "addtext: font dictionary not present"
|
|
|
|
in
|
|
|
|
let lines = map unescape_string (split_at_newline text) in
|
|
|
|
let pdf = ref pdf in
|
|
|
|
let voffset =
|
|
|
|
let open Cpdfposition in
|
|
|
|
match position with
|
|
|
|
| Bottom _ | BottomLeft _ | BottomRight _ ->
|
|
|
|
ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.)))
|
|
|
|
| Left _ | Right _ ->
|
|
|
|
(* Vertically align *)
|
|
|
|
ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.)))
|
|
|
|
| Diagonal | ReverseDiagonal ->
|
|
|
|
(* Change so that the whole paragraph sits on the centre... *)
|
|
|
|
ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.))
|
|
|
|
| _ -> ref 0.
|
|
|
|
in
|
|
|
|
if midline then
|
|
|
|
begin match font with
|
2022-09-16 19:07:46 +02:00
|
|
|
| Some (Pdftext.StandardFont (font, _)) ->
|
2021-12-21 15:00:58 +01:00
|
|
|
let baseline_adjustment =
|
|
|
|
(fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000.
|
|
|
|
in
|
|
|
|
voffset := !voffset +. baseline_adjustment
|
2022-09-16 21:10:49 +02:00
|
|
|
| Some (Pdftext.SimpleFont {fontdescriptor = Some {capheight}}) ->
|
|
|
|
voffset := !voffset +. capheight /. 2.
|
|
|
|
| _ ->
|
2023-04-25 14:45:56 +02:00
|
|
|
Pdfe.log "Unable to find midline adjustment in this font\n"
|
2021-12-21 15:00:58 +01:00
|
|
|
end
|
|
|
|
else
|
|
|
|
if topline then
|
|
|
|
begin match font with
|
2022-09-16 19:07:46 +02:00
|
|
|
| Some (Pdftext.StandardFont (font, _)) ->
|
2021-12-21 15:00:58 +01:00
|
|
|
let baseline_adjustment =
|
|
|
|
(fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000.
|
|
|
|
in
|
|
|
|
voffset := !voffset +. baseline_adjustment
|
2022-09-16 21:10:49 +02:00
|
|
|
| Some (Pdftext.SimpleFont {fontdescriptor = Some {capheight}}) ->
|
|
|
|
voffset := !voffset +. capheight
|
|
|
|
| _ ->
|
2023-04-25 14:45:56 +02:00
|
|
|
Pdfe.log "Unable to find midline adjustment in this font\n"
|
2022-09-21 18:40:28 +02:00
|
|
|
end;
|
|
|
|
let encoding =
|
|
|
|
match font with
|
|
|
|
| Some (Pdftext.StandardFont (_, e)) -> e
|
|
|
|
| Some (Pdftext.SimpleFont {encoding}) -> encoding
|
|
|
|
| _ -> Pdftext.WinAnsiEncoding
|
|
|
|
in
|
2023-07-18 15:05:17 +02:00
|
|
|
let fontpackpdfobjs =
|
|
|
|
match cpdffont with
|
2022-10-19 17:32:56 +02:00
|
|
|
| Cpdfembed.EmbedInfo {fontfile; fontname; encoding} ->
|
2023-07-10 15:13:45 +02:00
|
|
|
let codepoints = map fst (list_of_hashtbl used) in
|
2023-07-18 15:05:17 +02:00
|
|
|
let fonts = fst (Cpdfembed.embed_truetype !pdf ~fontfile ~fontname ~codepoints ~encoding) in
|
|
|
|
map (Pdftext.write_font !pdf) fonts
|
|
|
|
| _ -> []
|
|
|
|
in
|
|
|
|
iter
|
|
|
|
(fun line ->
|
|
|
|
let voff, hoff = !voffset, 0. in
|
|
|
|
pdf :=
|
|
|
|
addtext time lines linewidth outline fast colour !realfontname encoding
|
|
|
|
bates batespad fontsize fontpack font fontpdfobj fontpackpdfobjs underneath
|
|
|
|
position hoff voff line pages cropbox opacity justification filename
|
|
|
|
extract_text_font_size shift raw !pdf;
|
|
|
|
voffset := !voffset +. (linespacing *. fontsize))
|
|
|
|
lines;
|
|
|
|
!pdf
|
2021-12-21 15:00:58 +01:00
|
|
|
|
|
|
|
let addrectangle
|
|
|
|
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
|
|
|
|
underneath range pdf
|
|
|
|
=
|
|
|
|
let addrectangle_page _ page =
|
|
|
|
let resources', unique_extgstatename =
|
|
|
|
if opacity < 1.0 then
|
|
|
|
let dict =
|
|
|
|
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
|
|
|
|
| Some d -> d
|
|
|
|
| None -> Pdf.Dictionary []
|
|
|
|
in
|
|
|
|
let unique_extgstatename = Pdf.unique_key "gs" dict in
|
|
|
|
let dict' =
|
|
|
|
Pdf.add_dict_entry dict unique_extgstatename
|
|
|
|
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
|
|
|
|
in
|
|
|
|
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
|
|
|
|
else
|
|
|
|
page.Pdfpage.resources, None
|
|
|
|
in
|
|
|
|
let mediabox =
|
|
|
|
if relative_to_cropbox then
|
|
|
|
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
|
2022-07-14 15:06:25 +02:00
|
|
|
| Some pdfobject -> Pdf.parse_rectangle pdf (Pdf.direct pdf pdfobject)
|
|
|
|
| None -> Pdf.parse_rectangle pdf page.Pdfpage.mediabox
|
2021-12-21 15:00:58 +01:00
|
|
|
else
|
2022-07-14 15:06:25 +02:00
|
|
|
Pdf.parse_rectangle pdf page.Pdfpage.mediabox
|
2021-12-21 15:00:58 +01:00
|
|
|
in
|
|
|
|
let x, y, _ =
|
2022-09-21 18:40:28 +02:00
|
|
|
Cpdfposition.calculate_position false w mediabox position
|
2021-12-21 15:00:58 +01:00
|
|
|
in
|
|
|
|
let x, y =
|
|
|
|
match position with
|
|
|
|
Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h)
|
|
|
|
| Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.))
|
|
|
|
| _ -> (x, y)
|
|
|
|
in
|
|
|
|
let ops =
|
|
|
|
[
|
|
|
|
Pdfops.Op_q;
|
|
|
|
Pdfops.Op_BMC "/CPDFSTAMP";
|
|
|
|
colour_op colour;
|
|
|
|
colour_op_stroke colour;
|
|
|
|
]
|
|
|
|
@
|
|
|
|
(if outline then [Pdfops.Op_w linewidth] else [])
|
|
|
|
@
|
|
|
|
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
|
|
|
|
@
|
|
|
|
[
|
|
|
|
Pdfops.Op_re (x, y, w, h);
|
|
|
|
(if outline then Pdfops.Op_s else Pdfops.Op_f);
|
|
|
|
Pdfops.Op_EMC;
|
|
|
|
Pdfops.Op_Q
|
|
|
|
]
|
|
|
|
in
|
|
|
|
let page = {page with Pdfpage.resources = resources'} in
|
|
|
|
if underneath
|
|
|
|
then Pdfpage.prepend_operators pdf ops ~fast:fast page
|
|
|
|
else Pdfpage.postpend_operators pdf ops ~fast:fast page
|
|
|
|
in
|
2023-03-23 20:24:50 +01:00
|
|
|
Cpdfpage.process_pages (Pdfpage.ppstub addrectangle_page) pdf range
|