cpdf-source/cpdfaddtext.ml

731 lines
30 KiB
OCaml
Raw Normal View History

2022-09-21 18:40:28 +02:00
(* Superimpose text, page numbers etc. *)
2021-12-21 15:00:58 +01:00
open Pdfutil
open Cpdferror
2022-09-24 18:19:05 +02:00
(* Keep a record of charcodes used for font subsetting. *)
2022-09-24 15:17:29 +02:00
let used = null_hash ()
2022-09-24 18:19:05 +02:00
(* If we have used a TTF font, put its name and object number here. Then we
know we put it there, and can avoid writing it anew across ANDS. *)
let glob_pdfobjnum = null_hash ()
2021-12-21 15:00:58 +01:00
type color =
Grey of float
| RGB of float * float * float
| CYMK of float * float * float * float
2022-09-21 18:40:28 +02:00
(* Process UTF8 text to charcodes, given a font. *)
2021-12-21 15:00:58 +01:00
let charcodes_of_utf8 font s =
let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in
let codepoints = Pdftext.codepoints_of_utf8 s in
let charcodes =
option_map
(fun codepoint ->
match extractor codepoint with
| Some cc -> Some cc
| None -> Printf.eprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint; None)
codepoints
in
implode (map char_of_int charcodes)
2022-09-24 12:58:09 +02:00
let unicode_codepoint_of_pdfcode encoding_table glyphlist_table p =
try
hd (Hashtbl.find glyphlist_table (Hashtbl.find encoding_table p))
with
Not_found -> 0
2021-12-21 15:00:58 +01:00
(* Get the width of some text in the given font *)
let width_of_text font text =
match font with
| Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} ->
begin try
fold_left ( +. ) 0. (map (fun c -> fontmetrics.(int_of_char c)) (explode text))
with
_ -> 0.
end
| _ -> 0.
let colour_op = function
| RGB (r, g, b) -> Pdfops.Op_rg (r, g, b)
| Grey g -> Pdfops.Op_g g
| CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k)
let colour_op_stroke = function
| RGB (r, g, b) -> Pdfops.Op_RG (r, g, b)
| Grey g -> Pdfops.Op_G g
| CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k)
2022-09-24 18:29:04 +02:00
let ops fontname longest_w x y rotate hoffset voffset outline linewidth unique_fontname unique_extgstatename colour fontsize text =
begin match Hashtbl.find used fontname with
| exception Not_found ->
let thisused = null_hash () in
String.iter (fun x -> Hashtbl.replace thisused x ()) text;
Hashtbl.add used fontname thisused
| thisused ->
String.iter (fun x -> Hashtbl.replace thisused x ()) text;
end;
[Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
Pdfops.Op_cm
(Pdftransform.matrix_of_transform
[Pdftransform.Translate (x -. hoffset, y -. voffset);
Pdftransform.Rotate ((0., 0.), rotate)]);
Pdfops.Op_BT;
] @
(if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0]) @
[colour_op colour; colour_op_stroke colour]
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[Pdfops.Op_Tf (unique_fontname, fontsize);
Pdfops.Op_Tj text;
Pdfops.Op_ET;
Pdfops.Op_EMC;
Pdfops.Op_Q]
2021-12-21 15:00:58 +01:00
type justification = LeftJustify | CentreJustify | RightJustify
(* Find the h-offset for justification based on the longest width, the current
width, the justification and the position. *)
let find_justification_offsets longest_w w position j =
let open Cpdfposition in
match j with
| LeftJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0.
| Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| RightJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w)
| Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.)
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| CentreJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.)
| Top _ | PosCentre _ | Bottom _ | Centre -> 0.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
(* Lex an integer from the table *)
let extract_num header s =
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[Pdfgenlex.LexInt i] -> Pdf.Integer i
| [Pdfgenlex.LexReal f] -> Pdf.Real f
| _ -> raise (Failure ("extract_num: " ^ s))
let extract_fontbbox header s =
let num = function
Pdfgenlex.LexInt i -> Pdf.Integer i
| Pdfgenlex.LexReal f -> Pdf.Real f
| _ -> raise (Failure "extract_fontbbox")
in
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[a; b; c; d] -> [num a; num b; num c; num d]
| _ -> raise (Failure "extract_fontbbox")
let remove_slash s =
match explode s with
'/'::x -> implode x
| _ -> raise (Failure "remove_slash")
let extract_widths chars_and_widths =
let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in
map
(fun x ->
try
let name = List.assoc x win_to_name in
let width = List.assoc (remove_slash name) chars_and_widths in
width
with
_ -> 0)
(ilist 0 255)
2022-08-07 15:01:05 +02:00
(* For finding the height for URL links, we try to find the Cap Height for the
font. For now, this will only work for built-in fonts. We fall back to using
the font size alone if we cannot get the cap height. *)
2022-09-24 13:34:50 +02:00
let cap_height font fontname =
match font with
| Some (Pdftext.SimpleFont {fontdescriptor = Some {capheight}}) ->
Some capheight
| _ ->
try
let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in
let header, _, _, _ = Pdfstandard14.afm_data font in
let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in
Some (match capheight with Pdf.Integer i -> float_of_int i | Pdf.Real r -> r | _ -> 0.)
with
_ -> None
2022-08-07 15:01:05 +02:00
2022-09-21 18:40:28 +02:00
let rec string_of_encoding = function
| Pdftext.StandardEncoding -> "StandardEncoding"
| Pdftext.MacRomanEncoding -> "MacRomanEncoding"
| Pdftext.WinAnsiEncoding -> "WinAnsiEncoding"
| _ -> error "unknown encoding"
let make_font embed encoding fontname =
2022-09-27 16:24:02 +02:00
Pdf.Dictionary
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/TrueType");
("/Encoding", Pdf.Name ("/" ^ string_of_encoding encoding));
("/BaseFont", Pdf.Name ("/" ^ fontname))]
2021-12-21 15:00:58 +01:00
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
let rec process_text time text m =
match m with
| [] -> Cpdfstrftime.strftime ~time text
| (s, r)::t -> process_text time (string_replace_all_lazy s r text) t
2022-08-03 17:38:05 +02:00
(* Find any %URL, sub in the text and return the new text together with a list
of ordered (line num, URL, startpos, endpos) data.
This will be used after any other %Specials have been processed, so that the
positions do not change. *)
(* text|url]abc -> text, url, abc *)
let extract_url line =
let text, rest = cleavewhile (neq '|') line in
if rest = [] then error "bad URL syntax in text" else
let url, rest = cleavewhile (neq ']') (tl rest) in
if rest = [] then error "bad URL syntax in text" else
(text, url, tl rest)
2022-08-04 18:16:04 +02:00
(* multiple %URL[a|b] *)
2022-08-03 17:38:05 +02:00
let get_urls_line line =
let line = explode line in
let urls = ref [] in
let pos = ref 0 in
let outline = ref [] in
let rec loop = function
| '%'::'U'::'R'::'L'::'['::t ->
let text, url, rest = extract_url t in
outline := rev text @ !outline;
urls := (implode url, !pos, !pos + length text)::!urls;
pos += length text;
loop rest
| h::t ->
outline := h::!outline;
pos += 1;
loop t
| [] -> ()
in
loop line;
(implode (rev !outline), rev !urls)
2021-12-21 15:00:58 +01:00
(* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num =
Pdfpagelabels.pagelabeltext_of_pagenumber
num
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
let addtext
2022-09-24 15:17:29 +02:00
time lines linewidth outline fast colour fontname encoding embed bates batespad fontsize
2022-09-21 18:40:28 +02:00
(font : Pdftext.font option)
2022-09-16 20:09:41 +02:00
fontpdfobj underneath position hoffset voffset text pages orientation cropbox opacity
2021-12-21 15:00:58 +01:00
justification filename extract_text_font_size shift pdf
=
let endpage = Pdfpage.endpage pdf in
let replace_pairs pdf filename bates batespad num page =
[
"%PageDiv2", (fun () -> string_of_int ((num + 1) / 2));
"%Page", (fun () -> string_of_int num);
"%Roman", (fun () -> roman_upper num);
"%roman", (fun () -> roman_lower num);
"%filename", (fun () -> filename);
"%Label", (fun () -> pagelabel pdf num);
"%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
"%Bates",
(fun () ->
(let numstring = string_of_int (bates + num - 1) in
match batespad with
None -> numstring
| Some w ->
if String.length numstring >= w
then numstring
else implode (many '0' (w - String.length numstring)) ^ numstring))]
in
let shifts = Cpdfcoord.parse_coordinates pdf shift in
let addtext_page num page =
let shift_x, shift_y = List.nth shifts (num - 1) in
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> Pdf.Dictionary []
| Some d -> d
in
let calc_textwidth text =
match font with
2022-09-16 19:07:46 +02:00
| Some (Pdftext.StandardFont (f, _)) ->
2021-12-21 15:00:58 +01:00
let rawwidth =
2022-09-21 18:40:28 +02:00
Pdfstandard14.textwidth false encoding f text
2021-12-21 15:00:58 +01:00
in
(float rawwidth *. fontsize) /. 1000.
2022-09-16 20:09:41 +02:00
| Some font ->
let rawwidth = width_of_text font text in
(rawwidth *. fontsize) /. 1000.
2021-12-21 15:00:58 +01:00
| None ->
let font =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
end
| _ -> failwith "addtext: font not found for width"
in
let rawwidth = width_of_text (Pdftext.read_font pdf font) text in
(rawwidth *. fontsize) /. 1000.
in
2022-08-06 16:42:28 +02:00
let unique_fontname = Pdf.unique_key "F" fontdict in
let ops, urls, x, y, hoffset, voffset, text, joffset =
2022-08-06 16:42:28 +02:00
let text = process_text time text (replace_pairs pdf filename bates batespad num page) in
let text, urls = get_urls_line text in
2021-12-21 15:00:58 +01:00
let expanded_lines =
map
(function text ->
process_text time text (replace_pairs pdf filename bates batespad num page))
lines
in
2022-08-06 16:42:28 +02:00
let expanded_lines = (* process URLs for justification too *)
map (fun line -> fst (get_urls_line line)) expanded_lines
2022-08-03 17:38:05 +02:00
in
2021-12-21 15:00:58 +01:00
let textwidth = calc_textwidth text
and allwidths = map calc_textwidth expanded_lines in
let longest_w = last (sort compare allwidths) in
let joffset = find_justification_offsets longest_w textwidth position justification in
let mediabox =
if cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
2022-07-14 15:06:25 +02:00
| Some pdfobject -> Pdf.parse_rectangle pdf (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle pdf page.Pdfpage.mediabox
2021-12-21 15:00:58 +01:00
else
2022-07-14 15:06:25 +02:00
Pdf.parse_rectangle pdf page.Pdfpage.mediabox
2021-12-21 15:00:58 +01:00
in
2022-09-21 18:40:28 +02:00
let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox position in
2021-12-21 15:00:58 +01:00
let hoffset, voffset =
if position = Diagonal || position = ReverseDiagonal
then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset
else hoffset, voffset
in
match font with
| Some f ->
2022-09-24 18:29:04 +02:00
ops fontname longest_w (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
2022-08-04 18:16:04 +02:00
unique_fontname unique_extgstatename colour fontsize text,
urls, x, y, hoffset, voffset, text, joffset
2021-12-21 15:00:58 +01:00
| None ->
2022-09-24 18:29:04 +02:00
ops fontname longest_w (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
2022-08-04 18:16:04 +02:00
fontname None colour fontsize text,
urls, x, y, hoffset, voffset, text, joffset
2021-12-21 15:00:58 +01:00
in
let newresources =
match font with
2022-09-16 20:09:41 +02:00
| Some (Pdftext.StandardFont _) ->
2021-12-21 15:00:58 +01:00
let newfontdict =
2022-09-21 18:40:28 +02:00
Pdf.add_dict_entry fontdict unique_fontname (make_font embed encoding fontname)
2021-12-21 15:00:58 +01:00
in
Pdf.add_dict_entry resources' "/Font" newfontdict
2022-09-16 20:09:41 +02:00
| Some f ->
let newfontdict =
Pdf.add_dict_entry fontdict unique_fontname fontpdfobj
in
Pdf.add_dict_entry resources' "/Font" newfontdict
2021-12-21 15:00:58 +01:00
| None -> page.Pdfpage.resources
in
2022-08-04 18:16:04 +02:00
(* Build annotations from URL data (get_urls and some sense of metrics) *)
let annot (minx, miny, maxx, maxy) url =
Pdf.Dictionary
[("/Subtype", Pdf.Name "/Link");
("/Rect", Pdf.Array [Pdf.Real minx; Pdf.Real miny; Pdf.Real maxx; Pdf.Real maxy]);
2022-08-07 18:25:45 +02:00
("/BS", Pdf.Dictionary [("/W", Pdf.Integer 0)]);
2022-08-04 18:16:04 +02:00
("/A", Pdf.Dictionary [("/URI", Pdf.String url);
("/Type", Pdf.Name "/Action");
("/S", Pdf.Name "/URI")])]
in
let annots =
2022-08-07 15:01:05 +02:00
let annot_coord text pos =
let before = take (explode text) pos in
calc_textwidth (implode before)
2022-08-06 16:42:28 +02:00
in
map (fun (url, s, e) ->
2022-08-07 15:01:05 +02:00
let sx = annot_coord text s in
let ex = annot_coord text e in
let x, y = x -. hoffset -. joffset, y -. voffset in
2022-08-07 15:01:05 +02:00
let height =
2022-09-24 13:34:50 +02:00
match cap_height font fontname with
2022-08-07 15:01:05 +02:00
| Some c -> (c *. fontsize) /. 1000.
| None -> fontsize
in
2022-08-06 16:42:28 +02:00
Pdf.Indirect (Pdf.addobj pdf (annot (x +. sx, y, x +. ex, y +. height) url))) urls
2022-08-04 18:16:04 +02:00
in
let newrest =
if annots = [] then page.Pdfpage.rest else
let existing =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array a) -> a
| _ -> []
in
Pdf.add_dict_entry page.Pdfpage.rest "/Annots" (Pdf.Array (annots @ existing))
in
let page =
{page with
Pdfpage.resources = newresources;
Pdfpage.rest = newrest}
in
2021-12-21 15:00:58 +01:00
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
2022-09-12 18:05:20 +02:00
Cpdfpage.process_pages (Cpdfutil.ppstub addtext_page) pdf pages
2021-12-21 15:00:58 +01:00
(* Prev is a list of lists of characters *)
let split_at_newline t =
let rec split_at_newline_inner prev = function
| [] -> rev (map implode (map rev prev))
| '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t
| '\\'::'n'::t -> split_at_newline_inner ([]::prev) t
| h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t
in
split_at_newline_inner [[]] (explode t)
let rec unescape_chars prev = function
| [] -> rev prev
| '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t ->
let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in
unescape_chars (chr::prev) t
| '\\'::'\\'::t -> unescape_chars ('\\'::prev) t
| '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t
| h::t -> unescape_chars (h::prev) t
let unescape_string s =
implode (unescape_chars [] (explode s))
let
2022-09-24 11:55:12 +02:00
addtexts ?embedinfo linewidth outline fast fontname (font : Pdftext.font option) embed bates batespad colour position linespacing
2021-12-21 15:00:58 +01:00
fontsize underneath text pages orientation cropbox opacity justification
midline topline filename extract_text_font_size shift ?(raw=false) pdf
=
2022-09-24 11:55:12 +02:00
let time = Cpdfstrftime.current_time () in
2021-12-21 15:00:58 +01:00
if pages = [] then error "addtexts: empty page range" else
let realfontname = ref fontname in
let fontpdfobj =
match font with
2022-09-21 18:40:28 +02:00
| Some (StandardFont (f, encoding)) ->
make_font embed encoding (Pdftext.string_of_standard_font f)
2022-09-16 20:09:41 +02:00
| Some f ->
2022-09-24 18:19:05 +02:00
begin match Hashtbl.find glob_pdfobjnum fontname with
| exception Not_found ->
let i = Pdftext.write_font pdf f in
Hashtbl.add glob_pdfobjnum fontname i; Pdf.Indirect i
| i ->
Pdf.Indirect i
end
2021-12-21 15:00:58 +01:00
| None ->
let firstpage =
List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1)
in
match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| _ ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname ->
font := Some v; realfontname := k
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
end
| _ -> failwith "addtext: font dictionary not present"
in
(* 19th May 2022. Reversed the phase order (split first, then get charcodes. This allows \n in custom fonts. *)
2021-12-21 15:00:58 +01:00
let lines = map unescape_string (split_at_newline text) in
let lines = map (fun text -> if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text) lines in
2021-12-21 15:00:58 +01:00
let pdf = ref pdf in
let voffset =
let open Cpdfposition in
match position with
| Bottom _ | BottomLeft _ | BottomRight _ ->
ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.)))
| Left _ | Right _ ->
(* Vertically align *)
ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.)))
| Diagonal | ReverseDiagonal ->
(* Change so that the whole paragraph sits on the centre... *)
ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.))
| _ -> ref 0.
in
if midline then
begin match font with
2022-09-16 19:07:46 +02:00
| Some (Pdftext.StandardFont (font, _)) ->
2021-12-21 15:00:58 +01:00
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000.
in
voffset := !voffset +. baseline_adjustment
2022-09-16 21:10:49 +02:00
| Some (Pdftext.SimpleFont {fontdescriptor = Some {capheight}}) ->
voffset := !voffset +. capheight /. 2.
| _ ->
Printf.eprintf "Unable to find midline adjustment in this font\n"
2021-12-21 15:00:58 +01:00
end
else
if topline then
begin match font with
2022-09-16 19:07:46 +02:00
| Some (Pdftext.StandardFont (font, _)) ->
2021-12-21 15:00:58 +01:00
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000.
in
voffset := !voffset +. baseline_adjustment
2022-09-16 21:10:49 +02:00
| Some (Pdftext.SimpleFont {fontdescriptor = Some {capheight}}) ->
voffset := !voffset +. capheight
| _ ->
Printf.eprintf "Unable to find midline adjustment in this font\n"
2022-09-21 18:40:28 +02:00
end;
let encoding =
match font with
| Some (Pdftext.StandardFont (_, e)) -> e
| Some (Pdftext.SimpleFont {encoding}) -> encoding
| _ -> Pdftext.WinAnsiEncoding
in
iter
(fun line ->
let voff, hoff = !voffset, 0. in
pdf :=
2022-09-24 15:17:29 +02:00
addtext time lines linewidth outline fast colour !realfontname encoding
2022-09-21 18:40:28 +02:00
embed bates batespad fontsize font fontpdfobj underneath position hoff voff line
pages orientation cropbox opacity justification filename
extract_text_font_size shift
!pdf;
voffset := !voffset +. (linespacing *. fontsize))
lines;
2022-09-24 12:58:09 +02:00
begin match embedinfo with
| None -> ()
| Some (_, fontfile, fontname, encoding) ->
2022-09-24 18:29:04 +02:00
let charcodes =
match Hashtbl.find used fontname with
| exception Not_found -> []
| thisused -> map fst (list_of_hashtbl thisused)
in
2022-09-24 12:58:09 +02:00
let encoding_table = Pdftext.table_of_encoding encoding in
let glyphlist_table = Pdfglyphlist.glyph_hashes () in
let codepoints =
map (fun c -> unicode_codepoint_of_pdfcode encoding_table glyphlist_table (int_of_char c)) charcodes
in
let objnum = match fontpdfobj with Pdf.Indirect i -> i | _ -> failwith "bad fontpdfobj" in
let font = Cpdfembed.embed_truetype !pdf ~fontfile ~fontname ~codepoints ~encoding in
ignore (Pdftext.write_font ~objnum !pdf font)
end;
2022-09-21 18:40:28 +02:00
!pdf
2021-12-21 15:00:58 +01:00
let removetext range pdf =
(* Could fail on nesting, or other marked content inside our marked content.*)
let rec remove_until_last_EMC level = function
| [] -> []
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
remove_until_last_EMC (level + 1) more
| Pdfops.Op_EMC::more ->
if level = 1
then more
else remove_until_last_EMC (level - 1) more
| _::more ->
remove_until_last_EMC level more
in
let rec remove_stamps prev = function
| [] -> rev prev
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
let rest = remove_until_last_EMC 1 more in
remove_stamps prev rest
| h::t -> remove_stamps (h::prev) t
in
let removetext_page _ page =
{page with
Pdfpage.content =
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
in
2021-12-27 16:20:07 +01:00
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
2021-12-21 15:00:58 +01:00
let addrectangle
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
underneath range pdf
=
let addrectangle_page _ page =
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let mediabox =
if relative_to_cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
2022-07-14 15:06:25 +02:00
| Some pdfobject -> Pdf.parse_rectangle pdf (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle pdf page.Pdfpage.mediabox
2021-12-21 15:00:58 +01:00
else
2022-07-14 15:06:25 +02:00
Pdf.parse_rectangle pdf page.Pdfpage.mediabox
2021-12-21 15:00:58 +01:00
in
let x, y, _ =
2022-09-21 18:40:28 +02:00
Cpdfposition.calculate_position false w mediabox position
2021-12-21 15:00:58 +01:00
in
let x, y =
match position with
Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h)
| Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.))
| _ -> (x, y)
in
let ops =
[
Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
colour_op colour;
colour_op_stroke colour;
]
@
(if outline then [Pdfops.Op_w linewidth] else [])
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[
Pdfops.Op_re (x, y, w, h);
(if outline then Pdfops.Op_s else Pdfops.Op_f);
Pdfops.Op_EMC;
Pdfops.Op_Q
]
in
let page = {page with Pdfpage.resources = resources'} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
2021-12-27 16:20:07 +01:00
Cpdfpage.process_pages (Cpdfutil.ppstub addrectangle_page) pdf range
2021-12-21 15:00:58 +01:00
2021-12-21 16:40:52 +01:00
let rec remove_all_text_ops pdf resources content =
let is_textop = function
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
| _ -> false
in
let content' =
let ops = Pdfops.parse_operators pdf resources content in
Pdfops.stream_of_ops
(option_map (function x -> if is_textop x then None else Some x) ops)
in
[content']
let remove_all_text_page pdf p =
let resources = p.Pdfpage.resources in
let content = p.Pdfpage.content in
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
let remove_all_text range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf in
let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_all_text_page !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')