This commit is contained in:
John Whitington 2021-12-21 14:00:58 +00:00
parent c6d606136b
commit 6297181775
6 changed files with 759 additions and 751 deletions

View File

@ -1,7 +1,7 @@
# Build the cpdf command line tools and top level
MODS = cpdfyojson cpdfxmlm \
cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord cpdfattach \
cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdf cpdffont cpdftype \
cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdffont cpdftype \
cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot cpdfpagelabels cpdfcreate cpdfannot cpdfcommand
SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml

649
cpdf.ml
View File

@ -3,27 +3,10 @@ open Pdfutil
open Pdfio
open Cpdferror
type color =
Grey of float
| RGB of float * float * float
| CYMK of float * float * float * float
(* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num =
Pdfpagelabels.pagelabeltext_of_pagenumber
num
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
let rec process_text time text m =
match m with
| [] -> Cpdfstrftime.strftime ~time text
| (s, r)::t -> process_text time (string_replace_all_lazy s r text) t
(* For uses of process_pages which don't need to deal with matrices, this
function transforms into one which returns the identity matrix *)
let ppstub f n p = (f n p, n, Pdftransform.i_matrix)
(* Add stack operators to a content stream to ensure it is composeable. On
-fast, we don't check for Q deficit, assuming PDF is ISO. *)
let protect fast pdf resources content =
@ -99,564 +82,6 @@ let hasbox pdf page boxname =
| Some _ -> true
| _ -> false
(* \section{Superimpose text, page numbers etc.} *)
(* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever
is in the font (for existing fonts). *)
let charcodes_of_utf8 font s =
let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in
let codepoints = Pdftext.codepoints_of_utf8 s in
let charcodes =
option_map
(fun codepoint ->
match extractor codepoint with
| Some cc -> Some cc
| None -> Printf.eprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint; None)
codepoints
in
implode (map char_of_int charcodes)
(* Process codepoints back to UTF8, assuming it came from UTF8 to start with *)
let utf8_of_winansi s =
let text_extractor =
Pdftext.text_extractor_of_font_real
(Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding))
in
let codepoints = Pdftext.codepoints_of_text text_extractor s in
Pdftext.utf8_of_codepoints codepoints
(* Get the width of some text in the given font *)
let width_of_text font text =
match font with
| Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} ->
begin try
fold_left ( +. ) 0. (map (fun c -> fontmetrics.(int_of_char c)) (explode text))
with
_ -> 0.
end
| _ -> 0.
type ops_metrics =
{metrics_text : string;
metrics_x : float;
metrics_y : float;
metrics_rot : float}
let ops_metrics : ops_metrics list ref = ref []
let ops_baseline_adjustment = ref 0.
let metrics_howmany () = length !ops_metrics
let metrics_text n =
utf8_of_winansi (select n !ops_metrics).metrics_text
let metrics_x n =
(select n !ops_metrics).metrics_x
let metrics_y n =
(select n !ops_metrics).metrics_y
let metrics_rot n =
(select n !ops_metrics).metrics_rot
let metrics_baseline_adjustment () = !ops_baseline_adjustment
let colour_op = function
| RGB (r, g, b) -> Pdfops.Op_rg (r, g, b)
| Grey g -> Pdfops.Op_g g
| CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k)
let colour_op_stroke = function
| RGB (r, g, b) -> Pdfops.Op_RG (r, g, b)
| Grey g -> Pdfops.Op_G g
| CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k)
let ops longest_w metrics x y rotate hoffset voffset outline linewidth unique_fontname unique_extgstatename colour fontsize text =
if metrics then
ops_metrics :=
{metrics_text = text; metrics_x = x -. hoffset; metrics_y = y -. voffset; metrics_rot = rotate}
::!ops_metrics;
[Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
Pdfops.Op_cm
(Pdftransform.matrix_of_transform
[Pdftransform.Translate (x -. hoffset, y -. voffset);
Pdftransform.Rotate ((0., 0.), rotate)]);
Pdfops.Op_BT;
] @
(if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0]) @
[colour_op colour; colour_op_stroke colour]
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[Pdfops.Op_Tf (unique_fontname, fontsize);
Pdfops.Op_Tj text;
Pdfops.Op_ET;
Pdfops.Op_EMC;
Pdfops.Op_Q]
type justification = LeftJustify | CentreJustify | RightJustify
(* Find the h-offset for justification based on the longest width, the current
width, the justification and the position. *)
let find_justification_offsets longest_w w position j =
let open Cpdfposition in
match j with
| LeftJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0.
| Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| RightJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w)
| Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.)
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| CentreJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.)
| Top _ | PosCentre _ | Bottom _ | Centre -> 0.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
(* Lex an integer from the table *)
let extract_num header s =
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[Pdfgenlex.LexInt i] -> Pdf.Integer i
| [Pdfgenlex.LexReal f] -> Pdf.Real f
| _ -> raise (Failure ("extract_num: " ^ s))
let extract_fontbbox header s =
let num = function
Pdfgenlex.LexInt i -> Pdf.Integer i
| Pdfgenlex.LexReal f -> Pdf.Real f
| _ -> raise (Failure "extract_fontbbox")
in
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[a; b; c; d] -> [num a; num b; num c; num d]
| _ -> raise (Failure "extract_fontbbox")
let remove_slash s =
match explode s with
'/'::x -> implode x
| _ -> raise (Failure "remove_slash")
let extract_widths chars_and_widths =
let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in
map
(fun x ->
try
let name = List.assoc x win_to_name in
let width = List.assoc (remove_slash name) chars_and_widths in
width
with
_ -> 0)
(ilist 0 255)
let make_font embed fontname =
let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in
let header, width_data, _, chars_and_widths = Pdfstandard14.afm_data font in
let widths = extract_widths (list_of_hashtbl chars_and_widths) in
let flags = Pdfstandard14.flags_of_standard_font font in
let fontbbox = extract_fontbbox header "FontBBox" in
let italicangle = extract_num header "ItalicAngle" in
let ascent = try extract_num header "Ascender" with _ -> Pdf.Integer 0 in
let descent = try extract_num header "Descender" with _ -> Pdf.Integer 0 in
let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in
let stemv = Pdfstandard14.stemv_of_standard_font font in
let fontdescriptor =
Pdf.Dictionary
[("/Type", Pdf.Name "/FontDescriptor");
("/FontName", Pdf.Name ("/" ^ fontname));
("/Flags", Pdf.Integer flags);
("/FontBBox", Pdf.Array fontbbox);
("/ItalicAngle", italicangle);
("/Ascent", ascent);
("/Descent", descent);
("/CapHeight", capheight);
("/StemV", Pdf.Integer stemv)]
in
(* With -no-embed-font, we use the standard encoding, and just the
* minimal stuff. Without -no-embed-font, we switch to WinAnsiEncoding,
* and fill out everything except the font file instead *)
if embed then
Pdf.Dictionary
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/BaseFont", Pdf.Name ("/" ^ fontname));
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/FirstChar", Pdf.Integer 0);
("/LastChar", Pdf.Integer 255);
("/Widths", Pdf.Array (map (fun x -> Pdf.Integer x) widths));
("/FontDescriptor", fontdescriptor)]
else
Pdf.Dictionary
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/BaseFont", Pdf.Name ("/" ^ fontname))]
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
let addtext
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
underneath position hoffset voffset text pages orientation cropbox opacity
justification filename extract_text_font_size shift pdf
=
let time = Cpdfstrftime.current_time () in
let endpage = Pdfpage.endpage pdf in
let replace_pairs pdf filename bates batespad num page =
[
"%PageDiv2", (fun () -> string_of_int ((num + 1) / 2));
"%Page", (fun () -> string_of_int num);
"%Roman", (fun () -> roman_upper num);
"%roman", (fun () -> roman_lower num);
"%filename", (fun () -> filename);
"%Label", (fun () -> pagelabel pdf num);
"%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
"%Bates",
(fun () ->
(let numstring = string_of_int (bates + num - 1) in
match batespad with
None -> numstring
| Some w ->
if String.length numstring >= w
then numstring
else implode (many '0' (w - String.length numstring)) ^ numstring))]
in
let shifts = Cpdfcoord.parse_coordinates pdf shift in
let addtext_page num page =
let shift_x, shift_y = List.nth shifts (num - 1) in
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> Pdf.Dictionary []
| Some d -> d
in
let unique_fontname = Pdf.unique_key "F" fontdict in
let ops =
let text = process_text time text (replace_pairs pdf filename bates batespad num page) in
let calc_textwidth text =
match font with
| Some f ->
let rawwidth =
Pdfstandard14.textwidth
false
(if embed then Pdftext.WinAnsiEncoding else Pdftext.StandardEncoding)
f
text
in
(float rawwidth *. fontsize) /. 1000.
| None ->
let font =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
end
| _ -> failwith "addtext: font not found for width"
in
let rawwidth = width_of_text (Pdftext.read_font pdf font) text in
(rawwidth *. fontsize) /. 1000.
in
let expanded_lines =
map
(function text ->
process_text time text (replace_pairs pdf filename bates batespad num page))
lines
in
let textwidth = calc_textwidth text
and allwidths = map calc_textwidth expanded_lines in
let longest_w = last (sort compare allwidths) in
let joffset = find_justification_offsets longest_w textwidth position justification in
let mediabox =
if cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
else
Pdf.parse_rectangle page.Pdfpage.mediabox
in
let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox orientation position in
let hoffset, voffset =
if position = Diagonal || position = ReverseDiagonal
then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset
else hoffset, voffset
in
match font with
| Some f ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
unique_fontname unique_extgstatename colour fontsize text
| None ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
fontname None colour fontsize text
in
let newresources =
match font with
| Some _ ->
let newfontdict =
Pdf.add_dict_entry fontdict unique_fontname (make_font embed fontname)
in
Pdf.add_dict_entry resources' "/Font" newfontdict
| None -> page.Pdfpage.resources
in
let page = {page with Pdfpage.resources = newresources} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
if metrics then
(ignore (Cpdfpage.iter_pages (fun a b -> ignore (addtext_page a b)) pdf pages); pdf)
else
Cpdfpage.process_pages (ppstub addtext_page) pdf pages
(* Prev is a list of lists of characters *)
let split_at_newline t =
let rec split_at_newline_inner prev = function
| [] -> rev (map implode (map rev prev))
| '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t
| '\\'::'n'::t -> split_at_newline_inner ([]::prev) t
| h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t
in
split_at_newline_inner [[]] (explode t)
let rec unescape_chars prev = function
| [] -> rev prev
| '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t ->
let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in
unescape_chars (chr::prev) t
| '\\'::'\\'::t -> unescape_chars ('\\'::prev) t
| '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t
| h::t -> unescape_chars (h::prev) t
let unescape_string s =
implode (unescape_chars [] (explode s))
let
addtexts metrics linewidth outline fast fontname (font : Pdftext.standard_font option) embed bates batespad colour position linespacing
fontsize underneath text pages orientation cropbox opacity justification
midline topline filename extract_text_font_size shift ?(raw=false) pdf
=
if pages = [] then error "addtexts: empty page range" else
(*flprint "addtexts:\n";
iter (Printf.printf "%C ") (explode text);
flprint "\n";
Printf.printf "\nCpdf.addtexts: metrics = %b" metrics;
flprint "\n";*)
(*Printf.printf "linewidth = %f\n" linewidth;
Printf.printf "outline = %b\n" outline;
Printf.printf "fast = %b\n" fast;
Printf.printf "fontname = %s\n" fontname;
Printf.printf "winansi text = %s\n" text;
Printf.printf "position = %s\n" (string_of_position position);
Printf.printf "bates = %i\n" bates;
Printf.printf "linespacing = %f\n" linespacing;
Printf.printf "fontsize = %f\n" fontsize;
Printf.printf "underneath = %b\n" underneath;
Printf.printf "font = %s\n" begin match font with None -> "None" | Some x -> Pdftext.string_of_standard_font x end;
Printf.printf "justification = %s\n"
begin match justification with LeftJustify -> "left" | RightJustify -> "right" | CentreJustify -> "centre" end;
Printf.printf "midline = %b\n" midline;
begin match colour with r, g, b -> Printf.printf "%f, %f, %f\n" r g b end;
Printf.printf "opacity = %f\n" opacity;
flprint "\n";
Printf.printf "relative-to-cropbox = %b" cropbox;
flprint "\n";*)
ops_metrics := [];
let realfontname = ref fontname in
let fontpdfobj =
match font with
| Some f ->
make_font embed (Pdftext.string_of_standard_font f)
| None ->
let firstpage =
List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1)
in
match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| _ ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname ->
font := Some v; realfontname := k
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
end
| _ -> failwith "addtext: font dictionary not present"
in
let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in
let lines = map unescape_string (split_at_newline text) in
let pdf = ref pdf in
let voffset =
let open Cpdfposition in
match position with
| Bottom _ | BottomLeft _ | BottomRight _ ->
ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.)))
| Left _ | Right _ ->
(* Vertically align *)
ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.)))
| Diagonal | ReverseDiagonal ->
(* Change so that the whole paragraph sits on the centre... *)
ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.))
| _ -> ref 0.
in
if midline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000.
in
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
end
else
if topline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000.
in
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
end
else
ops_baseline_adjustment := 0.;
iter
(fun line ->
let voff, hoff =
if orientation = Cpdfposition.Vertical then 0., -.(!voffset) else !voffset, 0.
in
pdf :=
addtext metrics lines linewidth outline fast colour !realfontname
embed bates batespad fontsize font underneath position hoff voff line
pages orientation cropbox opacity justification filename
extract_text_font_size shift
!pdf;
voffset := !voffset +. (linespacing *. fontsize))
lines;
ops_metrics := rev !ops_metrics;
!pdf
let removetext range pdf =
(* Could fail on nesting, or other marked content inside our marked content.*)
let rec remove_until_last_EMC level = function
| [] -> []
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
remove_until_last_EMC (level + 1) more
| Pdfops.Op_EMC::more ->
if level = 1
then more
else remove_until_last_EMC (level - 1) more
| _::more ->
remove_until_last_EMC level more
in
let rec remove_stamps prev = function
| [] -> rev prev
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
let rest = remove_until_last_EMC 1 more in
remove_stamps prev rest
| h::t -> remove_stamps (h::prev) t
in
let removetext_page _ page =
{page with
Pdfpage.content =
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
in
Cpdfpage.process_pages (ppstub removetext_page) pdf range
(* \section{Shift page data} *)
let make_mediabox (xmin, ymin, xmax, ymax) =
@ -1498,72 +923,6 @@ let scale_page_contents ?(fast=false) scale position pdf pnum page =
let scale_contents ?(fast=false) position scale pdf range =
Cpdfpage.process_pages (scale_page_contents ~fast scale position pdf) pdf range
let addrectangle
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
underneath range pdf
=
let addrectangle_page _ page =
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let mediabox =
if relative_to_cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
else
Pdf.parse_rectangle page.Pdfpage.mediabox
in
let x, y, _ =
Cpdfposition.calculate_position false w mediabox Cpdfposition.Horizontal position
in
let x, y =
match position with
Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h)
| Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.))
| _ -> (x, y)
in
let ops =
[
Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
colour_op colour;
colour_op_stroke colour;
]
@
(if outline then [Pdfops.Op_w linewidth] else [])
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[
Pdfops.Op_re (x, y, w, h);
(if outline then Pdfops.Op_s else Pdfops.Op_f);
Pdfops.Op_EMC;
Pdfops.Op_Q
]
in
let page = {page with Pdfpage.resources = resources'} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
Cpdfpage.process_pages (ppstub addrectangle_page) pdf range
(* Imposition *)
(* Union two rest dictionaries from the same PDF. *)
@ -1751,7 +1110,7 @@ let add_border linewidth ~fast pdf =
if linewidth = 0. then pdf else
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let _, _, w, h = Pdf.parse_rectangle firstpage.Pdfpage.mediabox in
addrectangle
Cpdfaddtext.addrectangle
fast (w -. linewidth, h -. linewidth) (RGB (0., 0., 0.)) true linewidth 1. (Cpdfposition.BottomLeft (linewidth /. 2.))
false false (ilist 1 (Pdfpage.endpage pdf)) pdf
@ -1885,7 +1244,7 @@ let blacktext_ops colour pdf resources content =
| Pdfops.Op_BT::more ->
incr textlevel;
remove_colourops
(colour_op colour::Pdfops.Op_BT::prev)
(Cpdfaddtext.colour_op colour::Pdfops.Op_BT::prev)
more
| Pdfops.Op_ET::more ->
decr textlevel;
@ -1943,7 +1302,7 @@ let blacklines_ops c pdf resources content =
blacken_strokeops (Pdfops.Op_CS "/DeviceRGB"::prev) t
| (Pdfops.Op_SC _ | Pdfops.Op_SCN _ | Pdfops.Op_SCNName _ | Pdfops.Op_G _
| Pdfops.Op_RG _ | Pdfops.Op_K _)::t ->
blacken_strokeops (colour_op_stroke c::prev) t
blacken_strokeops (Cpdfaddtext.colour_op_stroke c::prev) t
| h::t -> blacken_strokeops (h::prev) t
and operators =
Pdfops.parse_operators pdf resources content
@ -1969,7 +1328,7 @@ let blackfills_ops c pdf resources content =
blacken_fillops (Pdfops.Op_cs "/DeviceRGB"::prev) t
| (Pdfops.Op_sc _ | Pdfops.Op_scn _ | Pdfops.Op_scnName _ | Pdfops.Op_g _
| Pdfops.Op_rg _ | Pdfops.Op_k _)::t ->
blacken_fillops (colour_op c::prev) t
blacken_fillops (Cpdfaddtext.colour_op c::prev) t
| h::t -> blacken_fillops (h::prev) t
and operators =
Pdfops.parse_operators pdf resources content

View File

@ -1,14 +1,8 @@
(** Coherent PDF Tools Core Routines *)
open Pdfutil
type color =
Grey of float
| RGB of float * float * float
| CYMK of float * float * float * float
(** {2 Working with pages} *)
val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t
(** {2 Stamping} *)
@ -26,68 +20,6 @@ val combine_pages : bool -> Pdf.t -> Pdf.t -> bool -> bool -> bool -> Pdf.t
[combine_pages]. *)
val stamp : bool -> Cpdfposition.position -> bool -> bool -> bool -> bool -> bool -> int list -> Pdf.t -> Pdf.t -> Pdf.t
(** {2 Adding text} *)
(** Justification of multiline text *)
type justification =
| LeftJustify
| CentreJustify
| RightJustify
(** Call [add_texts metrics linewidth outline fast fontname font bates batespad colour
position linespacing fontsize underneath text pages orientation
relative_to_cropbox midline_adjust topline filename pdf]. For details see cpdfmanual.pdf *)
val addtexts :
bool -> (*metrics*)
float -> (*linewidth*)
bool -> (*outline*)
bool -> (*fast*)
string -> (*fontname*)
Pdftext.standard_font option -> (*font*)
bool -> (* embed font *)
int -> (* bates number *)
int option -> (* bates padding width *)
color -> (*colour*)
Cpdfposition.position -> (*position*)
float -> (*linespacing*)
float -> (*fontsize*)
bool -> (*underneath*)
string ->(*text*)
int list ->(*page range*)
Cpdfposition.orientation ->(*orientation*)
bool ->(*relative to cropbox?*)
float ->(*opacity*)
justification ->(*justification*)
bool ->(*midline adjust?*)
bool ->(*topline adjust?*)
string ->(*filename*)
float option -> (*extract_text_font_size*)
string -> (* shift *)
?raw:bool -> (* raw *)
Pdf.t ->(*pdf*)
Pdf.t
val addrectangle :
bool ->
float * float ->
color ->
bool ->
float ->
float ->
Cpdfposition.position ->
bool -> bool -> int list -> Pdf.t -> Pdf.t
val metrics_howmany : unit -> int
val metrics_text : int -> string
val metrics_x : int -> float
val metrics_y : int -> float
val metrics_rot : int -> float
val metrics_baseline_adjustment : unit -> float
(** These functions returns some details about the text if [addtexts] is called with [metrics] true. The integer arguments are 1 for the first one, 2 for the second etc. Call [metrics_howmany] first to find out how many. *)
(** Remove text from the given pages. *)
val removetext : int list -> Pdf.t -> Pdf.t
(** {2 Page geometry} *)
(** True if a given page in a PDF has a given box *)
@ -173,13 +105,13 @@ val twoup : bool -> Pdf.t -> Pdf.t
val thinlines : int list -> float -> Pdf.t -> Pdf.t
(** Make all text on certain pages black. *)
val blacktext : color -> int list -> Pdf.t -> Pdf.t
val blacktext : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
(** Make all lines on certain pages black. *)
val blacklines : color -> int list -> Pdf.t -> Pdf.t
val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
(** Make all fills on certain pages black. *)
val blackfills : color -> int list -> Pdf.t -> Pdf.t
val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
(** Remove images from a PDF, optionally adding crossed boxes. *)
val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t
@ -190,8 +122,6 @@ val remove_all_text : int list -> Pdf.t -> Pdf.t
val process_xobjects : Pdf.t -> Pdfpage.t -> (Pdf.t -> Pdf.pdfobject -> Pdf.pdfobject list -> Pdf.pdfobject list) -> unit
val extract_text : float option -> Pdf.t -> int list -> string
val append_page_content : string -> bool -> bool -> int list -> Pdf.t -> Pdf.t
val stamp_as_xobject : Pdf.t -> int list -> Pdf.t -> Pdf.t * string

647
cpdfaddtext.ml Normal file
View File

@ -0,0 +1,647 @@
open Pdfutil
open Cpdferror
type color =
Grey of float
| RGB of float * float * float
| CYMK of float * float * float * float
(* For uses of process_pages which don't need to deal with matrices, this
function transforms into one which returns the identity matrix *)
let ppstub f n p = (f n p, n, Pdftransform.i_matrix)
(* \section{Superimpose text, page numbers etc.} *)
(* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever
is in the font (for existing fonts). *)
let charcodes_of_utf8 font s =
let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in
let codepoints = Pdftext.codepoints_of_utf8 s in
let charcodes =
option_map
(fun codepoint ->
match extractor codepoint with
| Some cc -> Some cc
| None -> Printf.eprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint; None)
codepoints
in
implode (map char_of_int charcodes)
(* Process codepoints back to UTF8, assuming it came from UTF8 to start with *)
let utf8_of_winansi s =
let text_extractor =
Pdftext.text_extractor_of_font_real
(Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding))
in
let codepoints = Pdftext.codepoints_of_text text_extractor s in
Pdftext.utf8_of_codepoints codepoints
(* Get the width of some text in the given font *)
let width_of_text font text =
match font with
| Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} ->
begin try
fold_left ( +. ) 0. (map (fun c -> fontmetrics.(int_of_char c)) (explode text))
with
_ -> 0.
end
| _ -> 0.
type ops_metrics =
{metrics_text : string;
metrics_x : float;
metrics_y : float;
metrics_rot : float}
let ops_metrics : ops_metrics list ref = ref []
let ops_baseline_adjustment = ref 0.
let metrics_howmany () = length !ops_metrics
let metrics_text n =
utf8_of_winansi (select n !ops_metrics).metrics_text
let metrics_x n =
(select n !ops_metrics).metrics_x
let metrics_y n =
(select n !ops_metrics).metrics_y
let metrics_rot n =
(select n !ops_metrics).metrics_rot
let metrics_baseline_adjustment () = !ops_baseline_adjustment
let colour_op = function
| RGB (r, g, b) -> Pdfops.Op_rg (r, g, b)
| Grey g -> Pdfops.Op_g g
| CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k)
let colour_op_stroke = function
| RGB (r, g, b) -> Pdfops.Op_RG (r, g, b)
| Grey g -> Pdfops.Op_G g
| CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k)
let ops longest_w metrics x y rotate hoffset voffset outline linewidth unique_fontname unique_extgstatename colour fontsize text =
if metrics then
ops_metrics :=
{metrics_text = text; metrics_x = x -. hoffset; metrics_y = y -. voffset; metrics_rot = rotate}
::!ops_metrics;
[Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
Pdfops.Op_cm
(Pdftransform.matrix_of_transform
[Pdftransform.Translate (x -. hoffset, y -. voffset);
Pdftransform.Rotate ((0., 0.), rotate)]);
Pdfops.Op_BT;
] @
(if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0]) @
[colour_op colour; colour_op_stroke colour]
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[Pdfops.Op_Tf (unique_fontname, fontsize);
Pdfops.Op_Tj text;
Pdfops.Op_ET;
Pdfops.Op_EMC;
Pdfops.Op_Q]
type justification = LeftJustify | CentreJustify | RightJustify
(* Find the h-offset for justification based on the longest width, the current
width, the justification and the position. *)
let find_justification_offsets longest_w w position j =
let open Cpdfposition in
match j with
| LeftJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0.
| Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| RightJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w)
| Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.)
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| CentreJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.)
| Top _ | PosCentre _ | Bottom _ | Centre -> 0.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
(* Lex an integer from the table *)
let extract_num header s =
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[Pdfgenlex.LexInt i] -> Pdf.Integer i
| [Pdfgenlex.LexReal f] -> Pdf.Real f
| _ -> raise (Failure ("extract_num: " ^ s))
let extract_fontbbox header s =
let num = function
Pdfgenlex.LexInt i -> Pdf.Integer i
| Pdfgenlex.LexReal f -> Pdf.Real f
| _ -> raise (Failure "extract_fontbbox")
in
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[a; b; c; d] -> [num a; num b; num c; num d]
| _ -> raise (Failure "extract_fontbbox")
let remove_slash s =
match explode s with
'/'::x -> implode x
| _ -> raise (Failure "remove_slash")
let extract_widths chars_and_widths =
let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in
map
(fun x ->
try
let name = List.assoc x win_to_name in
let width = List.assoc (remove_slash name) chars_and_widths in
width
with
_ -> 0)
(ilist 0 255)
let make_font embed fontname =
let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in
let header, width_data, _, chars_and_widths = Pdfstandard14.afm_data font in
let widths = extract_widths (list_of_hashtbl chars_and_widths) in
let flags = Pdfstandard14.flags_of_standard_font font in
let fontbbox = extract_fontbbox header "FontBBox" in
let italicangle = extract_num header "ItalicAngle" in
let ascent = try extract_num header "Ascender" with _ -> Pdf.Integer 0 in
let descent = try extract_num header "Descender" with _ -> Pdf.Integer 0 in
let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in
let stemv = Pdfstandard14.stemv_of_standard_font font in
let fontdescriptor =
Pdf.Dictionary
[("/Type", Pdf.Name "/FontDescriptor");
("/FontName", Pdf.Name ("/" ^ fontname));
("/Flags", Pdf.Integer flags);
("/FontBBox", Pdf.Array fontbbox);
("/ItalicAngle", italicangle);
("/Ascent", ascent);
("/Descent", descent);
("/CapHeight", capheight);
("/StemV", Pdf.Integer stemv)]
in
(* With -no-embed-font, we use the standard encoding, and just the
* minimal stuff. Without -no-embed-font, we switch to WinAnsiEncoding,
* and fill out everything except the font file instead *)
if embed then
Pdf.Dictionary
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/BaseFont", Pdf.Name ("/" ^ fontname));
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/FirstChar", Pdf.Integer 0);
("/LastChar", Pdf.Integer 255);
("/Widths", Pdf.Array (map (fun x -> Pdf.Integer x) widths));
("/FontDescriptor", fontdescriptor)]
else
Pdf.Dictionary
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/BaseFont", Pdf.Name ("/" ^ fontname))]
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
let rec process_text time text m =
match m with
| [] -> Cpdfstrftime.strftime ~time text
| (s, r)::t -> process_text time (string_replace_all_lazy s r text) t
(* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num =
Pdfpagelabels.pagelabeltext_of_pagenumber
num
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
let addtext
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
underneath position hoffset voffset text pages orientation cropbox opacity
justification filename extract_text_font_size shift pdf
=
let time = Cpdfstrftime.current_time () in
let endpage = Pdfpage.endpage pdf in
let replace_pairs pdf filename bates batespad num page =
[
"%PageDiv2", (fun () -> string_of_int ((num + 1) / 2));
"%Page", (fun () -> string_of_int num);
"%Roman", (fun () -> roman_upper num);
"%roman", (fun () -> roman_lower num);
"%filename", (fun () -> filename);
"%Label", (fun () -> pagelabel pdf num);
"%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
"%Bates",
(fun () ->
(let numstring = string_of_int (bates + num - 1) in
match batespad with
None -> numstring
| Some w ->
if String.length numstring >= w
then numstring
else implode (many '0' (w - String.length numstring)) ^ numstring))]
in
let shifts = Cpdfcoord.parse_coordinates pdf shift in
let addtext_page num page =
let shift_x, shift_y = List.nth shifts (num - 1) in
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> Pdf.Dictionary []
| Some d -> d
in
let unique_fontname = Pdf.unique_key "F" fontdict in
let ops =
let text = process_text time text (replace_pairs pdf filename bates batespad num page) in
let calc_textwidth text =
match font with
| Some f ->
let rawwidth =
Pdfstandard14.textwidth
false
(if embed then Pdftext.WinAnsiEncoding else Pdftext.StandardEncoding)
f
text
in
(float rawwidth *. fontsize) /. 1000.
| None ->
let font =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
end
| _ -> failwith "addtext: font not found for width"
in
let rawwidth = width_of_text (Pdftext.read_font pdf font) text in
(rawwidth *. fontsize) /. 1000.
in
let expanded_lines =
map
(function text ->
process_text time text (replace_pairs pdf filename bates batespad num page))
lines
in
let textwidth = calc_textwidth text
and allwidths = map calc_textwidth expanded_lines in
let longest_w = last (sort compare allwidths) in
let joffset = find_justification_offsets longest_w textwidth position justification in
let mediabox =
if cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
else
Pdf.parse_rectangle page.Pdfpage.mediabox
in
let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox orientation position in
let hoffset, voffset =
if position = Diagonal || position = ReverseDiagonal
then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset
else hoffset, voffset
in
match font with
| Some f ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
unique_fontname unique_extgstatename colour fontsize text
| None ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
fontname None colour fontsize text
in
let newresources =
match font with
| Some _ ->
let newfontdict =
Pdf.add_dict_entry fontdict unique_fontname (make_font embed fontname)
in
Pdf.add_dict_entry resources' "/Font" newfontdict
| None -> page.Pdfpage.resources
in
let page = {page with Pdfpage.resources = newresources} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
if metrics then
(ignore (Cpdfpage.iter_pages (fun a b -> ignore (addtext_page a b)) pdf pages); pdf)
else
Cpdfpage.process_pages (ppstub addtext_page) pdf pages
(* Prev is a list of lists of characters *)
let split_at_newline t =
let rec split_at_newline_inner prev = function
| [] -> rev (map implode (map rev prev))
| '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t
| '\\'::'n'::t -> split_at_newline_inner ([]::prev) t
| h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t
in
split_at_newline_inner [[]] (explode t)
let rec unescape_chars prev = function
| [] -> rev prev
| '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t ->
let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in
unescape_chars (chr::prev) t
| '\\'::'\\'::t -> unescape_chars ('\\'::prev) t
| '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t
| h::t -> unescape_chars (h::prev) t
let unescape_string s =
implode (unescape_chars [] (explode s))
let
addtexts metrics linewidth outline fast fontname (font : Pdftext.standard_font option) embed bates batespad colour position linespacing
fontsize underneath text pages orientation cropbox opacity justification
midline topline filename extract_text_font_size shift ?(raw=false) pdf
=
if pages = [] then error "addtexts: empty page range" else
(*flprint "addtexts:\n";
iter (Printf.printf "%C ") (explode text);
flprint "\n";
Printf.printf "\nCpdf.addtexts: metrics = %b" metrics;
flprint "\n";*)
(*Printf.printf "linewidth = %f\n" linewidth;
Printf.printf "outline = %b\n" outline;
Printf.printf "fast = %b\n" fast;
Printf.printf "fontname = %s\n" fontname;
Printf.printf "winansi text = %s\n" text;
Printf.printf "position = %s\n" (string_of_position position);
Printf.printf "bates = %i\n" bates;
Printf.printf "linespacing = %f\n" linespacing;
Printf.printf "fontsize = %f\n" fontsize;
Printf.printf "underneath = %b\n" underneath;
Printf.printf "font = %s\n" begin match font with None -> "None" | Some x -> Pdftext.string_of_standard_font x end;
Printf.printf "justification = %s\n"
begin match justification with LeftJustify -> "left" | RightJustify -> "right" | CentreJustify -> "centre" end;
Printf.printf "midline = %b\n" midline;
begin match colour with r, g, b -> Printf.printf "%f, %f, %f\n" r g b end;
Printf.printf "opacity = %f\n" opacity;
flprint "\n";
Printf.printf "relative-to-cropbox = %b" cropbox;
flprint "\n";*)
ops_metrics := [];
let realfontname = ref fontname in
let fontpdfobj =
match font with
| Some f ->
make_font embed (Pdftext.string_of_standard_font f)
| None ->
let firstpage =
List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1)
in
match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
| _ ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname ->
font := Some v; realfontname := k
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
end
| _ -> failwith "addtext: font dictionary not present"
in
let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in
let lines = map unescape_string (split_at_newline text) in
let pdf = ref pdf in
let voffset =
let open Cpdfposition in
match position with
| Bottom _ | BottomLeft _ | BottomRight _ ->
ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.)))
| Left _ | Right _ ->
(* Vertically align *)
ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.)))
| Diagonal | ReverseDiagonal ->
(* Change so that the whole paragraph sits on the centre... *)
ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.))
| _ -> ref 0.
in
if midline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000.
in
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
end
else
if topline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000.
in
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
end
else
ops_baseline_adjustment := 0.;
iter
(fun line ->
let voff, hoff =
if orientation = Cpdfposition.Vertical then 0., -.(!voffset) else !voffset, 0.
in
pdf :=
addtext metrics lines linewidth outline fast colour !realfontname
embed bates batespad fontsize font underneath position hoff voff line
pages orientation cropbox opacity justification filename
extract_text_font_size shift
!pdf;
voffset := !voffset +. (linespacing *. fontsize))
lines;
ops_metrics := rev !ops_metrics;
!pdf
let removetext range pdf =
(* Could fail on nesting, or other marked content inside our marked content.*)
let rec remove_until_last_EMC level = function
| [] -> []
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
remove_until_last_EMC (level + 1) more
| Pdfops.Op_EMC::more ->
if level = 1
then more
else remove_until_last_EMC (level - 1) more
| _::more ->
remove_until_last_EMC level more
in
let rec remove_stamps prev = function
| [] -> rev prev
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
let rest = remove_until_last_EMC 1 more in
remove_stamps prev rest
| h::t -> remove_stamps (h::prev) t
in
let removetext_page _ page =
{page with
Pdfpage.content =
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
in
Cpdfpage.process_pages (ppstub removetext_page) pdf range
let addrectangle
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
underneath range pdf
=
let addrectangle_page _ page =
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let mediabox =
if relative_to_cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
else
Pdf.parse_rectangle page.Pdfpage.mediabox
in
let x, y, _ =
Cpdfposition.calculate_position false w mediabox Cpdfposition.Horizontal position
in
let x, y =
match position with
Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h)
| Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.))
| _ -> (x, y)
in
let ops =
[
Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
colour_op colour;
colour_op_stroke colour;
]
@
(if outline then [Pdfops.Op_w linewidth] else [])
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[
Pdfops.Op_re (x, y, w, h);
(if outline then Pdfops.Op_s else Pdfops.Op_f);
Pdfops.Op_EMC;
Pdfops.Op_Q
]
in
let page = {page with Pdfpage.resources = resources'} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
Cpdfpage.process_pages (ppstub addrectangle_page) pdf range

72
cpdfaddtext.mli Normal file
View File

@ -0,0 +1,72 @@
(** {2 Adding text} *)
type color =
Grey of float
| RGB of float * float * float
| CYMK of float * float * float * float
val colour_op : color -> Pdfops.t
val colour_op_stroke : color -> Pdfops.t
(** Justification of multiline text *)
type justification =
| LeftJustify
| CentreJustify
| RightJustify
(** Call [add_texts metrics linewidth outline fast fontname font bates batespad colour
position linespacing fontsize underneath text pages orientation
relative_to_cropbox midline_adjust topline filename pdf]. For details see cpdfmanual.pdf *)
val addtexts :
bool -> (*metrics*)
float -> (*linewidth*)
bool -> (*outline*)
bool -> (*fast*)
string -> (*fontname*)
Pdftext.standard_font option -> (*font*)
bool -> (* embed font *)
int -> (* bates number *)
int option -> (* bates padding width *)
color -> (*colour*)
Cpdfposition.position -> (*position*)
float -> (*linespacing*)
float -> (*fontsize*)
bool -> (*underneath*)
string ->(*text*)
int list ->(*page range*)
Cpdfposition.orientation ->(*orientation*)
bool ->(*relative to cropbox?*)
float ->(*opacity*)
justification ->(*justification*)
bool ->(*midline adjust?*)
bool ->(*topline adjust?*)
string ->(*filename*)
float option -> (*extract_text_font_size*)
string -> (* shift *)
?raw:bool -> (* raw *)
Pdf.t ->(*pdf*)
Pdf.t
val addrectangle :
bool ->
float * float ->
color ->
bool ->
float ->
float ->
Cpdfposition.position ->
bool -> bool -> int list -> Pdf.t -> Pdf.t
val metrics_howmany : unit -> int
val metrics_text : int -> string
val metrics_x : int -> float
val metrics_y : int -> float
val metrics_rot : int -> float
val metrics_baseline_adjustment : unit -> float
(** These functions returns some details about the text if [addtexts] is called with [metrics] true. The integer arguments are 1 for the first one, 2 for the second etc. Call [metrics_howmany] first to find out how many. *)
(** Remove text from the given pages. *)
val removetext : int list -> Pdf.t -> Pdf.t
val extract_text : float option -> Pdf.t -> int list -> string

View File

@ -382,14 +382,14 @@ type args =
mutable font : font;
mutable fontname : string;
mutable fontsize : float;
mutable color : Cpdf.color;
mutable color : Cpdfaddtext.color;
mutable opacity : float;
mutable position : Cpdfposition.position;
mutable underneath : bool;
mutable linespacing : float;
mutable midline : bool;
mutable topline : bool;
mutable justification : Cpdf.justification;
mutable justification : Cpdfaddtext.justification;
mutable bates : int;
mutable batespad : int option;
mutable prerotate : bool;
@ -501,14 +501,14 @@ let args =
font = StandardFont Pdftext.TimesRoman;
fontname = "Times-Roman";
fontsize = 12.;
color = Cpdf.RGB (0., 0., 0.);
color = Cpdfaddtext.RGB (0., 0., 0.);
opacity = 1.;
position = Cpdfposition.TopLeft 100.;
underneath = false;
linespacing = 1.;
midline = false;
topline = false;
justification = Cpdf.LeftJustify;
justification = Cpdfaddtext.LeftJustify;
bates = 0;
batespad = None;
prerotate = false;
@ -620,14 +620,14 @@ let reset_arguments () =
args.font <- StandardFont Pdftext.TimesRoman;
args.fontname <- "Times-Roman";
args.fontsize <- 12.;
args.color <- Cpdf.RGB (0., 0., 0.);
args.color <- Cpdfaddtext.RGB (0., 0., 0.);
args.opacity <- 1.;
args.position <- Cpdfposition.TopLeft 100.;
args.underneath <- false;
args.linespacing <- 1.;
args.midline <- false;
args.topline <- false;
args.justification <- Cpdf.LeftJustify;
args.justification <- Cpdfaddtext.LeftJustify;
args.bates <- 0;
args.batespad <- None;
args.prerotate <- false;
@ -1106,11 +1106,11 @@ let setaddtext s =
let parse_color s =
match String.lowercase s with
| "white" -> Cpdf.RGB (1., 1., 1.)
| "black" -> Cpdf.RGB (0., 0., 0.)
| "red" -> Cpdf.RGB (1., 0., 0.)
| "green" -> Cpdf.RGB (0., 1., 0.)
| "blue" -> Cpdf.RGB (0., 0., 1.)
| "white" -> Cpdfaddtext.RGB (1., 1., 1.)
| "black" -> Cpdfaddtext.RGB (0., 0., 0.)
| "red" -> Cpdfaddtext.RGB (1., 0., 0.)
| "green" -> Cpdfaddtext.RGB (0., 1., 0.)
| "blue" -> Cpdfaddtext.RGB (0., 0., 1.)
| _ ->
let getnum = function
| Pdfgenlex.LexInt i -> float i
@ -1118,9 +1118,9 @@ let parse_color s =
| _ -> error "Bad color"
in
match Pdfgenlex.lex_string s with
| [g] -> Cpdf.Grey (getnum g)
| [r;g;b] -> Cpdf.RGB (getnum r, getnum g, getnum b)
| [c; y; m; k] -> Cpdf.CYMK (getnum c, getnum y, getnum m, getnum k)
| [g] -> Cpdfaddtext.Grey (getnum g)
| [r;g;b] -> Cpdfaddtext.RGB (getnum r, getnum g, getnum b)
| [c; y; m; k] -> Cpdfaddtext.CYMK (getnum c, getnum y, getnum m, getnum k)
| _ -> error "Bad color"
let setcolor s =
@ -1173,47 +1173,47 @@ let setposright s =
let settop n =
args.position <- Cpdfposition.Top (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.CentreJustify
args.justification <- Cpdfaddtext.CentreJustify
let settopleft n =
args.position <- Cpdfposition.TopLeft (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.LeftJustify
args.justification <- Cpdfaddtext.LeftJustify
let settopright n =
args.position <- Cpdfposition.TopRight (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.RightJustify
args.justification <- Cpdfaddtext.RightJustify
let setleft n =
args.position <- Cpdfposition.Left (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.LeftJustify
args.justification <- Cpdfaddtext.LeftJustify
let setbottomleft n =
args.position <- Cpdfposition.BottomLeft (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.LeftJustify
args.justification <- Cpdfaddtext.LeftJustify
let setbottom n =
args.position <- Cpdfposition.Bottom (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.CentreJustify
args.justification <- Cpdfaddtext.CentreJustify
let setbottomright n =
args.position <- Cpdfposition.BottomRight (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.RightJustify
args.justification <- Cpdfaddtext.RightJustify
let setright n =
args.position <- Cpdfposition.Right (Cpdfcoord.parse_single_number empty n);
args.justification <- Cpdf.RightJustify
args.justification <- Cpdfaddtext.RightJustify
let setdiagonal n =
args.position <- Cpdfposition.Diagonal;
args.justification <- Cpdf.CentreJustify
args.justification <- Cpdfaddtext.CentreJustify
let setreversediagonal n =
args.position <- Cpdfposition.ReverseDiagonal;
args.justification <- Cpdf.CentreJustify
args.justification <- Cpdfaddtext.CentreJustify
let setcenter n =
args.position <- Cpdfposition.Centre;
args.justification <- Cpdf.CentreJustify
args.justification <- Cpdfaddtext.CentreJustify
let setbatespad n =
args.batespad <- Some n
@ -1489,13 +1489,13 @@ let setscalestamptofit () =
args.scale_stamp_to_fit <- true
let setjustifyleft () =
args.justification <- Cpdf.LeftJustify
args.justification <- Cpdfaddtext.LeftJustify
let setjustifyright () =
args.justification <- Cpdf.RightJustify
args.justification <- Cpdfaddtext.RightJustify
let setjustifycenter () =
args.justification <- Cpdf.CentreJustify
args.justification <- Cpdfaddtext.CentreJustify
let setremoveduplicatestreams () =
args.remove_duplicate_streams <- true
@ -3666,7 +3666,7 @@ let go () =
| _ -> ""
in
write_pdf false
(Cpdf.addtexts
(Cpdfaddtext.addtexts
false args.linewidth args.outline args.fast args.fontname
font args.embedfonts args.bates args.batespad args.color args.position
args.linespacing args.fontsize args.underneath text range
@ -3676,12 +3676,12 @@ let go () =
| Some RemoveText ->
let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
write_pdf false (Cpdf.removetext range pdf)
write_pdf false (Cpdfaddtext.removetext range pdf)
| Some AddRectangle ->
let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
write_pdf false
(Cpdf.addrectangle
(Cpdfaddtext.addrectangle
args.fast (Cpdfcoord.parse_coordinate pdf args.coord)
args.color args.outline args.linewidth args.opacity args.position
args.relative_to_cropbox args.underneath range pdf)
@ -3792,7 +3792,7 @@ let go () =
| Some ExtractText ->
let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
let text = Cpdf.extract_text args.extract_text_font_size pdf range in
let text = Cpdfaddtext.extract_text args.extract_text_font_size pdf range in
begin match args.out with
| File filename ->
let fh = open_out_bin filename in