From 629718177521c4c6e2634afc32dc0a2f12997816 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 21 Dec 2021 14:00:58 +0000 Subject: [PATCH] more --- Makefile | 2 +- cpdf.ml | 649 +----------------------------------------------- cpdf.mli | 76 +----- cpdfaddtext.ml | 647 +++++++++++++++++++++++++++++++++++++++++++++++ cpdfaddtext.mli | 72 ++++++ cpdfcommand.ml | 64 ++--- 6 files changed, 759 insertions(+), 751 deletions(-) create mode 100644 cpdfaddtext.ml create mode 100644 cpdfaddtext.mli diff --git a/Makefile b/Makefile index 5d078f2..ab252fd 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Build the cpdf command line tools and top level MODS = cpdfyojson cpdfxmlm \ cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord cpdfattach \ - cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdf cpdffont cpdftype \ + cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdffont cpdftype \ cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfspot cpdfpagelabels cpdfcreate cpdfannot cpdfcommand SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml diff --git a/cpdf.ml b/cpdf.ml index bb58056..f87d720 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -3,27 +3,10 @@ open Pdfutil open Pdfio open Cpdferror -type color = - Grey of float -| RGB of float * float * float -| CYMK of float * float * float * float - -(* Return page label at pdf page num, or page number in arabic if no label *) -let pagelabel pdf num = - Pdfpagelabels.pagelabeltext_of_pagenumber - num - (Pdfpagelabels.complete (Pdfpagelabels.read pdf)) - -let rec process_text time text m = - match m with - | [] -> Cpdfstrftime.strftime ~time text - | (s, r)::t -> process_text time (string_replace_all_lazy s r text) t - (* For uses of process_pages which don't need to deal with matrices, this function transforms into one which returns the identity matrix *) let ppstub f n p = (f n p, n, Pdftransform.i_matrix) - (* Add stack operators to a content stream to ensure it is composeable. On -fast, we don't check for Q deficit, assuming PDF is ISO. *) let protect fast pdf resources content = @@ -99,564 +82,6 @@ let hasbox pdf page boxname = | Some _ -> true | _ -> false -(* \section{Superimpose text, page numbers etc.} *) - -(* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever - is in the font (for existing fonts). *) -let charcodes_of_utf8 font s = - let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in - let codepoints = Pdftext.codepoints_of_utf8 s in - let charcodes = - option_map - (fun codepoint -> - match extractor codepoint with - | Some cc -> Some cc - | None -> Printf.eprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint; None) - codepoints - in - implode (map char_of_int charcodes) - -(* Process codepoints back to UTF8, assuming it came from UTF8 to start with *) -let utf8_of_winansi s = - let text_extractor = - Pdftext.text_extractor_of_font_real - (Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding)) - in - let codepoints = Pdftext.codepoints_of_text text_extractor s in - Pdftext.utf8_of_codepoints codepoints - -(* Get the width of some text in the given font *) -let width_of_text font text = - match font with - | Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} -> - begin try - fold_left ( +. ) 0. (map (fun c -> fontmetrics.(int_of_char c)) (explode text)) - with - _ -> 0. - end - | _ -> 0. - -type ops_metrics = - {metrics_text : string; - metrics_x : float; - metrics_y : float; - metrics_rot : float} - -let ops_metrics : ops_metrics list ref = ref [] - -let ops_baseline_adjustment = ref 0. - -let metrics_howmany () = length !ops_metrics - -let metrics_text n = - utf8_of_winansi (select n !ops_metrics).metrics_text - -let metrics_x n = - (select n !ops_metrics).metrics_x - -let metrics_y n = - (select n !ops_metrics).metrics_y - -let metrics_rot n = - (select n !ops_metrics).metrics_rot - -let metrics_baseline_adjustment () = !ops_baseline_adjustment - -let colour_op = function - | RGB (r, g, b) -> Pdfops.Op_rg (r, g, b) - | Grey g -> Pdfops.Op_g g - | CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k) - -let colour_op_stroke = function - | RGB (r, g, b) -> Pdfops.Op_RG (r, g, b) - | Grey g -> Pdfops.Op_G g - | CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k) - -let ops longest_w metrics x y rotate hoffset voffset outline linewidth unique_fontname unique_extgstatename colour fontsize text = - if metrics then - ops_metrics := - {metrics_text = text; metrics_x = x -. hoffset; metrics_y = y -. voffset; metrics_rot = rotate} - ::!ops_metrics; - [Pdfops.Op_q; - Pdfops.Op_BMC "/CPDFSTAMP"; - Pdfops.Op_cm - (Pdftransform.matrix_of_transform - [Pdftransform.Translate (x -. hoffset, y -. voffset); - Pdftransform.Rotate ((0., 0.), rotate)]); - Pdfops.Op_BT; - ] @ - (if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0]) @ - [colour_op colour; colour_op_stroke colour] - @ - (match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n]) - @ - [Pdfops.Op_Tf (unique_fontname, fontsize); - Pdfops.Op_Tj text; - Pdfops.Op_ET; - Pdfops.Op_EMC; - Pdfops.Op_Q] - -type justification = LeftJustify | CentreJustify | RightJustify - -(* Find the h-offset for justification based on the longest width, the current -width, the justification and the position. *) -let find_justification_offsets longest_w w position j = - let open Cpdfposition in - match j with - | LeftJustify -> - begin match position with - | TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0. - | Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2. - | TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w - | Diagonal -> 0. - | ReverseDiagonal -> 0. - end - | RightJustify -> - begin match position with - | TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w) - | Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.) - | TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0. - | Diagonal -> 0. - | ReverseDiagonal -> 0. - end - | CentreJustify -> - begin match position with - | TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.) - | Top _ | PosCentre _ | Bottom _ | Centre -> 0. - | TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2. - | Diagonal -> 0. - | ReverseDiagonal -> 0. - end - -(* Lex an integer from the table *) -let extract_num header s = - match Pdfgenlex.lex_string (Hashtbl.find header s) with - [Pdfgenlex.LexInt i] -> Pdf.Integer i - | [Pdfgenlex.LexReal f] -> Pdf.Real f - | _ -> raise (Failure ("extract_num: " ^ s)) - -let extract_fontbbox header s = - let num = function - Pdfgenlex.LexInt i -> Pdf.Integer i - | Pdfgenlex.LexReal f -> Pdf.Real f - | _ -> raise (Failure "extract_fontbbox") - in - match Pdfgenlex.lex_string (Hashtbl.find header s) with - [a; b; c; d] -> [num a; num b; num c; num d] - | _ -> raise (Failure "extract_fontbbox") - -let remove_slash s = - match explode s with - '/'::x -> implode x - | _ -> raise (Failure "remove_slash") - -let extract_widths chars_and_widths = - let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in - map - (fun x -> - try - let name = List.assoc x win_to_name in - let width = List.assoc (remove_slash name) chars_and_widths in - width - with - _ -> 0) - (ilist 0 255) - -let make_font embed fontname = - let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in - let header, width_data, _, chars_and_widths = Pdfstandard14.afm_data font in - let widths = extract_widths (list_of_hashtbl chars_and_widths) in - let flags = Pdfstandard14.flags_of_standard_font font in - let fontbbox = extract_fontbbox header "FontBBox" in - let italicangle = extract_num header "ItalicAngle" in - let ascent = try extract_num header "Ascender" with _ -> Pdf.Integer 0 in - let descent = try extract_num header "Descender" with _ -> Pdf.Integer 0 in - let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in - let stemv = Pdfstandard14.stemv_of_standard_font font in - let fontdescriptor = - Pdf.Dictionary - [("/Type", Pdf.Name "/FontDescriptor"); - ("/FontName", Pdf.Name ("/" ^ fontname)); - ("/Flags", Pdf.Integer flags); - ("/FontBBox", Pdf.Array fontbbox); - ("/ItalicAngle", italicangle); - ("/Ascent", ascent); - ("/Descent", descent); - ("/CapHeight", capheight); - ("/StemV", Pdf.Integer stemv)] - in - (* With -no-embed-font, we use the standard encoding, and just the - * minimal stuff. Without -no-embed-font, we switch to WinAnsiEncoding, - * and fill out everything except the font file instead *) - if embed then - Pdf.Dictionary - [("/Type", Pdf.Name "/Font"); - ("/Subtype", Pdf.Name "/Type1"); - ("/BaseFont", Pdf.Name ("/" ^ fontname)); - ("/Encoding", Pdf.Name "/WinAnsiEncoding"); - ("/FirstChar", Pdf.Integer 0); - ("/LastChar", Pdf.Integer 255); - ("/Widths", Pdf.Array (map (fun x -> Pdf.Integer x) widths)); - ("/FontDescriptor", fontdescriptor)] - else - Pdf.Dictionary - [("/Type", Pdf.Name "/Font"); - ("/Subtype", Pdf.Name "/Type1"); - ("/Encoding", Pdf.Name "/WinAnsiEncoding"); - ("/BaseFont", Pdf.Name ("/" ^ fontname))] - -let extract_page_text only_fontsize pdf _ page = - let text_extractor = ref None in - let right_font_size = ref false in - fold_left ( ^ ) "" - (map - (function - | Pdfops.Op_Tf (fontname, fontsize) -> - right_font_size := - begin match only_fontsize with - Some x -> x = fontsize - | _ -> false - end; - let fontdict = - match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with - | None -> raise (Pdf.PDFError "Missing /Font in text extraction") - | Some d -> - match Pdf.lookup_direct pdf fontname d with - | None -> raise (Pdf.PDFError "Missing font in text extraction") - | Some d -> d - in - text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict); - "" - | Pdfops.Op_Tj text when !text_extractor <> None -> - if not !right_font_size then - "" - else - Pdftext.utf8_of_codepoints - (Pdftext.codepoints_of_text (unopt !text_extractor) text) - | Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None -> - if not !right_font_size then - "" - else - fold_left ( ^ ) "" - (option_map - (function - | Pdf.String text -> - Some - (Pdftext.utf8_of_codepoints - (Pdftext.codepoints_of_text (unopt !text_extractor) text)) - | _ -> None) - objs) - | _ -> "") - (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)) - -(* For each page, extract all the ops with text in them, and concatenate it all together *) -let extract_text extract_text_font_size pdf range = - fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) "" - (Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range) - -let addtext - metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font - underneath position hoffset voffset text pages orientation cropbox opacity - justification filename extract_text_font_size shift pdf -= - let time = Cpdfstrftime.current_time () in - let endpage = Pdfpage.endpage pdf in - let replace_pairs pdf filename bates batespad num page = - [ - "%PageDiv2", (fun () -> string_of_int ((num + 1) / 2)); - "%Page", (fun () -> string_of_int num); - "%Roman", (fun () -> roman_upper num); - "%roman", (fun () -> roman_lower num); - "%filename", (fun () -> filename); - "%Label", (fun () -> pagelabel pdf num); - "%EndPage", (fun () -> string_of_int endpage); - "%EndLabel", (fun () -> pagelabel pdf endpage); - "%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page); - "%Bates", - (fun () -> - (let numstring = string_of_int (bates + num - 1) in - match batespad with - None -> numstring - | Some w -> - if String.length numstring >= w - then numstring - else implode (many '0' (w - String.length numstring)) ^ numstring))] - in - let shifts = Cpdfcoord.parse_coordinates pdf shift in - let addtext_page num page = - let shift_x, shift_y = List.nth shifts (num - 1) in - let resources', unique_extgstatename = - if opacity < 1.0 then - let dict = - match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with - | Some d -> d - | None -> Pdf.Dictionary [] - in - let unique_extgstatename = Pdf.unique_key "gs" dict in - let dict' = - Pdf.add_dict_entry dict unique_extgstatename - (Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)]) - in - Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename - else - page.Pdfpage.resources, None - in - let fontdict = - match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with - | None -> Pdf.Dictionary [] - | Some d -> d - in - let unique_fontname = Pdf.unique_key "F" fontdict in - let ops = - let text = process_text time text (replace_pairs pdf filename bates batespad num page) in - let calc_textwidth text = - match font with - | Some f -> - let rawwidth = - Pdfstandard14.textwidth - false - (if embed then Pdftext.WinAnsiEncoding else Pdftext.StandardEncoding) - f - text - in - (float rawwidth *. fontsize) /. 1000. - | None -> - let font = - match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with - | Some fontdict -> - begin match Pdf.lookup_direct pdf fontname fontdict with - | Some font -> font - | None -> - (* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *) - let font = ref None in - iter - (fun (k, v) -> - match Pdf.lookup_direct pdf "/BaseFont" v with - | Some (Pdf.Name n) when n = fontname -> font := Some v - | _ -> ()) - (match fontdict with Pdf.Dictionary d -> d | _ -> []); - match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname) - end - | _ -> failwith "addtext: font not found for width" - in - let rawwidth = width_of_text (Pdftext.read_font pdf font) text in - (rawwidth *. fontsize) /. 1000. - in - let expanded_lines = - map - (function text -> - process_text time text (replace_pairs pdf filename bates batespad num page)) - lines - in - let textwidth = calc_textwidth text - and allwidths = map calc_textwidth expanded_lines in - let longest_w = last (sort compare allwidths) in - let joffset = find_justification_offsets longest_w textwidth position justification in - let mediabox = - if cropbox then - match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with - | Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject) - | None -> Pdf.parse_rectangle page.Pdfpage.mediabox - else - Pdf.parse_rectangle page.Pdfpage.mediabox - in - let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox orientation position in - let hoffset, voffset = - if position = Diagonal || position = ReverseDiagonal - then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset - else hoffset, voffset - in - match font with - | Some f -> - ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth - unique_fontname unique_extgstatename colour fontsize text - | None -> - ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth - fontname None colour fontsize text - in - let newresources = - match font with - | Some _ -> - let newfontdict = - Pdf.add_dict_entry fontdict unique_fontname (make_font embed fontname) - in - Pdf.add_dict_entry resources' "/Font" newfontdict - | None -> page.Pdfpage.resources - in - let page = {page with Pdfpage.resources = newresources} in - if underneath - then Pdfpage.prepend_operators pdf ops ~fast:fast page - else Pdfpage.postpend_operators pdf ops ~fast:fast page - in - if metrics then - (ignore (Cpdfpage.iter_pages (fun a b -> ignore (addtext_page a b)) pdf pages); pdf) - else - Cpdfpage.process_pages (ppstub addtext_page) pdf pages - -(* Prev is a list of lists of characters *) -let split_at_newline t = - let rec split_at_newline_inner prev = function - | [] -> rev (map implode (map rev prev)) - | '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t - | '\\'::'n'::t -> split_at_newline_inner ([]::prev) t - | h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t - in - split_at_newline_inner [[]] (explode t) - -let rec unescape_chars prev = function - | [] -> rev prev - | '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t -> - let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in - unescape_chars (chr::prev) t - | '\\'::'\\'::t -> unescape_chars ('\\'::prev) t - | '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t - | h::t -> unescape_chars (h::prev) t - -let unescape_string s = - implode (unescape_chars [] (explode s)) - -let - addtexts metrics linewidth outline fast fontname (font : Pdftext.standard_font option) embed bates batespad colour position linespacing - fontsize underneath text pages orientation cropbox opacity justification - midline topline filename extract_text_font_size shift ?(raw=false) pdf -= - if pages = [] then error "addtexts: empty page range" else - (*flprint "addtexts:\n"; - iter (Printf.printf "%C ") (explode text); - flprint "\n"; - Printf.printf "\nCpdf.addtexts: metrics = %b" metrics; - flprint "\n";*) - (*Printf.printf "linewidth = %f\n" linewidth; - Printf.printf "outline = %b\n" outline; - Printf.printf "fast = %b\n" fast; - Printf.printf "fontname = %s\n" fontname; - Printf.printf "winansi text = %s\n" text; - Printf.printf "position = %s\n" (string_of_position position); - Printf.printf "bates = %i\n" bates; - Printf.printf "linespacing = %f\n" linespacing; - Printf.printf "fontsize = %f\n" fontsize; - Printf.printf "underneath = %b\n" underneath; - Printf.printf "font = %s\n" begin match font with None -> "None" | Some x -> Pdftext.string_of_standard_font x end; - Printf.printf "justification = %s\n" - begin match justification with LeftJustify -> "left" | RightJustify -> "right" | CentreJustify -> "centre" end; - Printf.printf "midline = %b\n" midline; - begin match colour with r, g, b -> Printf.printf "%f, %f, %f\n" r g b end; - Printf.printf "opacity = %f\n" opacity; - flprint "\n"; - Printf.printf "relative-to-cropbox = %b" cropbox; - flprint "\n";*) - ops_metrics := []; - let realfontname = ref fontname in - let fontpdfobj = - match font with - | Some f -> - make_font embed (Pdftext.string_of_standard_font f) - | None -> - let firstpage = - List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1) - in - match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with - | Some fontdict -> - begin match Pdf.lookup_direct pdf fontname fontdict with - | Some font -> font - | _ -> - (* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *) - let font = ref None in - iter - (fun (k, v) -> - match Pdf.lookup_direct pdf "/BaseFont" v with - | Some (Pdf.Name n) when n = fontname -> - font := Some v; realfontname := k - | _ -> ()) - (match fontdict with Pdf.Dictionary d -> d | _ -> []); - match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname) - end - | _ -> failwith "addtext: font dictionary not present" - in - let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in - let lines = map unescape_string (split_at_newline text) in - let pdf = ref pdf in - let voffset = - let open Cpdfposition in - match position with - | Bottom _ | BottomLeft _ | BottomRight _ -> - ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.))) - | Left _ | Right _ -> - (* Vertically align *) - ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.))) - | Diagonal | ReverseDiagonal -> - (* Change so that the whole paragraph sits on the centre... *) - ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.)) - | _ -> ref 0. - in - if midline then - begin match font with - | Some font -> - let baseline_adjustment = - (fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000. - in - ops_baseline_adjustment := baseline_adjustment; - voffset := !voffset +. baseline_adjustment - | _ -> - ops_baseline_adjustment := 0. - end - else - if topline then - begin match font with - | Some font -> - let baseline_adjustment = - (fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000. - in - ops_baseline_adjustment := baseline_adjustment; - voffset := !voffset +. baseline_adjustment - | _ -> - ops_baseline_adjustment := 0. - end - else - ops_baseline_adjustment := 0.; - iter - (fun line -> - let voff, hoff = - if orientation = Cpdfposition.Vertical then 0., -.(!voffset) else !voffset, 0. - in - pdf := - addtext metrics lines linewidth outline fast colour !realfontname - embed bates batespad fontsize font underneath position hoff voff line - pages orientation cropbox opacity justification filename - extract_text_font_size shift - !pdf; - voffset := !voffset +. (linespacing *. fontsize)) - lines; - ops_metrics := rev !ops_metrics; - !pdf - -let removetext range pdf = - (* Could fail on nesting, or other marked content inside our marked content.*) - let rec remove_until_last_EMC level = function - | [] -> [] - | Pdfops.Op_BMC "/CPDFSTAMP"::more -> - remove_until_last_EMC (level + 1) more - | Pdfops.Op_EMC::more -> - if level = 1 - then more - else remove_until_last_EMC (level - 1) more - | _::more -> - remove_until_last_EMC level more - in - let rec remove_stamps prev = function - | [] -> rev prev - | Pdfops.Op_BMC "/CPDFSTAMP"::more -> - let rest = remove_until_last_EMC 1 more in - remove_stamps prev rest - | h::t -> remove_stamps (h::prev) t - in - let removetext_page _ page = - {page with - Pdfpage.content = - let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in - [Pdfops.stream_of_ops (remove_stamps [] ops)]} - in - Cpdfpage.process_pages (ppstub removetext_page) pdf range (* \section{Shift page data} *) let make_mediabox (xmin, ymin, xmax, ymax) = @@ -1498,72 +923,6 @@ let scale_page_contents ?(fast=false) scale position pdf pnum page = let scale_contents ?(fast=false) position scale pdf range = Cpdfpage.process_pages (scale_page_contents ~fast scale position pdf) pdf range - -let addrectangle - fast (w, h) colour outline linewidth opacity position relative_to_cropbox - underneath range pdf -= - let addrectangle_page _ page = - let resources', unique_extgstatename = - if opacity < 1.0 then - let dict = - match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with - | Some d -> d - | None -> Pdf.Dictionary [] - in - let unique_extgstatename = Pdf.unique_key "gs" dict in - let dict' = - Pdf.add_dict_entry dict unique_extgstatename - (Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)]) - in - Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename - else - page.Pdfpage.resources, None - in - let mediabox = - if relative_to_cropbox then - match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with - | Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject) - | None -> Pdf.parse_rectangle page.Pdfpage.mediabox - else - Pdf.parse_rectangle page.Pdfpage.mediabox - in - let x, y, _ = - Cpdfposition.calculate_position false w mediabox Cpdfposition.Horizontal position - in - let x, y = - match position with - Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h) - | Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.)) - | _ -> (x, y) - in - let ops = - [ - Pdfops.Op_q; - Pdfops.Op_BMC "/CPDFSTAMP"; - colour_op colour; - colour_op_stroke colour; - ] - @ - (if outline then [Pdfops.Op_w linewidth] else []) - @ - (match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n]) - @ - [ - Pdfops.Op_re (x, y, w, h); - (if outline then Pdfops.Op_s else Pdfops.Op_f); - Pdfops.Op_EMC; - Pdfops.Op_Q - ] - in - let page = {page with Pdfpage.resources = resources'} in - if underneath - then Pdfpage.prepend_operators pdf ops ~fast:fast page - else Pdfpage.postpend_operators pdf ops ~fast:fast page - in - Cpdfpage.process_pages (ppstub addrectangle_page) pdf range - - (* Imposition *) (* Union two rest dictionaries from the same PDF. *) @@ -1751,7 +1110,7 @@ let add_border linewidth ~fast pdf = if linewidth = 0. then pdf else let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in let _, _, w, h = Pdf.parse_rectangle firstpage.Pdfpage.mediabox in - addrectangle + Cpdfaddtext.addrectangle fast (w -. linewidth, h -. linewidth) (RGB (0., 0., 0.)) true linewidth 1. (Cpdfposition.BottomLeft (linewidth /. 2.)) false false (ilist 1 (Pdfpage.endpage pdf)) pdf @@ -1885,7 +1244,7 @@ let blacktext_ops colour pdf resources content = | Pdfops.Op_BT::more -> incr textlevel; remove_colourops - (colour_op colour::Pdfops.Op_BT::prev) + (Cpdfaddtext.colour_op colour::Pdfops.Op_BT::prev) more | Pdfops.Op_ET::more -> decr textlevel; @@ -1943,7 +1302,7 @@ let blacklines_ops c pdf resources content = blacken_strokeops (Pdfops.Op_CS "/DeviceRGB"::prev) t | (Pdfops.Op_SC _ | Pdfops.Op_SCN _ | Pdfops.Op_SCNName _ | Pdfops.Op_G _ | Pdfops.Op_RG _ | Pdfops.Op_K _)::t -> - blacken_strokeops (colour_op_stroke c::prev) t + blacken_strokeops (Cpdfaddtext.colour_op_stroke c::prev) t | h::t -> blacken_strokeops (h::prev) t and operators = Pdfops.parse_operators pdf resources content @@ -1969,7 +1328,7 @@ let blackfills_ops c pdf resources content = blacken_fillops (Pdfops.Op_cs "/DeviceRGB"::prev) t | (Pdfops.Op_sc _ | Pdfops.Op_scn _ | Pdfops.Op_scnName _ | Pdfops.Op_g _ | Pdfops.Op_rg _ | Pdfops.Op_k _)::t -> - blacken_fillops (colour_op c::prev) t + blacken_fillops (Cpdfaddtext.colour_op c::prev) t | h::t -> blacken_fillops (h::prev) t and operators = Pdfops.parse_operators pdf resources content diff --git a/cpdf.mli b/cpdf.mli index 382faaf..9a810fb 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -1,14 +1,8 @@ (** Coherent PDF Tools Core Routines *) open Pdfutil -type color = - Grey of float -| RGB of float * float * float -| CYMK of float * float * float * float - (** {2 Working with pages} *) - val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t (** {2 Stamping} *) @@ -26,68 +20,6 @@ val combine_pages : bool -> Pdf.t -> Pdf.t -> bool -> bool -> bool -> Pdf.t [combine_pages]. *) val stamp : bool -> Cpdfposition.position -> bool -> bool -> bool -> bool -> bool -> int list -> Pdf.t -> Pdf.t -> Pdf.t -(** {2 Adding text} *) - -(** Justification of multiline text *) -type justification = - | LeftJustify - | CentreJustify - | RightJustify - -(** Call [add_texts metrics linewidth outline fast fontname font bates batespad colour -position linespacing fontsize underneath text pages orientation -relative_to_cropbox midline_adjust topline filename pdf]. For details see cpdfmanual.pdf *) -val addtexts : - bool -> (*metrics*) - float -> (*linewidth*) - bool -> (*outline*) - bool -> (*fast*) - string -> (*fontname*) - Pdftext.standard_font option -> (*font*) - bool -> (* embed font *) - int -> (* bates number *) - int option -> (* bates padding width *) - color -> (*colour*) - Cpdfposition.position -> (*position*) - float -> (*linespacing*) - float -> (*fontsize*) - bool -> (*underneath*) - string ->(*text*) - int list ->(*page range*) - Cpdfposition.orientation ->(*orientation*) - bool ->(*relative to cropbox?*) - float ->(*opacity*) - justification ->(*justification*) - bool ->(*midline adjust?*) - bool ->(*topline adjust?*) - string ->(*filename*) - float option -> (*extract_text_font_size*) - string -> (* shift *) - ?raw:bool -> (* raw *) - Pdf.t ->(*pdf*) - Pdf.t - -val addrectangle : - bool -> - float * float -> - color -> - bool -> - float -> - float -> - Cpdfposition.position -> - bool -> bool -> int list -> Pdf.t -> Pdf.t - -val metrics_howmany : unit -> int -val metrics_text : int -> string -val metrics_x : int -> float -val metrics_y : int -> float -val metrics_rot : int -> float -val metrics_baseline_adjustment : unit -> float -(** These functions returns some details about the text if [addtexts] is called with [metrics] true. The integer arguments are 1 for the first one, 2 for the second etc. Call [metrics_howmany] first to find out how many. *) - -(** Remove text from the given pages. *) -val removetext : int list -> Pdf.t -> Pdf.t - (** {2 Page geometry} *) (** True if a given page in a PDF has a given box *) @@ -173,13 +105,13 @@ val twoup : bool -> Pdf.t -> Pdf.t val thinlines : int list -> float -> Pdf.t -> Pdf.t (** Make all text on certain pages black. *) -val blacktext : color -> int list -> Pdf.t -> Pdf.t +val blacktext : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t (** Make all lines on certain pages black. *) -val blacklines : color -> int list -> Pdf.t -> Pdf.t +val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t (** Make all fills on certain pages black. *) -val blackfills : color -> int list -> Pdf.t -> Pdf.t +val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t (** Remove images from a PDF, optionally adding crossed boxes. *) val draft : string option -> bool -> int list -> Pdf.t -> Pdf.t @@ -190,8 +122,6 @@ val remove_all_text : int list -> Pdf.t -> Pdf.t val process_xobjects : Pdf.t -> Pdfpage.t -> (Pdf.t -> Pdf.pdfobject -> Pdf.pdfobject list -> Pdf.pdfobject list) -> unit -val extract_text : float option -> Pdf.t -> int list -> string - val append_page_content : string -> bool -> bool -> int list -> Pdf.t -> Pdf.t val stamp_as_xobject : Pdf.t -> int list -> Pdf.t -> Pdf.t * string diff --git a/cpdfaddtext.ml b/cpdfaddtext.ml new file mode 100644 index 0000000..f81133c --- /dev/null +++ b/cpdfaddtext.ml @@ -0,0 +1,647 @@ +open Pdfutil +open Cpdferror + +type color = + Grey of float +| RGB of float * float * float +| CYMK of float * float * float * float + +(* For uses of process_pages which don't need to deal with matrices, this + function transforms into one which returns the identity matrix *) +let ppstub f n p = (f n p, n, Pdftransform.i_matrix) + +(* \section{Superimpose text, page numbers etc.} *) + +(* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever + is in the font (for existing fonts). *) +let charcodes_of_utf8 font s = + let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in + let codepoints = Pdftext.codepoints_of_utf8 s in + let charcodes = + option_map + (fun codepoint -> + match extractor codepoint with + | Some cc -> Some cc + | None -> Printf.eprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint; None) + codepoints + in + implode (map char_of_int charcodes) + +(* Process codepoints back to UTF8, assuming it came from UTF8 to start with *) +let utf8_of_winansi s = + let text_extractor = + Pdftext.text_extractor_of_font_real + (Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding)) + in + let codepoints = Pdftext.codepoints_of_text text_extractor s in + Pdftext.utf8_of_codepoints codepoints + +(* Get the width of some text in the given font *) +let width_of_text font text = + match font with + | Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} -> + begin try + fold_left ( +. ) 0. (map (fun c -> fontmetrics.(int_of_char c)) (explode text)) + with + _ -> 0. + end + | _ -> 0. + +type ops_metrics = + {metrics_text : string; + metrics_x : float; + metrics_y : float; + metrics_rot : float} + +let ops_metrics : ops_metrics list ref = ref [] + +let ops_baseline_adjustment = ref 0. + +let metrics_howmany () = length !ops_metrics + +let metrics_text n = + utf8_of_winansi (select n !ops_metrics).metrics_text + +let metrics_x n = + (select n !ops_metrics).metrics_x + +let metrics_y n = + (select n !ops_metrics).metrics_y + +let metrics_rot n = + (select n !ops_metrics).metrics_rot + +let metrics_baseline_adjustment () = !ops_baseline_adjustment + +let colour_op = function + | RGB (r, g, b) -> Pdfops.Op_rg (r, g, b) + | Grey g -> Pdfops.Op_g g + | CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k) + +let colour_op_stroke = function + | RGB (r, g, b) -> Pdfops.Op_RG (r, g, b) + | Grey g -> Pdfops.Op_G g + | CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k) + +let ops longest_w metrics x y rotate hoffset voffset outline linewidth unique_fontname unique_extgstatename colour fontsize text = + if metrics then + ops_metrics := + {metrics_text = text; metrics_x = x -. hoffset; metrics_y = y -. voffset; metrics_rot = rotate} + ::!ops_metrics; + [Pdfops.Op_q; + Pdfops.Op_BMC "/CPDFSTAMP"; + Pdfops.Op_cm + (Pdftransform.matrix_of_transform + [Pdftransform.Translate (x -. hoffset, y -. voffset); + Pdftransform.Rotate ((0., 0.), rotate)]); + Pdfops.Op_BT; + ] @ + (if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0]) @ + [colour_op colour; colour_op_stroke colour] + @ + (match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n]) + @ + [Pdfops.Op_Tf (unique_fontname, fontsize); + Pdfops.Op_Tj text; + Pdfops.Op_ET; + Pdfops.Op_EMC; + Pdfops.Op_Q] + +type justification = LeftJustify | CentreJustify | RightJustify + +(* Find the h-offset for justification based on the longest width, the current +width, the justification and the position. *) +let find_justification_offsets longest_w w position j = + let open Cpdfposition in + match j with + | LeftJustify -> + begin match position with + | TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0. + | Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2. + | TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w + | Diagonal -> 0. + | ReverseDiagonal -> 0. + end + | RightJustify -> + begin match position with + | TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w) + | Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.) + | TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0. + | Diagonal -> 0. + | ReverseDiagonal -> 0. + end + | CentreJustify -> + begin match position with + | TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.) + | Top _ | PosCentre _ | Bottom _ | Centre -> 0. + | TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2. + | Diagonal -> 0. + | ReverseDiagonal -> 0. + end + +(* Lex an integer from the table *) +let extract_num header s = + match Pdfgenlex.lex_string (Hashtbl.find header s) with + [Pdfgenlex.LexInt i] -> Pdf.Integer i + | [Pdfgenlex.LexReal f] -> Pdf.Real f + | _ -> raise (Failure ("extract_num: " ^ s)) + +let extract_fontbbox header s = + let num = function + Pdfgenlex.LexInt i -> Pdf.Integer i + | Pdfgenlex.LexReal f -> Pdf.Real f + | _ -> raise (Failure "extract_fontbbox") + in + match Pdfgenlex.lex_string (Hashtbl.find header s) with + [a; b; c; d] -> [num a; num b; num c; num d] + | _ -> raise (Failure "extract_fontbbox") + +let remove_slash s = + match explode s with + '/'::x -> implode x + | _ -> raise (Failure "remove_slash") + +let extract_widths chars_and_widths = + let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in + map + (fun x -> + try + let name = List.assoc x win_to_name in + let width = List.assoc (remove_slash name) chars_and_widths in + width + with + _ -> 0) + (ilist 0 255) + +let make_font embed fontname = + let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in + let header, width_data, _, chars_and_widths = Pdfstandard14.afm_data font in + let widths = extract_widths (list_of_hashtbl chars_and_widths) in + let flags = Pdfstandard14.flags_of_standard_font font in + let fontbbox = extract_fontbbox header "FontBBox" in + let italicangle = extract_num header "ItalicAngle" in + let ascent = try extract_num header "Ascender" with _ -> Pdf.Integer 0 in + let descent = try extract_num header "Descender" with _ -> Pdf.Integer 0 in + let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in + let stemv = Pdfstandard14.stemv_of_standard_font font in + let fontdescriptor = + Pdf.Dictionary + [("/Type", Pdf.Name "/FontDescriptor"); + ("/FontName", Pdf.Name ("/" ^ fontname)); + ("/Flags", Pdf.Integer flags); + ("/FontBBox", Pdf.Array fontbbox); + ("/ItalicAngle", italicangle); + ("/Ascent", ascent); + ("/Descent", descent); + ("/CapHeight", capheight); + ("/StemV", Pdf.Integer stemv)] + in + (* With -no-embed-font, we use the standard encoding, and just the + * minimal stuff. Without -no-embed-font, we switch to WinAnsiEncoding, + * and fill out everything except the font file instead *) + if embed then + Pdf.Dictionary + [("/Type", Pdf.Name "/Font"); + ("/Subtype", Pdf.Name "/Type1"); + ("/BaseFont", Pdf.Name ("/" ^ fontname)); + ("/Encoding", Pdf.Name "/WinAnsiEncoding"); + ("/FirstChar", Pdf.Integer 0); + ("/LastChar", Pdf.Integer 255); + ("/Widths", Pdf.Array (map (fun x -> Pdf.Integer x) widths)); + ("/FontDescriptor", fontdescriptor)] + else + Pdf.Dictionary + [("/Type", Pdf.Name "/Font"); + ("/Subtype", Pdf.Name "/Type1"); + ("/Encoding", Pdf.Name "/WinAnsiEncoding"); + ("/BaseFont", Pdf.Name ("/" ^ fontname))] + +let extract_page_text only_fontsize pdf _ page = + let text_extractor = ref None in + let right_font_size = ref false in + fold_left ( ^ ) "" + (map + (function + | Pdfops.Op_Tf (fontname, fontsize) -> + right_font_size := + begin match only_fontsize with + Some x -> x = fontsize + | _ -> false + end; + let fontdict = + match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with + | None -> raise (Pdf.PDFError "Missing /Font in text extraction") + | Some d -> + match Pdf.lookup_direct pdf fontname d with + | None -> raise (Pdf.PDFError "Missing font in text extraction") + | Some d -> d + in + text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict); + "" + | Pdfops.Op_Tj text when !text_extractor <> None -> + if not !right_font_size then + "" + else + Pdftext.utf8_of_codepoints + (Pdftext.codepoints_of_text (unopt !text_extractor) text) + | Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None -> + if not !right_font_size then + "" + else + fold_left ( ^ ) "" + (option_map + (function + | Pdf.String text -> + Some + (Pdftext.utf8_of_codepoints + (Pdftext.codepoints_of_text (unopt !text_extractor) text)) + | _ -> None) + objs) + | _ -> "") + (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)) + +(* For each page, extract all the ops with text in them, and concatenate it all together *) +let extract_text extract_text_font_size pdf range = + fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) "" + (Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range) + +let rec process_text time text m = + match m with + | [] -> Cpdfstrftime.strftime ~time text + | (s, r)::t -> process_text time (string_replace_all_lazy s r text) t + +(* Return page label at pdf page num, or page number in arabic if no label *) +let pagelabel pdf num = + Pdfpagelabels.pagelabeltext_of_pagenumber + num + (Pdfpagelabels.complete (Pdfpagelabels.read pdf)) + +let addtext + metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font + underneath position hoffset voffset text pages orientation cropbox opacity + justification filename extract_text_font_size shift pdf += + let time = Cpdfstrftime.current_time () in + let endpage = Pdfpage.endpage pdf in + let replace_pairs pdf filename bates batespad num page = + [ + "%PageDiv2", (fun () -> string_of_int ((num + 1) / 2)); + "%Page", (fun () -> string_of_int num); + "%Roman", (fun () -> roman_upper num); + "%roman", (fun () -> roman_lower num); + "%filename", (fun () -> filename); + "%Label", (fun () -> pagelabel pdf num); + "%EndPage", (fun () -> string_of_int endpage); + "%EndLabel", (fun () -> pagelabel pdf endpage); + "%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page); + "%Bates", + (fun () -> + (let numstring = string_of_int (bates + num - 1) in + match batespad with + None -> numstring + | Some w -> + if String.length numstring >= w + then numstring + else implode (many '0' (w - String.length numstring)) ^ numstring))] + in + let shifts = Cpdfcoord.parse_coordinates pdf shift in + let addtext_page num page = + let shift_x, shift_y = List.nth shifts (num - 1) in + let resources', unique_extgstatename = + if opacity < 1.0 then + let dict = + match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with + | Some d -> d + | None -> Pdf.Dictionary [] + in + let unique_extgstatename = Pdf.unique_key "gs" dict in + let dict' = + Pdf.add_dict_entry dict unique_extgstatename + (Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)]) + in + Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename + else + page.Pdfpage.resources, None + in + let fontdict = + match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with + | None -> Pdf.Dictionary [] + | Some d -> d + in + let unique_fontname = Pdf.unique_key "F" fontdict in + let ops = + let text = process_text time text (replace_pairs pdf filename bates batespad num page) in + let calc_textwidth text = + match font with + | Some f -> + let rawwidth = + Pdfstandard14.textwidth + false + (if embed then Pdftext.WinAnsiEncoding else Pdftext.StandardEncoding) + f + text + in + (float rawwidth *. fontsize) /. 1000. + | None -> + let font = + match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with + | Some fontdict -> + begin match Pdf.lookup_direct pdf fontname fontdict with + | Some font -> font + | None -> + (* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *) + let font = ref None in + iter + (fun (k, v) -> + match Pdf.lookup_direct pdf "/BaseFont" v with + | Some (Pdf.Name n) when n = fontname -> font := Some v + | _ -> ()) + (match fontdict with Pdf.Dictionary d -> d | _ -> []); + match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname) + end + | _ -> failwith "addtext: font not found for width" + in + let rawwidth = width_of_text (Pdftext.read_font pdf font) text in + (rawwidth *. fontsize) /. 1000. + in + let expanded_lines = + map + (function text -> + process_text time text (replace_pairs pdf filename bates batespad num page)) + lines + in + let textwidth = calc_textwidth text + and allwidths = map calc_textwidth expanded_lines in + let longest_w = last (sort compare allwidths) in + let joffset = find_justification_offsets longest_w textwidth position justification in + let mediabox = + if cropbox then + match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with + | Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject) + | None -> Pdf.parse_rectangle page.Pdfpage.mediabox + else + Pdf.parse_rectangle page.Pdfpage.mediabox + in + let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox orientation position in + let hoffset, voffset = + if position = Diagonal || position = ReverseDiagonal + then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset + else hoffset, voffset + in + match font with + | Some f -> + ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth + unique_fontname unique_extgstatename colour fontsize text + | None -> + ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth + fontname None colour fontsize text + in + let newresources = + match font with + | Some _ -> + let newfontdict = + Pdf.add_dict_entry fontdict unique_fontname (make_font embed fontname) + in + Pdf.add_dict_entry resources' "/Font" newfontdict + | None -> page.Pdfpage.resources + in + let page = {page with Pdfpage.resources = newresources} in + if underneath + then Pdfpage.prepend_operators pdf ops ~fast:fast page + else Pdfpage.postpend_operators pdf ops ~fast:fast page + in + if metrics then + (ignore (Cpdfpage.iter_pages (fun a b -> ignore (addtext_page a b)) pdf pages); pdf) + else + Cpdfpage.process_pages (ppstub addtext_page) pdf pages + +(* Prev is a list of lists of characters *) +let split_at_newline t = + let rec split_at_newline_inner prev = function + | [] -> rev (map implode (map rev prev)) + | '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t + | '\\'::'n'::t -> split_at_newline_inner ([]::prev) t + | h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t + in + split_at_newline_inner [[]] (explode t) + +let rec unescape_chars prev = function + | [] -> rev prev + | '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t -> + let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in + unescape_chars (chr::prev) t + | '\\'::'\\'::t -> unescape_chars ('\\'::prev) t + | '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t + | h::t -> unescape_chars (h::prev) t + +let unescape_string s = + implode (unescape_chars [] (explode s)) + +let + addtexts metrics linewidth outline fast fontname (font : Pdftext.standard_font option) embed bates batespad colour position linespacing + fontsize underneath text pages orientation cropbox opacity justification + midline topline filename extract_text_font_size shift ?(raw=false) pdf += + if pages = [] then error "addtexts: empty page range" else + (*flprint "addtexts:\n"; + iter (Printf.printf "%C ") (explode text); + flprint "\n"; + Printf.printf "\nCpdf.addtexts: metrics = %b" metrics; + flprint "\n";*) + (*Printf.printf "linewidth = %f\n" linewidth; + Printf.printf "outline = %b\n" outline; + Printf.printf "fast = %b\n" fast; + Printf.printf "fontname = %s\n" fontname; + Printf.printf "winansi text = %s\n" text; + Printf.printf "position = %s\n" (string_of_position position); + Printf.printf "bates = %i\n" bates; + Printf.printf "linespacing = %f\n" linespacing; + Printf.printf "fontsize = %f\n" fontsize; + Printf.printf "underneath = %b\n" underneath; + Printf.printf "font = %s\n" begin match font with None -> "None" | Some x -> Pdftext.string_of_standard_font x end; + Printf.printf "justification = %s\n" + begin match justification with LeftJustify -> "left" | RightJustify -> "right" | CentreJustify -> "centre" end; + Printf.printf "midline = %b\n" midline; + begin match colour with r, g, b -> Printf.printf "%f, %f, %f\n" r g b end; + Printf.printf "opacity = %f\n" opacity; + flprint "\n"; + Printf.printf "relative-to-cropbox = %b" cropbox; + flprint "\n";*) + ops_metrics := []; + let realfontname = ref fontname in + let fontpdfobj = + match font with + | Some f -> + make_font embed (Pdftext.string_of_standard_font f) + | None -> + let firstpage = + List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1) + in + match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with + | Some fontdict -> + begin match Pdf.lookup_direct pdf fontname fontdict with + | Some font -> font + | _ -> + (* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *) + let font = ref None in + iter + (fun (k, v) -> + match Pdf.lookup_direct pdf "/BaseFont" v with + | Some (Pdf.Name n) when n = fontname -> + font := Some v; realfontname := k + | _ -> ()) + (match fontdict with Pdf.Dictionary d -> d | _ -> []); + match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname) + end + | _ -> failwith "addtext: font dictionary not present" + in + let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in + let lines = map unescape_string (split_at_newline text) in + let pdf = ref pdf in + let voffset = + let open Cpdfposition in + match position with + | Bottom _ | BottomLeft _ | BottomRight _ -> + ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.))) + | Left _ | Right _ -> + (* Vertically align *) + ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.))) + | Diagonal | ReverseDiagonal -> + (* Change so that the whole paragraph sits on the centre... *) + ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.)) + | _ -> ref 0. + in + if midline then + begin match font with + | Some font -> + let baseline_adjustment = + (fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000. + in + ops_baseline_adjustment := baseline_adjustment; + voffset := !voffset +. baseline_adjustment + | _ -> + ops_baseline_adjustment := 0. + end + else + if topline then + begin match font with + | Some font -> + let baseline_adjustment = + (fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000. + in + ops_baseline_adjustment := baseline_adjustment; + voffset := !voffset +. baseline_adjustment + | _ -> + ops_baseline_adjustment := 0. + end + else + ops_baseline_adjustment := 0.; + iter + (fun line -> + let voff, hoff = + if orientation = Cpdfposition.Vertical then 0., -.(!voffset) else !voffset, 0. + in + pdf := + addtext metrics lines linewidth outline fast colour !realfontname + embed bates batespad fontsize font underneath position hoff voff line + pages orientation cropbox opacity justification filename + extract_text_font_size shift + !pdf; + voffset := !voffset +. (linespacing *. fontsize)) + lines; + ops_metrics := rev !ops_metrics; + !pdf + +let removetext range pdf = + (* Could fail on nesting, or other marked content inside our marked content.*) + let rec remove_until_last_EMC level = function + | [] -> [] + | Pdfops.Op_BMC "/CPDFSTAMP"::more -> + remove_until_last_EMC (level + 1) more + | Pdfops.Op_EMC::more -> + if level = 1 + then more + else remove_until_last_EMC (level - 1) more + | _::more -> + remove_until_last_EMC level more + in + let rec remove_stamps prev = function + | [] -> rev prev + | Pdfops.Op_BMC "/CPDFSTAMP"::more -> + let rest = remove_until_last_EMC 1 more in + remove_stamps prev rest + | h::t -> remove_stamps (h::prev) t + in + let removetext_page _ page = + {page with + Pdfpage.content = + let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in + [Pdfops.stream_of_ops (remove_stamps [] ops)]} + in + Cpdfpage.process_pages (ppstub removetext_page) pdf range + +let addrectangle + fast (w, h) colour outline linewidth opacity position relative_to_cropbox + underneath range pdf += + let addrectangle_page _ page = + let resources', unique_extgstatename = + if opacity < 1.0 then + let dict = + match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with + | Some d -> d + | None -> Pdf.Dictionary [] + in + let unique_extgstatename = Pdf.unique_key "gs" dict in + let dict' = + Pdf.add_dict_entry dict unique_extgstatename + (Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)]) + in + Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename + else + page.Pdfpage.resources, None + in + let mediabox = + if relative_to_cropbox then + match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with + | Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject) + | None -> Pdf.parse_rectangle page.Pdfpage.mediabox + else + Pdf.parse_rectangle page.Pdfpage.mediabox + in + let x, y, _ = + Cpdfposition.calculate_position false w mediabox Cpdfposition.Horizontal position + in + let x, y = + match position with + Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h) + | Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.)) + | _ -> (x, y) + in + let ops = + [ + Pdfops.Op_q; + Pdfops.Op_BMC "/CPDFSTAMP"; + colour_op colour; + colour_op_stroke colour; + ] + @ + (if outline then [Pdfops.Op_w linewidth] else []) + @ + (match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n]) + @ + [ + Pdfops.Op_re (x, y, w, h); + (if outline then Pdfops.Op_s else Pdfops.Op_f); + Pdfops.Op_EMC; + Pdfops.Op_Q + ] + in + let page = {page with Pdfpage.resources = resources'} in + if underneath + then Pdfpage.prepend_operators pdf ops ~fast:fast page + else Pdfpage.postpend_operators pdf ops ~fast:fast page + in + Cpdfpage.process_pages (ppstub addrectangle_page) pdf range + + diff --git a/cpdfaddtext.mli b/cpdfaddtext.mli new file mode 100644 index 0000000..52e5d45 --- /dev/null +++ b/cpdfaddtext.mli @@ -0,0 +1,72 @@ +(** {2 Adding text} *) + +type color = + Grey of float +| RGB of float * float * float +| CYMK of float * float * float * float + +val colour_op : color -> Pdfops.t + +val colour_op_stroke : color -> Pdfops.t + +(** Justification of multiline text *) +type justification = + | LeftJustify + | CentreJustify + | RightJustify + +(** Call [add_texts metrics linewidth outline fast fontname font bates batespad colour +position linespacing fontsize underneath text pages orientation +relative_to_cropbox midline_adjust topline filename pdf]. For details see cpdfmanual.pdf *) +val addtexts : + bool -> (*metrics*) + float -> (*linewidth*) + bool -> (*outline*) + bool -> (*fast*) + string -> (*fontname*) + Pdftext.standard_font option -> (*font*) + bool -> (* embed font *) + int -> (* bates number *) + int option -> (* bates padding width *) + color -> (*colour*) + Cpdfposition.position -> (*position*) + float -> (*linespacing*) + float -> (*fontsize*) + bool -> (*underneath*) + string ->(*text*) + int list ->(*page range*) + Cpdfposition.orientation ->(*orientation*) + bool ->(*relative to cropbox?*) + float ->(*opacity*) + justification ->(*justification*) + bool ->(*midline adjust?*) + bool ->(*topline adjust?*) + string ->(*filename*) + float option -> (*extract_text_font_size*) + string -> (* shift *) + ?raw:bool -> (* raw *) + Pdf.t ->(*pdf*) + Pdf.t + +val addrectangle : + bool -> + float * float -> + color -> + bool -> + float -> + float -> + Cpdfposition.position -> + bool -> bool -> int list -> Pdf.t -> Pdf.t + +val metrics_howmany : unit -> int +val metrics_text : int -> string +val metrics_x : int -> float +val metrics_y : int -> float +val metrics_rot : int -> float +val metrics_baseline_adjustment : unit -> float +(** These functions returns some details about the text if [addtexts] is called with [metrics] true. The integer arguments are 1 for the first one, 2 for the second etc. Call [metrics_howmany] first to find out how many. *) + +(** Remove text from the given pages. *) +val removetext : int list -> Pdf.t -> Pdf.t + +val extract_text : float option -> Pdf.t -> int list -> string diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 0d9b94e..6d7b019 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -382,14 +382,14 @@ type args = mutable font : font; mutable fontname : string; mutable fontsize : float; - mutable color : Cpdf.color; + mutable color : Cpdfaddtext.color; mutable opacity : float; mutable position : Cpdfposition.position; mutable underneath : bool; mutable linespacing : float; mutable midline : bool; mutable topline : bool; - mutable justification : Cpdf.justification; + mutable justification : Cpdfaddtext.justification; mutable bates : int; mutable batespad : int option; mutable prerotate : bool; @@ -501,14 +501,14 @@ let args = font = StandardFont Pdftext.TimesRoman; fontname = "Times-Roman"; fontsize = 12.; - color = Cpdf.RGB (0., 0., 0.); + color = Cpdfaddtext.RGB (0., 0., 0.); opacity = 1.; position = Cpdfposition.TopLeft 100.; underneath = false; linespacing = 1.; midline = false; topline = false; - justification = Cpdf.LeftJustify; + justification = Cpdfaddtext.LeftJustify; bates = 0; batespad = None; prerotate = false; @@ -620,14 +620,14 @@ let reset_arguments () = args.font <- StandardFont Pdftext.TimesRoman; args.fontname <- "Times-Roman"; args.fontsize <- 12.; - args.color <- Cpdf.RGB (0., 0., 0.); + args.color <- Cpdfaddtext.RGB (0., 0., 0.); args.opacity <- 1.; args.position <- Cpdfposition.TopLeft 100.; args.underneath <- false; args.linespacing <- 1.; args.midline <- false; args.topline <- false; - args.justification <- Cpdf.LeftJustify; + args.justification <- Cpdfaddtext.LeftJustify; args.bates <- 0; args.batespad <- None; args.prerotate <- false; @@ -1106,11 +1106,11 @@ let setaddtext s = let parse_color s = match String.lowercase s with - | "white" -> Cpdf.RGB (1., 1., 1.) - | "black" -> Cpdf.RGB (0., 0., 0.) - | "red" -> Cpdf.RGB (1., 0., 0.) - | "green" -> Cpdf.RGB (0., 1., 0.) - | "blue" -> Cpdf.RGB (0., 0., 1.) + | "white" -> Cpdfaddtext.RGB (1., 1., 1.) + | "black" -> Cpdfaddtext.RGB (0., 0., 0.) + | "red" -> Cpdfaddtext.RGB (1., 0., 0.) + | "green" -> Cpdfaddtext.RGB (0., 1., 0.) + | "blue" -> Cpdfaddtext.RGB (0., 0., 1.) | _ -> let getnum = function | Pdfgenlex.LexInt i -> float i @@ -1118,9 +1118,9 @@ let parse_color s = | _ -> error "Bad color" in match Pdfgenlex.lex_string s with - | [g] -> Cpdf.Grey (getnum g) - | [r;g;b] -> Cpdf.RGB (getnum r, getnum g, getnum b) - | [c; y; m; k] -> Cpdf.CYMK (getnum c, getnum y, getnum m, getnum k) + | [g] -> Cpdfaddtext.Grey (getnum g) + | [r;g;b] -> Cpdfaddtext.RGB (getnum r, getnum g, getnum b) + | [c; y; m; k] -> Cpdfaddtext.CYMK (getnum c, getnum y, getnum m, getnum k) | _ -> error "Bad color" let setcolor s = @@ -1173,47 +1173,47 @@ let setposright s = let settop n = args.position <- Cpdfposition.Top (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.CentreJustify + args.justification <- Cpdfaddtext.CentreJustify let settopleft n = args.position <- Cpdfposition.TopLeft (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.LeftJustify + args.justification <- Cpdfaddtext.LeftJustify let settopright n = args.position <- Cpdfposition.TopRight (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.RightJustify + args.justification <- Cpdfaddtext.RightJustify let setleft n = args.position <- Cpdfposition.Left (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.LeftJustify + args.justification <- Cpdfaddtext.LeftJustify let setbottomleft n = args.position <- Cpdfposition.BottomLeft (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.LeftJustify + args.justification <- Cpdfaddtext.LeftJustify let setbottom n = args.position <- Cpdfposition.Bottom (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.CentreJustify + args.justification <- Cpdfaddtext.CentreJustify let setbottomright n = args.position <- Cpdfposition.BottomRight (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.RightJustify + args.justification <- Cpdfaddtext.RightJustify let setright n = args.position <- Cpdfposition.Right (Cpdfcoord.parse_single_number empty n); - args.justification <- Cpdf.RightJustify + args.justification <- Cpdfaddtext.RightJustify let setdiagonal n = args.position <- Cpdfposition.Diagonal; - args.justification <- Cpdf.CentreJustify + args.justification <- Cpdfaddtext.CentreJustify let setreversediagonal n = args.position <- Cpdfposition.ReverseDiagonal; - args.justification <- Cpdf.CentreJustify + args.justification <- Cpdfaddtext.CentreJustify let setcenter n = args.position <- Cpdfposition.Centre; - args.justification <- Cpdf.CentreJustify + args.justification <- Cpdfaddtext.CentreJustify let setbatespad n = args.batespad <- Some n @@ -1489,13 +1489,13 @@ let setscalestamptofit () = args.scale_stamp_to_fit <- true let setjustifyleft () = - args.justification <- Cpdf.LeftJustify + args.justification <- Cpdfaddtext.LeftJustify let setjustifyright () = - args.justification <- Cpdf.RightJustify + args.justification <- Cpdfaddtext.RightJustify let setjustifycenter () = - args.justification <- Cpdf.CentreJustify + args.justification <- Cpdfaddtext.CentreJustify let setremoveduplicatestreams () = args.remove_duplicate_streams <- true @@ -3666,7 +3666,7 @@ let go () = | _ -> "" in write_pdf false - (Cpdf.addtexts + (Cpdfaddtext.addtexts false args.linewidth args.outline args.fast args.fontname font args.embedfonts args.bates args.batespad args.color args.position args.linespacing args.fontsize args.underneath text range @@ -3676,12 +3676,12 @@ let go () = | Some RemoveText -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - write_pdf false (Cpdf.removetext range pdf) + write_pdf false (Cpdfaddtext.removetext range pdf) | Some AddRectangle -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in write_pdf false - (Cpdf.addrectangle + (Cpdfaddtext.addrectangle args.fast (Cpdfcoord.parse_coordinate pdf args.coord) args.color args.outline args.linewidth args.opacity args.position args.relative_to_cropbox args.underneath range pdf) @@ -3792,7 +3792,7 @@ let go () = | Some ExtractText -> let pdf = get_single_pdf args.op true in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - let text = Cpdf.extract_text args.extract_text_font_size pdf range in + let text = Cpdfaddtext.extract_text args.extract_text_font_size pdf range in begin match args.out with | File filename -> let fh = open_out_bin filename in