Added %ExtractedText

This commit is contained in:
John Whitington 2016-11-13 14:02:09 +00:00
parent c7accd12c8
commit 9d34594f13
3 changed files with 83 additions and 80 deletions

92
cpdf.ml
View File

@ -531,19 +531,16 @@ let print_pdf_objs pdf =
Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj)) Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj))
pdf pdf
(* Return page label at pdf page num, or page number in arabic if no label *) (* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num = let pagelabel pdf num =
Pdfpagelabels.pagelabeltext_of_pagenumber Pdfpagelabels.pagelabeltext_of_pagenumber
num num
(Pdfpagelabels.complete (Pdfpagelabels.read pdf)) (Pdfpagelabels.complete (Pdfpagelabels.read pdf))
let rec process_text text m = let rec process_text text m =
match m with match m with
| ([] : (string * string) list) -> Cpdfstrftime.strftime text | [] -> Cpdfstrftime.strftime text
| (s, r)::t -> process_text (string_replace_all s r text) t | (s, r)::t -> process_text (string_replace_all_lazy s r text) t
let expand_date = function let expand_date = function
| "now" -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S" | "now" -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S"
@ -1661,28 +1658,79 @@ let make_font embed fontname =
("/Encoding", Pdf.Name "/WinAnsiEncoding"); ("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/BaseFont", Pdf.Name ("/" ^ fontname))] ("/BaseFont", Pdf.Name ("/" ^ fontname))]
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(map_pages (extract_page_text extract_text_font_size pdf) pdf range)
let addtext let addtext
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
underneath position hoffset voffset text pages orientation cropbox opacity underneath position hoffset voffset text pages orientation cropbox opacity
justification filename pdf justification filename extract_text_font_size pdf
= =
let endpage = Pdfpage.endpage pdf in let endpage = Pdfpage.endpage pdf in
let replace_pairs pdf filename bates batespad num = let replace_pairs pdf filename bates batespad num page =
["%Page", string_of_int num; ["%Page", (fun () -> string_of_int num);
"%Roman", roman_upper num; "%Roman", (fun () -> roman_upper num);
"%roman", roman_lower num; "%roman", (fun () -> roman_lower num);
"%filename", filename; "%filename", (fun () -> filename);
"%Label", pagelabel pdf num; "%Label", (fun () -> pagelabel pdf num);
"%EndPage", string_of_int endpage; "%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", pagelabel pdf endpage; "%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
"%Bates", "%Bates",
(let numstring = string_of_int (bates + num - 1) in (fun () ->
(let numstring = string_of_int (bates + num - 1) in
match batespad with match batespad with
None -> numstring None -> numstring
| Some w -> | Some w ->
if String.length numstring >= w if String.length numstring >= w
then numstring then numstring
else implode (many '0' (w - String.length numstring)) ^ numstring)] else implode (many '0' (w - String.length numstring)) ^ numstring))]
in in
let addtext_page num page = let addtext_page num page =
let resources', unique_extgstatename = let resources', unique_extgstatename =
@ -1708,7 +1756,7 @@ let addtext
in in
let unique_fontname = Pdf.unique_key "F" fontdict in let unique_fontname = Pdf.unique_key "F" fontdict in
let ops = let ops =
let text = process_text text (replace_pairs pdf filename bates batespad num) in let text = process_text text (replace_pairs pdf filename bates batespad num page) in
let calc_textwidth text = let calc_textwidth text =
match font with match font with
| Some f -> | Some f ->
@ -1734,8 +1782,10 @@ let addtext
(rawwidth *. fontsize) /. 1000. (rawwidth *. fontsize) /. 1000.
in in
let expanded_lines = let expanded_lines =
map (function text -> process_text text (replace_pairs pdf map
filename bates batespad num)) lines (function text ->
process_text text (replace_pairs pdf filename bates batespad num page))
lines
in in
let textwidth = calc_textwidth text let textwidth = calc_textwidth text
and allwidths = map calc_textwidth expanded_lines in and allwidths = map calc_textwidth expanded_lines in
@ -1807,7 +1857,7 @@ let unescape_string s =
let let
addtexts metrics linewidth outline fast fontname font embed bates batespad colour position linespacing addtexts metrics linewidth outline fast fontname font embed bates batespad colour position linespacing
fontsize underneath text pages orientation cropbox opacity justification fontsize underneath text pages orientation cropbox opacity justification
midline topline filename pdf midline topline filename extract_text_font_size pdf
= =
(*flprint "addtexts:\n"; (*flprint "addtexts:\n";
iter (Printf.printf "%C ") (explode text); iter (Printf.printf "%C ") (explode text);
@ -1883,6 +1933,7 @@ let
addtext metrics lines linewidth outline fast colour fontname addtext metrics lines linewidth outline fast colour fontname
embed bates batespad fontsize font underneath position hoff voff line embed bates batespad fontsize font underneath position hoff voff line
pages orientation cropbox opacity justification filename pages orientation cropbox opacity justification filename
extract_text_font_size
!pdf; !pdf;
voffset := !voffset +. (linespacing *. fontsize)) voffset := !voffset +. (linespacing *. fontsize))
lines; lines;
@ -3664,3 +3715,4 @@ let add_page_labels pdf style prefix startval range =
ranges; ranges;
Pdfpagelabels.write pdf !labels Pdfpagelabels.write pdf !labels

View File

@ -287,7 +287,7 @@ type justification =
| CentreJustify | CentreJustify
| RightJustify | RightJustify
(** [calculate ignore_d w (xmin, ymin, xmax, ymax) orientation pos] calculates (** [calculate_position ignore_d w (xmin, ymin, xmax, ymax) orientation pos] calculates
the absolute position of text given its width, bounding box, orientation and the absolute position of text given its width, bounding box, orientation and
position. If [ignore_d] is true, the distance from the position (e.g 10 in position. If [ignore_d] is true, the distance from the position (e.g 10 in
TopLeft 10) is ignored (considered zero). *) TopLeft 10) is ignored (considered zero). *)
@ -324,6 +324,7 @@ val addtexts :
bool ->(*midline adjust?*) bool ->(*midline adjust?*)
bool ->(*topline adjust?*) bool ->(*topline adjust?*)
string ->(*filename*) string ->(*filename*)
float option -> (*extract_text_font_size*)
Pdf.t ->(*pdf*) Pdf.t ->(*pdf*)
Pdf.t Pdf.t
@ -492,3 +493,4 @@ val call_cpdflin : string -> string -> string -> string -> int
val debug : bool ref val debug : bool ref
val extract_text : float option -> Pdf.t -> int list -> string

View File

@ -1109,15 +1109,11 @@ let setfont f =
end; end;
args.fontname <- f args.fontname <- f
let setextracttextfontsize f =
args.extract_text_font_size <- Some f
let setfontsize f = let setfontsize f =
if f > 0. if f > 0. then args.fontsize <- f else error "Negative font size specified"
then
begin
args.fontsize <- f;
args.extract_text_font_size <- Some f
end
else
error "Negative font size specified"
let setaddtext s = let setaddtext s =
setop (AddText s) () setop (AddText s) ()
@ -2089,6 +2085,7 @@ and specs =
("-debug-crypt", Arg.Unit setdebugcrypt, ""); ("-debug-crypt", Arg.Unit setdebugcrypt, "");
("-fix-prince", Arg.Unit (setop RemoveUnusedResources), ""); ("-fix-prince", Arg.Unit (setop RemoveUnusedResources), "");
("-extract-text", Arg.Unit (setop ExtractText), ""); ("-extract-text", Arg.Unit (setop ExtractText), "");
("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
(*("-change-font-size-to", Arg.Float setchangefontsizeto, ""); (*("-change-font-size-to", Arg.Float setchangefontsizeto, "");
("-change-font-size-shift", Arg.String setchangefontsizeshift, ""); ("-change-font-size-shift", Arg.String setchangefontsizeshift, "");
("-change-font-size-color", Arg.String setchangefontsizecolor, "")*) ("-change-font-size-color", Arg.String setchangefontsizecolor, "")*)
@ -3039,55 +3036,6 @@ let remove_unused_resources_page pdf n page =
let remove_unused_resources pdf = let remove_unused_resources pdf =
Cpdf.process_pages (remove_unused_resources_page pdf) pdf (ilist 1 (Pdfpage.endpage pdf)) Cpdf.process_pages (remove_unused_resources_page pdf) pdf (ilist 1 (Pdfpage.endpage pdf))
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdf.map_pages (extract_page_text args.extract_text_font_size pdf) pdf range)
(* Extracts font to font.dat in CWD. *) (* Extracts font to font.dat in CWD. *)
let extract_fontfile pagenumber fontname pdf = let extract_fontfile pagenumber fontname pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
@ -3831,7 +3779,8 @@ let go () =
font args.embedfonts args.bates args.batespad args.color args.position font args.embedfonts args.bates args.batespad args.color args.position
args.linespacing args.fontsize args.underneath text range args.linespacing args.fontsize args.underneath text range
args.orientation args.relative_to_cropbox args.opacity args.orientation args.relative_to_cropbox args.opacity
args.justification args.midline args.topline filename pdf) args.justification args.midline args.topline filename
args.extract_text_font_size pdf)
| Some RemoveText -> | Some RemoveText ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
let range = parse_pagespec pdf (get_pagespec ()) in let range = parse_pagespec pdf (get_pagespec ()) in
@ -3951,7 +3900,7 @@ let go () =
| Some ExtractText -> | Some ExtractText ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
let range = parse_pagespec pdf (get_pagespec ()) in let range = parse_pagespec pdf (get_pagespec ()) in
let text = extract_text pdf range in let text = Cpdf.extract_text args.extract_text_font_size pdf range in
begin match args.out with begin match args.out with
| File filename -> | File filename ->
let fh = open_out_bin filename in let fh = open_out_bin filename in