Added %ExtractedText

This commit is contained in:
John Whitington 2016-11-13 14:02:09 +00:00
parent c7accd12c8
commit 9d34594f13
3 changed files with 83 additions and 80 deletions

92
cpdf.ml
View File

@ -531,19 +531,16 @@ let print_pdf_objs pdf =
Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj))
pdf
(* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num =
Pdfpagelabels.pagelabeltext_of_pagenumber
num
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
let rec process_text text m =
match m with
| ([] : (string * string) list) -> Cpdfstrftime.strftime text
| (s, r)::t -> process_text (string_replace_all s r text) t
| [] -> Cpdfstrftime.strftime text
| (s, r)::t -> process_text (string_replace_all_lazy s r text) t
let expand_date = function
| "now" -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S"
@ -1661,28 +1658,79 @@ let make_font embed fontname =
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/BaseFont", Pdf.Name ("/" ^ fontname))]
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(map_pages (extract_page_text extract_text_font_size pdf) pdf range)
let addtext
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
underneath position hoffset voffset text pages orientation cropbox opacity
justification filename pdf
justification filename extract_text_font_size pdf
=
let endpage = Pdfpage.endpage pdf in
let replace_pairs pdf filename bates batespad num =
["%Page", string_of_int num;
"%Roman", roman_upper num;
"%roman", roman_lower num;
"%filename", filename;
"%Label", pagelabel pdf num;
"%EndPage", string_of_int endpage;
"%EndLabel", pagelabel pdf endpage;
let replace_pairs pdf filename bates batespad num page =
["%Page", (fun () -> string_of_int num);
"%Roman", (fun () -> roman_upper num);
"%roman", (fun () -> roman_lower num);
"%filename", (fun () -> filename);
"%Label", (fun () -> pagelabel pdf num);
"%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
"%Bates",
(let numstring = string_of_int (bates + num - 1) in
(fun () ->
(let numstring = string_of_int (bates + num - 1) in
match batespad with
None -> numstring
| Some w ->
if String.length numstring >= w
then numstring
else implode (many '0' (w - String.length numstring)) ^ numstring)]
else implode (many '0' (w - String.length numstring)) ^ numstring))]
in
let addtext_page num page =
let resources', unique_extgstatename =
@ -1708,7 +1756,7 @@ let addtext
in
let unique_fontname = Pdf.unique_key "F" fontdict in
let ops =
let text = process_text text (replace_pairs pdf filename bates batespad num) in
let text = process_text text (replace_pairs pdf filename bates batespad num page) in
let calc_textwidth text =
match font with
| Some f ->
@ -1734,8 +1782,10 @@ let addtext
(rawwidth *. fontsize) /. 1000.
in
let expanded_lines =
map (function text -> process_text text (replace_pairs pdf
filename bates batespad num)) lines
map
(function text ->
process_text text (replace_pairs pdf filename bates batespad num page))
lines
in
let textwidth = calc_textwidth text
and allwidths = map calc_textwidth expanded_lines in
@ -1807,7 +1857,7 @@ let unescape_string s =
let
addtexts metrics linewidth outline fast fontname font embed bates batespad colour position linespacing
fontsize underneath text pages orientation cropbox opacity justification
midline topline filename pdf
midline topline filename extract_text_font_size pdf
=
(*flprint "addtexts:\n";
iter (Printf.printf "%C ") (explode text);
@ -1883,6 +1933,7 @@ let
addtext metrics lines linewidth outline fast colour fontname
embed bates batespad fontsize font underneath position hoff voff line
pages orientation cropbox opacity justification filename
extract_text_font_size
!pdf;
voffset := !voffset +. (linespacing *. fontsize))
lines;
@ -3664,3 +3715,4 @@ let add_page_labels pdf style prefix startval range =
ranges;
Pdfpagelabels.write pdf !labels

View File

@ -287,7 +287,7 @@ type justification =
| CentreJustify
| RightJustify
(** [calculate ignore_d w (xmin, ymin, xmax, ymax) orientation pos] calculates
(** [calculate_position ignore_d w (xmin, ymin, xmax, ymax) orientation pos] calculates
the absolute position of text given its width, bounding box, orientation and
position. If [ignore_d] is true, the distance from the position (e.g 10 in
TopLeft 10) is ignored (considered zero). *)
@ -324,6 +324,7 @@ val addtexts :
bool ->(*midline adjust?*)
bool ->(*topline adjust?*)
string ->(*filename*)
float option -> (*extract_text_font_size*)
Pdf.t ->(*pdf*)
Pdf.t
@ -492,3 +493,4 @@ val call_cpdflin : string -> string -> string -> string -> int
val debug : bool ref
val extract_text : float option -> Pdf.t -> int list -> string

View File

@ -1109,15 +1109,11 @@ let setfont f =
end;
args.fontname <- f
let setextracttextfontsize f =
args.extract_text_font_size <- Some f
let setfontsize f =
if f > 0.
then
begin
args.fontsize <- f;
args.extract_text_font_size <- Some f
end
else
error "Negative font size specified"
if f > 0. then args.fontsize <- f else error "Negative font size specified"
let setaddtext s =
setop (AddText s) ()
@ -2089,6 +2085,7 @@ and specs =
("-debug-crypt", Arg.Unit setdebugcrypt, "");
("-fix-prince", Arg.Unit (setop RemoveUnusedResources), "");
("-extract-text", Arg.Unit (setop ExtractText), "");
("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
(*("-change-font-size-to", Arg.Float setchangefontsizeto, "");
("-change-font-size-shift", Arg.String setchangefontsizeshift, "");
("-change-font-size-color", Arg.String setchangefontsizecolor, "")*)
@ -3039,55 +3036,6 @@ let remove_unused_resources_page pdf n page =
let remove_unused_resources pdf =
Cpdf.process_pages (remove_unused_resources_page pdf) pdf (ilist 1 (Pdfpage.endpage pdf))
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdf.map_pages (extract_page_text args.extract_text_font_size pdf) pdf range)
(* Extracts font to font.dat in CWD. *)
let extract_fontfile pagenumber fontname pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
@ -3831,7 +3779,8 @@ let go () =
font args.embedfonts args.bates args.batespad args.color args.position
args.linespacing args.fontsize args.underneath text range
args.orientation args.relative_to_cropbox args.opacity
args.justification args.midline args.topline filename pdf)
args.justification args.midline args.topline filename
args.extract_text_font_size pdf)
| Some RemoveText ->
let pdf = get_single_pdf args.op false in
let range = parse_pagespec pdf (get_pagespec ()) in
@ -3951,7 +3900,7 @@ let go () =
| Some ExtractText ->
let pdf = get_single_pdf args.op true in
let range = parse_pagespec pdf (get_pagespec ()) in
let text = extract_text pdf range in
let text = Cpdf.extract_text args.extract_text_font_size pdf range in
begin match args.out with
| File filename ->
let fh = open_out_bin filename in