Added %ExtractedText
This commit is contained in:
parent
c7accd12c8
commit
9d34594f13
92
cpdf.ml
92
cpdf.ml
|
@ -531,19 +531,16 @@ let print_pdf_objs pdf =
|
||||||
Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj))
|
Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj))
|
||||||
pdf
|
pdf
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
(* Return page label at pdf page num, or page number in arabic if no label *)
|
(* Return page label at pdf page num, or page number in arabic if no label *)
|
||||||
let pagelabel pdf num =
|
let pagelabel pdf num =
|
||||||
Pdfpagelabels.pagelabeltext_of_pagenumber
|
Pdfpagelabels.pagelabeltext_of_pagenumber
|
||||||
num
|
num
|
||||||
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
|
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
|
||||||
|
|
||||||
|
|
||||||
let rec process_text text m =
|
let rec process_text text m =
|
||||||
match m with
|
match m with
|
||||||
| ([] : (string * string) list) -> Cpdfstrftime.strftime text
|
| [] -> Cpdfstrftime.strftime text
|
||||||
| (s, r)::t -> process_text (string_replace_all s r text) t
|
| (s, r)::t -> process_text (string_replace_all_lazy s r text) t
|
||||||
|
|
||||||
let expand_date = function
|
let expand_date = function
|
||||||
| "now" -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S"
|
| "now" -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S"
|
||||||
|
@ -1661,28 +1658,79 @@ let make_font embed fontname =
|
||||||
("/Encoding", Pdf.Name "/WinAnsiEncoding");
|
("/Encoding", Pdf.Name "/WinAnsiEncoding");
|
||||||
("/BaseFont", Pdf.Name ("/" ^ fontname))]
|
("/BaseFont", Pdf.Name ("/" ^ fontname))]
|
||||||
|
|
||||||
|
let extract_page_text only_fontsize pdf _ page =
|
||||||
|
let text_extractor = ref None in
|
||||||
|
let right_font_size = ref false in
|
||||||
|
fold_left ( ^ ) ""
|
||||||
|
(map
|
||||||
|
(function
|
||||||
|
| Pdfops.Op_Tf (fontname, fontsize) ->
|
||||||
|
right_font_size :=
|
||||||
|
begin match only_fontsize with
|
||||||
|
Some x -> x = fontsize
|
||||||
|
| _ -> false
|
||||||
|
end;
|
||||||
|
let fontdict =
|
||||||
|
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
||||||
|
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
|
||||||
|
| Some d ->
|
||||||
|
match Pdf.lookup_direct pdf fontname d with
|
||||||
|
| None -> raise (Pdf.PDFError "Missing font in text extraction")
|
||||||
|
| Some d -> d
|
||||||
|
in
|
||||||
|
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
|
||||||
|
""
|
||||||
|
| Pdfops.Op_Tj text when !text_extractor <> None ->
|
||||||
|
if not !right_font_size then
|
||||||
|
""
|
||||||
|
else
|
||||||
|
Pdftext.utf8_of_codepoints
|
||||||
|
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
|
||||||
|
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
|
||||||
|
if not !right_font_size then
|
||||||
|
""
|
||||||
|
else
|
||||||
|
fold_left ( ^ ) ""
|
||||||
|
(option_map
|
||||||
|
(function
|
||||||
|
| Pdf.String text ->
|
||||||
|
Some
|
||||||
|
(Pdftext.utf8_of_codepoints
|
||||||
|
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
|
||||||
|
| _ -> None)
|
||||||
|
objs)
|
||||||
|
| _ -> "")
|
||||||
|
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
|
||||||
|
|
||||||
|
(* For each page, extract all the ops with text in them, and concatenate it all together *)
|
||||||
|
let extract_text extract_text_font_size pdf range =
|
||||||
|
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
|
||||||
|
(map_pages (extract_page_text extract_text_font_size pdf) pdf range)
|
||||||
|
|
||||||
let addtext
|
let addtext
|
||||||
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
|
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
|
||||||
underneath position hoffset voffset text pages orientation cropbox opacity
|
underneath position hoffset voffset text pages orientation cropbox opacity
|
||||||
justification filename pdf
|
justification filename extract_text_font_size pdf
|
||||||
=
|
=
|
||||||
let endpage = Pdfpage.endpage pdf in
|
let endpage = Pdfpage.endpage pdf in
|
||||||
let replace_pairs pdf filename bates batespad num =
|
let replace_pairs pdf filename bates batespad num page =
|
||||||
["%Page", string_of_int num;
|
["%Page", (fun () -> string_of_int num);
|
||||||
"%Roman", roman_upper num;
|
"%Roman", (fun () -> roman_upper num);
|
||||||
"%roman", roman_lower num;
|
"%roman", (fun () -> roman_lower num);
|
||||||
"%filename", filename;
|
"%filename", (fun () -> filename);
|
||||||
"%Label", pagelabel pdf num;
|
"%Label", (fun () -> pagelabel pdf num);
|
||||||
"%EndPage", string_of_int endpage;
|
"%EndPage", (fun () -> string_of_int endpage);
|
||||||
"%EndLabel", pagelabel pdf endpage;
|
"%EndLabel", (fun () -> pagelabel pdf endpage);
|
||||||
|
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
|
||||||
"%Bates",
|
"%Bates",
|
||||||
(let numstring = string_of_int (bates + num - 1) in
|
(fun () ->
|
||||||
|
(let numstring = string_of_int (bates + num - 1) in
|
||||||
match batespad with
|
match batespad with
|
||||||
None -> numstring
|
None -> numstring
|
||||||
| Some w ->
|
| Some w ->
|
||||||
if String.length numstring >= w
|
if String.length numstring >= w
|
||||||
then numstring
|
then numstring
|
||||||
else implode (many '0' (w - String.length numstring)) ^ numstring)]
|
else implode (many '0' (w - String.length numstring)) ^ numstring))]
|
||||||
in
|
in
|
||||||
let addtext_page num page =
|
let addtext_page num page =
|
||||||
let resources', unique_extgstatename =
|
let resources', unique_extgstatename =
|
||||||
|
@ -1708,7 +1756,7 @@ let addtext
|
||||||
in
|
in
|
||||||
let unique_fontname = Pdf.unique_key "F" fontdict in
|
let unique_fontname = Pdf.unique_key "F" fontdict in
|
||||||
let ops =
|
let ops =
|
||||||
let text = process_text text (replace_pairs pdf filename bates batespad num) in
|
let text = process_text text (replace_pairs pdf filename bates batespad num page) in
|
||||||
let calc_textwidth text =
|
let calc_textwidth text =
|
||||||
match font with
|
match font with
|
||||||
| Some f ->
|
| Some f ->
|
||||||
|
@ -1734,8 +1782,10 @@ let addtext
|
||||||
(rawwidth *. fontsize) /. 1000.
|
(rawwidth *. fontsize) /. 1000.
|
||||||
in
|
in
|
||||||
let expanded_lines =
|
let expanded_lines =
|
||||||
map (function text -> process_text text (replace_pairs pdf
|
map
|
||||||
filename bates batespad num)) lines
|
(function text ->
|
||||||
|
process_text text (replace_pairs pdf filename bates batespad num page))
|
||||||
|
lines
|
||||||
in
|
in
|
||||||
let textwidth = calc_textwidth text
|
let textwidth = calc_textwidth text
|
||||||
and allwidths = map calc_textwidth expanded_lines in
|
and allwidths = map calc_textwidth expanded_lines in
|
||||||
|
@ -1807,7 +1857,7 @@ let unescape_string s =
|
||||||
let
|
let
|
||||||
addtexts metrics linewidth outline fast fontname font embed bates batespad colour position linespacing
|
addtexts metrics linewidth outline fast fontname font embed bates batespad colour position linespacing
|
||||||
fontsize underneath text pages orientation cropbox opacity justification
|
fontsize underneath text pages orientation cropbox opacity justification
|
||||||
midline topline filename pdf
|
midline topline filename extract_text_font_size pdf
|
||||||
=
|
=
|
||||||
(*flprint "addtexts:\n";
|
(*flprint "addtexts:\n";
|
||||||
iter (Printf.printf "%C ") (explode text);
|
iter (Printf.printf "%C ") (explode text);
|
||||||
|
@ -1883,6 +1933,7 @@ let
|
||||||
addtext metrics lines linewidth outline fast colour fontname
|
addtext metrics lines linewidth outline fast colour fontname
|
||||||
embed bates batespad fontsize font underneath position hoff voff line
|
embed bates batespad fontsize font underneath position hoff voff line
|
||||||
pages orientation cropbox opacity justification filename
|
pages orientation cropbox opacity justification filename
|
||||||
|
extract_text_font_size
|
||||||
!pdf;
|
!pdf;
|
||||||
voffset := !voffset +. (linespacing *. fontsize))
|
voffset := !voffset +. (linespacing *. fontsize))
|
||||||
lines;
|
lines;
|
||||||
|
@ -3664,3 +3715,4 @@ let add_page_labels pdf style prefix startval range =
|
||||||
ranges;
|
ranges;
|
||||||
Pdfpagelabels.write pdf !labels
|
Pdfpagelabels.write pdf !labels
|
||||||
|
|
||||||
|
|
||||||
|
|
4
cpdf.mli
4
cpdf.mli
|
@ -287,7 +287,7 @@ type justification =
|
||||||
| CentreJustify
|
| CentreJustify
|
||||||
| RightJustify
|
| RightJustify
|
||||||
|
|
||||||
(** [calculate ignore_d w (xmin, ymin, xmax, ymax) orientation pos] calculates
|
(** [calculate_position ignore_d w (xmin, ymin, xmax, ymax) orientation pos] calculates
|
||||||
the absolute position of text given its width, bounding box, orientation and
|
the absolute position of text given its width, bounding box, orientation and
|
||||||
position. If [ignore_d] is true, the distance from the position (e.g 10 in
|
position. If [ignore_d] is true, the distance from the position (e.g 10 in
|
||||||
TopLeft 10) is ignored (considered zero). *)
|
TopLeft 10) is ignored (considered zero). *)
|
||||||
|
@ -324,6 +324,7 @@ val addtexts :
|
||||||
bool ->(*midline adjust?*)
|
bool ->(*midline adjust?*)
|
||||||
bool ->(*topline adjust?*)
|
bool ->(*topline adjust?*)
|
||||||
string ->(*filename*)
|
string ->(*filename*)
|
||||||
|
float option -> (*extract_text_font_size*)
|
||||||
Pdf.t ->(*pdf*)
|
Pdf.t ->(*pdf*)
|
||||||
Pdf.t
|
Pdf.t
|
||||||
|
|
||||||
|
@ -492,3 +493,4 @@ val call_cpdflin : string -> string -> string -> string -> int
|
||||||
|
|
||||||
val debug : bool ref
|
val debug : bool ref
|
||||||
|
|
||||||
|
val extract_text : float option -> Pdf.t -> int list -> string
|
||||||
|
|
|
@ -1109,15 +1109,11 @@ let setfont f =
|
||||||
end;
|
end;
|
||||||
args.fontname <- f
|
args.fontname <- f
|
||||||
|
|
||||||
|
let setextracttextfontsize f =
|
||||||
|
args.extract_text_font_size <- Some f
|
||||||
|
|
||||||
let setfontsize f =
|
let setfontsize f =
|
||||||
if f > 0.
|
if f > 0. then args.fontsize <- f else error "Negative font size specified"
|
||||||
then
|
|
||||||
begin
|
|
||||||
args.fontsize <- f;
|
|
||||||
args.extract_text_font_size <- Some f
|
|
||||||
end
|
|
||||||
else
|
|
||||||
error "Negative font size specified"
|
|
||||||
|
|
||||||
let setaddtext s =
|
let setaddtext s =
|
||||||
setop (AddText s) ()
|
setop (AddText s) ()
|
||||||
|
@ -2089,6 +2085,7 @@ and specs =
|
||||||
("-debug-crypt", Arg.Unit setdebugcrypt, "");
|
("-debug-crypt", Arg.Unit setdebugcrypt, "");
|
||||||
("-fix-prince", Arg.Unit (setop RemoveUnusedResources), "");
|
("-fix-prince", Arg.Unit (setop RemoveUnusedResources), "");
|
||||||
("-extract-text", Arg.Unit (setop ExtractText), "");
|
("-extract-text", Arg.Unit (setop ExtractText), "");
|
||||||
|
("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
|
||||||
(*("-change-font-size-to", Arg.Float setchangefontsizeto, "");
|
(*("-change-font-size-to", Arg.Float setchangefontsizeto, "");
|
||||||
("-change-font-size-shift", Arg.String setchangefontsizeshift, "");
|
("-change-font-size-shift", Arg.String setchangefontsizeshift, "");
|
||||||
("-change-font-size-color", Arg.String setchangefontsizecolor, "")*)
|
("-change-font-size-color", Arg.String setchangefontsizecolor, "")*)
|
||||||
|
@ -3039,55 +3036,6 @@ let remove_unused_resources_page pdf n page =
|
||||||
let remove_unused_resources pdf =
|
let remove_unused_resources pdf =
|
||||||
Cpdf.process_pages (remove_unused_resources_page pdf) pdf (ilist 1 (Pdfpage.endpage pdf))
|
Cpdf.process_pages (remove_unused_resources_page pdf) pdf (ilist 1 (Pdfpage.endpage pdf))
|
||||||
|
|
||||||
let extract_page_text only_fontsize pdf _ page =
|
|
||||||
let text_extractor = ref None in
|
|
||||||
let right_font_size = ref false in
|
|
||||||
fold_left ( ^ ) ""
|
|
||||||
(map
|
|
||||||
(function
|
|
||||||
| Pdfops.Op_Tf (fontname, fontsize) ->
|
|
||||||
right_font_size :=
|
|
||||||
begin match only_fontsize with
|
|
||||||
Some x -> x = fontsize
|
|
||||||
| _ -> false
|
|
||||||
end;
|
|
||||||
let fontdict =
|
|
||||||
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
|
||||||
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
|
|
||||||
| Some d ->
|
|
||||||
match Pdf.lookup_direct pdf fontname d with
|
|
||||||
| None -> raise (Pdf.PDFError "Missing font in text extraction")
|
|
||||||
| Some d -> d
|
|
||||||
in
|
|
||||||
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
|
|
||||||
""
|
|
||||||
| Pdfops.Op_Tj text when !text_extractor <> None ->
|
|
||||||
if not !right_font_size then
|
|
||||||
""
|
|
||||||
else
|
|
||||||
Pdftext.utf8_of_codepoints
|
|
||||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
|
|
||||||
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
|
|
||||||
if not !right_font_size then
|
|
||||||
""
|
|
||||||
else
|
|
||||||
fold_left ( ^ ) ""
|
|
||||||
(option_map
|
|
||||||
(function
|
|
||||||
| Pdf.String text ->
|
|
||||||
Some
|
|
||||||
(Pdftext.utf8_of_codepoints
|
|
||||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
|
|
||||||
| _ -> None)
|
|
||||||
objs)
|
|
||||||
| _ -> "")
|
|
||||||
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
|
|
||||||
|
|
||||||
(* For each page, extract all the ops with text in them, and concatenate it all together *)
|
|
||||||
let extract_text pdf range =
|
|
||||||
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
|
|
||||||
(Cpdf.map_pages (extract_page_text args.extract_text_font_size pdf) pdf range)
|
|
||||||
|
|
||||||
(* Extracts font to font.dat in CWD. *)
|
(* Extracts font to font.dat in CWD. *)
|
||||||
let extract_fontfile pagenumber fontname pdf =
|
let extract_fontfile pagenumber fontname pdf =
|
||||||
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
|
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
|
||||||
|
@ -3831,7 +3779,8 @@ let go () =
|
||||||
font args.embedfonts args.bates args.batespad args.color args.position
|
font args.embedfonts args.bates args.batespad args.color args.position
|
||||||
args.linespacing args.fontsize args.underneath text range
|
args.linespacing args.fontsize args.underneath text range
|
||||||
args.orientation args.relative_to_cropbox args.opacity
|
args.orientation args.relative_to_cropbox args.opacity
|
||||||
args.justification args.midline args.topline filename pdf)
|
args.justification args.midline args.topline filename
|
||||||
|
args.extract_text_font_size pdf)
|
||||||
| Some RemoveText ->
|
| Some RemoveText ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let range = parse_pagespec pdf (get_pagespec ()) in
|
let range = parse_pagespec pdf (get_pagespec ()) in
|
||||||
|
@ -3951,7 +3900,7 @@ let go () =
|
||||||
| Some ExtractText ->
|
| Some ExtractText ->
|
||||||
let pdf = get_single_pdf args.op true in
|
let pdf = get_single_pdf args.op true in
|
||||||
let range = parse_pagespec pdf (get_pagespec ()) in
|
let range = parse_pagespec pdf (get_pagespec ()) in
|
||||||
let text = extract_text pdf range in
|
let text = Cpdf.extract_text args.extract_text_font_size pdf range in
|
||||||
begin match args.out with
|
begin match args.out with
|
||||||
| File filename ->
|
| File filename ->
|
||||||
let fh = open_out_bin filename in
|
let fh = open_out_bin filename in
|
||||||
|
|
Loading…
Reference in New Issue