Added %ExtractedText

This commit is contained in:
John Whitington
2016-11-13 14:02:09 +00:00
parent c7accd12c8
commit 9d34594f13
3 changed files with 83 additions and 80 deletions

View File

@@ -1109,15 +1109,11 @@ let setfont f =
end;
args.fontname <- f
let setextracttextfontsize f =
args.extract_text_font_size <- Some f
let setfontsize f =
if f > 0.
then
begin
args.fontsize <- f;
args.extract_text_font_size <- Some f
end
else
error "Negative font size specified"
if f > 0. then args.fontsize <- f else error "Negative font size specified"
let setaddtext s =
setop (AddText s) ()
@@ -2089,6 +2085,7 @@ and specs =
("-debug-crypt", Arg.Unit setdebugcrypt, "");
("-fix-prince", Arg.Unit (setop RemoveUnusedResources), "");
("-extract-text", Arg.Unit (setop ExtractText), "");
("-extract-text-font-size", Arg.Float setextracttextfontsize, "");
(*("-change-font-size-to", Arg.Float setchangefontsizeto, "");
("-change-font-size-shift", Arg.String setchangefontsizeshift, "");
("-change-font-size-color", Arg.String setchangefontsizecolor, "")*)
@@ -3039,55 +3036,6 @@ let remove_unused_resources_page pdf n page =
let remove_unused_resources pdf =
Cpdf.process_pages (remove_unused_resources_page pdf) pdf (ilist 1 (Pdfpage.endpage pdf))
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdf.map_pages (extract_page_text args.extract_text_font_size pdf) pdf range)
(* Extracts font to font.dat in CWD. *)
let extract_fontfile pagenumber fontname pdf =
let resources = (select pagenumber (Pdfpage.pages_of_pagetree pdf)).Pdfpage.resources in
@@ -3831,7 +3779,8 @@ let go () =
font args.embedfonts args.bates args.batespad args.color args.position
args.linespacing args.fontsize args.underneath text range
args.orientation args.relative_to_cropbox args.opacity
args.justification args.midline args.topline filename pdf)
args.justification args.midline args.topline filename
args.extract_text_font_size pdf)
| Some RemoveText ->
let pdf = get_single_pdf args.op false in
let range = parse_pagespec pdf (get_pagespec ()) in
@@ -3951,7 +3900,7 @@ let go () =
| Some ExtractText ->
let pdf = get_single_pdf args.op true in
let range = parse_pagespec pdf (get_pagespec ()) in
let text = extract_text pdf range in
let text = Cpdf.extract_text args.extract_text_font_size pdf range in
begin match args.out with
| File filename ->
let fh = open_out_bin filename in