This commit is contained in:
John Whitington 2022-09-27 19:58:27 +01:00
parent e165a3bf5f
commit 7d181bf13d
8 changed files with 134 additions and 130 deletions

View File

@ -3,10 +3,10 @@ NONDOC = cpdfyojson cpdfxmlm cpdfutil
DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
cpdfbookmarks cpdfpage cpdftruetype cpdfembed cpdfaddtext cpdfimage \ cpdfbookmarks cpdfpage cpdftruetype cpdfremovetext cpdfextracttext \
cpdffont cpdftype cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \ cpdfembed cpdfaddtext cpdfimage cpdffont cpdftype cpdfpad cpdfocg \
cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \ cpdfsqueeze cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot \
cpdftexttopdf cpdftoc cpdfcommand cpdfxobject cpdfimpose cpdftweak cpdftexttopdf cpdftoc cpdfcommand
MODS = $(NONDOC) $(DOC) MODS = $(NONDOC) $(DOC)

View File

@ -167,55 +167,6 @@ let cap_height font fontname =
with with
_ -> None _ -> None
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
let rec process_text time text m = let rec process_text time text m =
match m with match m with
| [] -> Cpdfstrftime.strftime ~time text | [] -> Cpdfstrftime.strftime ~time text
@ -278,7 +229,7 @@ let addtext
"%Label", (fun () -> pagelabel pdf num); "%Label", (fun () -> pagelabel pdf num);
"%EndPage", (fun () -> string_of_int endpage); "%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", (fun () -> pagelabel pdf endpage); "%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page); "%ExtractedText", (fun () -> Cpdfextracttext.extract_page_text extract_text_font_size pdf num page);
"%Bates", "%Bates",
(fun () -> (fun () ->
(let numstring = string_of_int (bates + num - 1) in (let numstring = string_of_int (bates + num - 1) in
@ -589,33 +540,6 @@ let
end; end;
!pdf !pdf
let removetext range pdf =
(* Could fail on nesting, or other marked content inside our marked content.*)
let rec remove_until_last_EMC level = function
| [] -> []
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
remove_until_last_EMC (level + 1) more
| Pdfops.Op_EMC::more ->
if level = 1
then more
else remove_until_last_EMC (level - 1) more
| _::more ->
remove_until_last_EMC level more
in
let rec remove_stamps prev = function
| [] -> rev prev
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
let rest = remove_until_last_EMC 1 more in
remove_stamps prev rest
| h::t -> remove_stamps (h::prev) t
in
let removetext_page _ page =
{page with
Pdfpage.content =
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
in
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
let addrectangle let addrectangle
fast (w, h) colour outline linewidth opacity position relative_to_cropbox fast (w, h) colour outline linewidth opacity position relative_to_cropbox
@ -680,39 +604,3 @@ let addrectangle
else Pdfpage.postpend_operators pdf ops ~fast:fast page else Pdfpage.postpend_operators pdf ops ~fast:fast page
in in
Cpdfpage.process_pages (Cpdfutil.ppstub addrectangle_page) pdf range Cpdfpage.process_pages (Cpdfutil.ppstub addrectangle_page) pdf range
let rec remove_all_text_ops pdf resources content =
let is_textop = function
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
| _ -> false
in
let content' =
let ops = Pdfops.parse_operators pdf resources content in
Pdfops.stream_of_ops
(option_map (function x -> if is_textop x then None else Some x) ops)
in
[content']
let remove_all_text_page pdf p =
let resources = p.Pdfpage.resources in
let content = p.Pdfpage.content in
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
let remove_all_text range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf in
let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_all_text_page !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')

View File

@ -56,12 +56,3 @@ val addrectangle :
float -> float ->
Cpdfposition.position -> Cpdfposition.position ->
bool -> bool -> int list -> Pdf.t -> Pdf.t bool -> bool -> int list -> Pdf.t -> Pdf.t
(** Remove text added by Cpdfaddtext from the given pages. *)
val removetext : int list -> Pdf.t -> Pdf.t
(** Extract text *)
val extract_text : float option -> Pdf.t -> int list -> string
(** Remove all text from the given pages *)
val remove_all_text : int list -> Pdf.t -> Pdf.t

View File

@ -3757,7 +3757,7 @@ let go () =
| Some RemoveText -> | Some RemoveText ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
write_pdf false (Cpdfaddtext.removetext range pdf) write_pdf false (Cpdfremovetext.removetext range pdf)
| Some AddRectangle -> | Some AddRectangle ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
@ -3881,7 +3881,7 @@ let go () =
| Some ExtractText -> | Some ExtractText ->
let pdf = get_single_pdf args.op true in let pdf = get_single_pdf args.op true in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
let text = Cpdfaddtext.extract_text args.extract_text_font_size pdf range in let text = Cpdfextracttext.extract_text args.extract_text_font_size pdf range in
begin match args.out with begin match args.out with
| File filename -> | File filename ->
let fh = open_out_bin filename in let fh = open_out_bin filename in
@ -3948,7 +3948,7 @@ let go () =
| Some RemoveAllText -> | Some RemoveAllText ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
write_pdf false (Cpdfaddtext.remove_all_text range pdf) write_pdf false (Cpdfremovetext.remove_all_text range pdf)
| Some ShowBoxes -> | Some ShowBoxes ->
let pdf = get_single_pdf args.op false in let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in

51
cpdfextracttext.ml Normal file
View File

@ -0,0 +1,51 @@
open Pdfutil
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)

4
cpdfextracttext.mli Normal file
View File

@ -0,0 +1,4 @@
(** Extract text *)
val extract_page_text : float option -> Pdf.t -> 'a -> Pdfpage.t -> string
val extract_text : float option -> Pdf.t -> int list -> string

65
cpdfremovetext.ml Normal file
View File

@ -0,0 +1,65 @@
open Pdfutil
let removetext range pdf =
(* Could fail on nesting, or other marked content inside our marked content.*)
let rec remove_until_last_EMC level = function
| [] -> []
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
remove_until_last_EMC (level + 1) more
| Pdfops.Op_EMC::more ->
if level = 1
then more
else remove_until_last_EMC (level - 1) more
| _::more ->
remove_until_last_EMC level more
in
let rec remove_stamps prev = function
| [] -> rev prev
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
let rest = remove_until_last_EMC 1 more in
remove_stamps prev rest
| h::t -> remove_stamps (h::prev) t
in
let removetext_page _ page =
{page with
Pdfpage.content =
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
in
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
let rec remove_all_text_ops pdf resources content =
let is_textop = function
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
| _ -> false
in
let content' =
let ops = Pdfops.parse_operators pdf resources content in
Pdfops.stream_of_ops
(option_map (function x -> if is_textop x then None else Some x) ops)
in
[content']
let remove_all_text_page pdf p =
let resources = p.Pdfpage.resources in
let content = p.Pdfpage.content in
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
let remove_all_text range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf in
let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_all_text_page !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')

5
cpdfremovetext.mli Normal file
View File

@ -0,0 +1,5 @@
(** Remove text added by Cpdfaddtext from the given pages. *)
val removetext : int list -> Pdf.t -> Pdf.t
(** Remove all text from the given pages *)
val remove_all_text : int list -> Pdf.t -> Pdf.t