more
This commit is contained in:
parent
e165a3bf5f
commit
7d181bf13d
8
Makefile
8
Makefile
|
@ -3,10 +3,10 @@ NONDOC = cpdfyojson cpdfxmlm cpdfutil
|
||||||
|
|
||||||
DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
|
DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
|
||||||
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
|
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
|
||||||
cpdfbookmarks cpdfpage cpdftruetype cpdfembed cpdfaddtext cpdfimage \
|
cpdfbookmarks cpdfpage cpdftruetype cpdfremovetext cpdfextracttext \
|
||||||
cpdffont cpdftype cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \
|
cpdfembed cpdfaddtext cpdfimage cpdffont cpdftype cpdfpad cpdfocg \
|
||||||
cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \
|
cpdfsqueeze cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot \
|
||||||
cpdftexttopdf cpdftoc cpdfcommand
|
cpdfxobject cpdfimpose cpdftweak cpdftexttopdf cpdftoc cpdfcommand
|
||||||
|
|
||||||
MODS = $(NONDOC) $(DOC)
|
MODS = $(NONDOC) $(DOC)
|
||||||
|
|
||||||
|
|
116
cpdfaddtext.ml
116
cpdfaddtext.ml
|
@ -166,56 +166,7 @@ let cap_height font fontname =
|
||||||
Some (match capheight with Pdf.Integer i -> float_of_int i | Pdf.Real r -> r | _ -> 0.)
|
Some (match capheight with Pdf.Integer i -> float_of_int i | Pdf.Real r -> r | _ -> 0.)
|
||||||
with
|
with
|
||||||
_ -> None
|
_ -> None
|
||||||
|
|
||||||
let extract_page_text only_fontsize pdf _ page =
|
|
||||||
let text_extractor = ref None in
|
|
||||||
let right_font_size = ref false in
|
|
||||||
fold_left ( ^ ) ""
|
|
||||||
(map
|
|
||||||
(function
|
|
||||||
| Pdfops.Op_Tf (fontname, fontsize) ->
|
|
||||||
right_font_size :=
|
|
||||||
begin match only_fontsize with
|
|
||||||
Some x -> x = fontsize
|
|
||||||
| _ -> false
|
|
||||||
end;
|
|
||||||
let fontdict =
|
|
||||||
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
|
||||||
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
|
|
||||||
| Some d ->
|
|
||||||
match Pdf.lookup_direct pdf fontname d with
|
|
||||||
| None -> raise (Pdf.PDFError "Missing font in text extraction")
|
|
||||||
| Some d -> d
|
|
||||||
in
|
|
||||||
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
|
|
||||||
""
|
|
||||||
| Pdfops.Op_Tj text when !text_extractor <> None ->
|
|
||||||
if not !right_font_size then
|
|
||||||
""
|
|
||||||
else
|
|
||||||
Pdftext.utf8_of_codepoints
|
|
||||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
|
|
||||||
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
|
|
||||||
if not !right_font_size then
|
|
||||||
""
|
|
||||||
else
|
|
||||||
fold_left ( ^ ) ""
|
|
||||||
(option_map
|
|
||||||
(function
|
|
||||||
| Pdf.String text ->
|
|
||||||
Some
|
|
||||||
(Pdftext.utf8_of_codepoints
|
|
||||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
|
|
||||||
| _ -> None)
|
|
||||||
objs)
|
|
||||||
| _ -> "")
|
|
||||||
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
|
|
||||||
|
|
||||||
(* For each page, extract all the ops with text in them, and concatenate it all together *)
|
|
||||||
let extract_text extract_text_font_size pdf range =
|
|
||||||
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
|
|
||||||
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
|
|
||||||
|
|
||||||
let rec process_text time text m =
|
let rec process_text time text m =
|
||||||
match m with
|
match m with
|
||||||
| [] -> Cpdfstrftime.strftime ~time text
|
| [] -> Cpdfstrftime.strftime ~time text
|
||||||
|
@ -278,7 +229,7 @@ let addtext
|
||||||
"%Label", (fun () -> pagelabel pdf num);
|
"%Label", (fun () -> pagelabel pdf num);
|
||||||
"%EndPage", (fun () -> string_of_int endpage);
|
"%EndPage", (fun () -> string_of_int endpage);
|
||||||
"%EndLabel", (fun () -> pagelabel pdf endpage);
|
"%EndLabel", (fun () -> pagelabel pdf endpage);
|
||||||
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
|
"%ExtractedText", (fun () -> Cpdfextracttext.extract_page_text extract_text_font_size pdf num page);
|
||||||
"%Bates",
|
"%Bates",
|
||||||
(fun () ->
|
(fun () ->
|
||||||
(let numstring = string_of_int (bates + num - 1) in
|
(let numstring = string_of_int (bates + num - 1) in
|
||||||
|
@ -589,33 +540,6 @@ let
|
||||||
end;
|
end;
|
||||||
!pdf
|
!pdf
|
||||||
|
|
||||||
let removetext range pdf =
|
|
||||||
(* Could fail on nesting, or other marked content inside our marked content.*)
|
|
||||||
let rec remove_until_last_EMC level = function
|
|
||||||
| [] -> []
|
|
||||||
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
|
||||||
remove_until_last_EMC (level + 1) more
|
|
||||||
| Pdfops.Op_EMC::more ->
|
|
||||||
if level = 1
|
|
||||||
then more
|
|
||||||
else remove_until_last_EMC (level - 1) more
|
|
||||||
| _::more ->
|
|
||||||
remove_until_last_EMC level more
|
|
||||||
in
|
|
||||||
let rec remove_stamps prev = function
|
|
||||||
| [] -> rev prev
|
|
||||||
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
|
||||||
let rest = remove_until_last_EMC 1 more in
|
|
||||||
remove_stamps prev rest
|
|
||||||
| h::t -> remove_stamps (h::prev) t
|
|
||||||
in
|
|
||||||
let removetext_page _ page =
|
|
||||||
{page with
|
|
||||||
Pdfpage.content =
|
|
||||||
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
|
|
||||||
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
|
|
||||||
in
|
|
||||||
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
|
|
||||||
|
|
||||||
let addrectangle
|
let addrectangle
|
||||||
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
|
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
|
||||||
|
@ -680,39 +604,3 @@ let addrectangle
|
||||||
else Pdfpage.postpend_operators pdf ops ~fast:fast page
|
else Pdfpage.postpend_operators pdf ops ~fast:fast page
|
||||||
in
|
in
|
||||||
Cpdfpage.process_pages (Cpdfutil.ppstub addrectangle_page) pdf range
|
Cpdfpage.process_pages (Cpdfutil.ppstub addrectangle_page) pdf range
|
||||||
|
|
||||||
let rec remove_all_text_ops pdf resources content =
|
|
||||||
let is_textop = function
|
|
||||||
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
|
|
||||||
| _ -> false
|
|
||||||
in
|
|
||||||
let content' =
|
|
||||||
let ops = Pdfops.parse_operators pdf resources content in
|
|
||||||
Pdfops.stream_of_ops
|
|
||||||
(option_map (function x -> if is_textop x then None else Some x) ops)
|
|
||||||
in
|
|
||||||
[content']
|
|
||||||
|
|
||||||
let remove_all_text_page pdf p =
|
|
||||||
let resources = p.Pdfpage.resources in
|
|
||||||
let content = p.Pdfpage.content in
|
|
||||||
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
|
|
||||||
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
|
|
||||||
|
|
||||||
let remove_all_text range pdf =
|
|
||||||
let pages = Pdfpage.pages_of_pagetree pdf in
|
|
||||||
let pagenums = indx pages in
|
|
||||||
let pdf = ref pdf in
|
|
||||||
let pages' = ref [] in
|
|
||||||
iter2
|
|
||||||
(fun p pagenum ->
|
|
||||||
let p', pdf' =
|
|
||||||
if mem pagenum range
|
|
||||||
then remove_all_text_page !pdf p
|
|
||||||
else p, !pdf
|
|
||||||
in
|
|
||||||
pdf := pdf';
|
|
||||||
pages' =| p')
|
|
||||||
pages
|
|
||||||
pagenums;
|
|
||||||
Pdfpage.change_pages true !pdf (rev !pages')
|
|
||||||
|
|
|
@ -56,12 +56,3 @@ val addrectangle :
|
||||||
float ->
|
float ->
|
||||||
Cpdfposition.position ->
|
Cpdfposition.position ->
|
||||||
bool -> bool -> int list -> Pdf.t -> Pdf.t
|
bool -> bool -> int list -> Pdf.t -> Pdf.t
|
||||||
|
|
||||||
(** Remove text added by Cpdfaddtext from the given pages. *)
|
|
||||||
val removetext : int list -> Pdf.t -> Pdf.t
|
|
||||||
|
|
||||||
(** Extract text *)
|
|
||||||
val extract_text : float option -> Pdf.t -> int list -> string
|
|
||||||
|
|
||||||
(** Remove all text from the given pages *)
|
|
||||||
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
|
||||||
|
|
|
@ -3757,7 +3757,7 @@ let go () =
|
||||||
| Some RemoveText ->
|
| Some RemoveText ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
write_pdf false (Cpdfaddtext.removetext range pdf)
|
write_pdf false (Cpdfremovetext.removetext range pdf)
|
||||||
| Some AddRectangle ->
|
| Some AddRectangle ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
|
@ -3881,7 +3881,7 @@ let go () =
|
||||||
| Some ExtractText ->
|
| Some ExtractText ->
|
||||||
let pdf = get_single_pdf args.op true in
|
let pdf = get_single_pdf args.op true in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
let text = Cpdfaddtext.extract_text args.extract_text_font_size pdf range in
|
let text = Cpdfextracttext.extract_text args.extract_text_font_size pdf range in
|
||||||
begin match args.out with
|
begin match args.out with
|
||||||
| File filename ->
|
| File filename ->
|
||||||
let fh = open_out_bin filename in
|
let fh = open_out_bin filename in
|
||||||
|
@ -3948,7 +3948,7 @@ let go () =
|
||||||
| Some RemoveAllText ->
|
| Some RemoveAllText ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
write_pdf false (Cpdfaddtext.remove_all_text range pdf)
|
write_pdf false (Cpdfremovetext.remove_all_text range pdf)
|
||||||
| Some ShowBoxes ->
|
| Some ShowBoxes ->
|
||||||
let pdf = get_single_pdf args.op false in
|
let pdf = get_single_pdf args.op false in
|
||||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
open Pdfutil
|
||||||
|
|
||||||
|
let extract_page_text only_fontsize pdf _ page =
|
||||||
|
let text_extractor = ref None in
|
||||||
|
let right_font_size = ref false in
|
||||||
|
fold_left ( ^ ) ""
|
||||||
|
(map
|
||||||
|
(function
|
||||||
|
| Pdfops.Op_Tf (fontname, fontsize) ->
|
||||||
|
right_font_size :=
|
||||||
|
begin match only_fontsize with
|
||||||
|
Some x -> x = fontsize
|
||||||
|
| _ -> false
|
||||||
|
end;
|
||||||
|
let fontdict =
|
||||||
|
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
||||||
|
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
|
||||||
|
| Some d ->
|
||||||
|
match Pdf.lookup_direct pdf fontname d with
|
||||||
|
| None -> raise (Pdf.PDFError "Missing font in text extraction")
|
||||||
|
| Some d -> d
|
||||||
|
in
|
||||||
|
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
|
||||||
|
""
|
||||||
|
| Pdfops.Op_Tj text when !text_extractor <> None ->
|
||||||
|
if not !right_font_size then
|
||||||
|
""
|
||||||
|
else
|
||||||
|
Pdftext.utf8_of_codepoints
|
||||||
|
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
|
||||||
|
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
|
||||||
|
if not !right_font_size then
|
||||||
|
""
|
||||||
|
else
|
||||||
|
fold_left ( ^ ) ""
|
||||||
|
(option_map
|
||||||
|
(function
|
||||||
|
| Pdf.String text ->
|
||||||
|
Some
|
||||||
|
(Pdftext.utf8_of_codepoints
|
||||||
|
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
|
||||||
|
| _ -> None)
|
||||||
|
objs)
|
||||||
|
| _ -> "")
|
||||||
|
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
|
||||||
|
|
||||||
|
(* For each page, extract all the ops with text in them, and concatenate it all together *)
|
||||||
|
let extract_text extract_text_font_size pdf range =
|
||||||
|
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
|
||||||
|
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
(** Extract text *)
|
||||||
|
val extract_page_text : float option -> Pdf.t -> 'a -> Pdfpage.t -> string
|
||||||
|
|
||||||
|
val extract_text : float option -> Pdf.t -> int list -> string
|
|
@ -0,0 +1,65 @@
|
||||||
|
open Pdfutil
|
||||||
|
|
||||||
|
let removetext range pdf =
|
||||||
|
(* Could fail on nesting, or other marked content inside our marked content.*)
|
||||||
|
let rec remove_until_last_EMC level = function
|
||||||
|
| [] -> []
|
||||||
|
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
||||||
|
remove_until_last_EMC (level + 1) more
|
||||||
|
| Pdfops.Op_EMC::more ->
|
||||||
|
if level = 1
|
||||||
|
then more
|
||||||
|
else remove_until_last_EMC (level - 1) more
|
||||||
|
| _::more ->
|
||||||
|
remove_until_last_EMC level more
|
||||||
|
in
|
||||||
|
let rec remove_stamps prev = function
|
||||||
|
| [] -> rev prev
|
||||||
|
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
||||||
|
let rest = remove_until_last_EMC 1 more in
|
||||||
|
remove_stamps prev rest
|
||||||
|
| h::t -> remove_stamps (h::prev) t
|
||||||
|
in
|
||||||
|
let removetext_page _ page =
|
||||||
|
{page with
|
||||||
|
Pdfpage.content =
|
||||||
|
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
|
||||||
|
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
|
||||||
|
in
|
||||||
|
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
|
||||||
|
|
||||||
|
let rec remove_all_text_ops pdf resources content =
|
||||||
|
let is_textop = function
|
||||||
|
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
|
||||||
|
| _ -> false
|
||||||
|
in
|
||||||
|
let content' =
|
||||||
|
let ops = Pdfops.parse_operators pdf resources content in
|
||||||
|
Pdfops.stream_of_ops
|
||||||
|
(option_map (function x -> if is_textop x then None else Some x) ops)
|
||||||
|
in
|
||||||
|
[content']
|
||||||
|
|
||||||
|
let remove_all_text_page pdf p =
|
||||||
|
let resources = p.Pdfpage.resources in
|
||||||
|
let content = p.Pdfpage.content in
|
||||||
|
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
|
||||||
|
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
|
||||||
|
|
||||||
|
let remove_all_text range pdf =
|
||||||
|
let pages = Pdfpage.pages_of_pagetree pdf in
|
||||||
|
let pagenums = indx pages in
|
||||||
|
let pdf = ref pdf in
|
||||||
|
let pages' = ref [] in
|
||||||
|
iter2
|
||||||
|
(fun p pagenum ->
|
||||||
|
let p', pdf' =
|
||||||
|
if mem pagenum range
|
||||||
|
then remove_all_text_page !pdf p
|
||||||
|
else p, !pdf
|
||||||
|
in
|
||||||
|
pdf := pdf';
|
||||||
|
pages' =| p')
|
||||||
|
pages
|
||||||
|
pagenums;
|
||||||
|
Pdfpage.change_pages true !pdf (rev !pages')
|
|
@ -0,0 +1,5 @@
|
||||||
|
(** Remove text added by Cpdfaddtext from the given pages. *)
|
||||||
|
val removetext : int list -> Pdf.t -> Pdf.t
|
||||||
|
|
||||||
|
(** Remove all text from the given pages *)
|
||||||
|
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
Loading…
Reference in New Issue