more
This commit is contained in:
parent
e165a3bf5f
commit
7d181bf13d
8
Makefile
8
Makefile
|
@ -3,10 +3,10 @@ NONDOC = cpdfyojson cpdfxmlm cpdfutil
|
|||
|
||||
DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
|
||||
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
|
||||
cpdfbookmarks cpdfpage cpdftruetype cpdfembed cpdfaddtext cpdfimage \
|
||||
cpdffont cpdftype cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \
|
||||
cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \
|
||||
cpdftexttopdf cpdftoc cpdfcommand
|
||||
cpdfbookmarks cpdfpage cpdftruetype cpdfremovetext cpdfextracttext \
|
||||
cpdfembed cpdfaddtext cpdfimage cpdffont cpdftype cpdfpad cpdfocg \
|
||||
cpdfsqueeze cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot \
|
||||
cpdfxobject cpdfimpose cpdftweak cpdftexttopdf cpdftoc cpdfcommand
|
||||
|
||||
MODS = $(NONDOC) $(DOC)
|
||||
|
||||
|
|
116
cpdfaddtext.ml
116
cpdfaddtext.ml
|
@ -166,56 +166,7 @@ let cap_height font fontname =
|
|||
Some (match capheight with Pdf.Integer i -> float_of_int i | Pdf.Real r -> r | _ -> 0.)
|
||||
with
|
||||
_ -> None
|
||||
|
||||
let extract_page_text only_fontsize pdf _ page =
|
||||
let text_extractor = ref None in
|
||||
let right_font_size = ref false in
|
||||
fold_left ( ^ ) ""
|
||||
(map
|
||||
(function
|
||||
| Pdfops.Op_Tf (fontname, fontsize) ->
|
||||
right_font_size :=
|
||||
begin match only_fontsize with
|
||||
Some x -> x = fontsize
|
||||
| _ -> false
|
||||
end;
|
||||
let fontdict =
|
||||
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
||||
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
|
||||
| Some d ->
|
||||
match Pdf.lookup_direct pdf fontname d with
|
||||
| None -> raise (Pdf.PDFError "Missing font in text extraction")
|
||||
| Some d -> d
|
||||
in
|
||||
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
|
||||
""
|
||||
| Pdfops.Op_Tj text when !text_extractor <> None ->
|
||||
if not !right_font_size then
|
||||
""
|
||||
else
|
||||
Pdftext.utf8_of_codepoints
|
||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
|
||||
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
|
||||
if not !right_font_size then
|
||||
""
|
||||
else
|
||||
fold_left ( ^ ) ""
|
||||
(option_map
|
||||
(function
|
||||
| Pdf.String text ->
|
||||
Some
|
||||
(Pdftext.utf8_of_codepoints
|
||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
|
||||
| _ -> None)
|
||||
objs)
|
||||
| _ -> "")
|
||||
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
|
||||
|
||||
(* For each page, extract all the ops with text in them, and concatenate it all together *)
|
||||
let extract_text extract_text_font_size pdf range =
|
||||
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
|
||||
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
|
||||
|
||||
|
||||
let rec process_text time text m =
|
||||
match m with
|
||||
| [] -> Cpdfstrftime.strftime ~time text
|
||||
|
@ -278,7 +229,7 @@ let addtext
|
|||
"%Label", (fun () -> pagelabel pdf num);
|
||||
"%EndPage", (fun () -> string_of_int endpage);
|
||||
"%EndLabel", (fun () -> pagelabel pdf endpage);
|
||||
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
|
||||
"%ExtractedText", (fun () -> Cpdfextracttext.extract_page_text extract_text_font_size pdf num page);
|
||||
"%Bates",
|
||||
(fun () ->
|
||||
(let numstring = string_of_int (bates + num - 1) in
|
||||
|
@ -589,33 +540,6 @@ let
|
|||
end;
|
||||
!pdf
|
||||
|
||||
let removetext range pdf =
|
||||
(* Could fail on nesting, or other marked content inside our marked content.*)
|
||||
let rec remove_until_last_EMC level = function
|
||||
| [] -> []
|
||||
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
||||
remove_until_last_EMC (level + 1) more
|
||||
| Pdfops.Op_EMC::more ->
|
||||
if level = 1
|
||||
then more
|
||||
else remove_until_last_EMC (level - 1) more
|
||||
| _::more ->
|
||||
remove_until_last_EMC level more
|
||||
in
|
||||
let rec remove_stamps prev = function
|
||||
| [] -> rev prev
|
||||
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
||||
let rest = remove_until_last_EMC 1 more in
|
||||
remove_stamps prev rest
|
||||
| h::t -> remove_stamps (h::prev) t
|
||||
in
|
||||
let removetext_page _ page =
|
||||
{page with
|
||||
Pdfpage.content =
|
||||
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
|
||||
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
|
||||
in
|
||||
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
|
||||
|
||||
let addrectangle
|
||||
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
|
||||
|
@ -680,39 +604,3 @@ let addrectangle
|
|||
else Pdfpage.postpend_operators pdf ops ~fast:fast page
|
||||
in
|
||||
Cpdfpage.process_pages (Cpdfutil.ppstub addrectangle_page) pdf range
|
||||
|
||||
let rec remove_all_text_ops pdf resources content =
|
||||
let is_textop = function
|
||||
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
|
||||
| _ -> false
|
||||
in
|
||||
let content' =
|
||||
let ops = Pdfops.parse_operators pdf resources content in
|
||||
Pdfops.stream_of_ops
|
||||
(option_map (function x -> if is_textop x then None else Some x) ops)
|
||||
in
|
||||
[content']
|
||||
|
||||
let remove_all_text_page pdf p =
|
||||
let resources = p.Pdfpage.resources in
|
||||
let content = p.Pdfpage.content in
|
||||
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
|
||||
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
|
||||
|
||||
let remove_all_text range pdf =
|
||||
let pages = Pdfpage.pages_of_pagetree pdf in
|
||||
let pagenums = indx pages in
|
||||
let pdf = ref pdf in
|
||||
let pages' = ref [] in
|
||||
iter2
|
||||
(fun p pagenum ->
|
||||
let p', pdf' =
|
||||
if mem pagenum range
|
||||
then remove_all_text_page !pdf p
|
||||
else p, !pdf
|
||||
in
|
||||
pdf := pdf';
|
||||
pages' =| p')
|
||||
pages
|
||||
pagenums;
|
||||
Pdfpage.change_pages true !pdf (rev !pages')
|
||||
|
|
|
@ -56,12 +56,3 @@ val addrectangle :
|
|||
float ->
|
||||
Cpdfposition.position ->
|
||||
bool -> bool -> int list -> Pdf.t -> Pdf.t
|
||||
|
||||
(** Remove text added by Cpdfaddtext from the given pages. *)
|
||||
val removetext : int list -> Pdf.t -> Pdf.t
|
||||
|
||||
(** Extract text *)
|
||||
val extract_text : float option -> Pdf.t -> int list -> string
|
||||
|
||||
(** Remove all text from the given pages *)
|
||||
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
||||
|
|
|
@ -3757,7 +3757,7 @@ let go () =
|
|||
| Some RemoveText ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
write_pdf false (Cpdfaddtext.removetext range pdf)
|
||||
write_pdf false (Cpdfremovetext.removetext range pdf)
|
||||
| Some AddRectangle ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
|
@ -3881,7 +3881,7 @@ let go () =
|
|||
| Some ExtractText ->
|
||||
let pdf = get_single_pdf args.op true in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
let text = Cpdfaddtext.extract_text args.extract_text_font_size pdf range in
|
||||
let text = Cpdfextracttext.extract_text args.extract_text_font_size pdf range in
|
||||
begin match args.out with
|
||||
| File filename ->
|
||||
let fh = open_out_bin filename in
|
||||
|
@ -3948,7 +3948,7 @@ let go () =
|
|||
| Some RemoveAllText ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
write_pdf false (Cpdfaddtext.remove_all_text range pdf)
|
||||
write_pdf false (Cpdfremovetext.remove_all_text range pdf)
|
||||
| Some ShowBoxes ->
|
||||
let pdf = get_single_pdf args.op false in
|
||||
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
open Pdfutil
|
||||
|
||||
let extract_page_text only_fontsize pdf _ page =
|
||||
let text_extractor = ref None in
|
||||
let right_font_size = ref false in
|
||||
fold_left ( ^ ) ""
|
||||
(map
|
||||
(function
|
||||
| Pdfops.Op_Tf (fontname, fontsize) ->
|
||||
right_font_size :=
|
||||
begin match only_fontsize with
|
||||
Some x -> x = fontsize
|
||||
| _ -> false
|
||||
end;
|
||||
let fontdict =
|
||||
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
|
||||
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
|
||||
| Some d ->
|
||||
match Pdf.lookup_direct pdf fontname d with
|
||||
| None -> raise (Pdf.PDFError "Missing font in text extraction")
|
||||
| Some d -> d
|
||||
in
|
||||
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
|
||||
""
|
||||
| Pdfops.Op_Tj text when !text_extractor <> None ->
|
||||
if not !right_font_size then
|
||||
""
|
||||
else
|
||||
Pdftext.utf8_of_codepoints
|
||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
|
||||
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
|
||||
if not !right_font_size then
|
||||
""
|
||||
else
|
||||
fold_left ( ^ ) ""
|
||||
(option_map
|
||||
(function
|
||||
| Pdf.String text ->
|
||||
Some
|
||||
(Pdftext.utf8_of_codepoints
|
||||
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
|
||||
| _ -> None)
|
||||
objs)
|
||||
| _ -> "")
|
||||
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
|
||||
|
||||
(* For each page, extract all the ops with text in them, and concatenate it all together *)
|
||||
let extract_text extract_text_font_size pdf range =
|
||||
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
|
||||
(Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range)
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
(** Extract text *)
|
||||
val extract_page_text : float option -> Pdf.t -> 'a -> Pdfpage.t -> string
|
||||
|
||||
val extract_text : float option -> Pdf.t -> int list -> string
|
|
@ -0,0 +1,65 @@
|
|||
open Pdfutil
|
||||
|
||||
let removetext range pdf =
|
||||
(* Could fail on nesting, or other marked content inside our marked content.*)
|
||||
let rec remove_until_last_EMC level = function
|
||||
| [] -> []
|
||||
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
||||
remove_until_last_EMC (level + 1) more
|
||||
| Pdfops.Op_EMC::more ->
|
||||
if level = 1
|
||||
then more
|
||||
else remove_until_last_EMC (level - 1) more
|
||||
| _::more ->
|
||||
remove_until_last_EMC level more
|
||||
in
|
||||
let rec remove_stamps prev = function
|
||||
| [] -> rev prev
|
||||
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
|
||||
let rest = remove_until_last_EMC 1 more in
|
||||
remove_stamps prev rest
|
||||
| h::t -> remove_stamps (h::prev) t
|
||||
in
|
||||
let removetext_page _ page =
|
||||
{page with
|
||||
Pdfpage.content =
|
||||
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
|
||||
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
|
||||
in
|
||||
Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range
|
||||
|
||||
let rec remove_all_text_ops pdf resources content =
|
||||
let is_textop = function
|
||||
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
|
||||
| _ -> false
|
||||
in
|
||||
let content' =
|
||||
let ops = Pdfops.parse_operators pdf resources content in
|
||||
Pdfops.stream_of_ops
|
||||
(option_map (function x -> if is_textop x then None else Some x) ops)
|
||||
in
|
||||
[content']
|
||||
|
||||
let remove_all_text_page pdf p =
|
||||
let resources = p.Pdfpage.resources in
|
||||
let content = p.Pdfpage.content in
|
||||
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
|
||||
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
|
||||
|
||||
let remove_all_text range pdf =
|
||||
let pages = Pdfpage.pages_of_pagetree pdf in
|
||||
let pagenums = indx pages in
|
||||
let pdf = ref pdf in
|
||||
let pages' = ref [] in
|
||||
iter2
|
||||
(fun p pagenum ->
|
||||
let p', pdf' =
|
||||
if mem pagenum range
|
||||
then remove_all_text_page !pdf p
|
||||
else p, !pdf
|
||||
in
|
||||
pdf := pdf';
|
||||
pages' =| p')
|
||||
pages
|
||||
pagenums;
|
||||
Pdfpage.change_pages true !pdf (rev !pages')
|
|
@ -0,0 +1,5 @@
|
|||
(** Remove text added by Cpdfaddtext from the given pages. *)
|
||||
val removetext : int list -> Pdf.t -> Pdf.t
|
||||
|
||||
(** Remove all text from the given pages *)
|
||||
val remove_all_text : int list -> Pdf.t -> Pdf.t
|
Loading…
Reference in New Issue