From 7d181bf13d150a7134ca4af599f4140760bf617c Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 27 Sep 2022 19:58:27 +0100 Subject: [PATCH] more --- Makefile | 8 +-- cpdfaddtext.ml | 116 +------------------------------------------- cpdfaddtext.mli | 9 ---- cpdfcommand.ml | 6 +-- cpdfextracttext.ml | 51 +++++++++++++++++++ cpdfextracttext.mli | 4 ++ cpdfremovetext.ml | 65 +++++++++++++++++++++++++ cpdfremovetext.mli | 5 ++ 8 files changed, 134 insertions(+), 130 deletions(-) create mode 100644 cpdfextracttext.ml create mode 100644 cpdfextracttext.mli create mode 100644 cpdfremovetext.ml create mode 100644 cpdfremovetext.mli diff --git a/Makefile b/Makefile index 7025bd9..5264ffc 100644 --- a/Makefile +++ b/Makefile @@ -3,10 +3,10 @@ NONDOC = cpdfyojson cpdfxmlm cpdfutil DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ - cpdfbookmarks cpdfpage cpdftruetype cpdfembed cpdfaddtext cpdfimage \ - cpdffont cpdftype cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \ - cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \ - cpdftexttopdf cpdftoc cpdfcommand + cpdfbookmarks cpdfpage cpdftruetype cpdfremovetext cpdfextracttext \ + cpdfembed cpdfaddtext cpdfimage cpdffont cpdftype cpdfpad cpdfocg \ + cpdfsqueeze cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot \ + cpdfxobject cpdfimpose cpdftweak cpdftexttopdf cpdftoc cpdfcommand MODS = $(NONDOC) $(DOC) diff --git a/cpdfaddtext.ml b/cpdfaddtext.ml index 005b6df..9199121 100644 --- a/cpdfaddtext.ml +++ b/cpdfaddtext.ml @@ -166,56 +166,7 @@ let cap_height font fontname = Some (match capheight with Pdf.Integer i -> float_of_int i | Pdf.Real r -> r | _ -> 0.) with _ -> None - -let extract_page_text only_fontsize pdf _ page = - let text_extractor = ref None in - let right_font_size = ref false in - fold_left ( ^ ) "" - (map - (function - | Pdfops.Op_Tf (fontname, fontsize) -> - right_font_size := - begin match only_fontsize with - Some x -> x = fontsize - | _ -> false - end; - let fontdict = - match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with - | None -> raise (Pdf.PDFError "Missing /Font in text extraction") - | Some d -> - match Pdf.lookup_direct pdf fontname d with - | None -> raise (Pdf.PDFError "Missing font in text extraction") - | Some d -> d - in - text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict); - "" - | Pdfops.Op_Tj text when !text_extractor <> None -> - if not !right_font_size then - "" - else - Pdftext.utf8_of_codepoints - (Pdftext.codepoints_of_text (unopt !text_extractor) text) - | Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None -> - if not !right_font_size then - "" - else - fold_left ( ^ ) "" - (option_map - (function - | Pdf.String text -> - Some - (Pdftext.utf8_of_codepoints - (Pdftext.codepoints_of_text (unopt !text_extractor) text)) - | _ -> None) - objs) - | _ -> "") - (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)) - -(* For each page, extract all the ops with text in them, and concatenate it all together *) -let extract_text extract_text_font_size pdf range = - fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) "" - (Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range) - + let rec process_text time text m = match m with | [] -> Cpdfstrftime.strftime ~time text @@ -278,7 +229,7 @@ let addtext "%Label", (fun () -> pagelabel pdf num); "%EndPage", (fun () -> string_of_int endpage); "%EndLabel", (fun () -> pagelabel pdf endpage); - "%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page); + "%ExtractedText", (fun () -> Cpdfextracttext.extract_page_text extract_text_font_size pdf num page); "%Bates", (fun () -> (let numstring = string_of_int (bates + num - 1) in @@ -589,33 +540,6 @@ let end; !pdf -let removetext range pdf = - (* Could fail on nesting, or other marked content inside our marked content.*) - let rec remove_until_last_EMC level = function - | [] -> [] - | Pdfops.Op_BMC "/CPDFSTAMP"::more -> - remove_until_last_EMC (level + 1) more - | Pdfops.Op_EMC::more -> - if level = 1 - then more - else remove_until_last_EMC (level - 1) more - | _::more -> - remove_until_last_EMC level more - in - let rec remove_stamps prev = function - | [] -> rev prev - | Pdfops.Op_BMC "/CPDFSTAMP"::more -> - let rest = remove_until_last_EMC 1 more in - remove_stamps prev rest - | h::t -> remove_stamps (h::prev) t - in - let removetext_page _ page = - {page with - Pdfpage.content = - let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in - [Pdfops.stream_of_ops (remove_stamps [] ops)]} - in - Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range let addrectangle fast (w, h) colour outline linewidth opacity position relative_to_cropbox @@ -680,39 +604,3 @@ let addrectangle else Pdfpage.postpend_operators pdf ops ~fast:fast page in Cpdfpage.process_pages (Cpdfutil.ppstub addrectangle_page) pdf range - -let rec remove_all_text_ops pdf resources content = - let is_textop = function - Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true - | _ -> false - in - let content' = - let ops = Pdfops.parse_operators pdf resources content in - Pdfops.stream_of_ops - (option_map (function x -> if is_textop x then None else Some x) ops) - in - [content'] - -let remove_all_text_page pdf p = - let resources = p.Pdfpage.resources in - let content = p.Pdfpage.content in - Cpdfutil.process_xobjects pdf p remove_all_text_ops; - {p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf - -let remove_all_text range pdf = - let pages = Pdfpage.pages_of_pagetree pdf in - let pagenums = indx pages in - let pdf = ref pdf in - let pages' = ref [] in - iter2 - (fun p pagenum -> - let p', pdf' = - if mem pagenum range - then remove_all_text_page !pdf p - else p, !pdf - in - pdf := pdf'; - pages' =| p') - pages - pagenums; - Pdfpage.change_pages true !pdf (rev !pages') diff --git a/cpdfaddtext.mli b/cpdfaddtext.mli index 98d8447..43df274 100644 --- a/cpdfaddtext.mli +++ b/cpdfaddtext.mli @@ -56,12 +56,3 @@ val addrectangle : float -> Cpdfposition.position -> bool -> bool -> int list -> Pdf.t -> Pdf.t - -(** Remove text added by Cpdfaddtext from the given pages. *) -val removetext : int list -> Pdf.t -> Pdf.t - -(** Extract text *) -val extract_text : float option -> Pdf.t -> int list -> string - -(** Remove all text from the given pages *) -val remove_all_text : int list -> Pdf.t -> Pdf.t diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 09202f9..90dee3c 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -3757,7 +3757,7 @@ let go () = | Some RemoveText -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - write_pdf false (Cpdfaddtext.removetext range pdf) + write_pdf false (Cpdfremovetext.removetext range pdf) | Some AddRectangle -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in @@ -3881,7 +3881,7 @@ let go () = | Some ExtractText -> let pdf = get_single_pdf args.op true in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - let text = Cpdfaddtext.extract_text args.extract_text_font_size pdf range in + let text = Cpdfextracttext.extract_text args.extract_text_font_size pdf range in begin match args.out with | File filename -> let fh = open_out_bin filename in @@ -3948,7 +3948,7 @@ let go () = | Some RemoveAllText -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - write_pdf false (Cpdfaddtext.remove_all_text range pdf) + write_pdf false (Cpdfremovetext.remove_all_text range pdf) | Some ShowBoxes -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in diff --git a/cpdfextracttext.ml b/cpdfextracttext.ml new file mode 100644 index 0000000..082a92d --- /dev/null +++ b/cpdfextracttext.ml @@ -0,0 +1,51 @@ +open Pdfutil + +let extract_page_text only_fontsize pdf _ page = + let text_extractor = ref None in + let right_font_size = ref false in + fold_left ( ^ ) "" + (map + (function + | Pdfops.Op_Tf (fontname, fontsize) -> + right_font_size := + begin match only_fontsize with + Some x -> x = fontsize + | _ -> false + end; + let fontdict = + match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with + | None -> raise (Pdf.PDFError "Missing /Font in text extraction") + | Some d -> + match Pdf.lookup_direct pdf fontname d with + | None -> raise (Pdf.PDFError "Missing font in text extraction") + | Some d -> d + in + text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict); + "" + | Pdfops.Op_Tj text when !text_extractor <> None -> + if not !right_font_size then + "" + else + Pdftext.utf8_of_codepoints + (Pdftext.codepoints_of_text (unopt !text_extractor) text) + | Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None -> + if not !right_font_size then + "" + else + fold_left ( ^ ) "" + (option_map + (function + | Pdf.String text -> + Some + (Pdftext.utf8_of_codepoints + (Pdftext.codepoints_of_text (unopt !text_extractor) text)) + | _ -> None) + objs) + | _ -> "") + (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)) + +(* For each page, extract all the ops with text in them, and concatenate it all together *) +let extract_text extract_text_font_size pdf range = + fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) "" + (Cpdfpage.map_pages (extract_page_text extract_text_font_size pdf) pdf range) + diff --git a/cpdfextracttext.mli b/cpdfextracttext.mli new file mode 100644 index 0000000..a143d7c --- /dev/null +++ b/cpdfextracttext.mli @@ -0,0 +1,4 @@ +(** Extract text *) +val extract_page_text : float option -> Pdf.t -> 'a -> Pdfpage.t -> string + +val extract_text : float option -> Pdf.t -> int list -> string diff --git a/cpdfremovetext.ml b/cpdfremovetext.ml new file mode 100644 index 0000000..da029e6 --- /dev/null +++ b/cpdfremovetext.ml @@ -0,0 +1,65 @@ +open Pdfutil + +let removetext range pdf = + (* Could fail on nesting, or other marked content inside our marked content.*) + let rec remove_until_last_EMC level = function + | [] -> [] + | Pdfops.Op_BMC "/CPDFSTAMP"::more -> + remove_until_last_EMC (level + 1) more + | Pdfops.Op_EMC::more -> + if level = 1 + then more + else remove_until_last_EMC (level - 1) more + | _::more -> + remove_until_last_EMC level more + in + let rec remove_stamps prev = function + | [] -> rev prev + | Pdfops.Op_BMC "/CPDFSTAMP"::more -> + let rest = remove_until_last_EMC 1 more in + remove_stamps prev rest + | h::t -> remove_stamps (h::prev) t + in + let removetext_page _ page = + {page with + Pdfpage.content = + let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in + [Pdfops.stream_of_ops (remove_stamps [] ops)]} + in + Cpdfpage.process_pages (Cpdfutil.ppstub removetext_page) pdf range + +let rec remove_all_text_ops pdf resources content = + let is_textop = function + Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true + | _ -> false + in + let content' = + let ops = Pdfops.parse_operators pdf resources content in + Pdfops.stream_of_ops + (option_map (function x -> if is_textop x then None else Some x) ops) + in + [content'] + +let remove_all_text_page pdf p = + let resources = p.Pdfpage.resources in + let content = p.Pdfpage.content in + Cpdfutil.process_xobjects pdf p remove_all_text_ops; + {p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf + +let remove_all_text range pdf = + let pages = Pdfpage.pages_of_pagetree pdf in + let pagenums = indx pages in + let pdf = ref pdf in + let pages' = ref [] in + iter2 + (fun p pagenum -> + let p', pdf' = + if mem pagenum range + then remove_all_text_page !pdf p + else p, !pdf + in + pdf := pdf'; + pages' =| p') + pages + pagenums; + Pdfpage.change_pages true !pdf (rev !pages') diff --git a/cpdfremovetext.mli b/cpdfremovetext.mli new file mode 100644 index 0000000..575e4ae --- /dev/null +++ b/cpdfremovetext.mli @@ -0,0 +1,5 @@ +(** Remove text added by Cpdfaddtext from the given pages. *) +val removetext : int list -> Pdf.t -> Pdf.t + +(** Remove all text from the given pages *) +val remove_all_text : int list -> Pdf.t -> Pdf.t