diff --git a/Makefile b/Makefile index 37a842b..f25b714 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Build the cpdf command line tools and top level -MODS = cpdfyojson cpdfxmlm \ +MODS = cpdfyojson cpdfxmlm cpdfutil \ cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \ diff --git a/cpdf.ml b/cpdf.ml index 151a54f..eb7168b 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -117,39 +117,6 @@ let change_boxes f pdf page = make_mediabox (f (Pdf.parse_rectangle page.Pdfpage.mediabox)); Pdfpage.rest = rest'} -let process_xobject f pdf resources i = - let xobj = Pdf.lookup_obj pdf i in - match Pdf.lookup_direct pdf "/Subtype" xobj with - | None -> raise (Pdf.PDFError "No /Subtype in Xobject") - | Some (Pdf.Name "/Form") -> - Pdf.getstream xobj; - begin match xobj with - | Pdf.Stream ({contents = Pdf.Dictionary dict, Pdf.Got bytes} as rf) -> - begin match f pdf resources [Pdf.Stream rf] with - | [Pdf.Stream {contents = (Pdf.Dictionary dict', data)}] -> - let dict' = - Pdf.remove_dict_entry - (Pdf.Dictionary (mergedict dict dict')) - "/Filter" - in - rf := (dict', data) - | _ -> assert false - end - | _ -> assert false (* getstream would have complained already *) - end - | Some _ -> () - -let process_xobjects pdf page f = - match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with - | Some (Pdf.Dictionary elts) -> - iter - (fun (k, v) -> - match v with - | Pdf.Indirect i -> process_xobject f pdf page.Pdfpage.resources i - | _ -> raise (Pdf.PDFError "process_xobject")) - elts - | _ -> () - (* The content transformed by altering any use of [Op_cm]. But we must also alter any /Matrix entries in pattern dictionaries *) let change_pattern_matrices_resources pdf tr resources = @@ -1180,7 +1147,7 @@ let blacktext c range pdf = let content' = blacktext_ops c pdf page.Pdfpage.resources page.Pdfpage.content in - process_xobjects pdf page (blacktext_ops c); + Cpdfutil.process_xobjects pdf page (blacktext_ops c); {page with Pdfpage.content = content'} in Cpdfpage.process_pages (ppstub blacktext_page) pdf range @@ -1206,7 +1173,7 @@ let blacklines c range pdf = let content' = blacklines_ops c pdf page.Pdfpage.resources page.Pdfpage.content in - process_xobjects pdf page (blacklines_ops c); + Cpdfutil.process_xobjects pdf page (blacklines_ops c); {page with Pdfpage.content = content'} in Cpdfpage.process_pages (ppstub blacklines_page) pdf range @@ -1232,7 +1199,7 @@ let blackfills c range pdf = let content' = blackfills_ops c pdf page.Pdfpage.resources page.Pdfpage.content in - process_xobjects pdf page (blackfills_ops c); + Cpdfutil.process_xobjects pdf page (blackfills_ops c); {page with Pdfpage.content = content'} in Cpdfpage.process_pages (ppstub blackfills_page) pdf range @@ -1404,42 +1371,6 @@ let trim_marks_page fast pdf n page = let trim_marks ?(fast=false) pdf range = Cpdfpage.process_pages (ppstub (trim_marks_page fast pdf)) pdf range -let rec remove_all_text_ops pdf resources content = - let is_textop = function - Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true - | _ -> false - in - let content' = - let ops = Pdfops.parse_operators pdf resources content in - Pdfops.stream_of_ops - (option_map (function x -> if is_textop x then None else Some x) ops) - in - [content'] - -let remove_all_text_page pdf p = - let resources = p.Pdfpage.resources in - let content = p.Pdfpage.content in - process_xobjects pdf p remove_all_text_ops; - {p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf - -let remove_all_text range pdf = - let pages = Pdfpage.pages_of_pagetree pdf in - let pagenums = indx pages in - let pdf = ref pdf in - let pages' = ref [] in - iter2 - (fun p pagenum -> - let p', pdf' = - if mem pagenum range - then remove_all_text_page !pdf p - else p, !pdf - in - pdf := pdf'; - pages' =| p') - pages - pagenums; - Pdfpage.change_pages true !pdf (rev !pages') - (* 1. Extend remove_dict_entry with search term 2. Implement replace_dict_entry by analogy to remove_dict_entry *) let rec dict_entry_single_object f pdf = function @@ -1501,7 +1432,7 @@ let remove_clipping pdf range = let content' = remove_clipping_ops pdf page.Pdfpage.resources page.Pdfpage.content in - process_xobjects pdf page remove_clipping_ops; + Cpdfutil.process_xobjects pdf page remove_clipping_ops; {page with Pdfpage.content = content'} in Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range diff --git a/cpdf.mli b/cpdf.mli index 4b852e4..5e96fce 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -113,12 +113,8 @@ val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t (** Make all fills on certain pages black. *) val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t -val remove_all_text : int list -> Pdf.t -> Pdf.t - (**/**) -val process_xobjects : Pdf.t -> Pdfpage.t -> (Pdf.t -> Pdf.pdfobject -> Pdf.pdfobject list -> Pdf.pdfobject list) -> unit - val append_page_content : string -> bool -> bool -> int list -> Pdf.t -> Pdf.t val remove_dict_entry : Pdf.t -> string -> Pdf.pdfobject option -> unit diff --git a/cpdfaddtext.ml b/cpdfaddtext.ml index f81133c..0e53c81 100644 --- a/cpdfaddtext.ml +++ b/cpdfaddtext.ml @@ -643,5 +643,40 @@ let addrectangle else Pdfpage.postpend_operators pdf ops ~fast:fast page in Cpdfpage.process_pages (ppstub addrectangle_page) pdf range +open Pdfutil +let rec remove_all_text_ops pdf resources content = + let is_textop = function + Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true + | _ -> false + in + let content' = + let ops = Pdfops.parse_operators pdf resources content in + Pdfops.stream_of_ops + (option_map (function x -> if is_textop x then None else Some x) ops) + in + [content'] +let remove_all_text_page pdf p = + let resources = p.Pdfpage.resources in + let content = p.Pdfpage.content in + Cpdfutil.process_xobjects pdf p remove_all_text_ops; + {p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf + +let remove_all_text range pdf = + let pages = Pdfpage.pages_of_pagetree pdf in + let pagenums = indx pages in + let pdf = ref pdf in + let pages' = ref [] in + iter2 + (fun p pagenum -> + let p', pdf' = + if mem pagenum range + then remove_all_text_page !pdf p + else p, !pdf + in + pdf := pdf'; + pages' =| p') + pages + pagenums; + Pdfpage.change_pages true !pdf (rev !pages') diff --git a/cpdfaddtext.mli b/cpdfaddtext.mli index 52e5d45..35416f2 100644 --- a/cpdfaddtext.mli +++ b/cpdfaddtext.mli @@ -70,3 +70,4 @@ val metrics_baseline_adjustment : unit -> float val removetext : int list -> Pdf.t -> Pdf.t val extract_text : float option -> Pdf.t -> int list -> string +val remove_all_text : int list -> Pdf.t -> Pdf.t diff --git a/cpdfcommand.ml b/cpdfcommand.ml index d27b10e..109d133 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -3859,7 +3859,7 @@ let go () = | Some RemoveAllText -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in - write_pdf false (Cpdf.remove_all_text range pdf) + write_pdf false (Cpdfaddtext.remove_all_text range pdf) | Some ShowBoxes -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in diff --git a/cpdfutil.ml b/cpdfutil.ml new file mode 100644 index 0000000..ad77c1f --- /dev/null +++ b/cpdfutil.ml @@ -0,0 +1,36 @@ +open Pdfutil + +(* These may move into CamlPDF at some point *) +let process_xobject f pdf resources i = + let xobj = Pdf.lookup_obj pdf i in + match Pdf.lookup_direct pdf "/Subtype" xobj with + | None -> raise (Pdf.PDFError "No /Subtype in Xobject") + | Some (Pdf.Name "/Form") -> + Pdf.getstream xobj; + begin match xobj with + | Pdf.Stream ({contents = Pdf.Dictionary dict, Pdf.Got bytes} as rf) -> + begin match f pdf resources [Pdf.Stream rf] with + | [Pdf.Stream {contents = (Pdf.Dictionary dict', data)}] -> + let dict' = + Pdf.remove_dict_entry + (Pdf.Dictionary (mergedict dict dict')) + "/Filter" + in + rf := (dict', data) + | _ -> assert false + end + | _ -> assert false (* getstream would have complained already *) + end + | Some _ -> () + +let process_xobjects pdf page f = + match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with + | Some (Pdf.Dictionary elts) -> + iter + (fun (k, v) -> + match v with + | Pdf.Indirect i -> process_xobject f pdf page.Pdfpage.resources i + | _ -> raise (Pdf.PDFError "process_xobject")) + elts + | _ -> () + diff --git a/cpdfutil.mli b/cpdfutil.mli new file mode 100644 index 0000000..dd12e0d --- /dev/null +++ b/cpdfutil.mli @@ -0,0 +1,5 @@ +val process_xobjects : Pdf.t -> + Pdfpage.t -> + (Pdf.t -> + Pdf.pdfobject -> Pdf.pdfobject list -> Pdf.pdfobject list) -> + unit