This commit is contained in:
John Whitington 2021-12-21 15:40:52 +00:00
parent 27d13d9e3b
commit 7066a8873a
8 changed files with 83 additions and 79 deletions

View File

@ -1,5 +1,5 @@
# Build the cpdf command line tools and top level
MODS = cpdfyojson cpdfxmlm \
MODS = cpdfyojson cpdfxmlm cpdfutil \
cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
cpdfbookmarks cpdfpage cpdfaddtext cpdf cpdfimage cpdffont cpdftype \

77
cpdf.ml
View File

@ -117,39 +117,6 @@ let change_boxes f pdf page =
make_mediabox (f (Pdf.parse_rectangle page.Pdfpage.mediabox));
Pdfpage.rest = rest'}
let process_xobject f pdf resources i =
let xobj = Pdf.lookup_obj pdf i in
match Pdf.lookup_direct pdf "/Subtype" xobj with
| None -> raise (Pdf.PDFError "No /Subtype in Xobject")
| Some (Pdf.Name "/Form") ->
Pdf.getstream xobj;
begin match xobj with
| Pdf.Stream ({contents = Pdf.Dictionary dict, Pdf.Got bytes} as rf) ->
begin match f pdf resources [Pdf.Stream rf] with
| [Pdf.Stream {contents = (Pdf.Dictionary dict', data)}] ->
let dict' =
Pdf.remove_dict_entry
(Pdf.Dictionary (mergedict dict dict'))
"/Filter"
in
rf := (dict', data)
| _ -> assert false
end
| _ -> assert false (* getstream would have complained already *)
end
| Some _ -> ()
let process_xobjects pdf page f =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
iter
(fun (k, v) ->
match v with
| Pdf.Indirect i -> process_xobject f pdf page.Pdfpage.resources i
| _ -> raise (Pdf.PDFError "process_xobject"))
elts
| _ -> ()
(* The content transformed by altering any use of [Op_cm]. But we must also
alter any /Matrix entries in pattern dictionaries *)
let change_pattern_matrices_resources pdf tr resources =
@ -1180,7 +1147,7 @@ let blacktext c range pdf =
let content' =
blacktext_ops c pdf page.Pdfpage.resources page.Pdfpage.content
in
process_xobjects pdf page (blacktext_ops c);
Cpdfutil.process_xobjects pdf page (blacktext_ops c);
{page with Pdfpage.content = content'}
in
Cpdfpage.process_pages (ppstub blacktext_page) pdf range
@ -1206,7 +1173,7 @@ let blacklines c range pdf =
let content' =
blacklines_ops c pdf page.Pdfpage.resources page.Pdfpage.content
in
process_xobjects pdf page (blacklines_ops c);
Cpdfutil.process_xobjects pdf page (blacklines_ops c);
{page with Pdfpage.content = content'}
in
Cpdfpage.process_pages (ppstub blacklines_page) pdf range
@ -1232,7 +1199,7 @@ let blackfills c range pdf =
let content' =
blackfills_ops c pdf page.Pdfpage.resources page.Pdfpage.content
in
process_xobjects pdf page (blackfills_ops c);
Cpdfutil.process_xobjects pdf page (blackfills_ops c);
{page with Pdfpage.content = content'}
in
Cpdfpage.process_pages (ppstub blackfills_page) pdf range
@ -1404,42 +1371,6 @@ let trim_marks_page fast pdf n page =
let trim_marks ?(fast=false) pdf range =
Cpdfpage.process_pages (ppstub (trim_marks_page fast pdf)) pdf range
let rec remove_all_text_ops pdf resources content =
let is_textop = function
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
| _ -> false
in
let content' =
let ops = Pdfops.parse_operators pdf resources content in
Pdfops.stream_of_ops
(option_map (function x -> if is_textop x then None else Some x) ops)
in
[content']
let remove_all_text_page pdf p =
let resources = p.Pdfpage.resources in
let content = p.Pdfpage.content in
process_xobjects pdf p remove_all_text_ops;
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
let remove_all_text range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf in
let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_all_text_page !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')
(* 1. Extend remove_dict_entry with search term
2. Implement replace_dict_entry by analogy to remove_dict_entry *)
let rec dict_entry_single_object f pdf = function
@ -1501,7 +1432,7 @@ let remove_clipping pdf range =
let content' =
remove_clipping_ops pdf page.Pdfpage.resources page.Pdfpage.content
in
process_xobjects pdf page remove_clipping_ops;
Cpdfutil.process_xobjects pdf page remove_clipping_ops;
{page with Pdfpage.content = content'}
in
Cpdfpage.process_pages (ppstub remove_clipping_page) pdf range

View File

@ -113,12 +113,8 @@ val blacklines : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
(** Make all fills on certain pages black. *)
val blackfills : Cpdfaddtext.color -> int list -> Pdf.t -> Pdf.t
val remove_all_text : int list -> Pdf.t -> Pdf.t
(**/**)
val process_xobjects : Pdf.t -> Pdfpage.t -> (Pdf.t -> Pdf.pdfobject -> Pdf.pdfobject list -> Pdf.pdfobject list) -> unit
val append_page_content : string -> bool -> bool -> int list -> Pdf.t -> Pdf.t
val remove_dict_entry : Pdf.t -> string -> Pdf.pdfobject option -> unit

View File

@ -643,5 +643,40 @@ let addrectangle
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
Cpdfpage.process_pages (ppstub addrectangle_page) pdf range
open Pdfutil
let rec remove_all_text_ops pdf resources content =
let is_textop = function
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
| _ -> false
in
let content' =
let ops = Pdfops.parse_operators pdf resources content in
Pdfops.stream_of_ops
(option_map (function x -> if is_textop x then None else Some x) ops)
in
[content']
let remove_all_text_page pdf p =
let resources = p.Pdfpage.resources in
let content = p.Pdfpage.content in
Cpdfutil.process_xobjects pdf p remove_all_text_ops;
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
let remove_all_text range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf in
let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_all_text_page !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')

View File

@ -70,3 +70,4 @@ val metrics_baseline_adjustment : unit -> float
val removetext : int list -> Pdf.t -> Pdf.t
val extract_text : float option -> Pdf.t -> int list -> string
val remove_all_text : int list -> Pdf.t -> Pdf.t

View File

@ -3859,7 +3859,7 @@ let go () =
| Some RemoveAllText ->
let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in
write_pdf false (Cpdf.remove_all_text range pdf)
write_pdf false (Cpdfaddtext.remove_all_text range pdf)
| Some ShowBoxes ->
let pdf = get_single_pdf args.op false in
let range = parse_pagespec_allow_empty pdf (get_pagespec ()) in

36
cpdfutil.ml Normal file
View File

@ -0,0 +1,36 @@
open Pdfutil
(* These may move into CamlPDF at some point *)
let process_xobject f pdf resources i =
let xobj = Pdf.lookup_obj pdf i in
match Pdf.lookup_direct pdf "/Subtype" xobj with
| None -> raise (Pdf.PDFError "No /Subtype in Xobject")
| Some (Pdf.Name "/Form") ->
Pdf.getstream xobj;
begin match xobj with
| Pdf.Stream ({contents = Pdf.Dictionary dict, Pdf.Got bytes} as rf) ->
begin match f pdf resources [Pdf.Stream rf] with
| [Pdf.Stream {contents = (Pdf.Dictionary dict', data)}] ->
let dict' =
Pdf.remove_dict_entry
(Pdf.Dictionary (mergedict dict dict'))
"/Filter"
in
rf := (dict', data)
| _ -> assert false
end
| _ -> assert false (* getstream would have complained already *)
end
| Some _ -> ()
let process_xobjects pdf page f =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
iter
(fun (k, v) ->
match v with
| Pdf.Indirect i -> process_xobject f pdf page.Pdfpage.resources i
| _ -> raise (Pdf.PDFError "process_xobject"))
elts
| _ -> ()

5
cpdfutil.mli Normal file
View File

@ -0,0 +1,5 @@
val process_xobjects : Pdf.t ->
Pdfpage.t ->
(Pdf.t ->
Pdf.pdfobject -> Pdf.pdfobject list -> Pdf.pdfobject list) ->
unit