From 81d88bc57a0200795ec4b919c9b4e78fbbab0c3f Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 11 Mar 2025 14:28:55 +0000 Subject: [PATCH] Stamp remove struct tree / artifacts --- Makefile | 15 +++++++-------- cpdfcommand.ml | 6 +++--- cpdfpage.ml | 40 ++++++++++++++++++++++++++++++++++++++++ cpdfpage.mli | 7 +++++++ cpdftweak.ml | 25 ------------------------- cpdftweak.mli | 6 ------ cpdfua.ml | 26 -------------------------- cpdfua.mli | 3 --- cpdfutil.ml | 32 ++++++++++++++++++++++++++++++++ cpdfutil.mli | 6 ++++++ 10 files changed, 95 insertions(+), 71 deletions(-) create mode 100644 cpdfutil.ml create mode 100644 cpdfutil.mli diff --git a/Makefile b/Makefile index 56381f0..d1b32b9 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,14 @@ # Build the cpdf command line tools NONDOC = cpdfyojson cpdfxmlm -DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ - cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ +DOC = cpdfutil cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime \ + cpdfcoord cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ cpdfbookmarks cpdfpage cpdftruetype cpdfremovetext cpdfextracttext \ - cpdfembed cpdffont cpdftype cpdfaddtext cpdfpad cpdfocg \ - cpdfsqueeze cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot \ - cpdfxobject cpdfimpose cpdfchop cpdftweak cpdfprinttree cpdfua cpdftexttopdf cpdftoc \ - cpdfjpeg cpdfjpeg2000 cpdfpng cpdfimage cpdfdraw \ - cpdfcomposition cpdfshape cpdfcolours cpdfdrawcontrol \ - cpdfcommand + cpdfembed cpdffont cpdftype cpdfaddtext cpdfpad cpdfocg cpdfsqueeze \ + cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot cpdfxobject \ + cpdfimpose cpdfchop cpdftweak cpdfprinttree cpdfua cpdftexttopdf \ + cpdftoc cpdfjpeg cpdfjpeg2000 cpdfpng cpdfimage cpdfdraw \ + cpdfcomposition cpdfshape cpdfcolours cpdfdrawcontrol cpdfcommand MODS = $(NONDOC) $(DOC) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 67b009e..afcd181 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -4679,11 +4679,11 @@ let go () = (map Pdfpagelabels.string_of_pagelabel (Pdfpagelabels.read pdf)) | Some (RemoveDictEntry key) -> let pdf = get_single_pdf args.op true in - Cpdftweak.remove_dict_entry pdf key args.dict_entry_search; + Cpdfutil.remove_dict_entry pdf key args.dict_entry_search; write_pdf false pdf | Some (ReplaceDictEntry key) -> let pdf = get_single_pdf args.op true in - Cpdftweak.replace_dict_entry pdf key args.replace_dict_entry_value args.dict_entry_search; + Cpdfutil.replace_dict_entry pdf key args.replace_dict_entry_value args.dict_entry_search; write_pdf false pdf | Some (PrintDictEntry key) -> let pdf = get_single_pdf args.op true in @@ -4897,7 +4897,7 @@ let go () = write_pdf false pdf | Some RemoveStructTree -> let pdf = get_single_pdf args.op false in - let pdf = Cpdfua.remove_struct_tree pdf in + let pdf = Cpdfpage.remove_struct_tree pdf in write_pdf false pdf | Some (SetLanguage s) -> let pdf = get_single_pdf args.op false in diff --git a/cpdfpage.ml b/cpdfpage.ml index c29a763..3989b4c 100644 --- a/cpdfpage.ml +++ b/cpdfpage.ml @@ -803,7 +803,47 @@ let do_stamp relative_to_cropbox fast position topline midline scale_to_fit isov Pdfpage.resources = Pdfpage.combine_pdf_resources pdf u.Pdfpage.resources o.Pdfpage.resources} +let remove_struct_tree pdf = + Cpdfutil.remove_dict_entry pdf "/StructTreeRoot" None; + Cpdfutil.remove_dict_entry pdf "/StructParent" None; + Cpdfutil.remove_dict_entry pdf "/StructParents" None; + let remove_struct_tree_ops pdf resources content = + let operators = Pdfops.parse_operators pdf resources content in + (* In fact, we remove all marked content regions. Acceptable in the circumstances. *) + let remove_mcids = + lose + (function + | Pdfops.Op_MP _ + | Pdfops.Op_DP _ + | Pdfops.Op_BMC _ + | Pdfops.Op_BDC _ + | Pdfops.Op_EMC -> true | _ -> false) + in + let operators' = remove_mcids operators in + [Pdfops.stream_of_ops operators'] + in + let remove_struct_tree_page _ page = + let content' = remove_struct_tree_ops pdf page.Pdfpage.resources page.Pdfpage.content in + Pdfpage.process_xobjects pdf page remove_struct_tree_ops; + {page with Pdfpage.content = content'} + in + process_pages (Pdfpage.ppstub remove_struct_tree_page) pdf (ilist 1 (Pdfpage.endpage pdf)) + +let mark_all_as_artifact pdf = + let mark_all_as_artifact_ops pdf resources content = + let operators = Pdfops.parse_operators pdf resources content in + let operators' = [Pdfops.Op_BMC "/Artifact"] @ operators @ [Pdfops.Op_EMC] in + [Pdfops.stream_of_ops operators'] + in + let remove_struct_tree_page _ page = + let content' = mark_all_as_artifact_ops pdf page.Pdfpage.resources page.Pdfpage.content in + Pdfpage.process_xobjects pdf page mark_all_as_artifact_ops; + {page with Pdfpage.content = content'} + in + process_pages (Pdfpage.ppstub remove_struct_tree_page) pdf (ilist 1 (Pdfpage.endpage pdf)) + let stamp ~process_struct_tree relative_to_cropbox position topline midline fast scale_to_fit isover range over pdf = + let over = if process_struct_tree then mark_all_as_artifact (remove_struct_tree over) else over in let prefix = Pdfpage.shortest_unused_prefix pdf in Pdfpage.add_prefix over prefix; let marks = Pdfmarks.read_bookmarks pdf in diff --git a/cpdfpage.mli b/cpdfpage.mli index d097e17..ac4a335 100644 --- a/cpdfpage.mli +++ b/cpdfpage.mli @@ -134,3 +134,10 @@ val alluprightonly : int list -> Pdf.t -> bool val change_pattern_matrices_page : Pdf.t -> Pdftransform.transform_matrix -> Pdfpage.t -> Pdfpage.t val redact : process_struct_tree:bool -> Pdf.t -> int list -> Pdf.t + +(** Remove a structure tree entirely from a file, including unmarking marked content. *) +val remove_struct_tree : Pdf.t -> Pdf.t + +(** Mark a PDF as begin entirely artifacts (may be used after running [remove_struct_tree]. *) +val mark_all_as_artifact : Pdf.t -> Pdf.t + diff --git a/cpdftweak.ml b/cpdftweak.ml index 96da311..41633d4 100644 --- a/cpdftweak.ml +++ b/cpdftweak.ml @@ -238,31 +238,6 @@ let rec dict_entry_single_object f pdf = function | Pdf.Array a -> Pdf.recurse_array (dict_entry_single_object f pdf) a | x -> x -(* FIXME are we sure that functional values can never appear in the equality here? *) -let remove_dict_entry pdf key search = - let f d = - match search with - | None -> Pdf.remove_dict_entry d key - | Some s -> - match Pdf.lookup_direct pdf key d with - | Some v when v = s -> Pdf.remove_dict_entry d key - | _ -> d - in - Pdf.objselfmap (dict_entry_single_object f pdf) pdf; - pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict - -let replace_dict_entry pdf key value search = - let f d = - match search with - | None -> begin try Pdf.replace_dict_entry d key value with _ -> d end - | Some s -> - match Pdf.lookup_direct pdf key d with - | Some v when v = s -> Pdf.replace_dict_entry d key value - | _ -> d - in - Pdf.objselfmap (dict_entry_single_object f pdf) pdf; - pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict - let print_dict_entry ~utf8 pdf key = let f d = match Pdf.lookup_direct pdf key d with diff --git a/cpdftweak.mli b/cpdftweak.mli index 984584c..5cf4548 100644 --- a/cpdftweak.mli +++ b/cpdftweak.mli @@ -15,12 +15,6 @@ val blackfills : Cpdfaddtext.colour -> int list -> Pdf.t -> Pdf.t (** Append page content. *) val append_page_content : string -> bool -> bool -> int list -> Pdf.t -> Pdf.t -(** Remove a dictionary entry. *) -val remove_dict_entry : Pdf.t -> string -> Pdf.pdfobject option -> unit - -(** Replace a dictionary entry. *) -val replace_dict_entry : Pdf.t -> string -> Pdf.pdfobject -> Pdf.pdfobject option -> unit - (** Print a dictionary entry. *) val print_dict_entry : utf8:bool -> Pdf.t -> string -> unit diff --git a/cpdfua.ml b/cpdfua.ml index 56efe78..f113ed8 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -1850,29 +1850,3 @@ let create_pdfua2 title pagesize pages = let pdf = {pdf with Pdf.major = 2; Pdf.minor = 0} in mark2 2024 pdf; pdf - -let remove_struct_tree pdf = - Cpdftweak.remove_dict_entry pdf "/StructTreeRoot" None; - Cpdftweak.remove_dict_entry pdf "/StructParent" None; - Cpdftweak.remove_dict_entry pdf "/StructParents" None; - let remove_struct_tree_ops pdf resources content = - let operators = Pdfops.parse_operators pdf resources content in - (* In fact, we remove all marked content regions. Acceptable in the circumstances. *) - let remove_mcids = - lose - (function - | Pdfops.Op_MP _ - | Pdfops.Op_DP _ - | Pdfops.Op_BMC _ - | Pdfops.Op_BDC _ - | Pdfops.Op_EMC -> true | _ -> false) - in - let operators' = remove_mcids operators in - [Pdfops.stream_of_ops operators'] - in - let remove_struct_tree_page _ page = - let content' = remove_struct_tree_ops pdf page.Pdfpage.resources page.Pdfpage.content in - Pdfpage.process_xobjects pdf page remove_struct_tree_ops; - {page with Pdfpage.content = content'} - in - Cpdfpage.process_pages (Pdfpage.ppstub remove_struct_tree_page) pdf (ilist 1 (Pdfpage.endpage pdf)) diff --git a/cpdfua.mli b/cpdfua.mli index d73f096..b2912ae 100644 --- a/cpdfua.mli +++ b/cpdfua.mli @@ -31,9 +31,6 @@ val extract_struct_tree : Pdf.t -> Cpdfyojson.Safe.t (** Reapply an edited JSON structure tree to its PDF. *) val replace_struct_tree : Pdf.t -> Cpdfyojson.Safe.t -> unit -(** Remove a structure tree entirely from a file, including unmarking marked content. *) -val remove_struct_tree : Pdf.t -> Pdf.t - (** Make a blank PDF/UA-1 PDF given a title, paper size, and number of pages. *) val create_pdfua1 : string -> Pdfpaper.t -> int -> Pdf.t diff --git a/cpdfutil.ml b/cpdfutil.ml new file mode 100644 index 0000000..8deea64 --- /dev/null +++ b/cpdfutil.ml @@ -0,0 +1,32 @@ +let rec dict_entry_single_object f pdf = function + | (Pdf.Dictionary d) -> f (Pdf.recurse_dict (dict_entry_single_object f pdf) d) + | (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) -> + f (Pdf.Stream {contents = (Pdf.recurse_dict (dict_entry_single_object f pdf) dict, data)}) + | Pdf.Array a -> Pdf.recurse_array (dict_entry_single_object f pdf) a + | x -> x + +(* FIXME are we sure that functional values can never appear in the equality here? *) +let remove_dict_entry pdf key search = + let f d = + match search with + | None -> Pdf.remove_dict_entry d key + | Some s -> + match Pdf.lookup_direct pdf key d with + | Some v when v = s -> Pdf.remove_dict_entry d key + | _ -> d + in + Pdf.objselfmap (dict_entry_single_object f pdf) pdf; + pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict + +let replace_dict_entry pdf key value search = + let f d = + match search with + | None -> begin try Pdf.replace_dict_entry d key value with _ -> d end + | Some s -> + match Pdf.lookup_direct pdf key d with + | Some v when v = s -> Pdf.replace_dict_entry d key value + | _ -> d + in + Pdf.objselfmap (dict_entry_single_object f pdf) pdf; + pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict + diff --git a/cpdfutil.mli b/cpdfutil.mli new file mode 100644 index 0000000..629018c --- /dev/null +++ b/cpdfutil.mli @@ -0,0 +1,6 @@ +(** Remove a dictionary entry. *) +val remove_dict_entry : Pdf.t -> string -> Pdf.pdfobject option -> unit + +(** Replace a dictionary entry. *) +val replace_dict_entry : Pdf.t -> string -> Pdf.pdfobject -> Pdf.pdfobject option -> unit +