Stamp remove struct tree / artifacts

This commit is contained in:
John Whitington 2025-03-11 14:28:55 +00:00
parent b9d97b938d
commit 81d88bc57a
10 changed files with 95 additions and 71 deletions

View File

@ -1,15 +1,14 @@
# Build the cpdf command line tools
NONDOC = cpdfyojson cpdfxmlm
DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
DOC = cpdfutil cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime \
cpdfcoord cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
cpdfbookmarks cpdfpage cpdftruetype cpdfremovetext cpdfextracttext \
cpdfembed cpdffont cpdftype cpdfaddtext cpdfpad cpdfocg \
cpdfsqueeze cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot \
cpdfxobject cpdfimpose cpdfchop cpdftweak cpdfprinttree cpdfua cpdftexttopdf cpdftoc \
cpdfjpeg cpdfjpeg2000 cpdfpng cpdfimage cpdfdraw \
cpdfcomposition cpdfshape cpdfcolours cpdfdrawcontrol \
cpdfcommand
cpdfembed cpdffont cpdftype cpdfaddtext cpdfpad cpdfocg cpdfsqueeze \
cpdfdraft cpdfspot cpdfpagelabels cpdfcreate cpdfannot cpdfxobject \
cpdfimpose cpdfchop cpdftweak cpdfprinttree cpdfua cpdftexttopdf \
cpdftoc cpdfjpeg cpdfjpeg2000 cpdfpng cpdfimage cpdfdraw \
cpdfcomposition cpdfshape cpdfcolours cpdfdrawcontrol cpdfcommand
MODS = $(NONDOC) $(DOC)

View File

@ -4679,11 +4679,11 @@ let go () =
(map Pdfpagelabels.string_of_pagelabel (Pdfpagelabels.read pdf))
| Some (RemoveDictEntry key) ->
let pdf = get_single_pdf args.op true in
Cpdftweak.remove_dict_entry pdf key args.dict_entry_search;
Cpdfutil.remove_dict_entry pdf key args.dict_entry_search;
write_pdf false pdf
| Some (ReplaceDictEntry key) ->
let pdf = get_single_pdf args.op true in
Cpdftweak.replace_dict_entry pdf key args.replace_dict_entry_value args.dict_entry_search;
Cpdfutil.replace_dict_entry pdf key args.replace_dict_entry_value args.dict_entry_search;
write_pdf false pdf
| Some (PrintDictEntry key) ->
let pdf = get_single_pdf args.op true in
@ -4897,7 +4897,7 @@ let go () =
write_pdf false pdf
| Some RemoveStructTree ->
let pdf = get_single_pdf args.op false in
let pdf = Cpdfua.remove_struct_tree pdf in
let pdf = Cpdfpage.remove_struct_tree pdf in
write_pdf false pdf
| Some (SetLanguage s) ->
let pdf = get_single_pdf args.op false in

View File

@ -803,7 +803,47 @@ let do_stamp relative_to_cropbox fast position topline midline scale_to_fit isov
Pdfpage.resources =
Pdfpage.combine_pdf_resources pdf u.Pdfpage.resources o.Pdfpage.resources}
let remove_struct_tree pdf =
Cpdfutil.remove_dict_entry pdf "/StructTreeRoot" None;
Cpdfutil.remove_dict_entry pdf "/StructParent" None;
Cpdfutil.remove_dict_entry pdf "/StructParents" None;
let remove_struct_tree_ops pdf resources content =
let operators = Pdfops.parse_operators pdf resources content in
(* In fact, we remove all marked content regions. Acceptable in the circumstances. *)
let remove_mcids =
lose
(function
| Pdfops.Op_MP _
| Pdfops.Op_DP _
| Pdfops.Op_BMC _
| Pdfops.Op_BDC _
| Pdfops.Op_EMC -> true | _ -> false)
in
let operators' = remove_mcids operators in
[Pdfops.stream_of_ops operators']
in
let remove_struct_tree_page _ page =
let content' = remove_struct_tree_ops pdf page.Pdfpage.resources page.Pdfpage.content in
Pdfpage.process_xobjects pdf page remove_struct_tree_ops;
{page with Pdfpage.content = content'}
in
process_pages (Pdfpage.ppstub remove_struct_tree_page) pdf (ilist 1 (Pdfpage.endpage pdf))
let mark_all_as_artifact pdf =
let mark_all_as_artifact_ops pdf resources content =
let operators = Pdfops.parse_operators pdf resources content in
let operators' = [Pdfops.Op_BMC "/Artifact"] @ operators @ [Pdfops.Op_EMC] in
[Pdfops.stream_of_ops operators']
in
let remove_struct_tree_page _ page =
let content' = mark_all_as_artifact_ops pdf page.Pdfpage.resources page.Pdfpage.content in
Pdfpage.process_xobjects pdf page mark_all_as_artifact_ops;
{page with Pdfpage.content = content'}
in
process_pages (Pdfpage.ppstub remove_struct_tree_page) pdf (ilist 1 (Pdfpage.endpage pdf))
let stamp ~process_struct_tree relative_to_cropbox position topline midline fast scale_to_fit isover range over pdf =
let over = if process_struct_tree then mark_all_as_artifact (remove_struct_tree over) else over in
let prefix = Pdfpage.shortest_unused_prefix pdf in
Pdfpage.add_prefix over prefix;
let marks = Pdfmarks.read_bookmarks pdf in

View File

@ -134,3 +134,10 @@ val alluprightonly : int list -> Pdf.t -> bool
val change_pattern_matrices_page : Pdf.t -> Pdftransform.transform_matrix -> Pdfpage.t -> Pdfpage.t
val redact : process_struct_tree:bool -> Pdf.t -> int list -> Pdf.t
(** Remove a structure tree entirely from a file, including unmarking marked content. *)
val remove_struct_tree : Pdf.t -> Pdf.t
(** Mark a PDF as begin entirely artifacts (may be used after running [remove_struct_tree]. *)
val mark_all_as_artifact : Pdf.t -> Pdf.t

View File

@ -238,31 +238,6 @@ let rec dict_entry_single_object f pdf = function
| Pdf.Array a -> Pdf.recurse_array (dict_entry_single_object f pdf) a
| x -> x
(* FIXME are we sure that functional values can never appear in the equality here? *)
let remove_dict_entry pdf key search =
let f d =
match search with
| None -> Pdf.remove_dict_entry d key
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.remove_dict_entry d key
| _ -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
let replace_dict_entry pdf key value search =
let f d =
match search with
| None -> begin try Pdf.replace_dict_entry d key value with _ -> d end
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.replace_dict_entry d key value
| _ -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
let print_dict_entry ~utf8 pdf key =
let f d =
match Pdf.lookup_direct pdf key d with

View File

@ -15,12 +15,6 @@ val blackfills : Cpdfaddtext.colour -> int list -> Pdf.t -> Pdf.t
(** Append page content. *)
val append_page_content : string -> bool -> bool -> int list -> Pdf.t -> Pdf.t
(** Remove a dictionary entry. *)
val remove_dict_entry : Pdf.t -> string -> Pdf.pdfobject option -> unit
(** Replace a dictionary entry. *)
val replace_dict_entry : Pdf.t -> string -> Pdf.pdfobject -> Pdf.pdfobject option -> unit
(** Print a dictionary entry. *)
val print_dict_entry : utf8:bool -> Pdf.t -> string -> unit

View File

@ -1850,29 +1850,3 @@ let create_pdfua2 title pagesize pages =
let pdf = {pdf with Pdf.major = 2; Pdf.minor = 0} in
mark2 2024 pdf;
pdf
let remove_struct_tree pdf =
Cpdftweak.remove_dict_entry pdf "/StructTreeRoot" None;
Cpdftweak.remove_dict_entry pdf "/StructParent" None;
Cpdftweak.remove_dict_entry pdf "/StructParents" None;
let remove_struct_tree_ops pdf resources content =
let operators = Pdfops.parse_operators pdf resources content in
(* In fact, we remove all marked content regions. Acceptable in the circumstances. *)
let remove_mcids =
lose
(function
| Pdfops.Op_MP _
| Pdfops.Op_DP _
| Pdfops.Op_BMC _
| Pdfops.Op_BDC _
| Pdfops.Op_EMC -> true | _ -> false)
in
let operators' = remove_mcids operators in
[Pdfops.stream_of_ops operators']
in
let remove_struct_tree_page _ page =
let content' = remove_struct_tree_ops pdf page.Pdfpage.resources page.Pdfpage.content in
Pdfpage.process_xobjects pdf page remove_struct_tree_ops;
{page with Pdfpage.content = content'}
in
Cpdfpage.process_pages (Pdfpage.ppstub remove_struct_tree_page) pdf (ilist 1 (Pdfpage.endpage pdf))

View File

@ -31,9 +31,6 @@ val extract_struct_tree : Pdf.t -> Cpdfyojson.Safe.t
(** Reapply an edited JSON structure tree to its PDF. *)
val replace_struct_tree : Pdf.t -> Cpdfyojson.Safe.t -> unit
(** Remove a structure tree entirely from a file, including unmarking marked content. *)
val remove_struct_tree : Pdf.t -> Pdf.t
(** Make a blank PDF/UA-1 PDF given a title, paper size, and number of pages. *)
val create_pdfua1 : string -> Pdfpaper.t -> int -> Pdf.t

32
cpdfutil.ml Normal file
View File

@ -0,0 +1,32 @@
let rec dict_entry_single_object f pdf = function
| (Pdf.Dictionary d) -> f (Pdf.recurse_dict (dict_entry_single_object f pdf) d)
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
f (Pdf.Stream {contents = (Pdf.recurse_dict (dict_entry_single_object f pdf) dict, data)})
| Pdf.Array a -> Pdf.recurse_array (dict_entry_single_object f pdf) a
| x -> x
(* FIXME are we sure that functional values can never appear in the equality here? *)
let remove_dict_entry pdf key search =
let f d =
match search with
| None -> Pdf.remove_dict_entry d key
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.remove_dict_entry d key
| _ -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
let replace_dict_entry pdf key value search =
let f d =
match search with
| None -> begin try Pdf.replace_dict_entry d key value with _ -> d end
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.replace_dict_entry d key value
| _ -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict

6
cpdfutil.mli Normal file
View File

@ -0,0 +1,6 @@
(** Remove a dictionary entry. *)
val remove_dict_entry : Pdf.t -> string -> Pdf.pdfobject option -> unit
(** Replace a dictionary entry. *)
val replace_dict_entry : Pdf.t -> string -> Pdf.pdfobject -> Pdf.pdfobject option -> unit