From 52b4d0d189b0efa0ae7823b5b5a072c7f7e91e8b Mon Sep 17 00:00:00 2001 From: John Whitington Date: Tue, 18 Jun 2024 15:28:19 +0100 Subject: [PATCH] Add -remove-mark --- cpdfcommand.ml | 13 +++++++++++- cpdfua.ml | 55 ++++++++++++++++++++++++++++++++------------------ cpdfua.mli | 2 ++ 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index b09badd..ca51dc3 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -221,6 +221,7 @@ type op = | PrintObj of int | Verify of string | MarkAs of string + | RemoveMark of string | ExtractStructTree | ReplaceStructTree of string | SetLanguage of string @@ -368,6 +369,7 @@ let string_of_op = function | PrintObj _ -> "PrintObj" | Verify _ -> "Verify" | MarkAs _ -> "MarkAs" + | RemoveMark _ -> "RemoveMark" | ExtractStructTree -> "ExtractStructTree" | ReplaceStructTree _ -> "ReplaceStructTree" | SetLanguage _ -> "SetLanguage" @@ -891,7 +893,7 @@ let banned banlist = function | AddPageLabels | RemovePageLabels | OutputJSON | OCGCoalesce | OCGRename | OCGList | OCGOrderAll | PrintFontEncoding _ | TableOfContents | Typeset _ | Composition _ | TextWidth _ | SetAnnotations _ | CopyAnnotations _ | ExtractStream _ | PrintObj _ - | Verify _ | MarkAs _ | ExtractStructTree | ReplaceStructTree _ | SetLanguage _ + | Verify _ | MarkAs _ | RemoveMark _ | ExtractStructTree | ReplaceStructTree _ | SetLanguage _ -> false (* Always allowed *) (* Combine pages is not allowed because we would not know where to get the -recrypt from -- the first or second file? *) @@ -2819,6 +2821,7 @@ and specs = ("-json", Arg.Unit (fun () -> args.format_json <- true), "Format output as JSON"); ("-verify", Arg.String (fun s -> setop (Verify s) ()), "Verify conformance to a standard"); ("-mark-as", Arg.String (fun s -> setop (MarkAs s) ()), "Mark as conforming to a standard"); + ("-remove-mark", Arg.String (fun s -> setop (RemoveMark s) ()), "Remove conformance mark"); ("-extract-struct-tree", Arg.Unit (fun () -> setop ExtractStructTree ()), "Extract structure tree in JSON format"); ("-replace-struct-tree", Arg.String (fun s -> setop (ReplaceStructTree s) ()), "Replace structure tree from JSON"); (* These items are undocumented *) @@ -4474,6 +4477,14 @@ let go () = write_pdf false pdf | _ -> error "Unknown standard" end + | Some (RemoveMark standard) -> + begin match standard with + | "PDF/UA-1" -> + let pdf = get_single_pdf args.op false in + Cpdfua.remove_mark pdf; + write_pdf false pdf + | _ -> error "Unknown standard" + end | Some ExtractStructTree -> let pdf = get_single_pdf args.op true in let json = Cpdfua.extract_struct_tree pdf in diff --git a/cpdfua.ml b/cpdfua.ml index bc3f4e1..360f19c 100644 --- a/cpdfua.ml +++ b/cpdfua.ml @@ -1135,32 +1135,47 @@ let rec delete_pdfua_marker tree = | _ -> false in match tree with - | Cpdfmetadata.E (((rdf, "Description"), _), c) when rdf = Cpdfmetadata.rdf && List.exists is_pdfuaid c -> - Cpdfmetadata.D "" + | Cpdfmetadata.E (((rdf, "Description"), x), c) when rdf = Cpdfmetadata.rdf && List.exists is_pdfuaid c -> + Cpdfmetadata.E (((rdf, "Description"), x), keep (notpred is_pdfuaid) c) | Cpdfmetadata.E (x, children) -> Cpdfmetadata.E (x, map delete_pdfua_marker children) | x -> x let mark pdf = let pdf2 = if Cpdfmetadata.get_metadata pdf = None then Cpdfmetadata.create_metadata pdf else pdf in - pdf.Pdf.objects <- pdf2.Pdf.objects; - pdf.Pdf.trailerdict <- pdf2.Pdf.trailerdict; - pdf.Pdf.root <- pdf2.Pdf.root; - match Cpdfmetadata.get_metadata pdf with - | Some metadata -> - let dtd, tree = Cpdfmetadata.xmltree_of_bytes metadata in - let newtree = - begin match Cpdfmetadata.get_data_for Cpdfmetadata.pdfuaid "part" tree with - | Some _ -> insert_as_rdf_description pdfua_marker (delete_pdfua_marker tree) - | None -> insert_as_rdf_description pdfua_marker tree - end - in - let newbytes = Cpdfmetadata.bytes_of_xmltree (dtd, newtree) in - let pdf3 = Cpdfmetadata.set_metadata_from_bytes true newbytes pdf in - pdf.Pdf.objects <- pdf3.Pdf.objects; - pdf.Pdf.trailerdict <- pdf3.Pdf.trailerdict; - pdf.Pdf.root <- pdf3.Pdf.root - | None -> assert false + pdf.Pdf.objects <- pdf2.Pdf.objects; + pdf.Pdf.trailerdict <- pdf2.Pdf.trailerdict; + pdf.Pdf.root <- pdf2.Pdf.root; + match Cpdfmetadata.get_metadata pdf with + | Some metadata -> + let dtd, tree = Cpdfmetadata.xmltree_of_bytes metadata in + let newtree = + match Cpdfmetadata.get_data_for Cpdfmetadata.pdfuaid "part" tree with + | Some _ -> insert_as_rdf_description pdfua_marker (delete_pdfua_marker tree) + | None -> insert_as_rdf_description pdfua_marker tree + in + let newbytes = Cpdfmetadata.bytes_of_xmltree (dtd, newtree) in + let pdf3 = Cpdfmetadata.set_metadata_from_bytes true newbytes pdf in + pdf.Pdf.objects <- pdf3.Pdf.objects; + pdf.Pdf.trailerdict <- pdf3.Pdf.trailerdict; + pdf.Pdf.root <- pdf3.Pdf.root + | None -> assert false + +let remove_mark pdf = + match Cpdfmetadata.get_metadata pdf with + | Some metadata -> + let dtd, tree = Cpdfmetadata.xmltree_of_bytes metadata in + let newtree = + match Cpdfmetadata.get_data_for Cpdfmetadata.pdfuaid "part" tree with + | Some _ -> delete_pdfua_marker tree + | None -> tree + in + let newbytes = Cpdfmetadata.bytes_of_xmltree (dtd, newtree) in + let pdf3 = Cpdfmetadata.set_metadata_from_bytes true newbytes pdf in + pdf.Pdf.objects <- pdf3.Pdf.objects; + pdf.Pdf.trailerdict <- pdf3.Pdf.trailerdict; + pdf.Pdf.root <- pdf3.Pdf.root + | None -> () let extract_struct_tree pdf = match Pdf.lookup_obj pdf pdf.Pdf.root with diff --git a/cpdfua.mli b/cpdfua.mli index a5c433c..a9dd682 100644 --- a/cpdfua.mli +++ b/cpdfua.mli @@ -5,6 +5,8 @@ val test_matterhorn_json : Pdf.t -> Cpdfyojson.Safe.t val mark : Pdf.t -> unit +val remove_mark : Pdf.t -> unit + val extract_struct_tree : Pdf.t -> Cpdfyojson.Safe.t val replace_struct_tree : Pdf.t -> Cpdfyojson.Safe.t -> unit