From 37f900042e7fe9466b9ac79e87abbfa25a9173e4 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Sun, 19 Dec 2021 12:55:06 +0000 Subject: [PATCH] more --- Makefile | 2 +- cpdf.ml | 675 +---------------------------------------------- cpdf.mli | 80 +----- cpdfcommand.ml | 52 ++-- cpdfmetadata.ml | 661 ++++++++++++++++++++++++++++++++++++++++++++++ cpdfmetadata.mli | 74 ++++++ 6 files changed, 776 insertions(+), 768 deletions(-) create mode 100644 cpdfmetadata.ml create mode 100644 cpdfmetadata.mli diff --git a/Makefile b/Makefile index 20e857f..3d9ebbb 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Build the cpdf command line tools and top level MODS = cpdfyojson cpdfxmlm \ cpdfunicodedata cpdferror cpdfjson cpdfstrftime cpdfcoord cpdfattach \ - cpdfpagespec cpdfposition cpdf cpdfpresent cpdffont cpdftype \ + cpdfpagespec cpdfposition cpdfpresent cpdfmetadata cpdf cpdffont cpdftype \ cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfcommand SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml diff --git a/cpdf.ml b/cpdf.ml index b01d95f..53ea33c 100644 --- a/cpdf.ml +++ b/cpdf.ml @@ -10,67 +10,11 @@ type color = let debug = ref false -let xmp_template = -{| - - - - - CREATEDATE - MODDATE - PRODUCER - CREATOR - TITLE - SUBJECT - AUTHOR - KEYWORDS - TRAPPED - - - - CREATEDATE - CREATOR - MODDATE - METADATADATE - - - - TITLE - - - - -|} - -type encoding = - | Raw - | UTF8 - | Stripped - -(* Just strip everything which isn't 7 bit ASCII *) -let crude_de_unicode s = - implode (map char_of_int (lose (fun x -> x > 127) (Pdftext.codepoints_of_pdfdocstring s))) - -let encode_output enc s = - match enc with - | Raw -> s - | UTF8 -> Pdftext.utf8_of_pdfdocstring s - | Stripped -> crude_de_unicode s - (* Get the number of pages in file. Doesn't need decryption. *) let endpage_io ?revision i user_pw owner_pw = let pdf = Pdfread.pdf_of_input_lazy ?revision user_pw owner_pw i in Pdfpage.endpage pdf - - let print_pdf_objs pdf = Printf.printf "Trailerdict: %s\n" (Pdfwrite.string_of_pdf pdf.Pdf.trailerdict); Printf.printf "Root: %i\n" pdf.Pdf.root; @@ -103,14 +47,6 @@ let rec process_text time text m = | [] -> Cpdfstrftime.strftime ~time text | (s, r)::t -> process_text time (string_replace_all_lazy s r text) t -let expand_date = function - | "now" -> - begin match Sys.getenv_opt "CPDF_REPRODUCIBLE_DATES" with - | Some "true" -> Cpdfstrftime.strftime ~time:Cpdfstrftime.dummy "D:%Y%m%d%H%M%S" - | _ -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S" - end - | x -> x - (* For uses of process_pages which don't need to deal with matrices, this function transforms into one which returns the identity matrix *) let ppstub f n p = (f n p, n, Pdftransform.i_matrix) @@ -205,17 +141,6 @@ let combine_pdf_resources pdf a b = (Pdf.Dictionary []) (unknown_keys_a @ unknown_keys_b @ combined_known_entries) -(* \section{Copy an /ID from one file to another} *) -let copy_id keepversion copyfrom copyto = - match Pdf.lookup_direct copyfrom "/ID" copyfrom.Pdf.trailerdict with - | None -> copyto (* error "Source PDF file has no /ID entry to copy from" *) - | Some id -> - copyto.Pdf.trailerdict <- - Pdf.add_dict_entry copyto.Pdf.trailerdict "/ID" id; - copyto.Pdf.minor <- - if keepversion then copyto.Pdf.minor else max copyto.Pdf.minor 1; - copyto - (* \section{Remove bookmarks} *) (* \section{Add bookmarks} *) @@ -377,155 +302,6 @@ let add_bookmarks ~json verify input pdf = (*iter (fun b -> flprint (Pdfmarks.string_of_bookmark b); flprint "\n") parsed;*) Pdfmarks.add_bookmarks parsed pdf -(* \section{Set page mode} *) -let set_page_mode pdf s = - match s with - | "UseNone" | "UseOutlines" | "UseThumbs" - | "FullScreen" | "UseOC" | "UseAttachments" -> - begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with - | Some catalog -> - let catalog' = - Pdf.add_dict_entry catalog "/PageMode" (Pdf.Name ("/" ^ s)) - in - let catalognum = Pdf.addobj pdf catalog' in - let trailerdict' = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) - in - {pdf with - Pdf.root = catalognum; - Pdf.trailerdict = trailerdict'} - | None -> error "bad root" - end - | _ -> error "Unknown page mode" - -(* Set open action *) -let set_open_action pdf fit pagenumber = - if pagenumber > Pdfpage.endpage pdf || pagenumber < 0 then - raise (error "set_open_action: invalid page number") - else - let pageobjectnumber = select pagenumber (Pdf.page_reference_numbers pdf) in - let destination = - if fit then - Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/Fit"] - else - Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/XYZ"; Pdf.Null; Pdf.Null; Pdf.Null] - in - let open_action = - Pdf.Dictionary [("/D", destination); ("/S", Pdf.Name "/GoTo")] - in - match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with - | Some catalog -> - let catalog' = - Pdf.add_dict_entry catalog "/OpenAction" open_action - in - let catalognum = Pdf.addobj pdf catalog' in - let trailerdict' = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) - in - {pdf with Pdf.root = catalognum; Pdf.trailerdict = trailerdict'} - | None -> error "bad root" - -(* \section{Set viewer preferences} *) -let set_viewer_preference (key, value, version) pdf = - match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with - | Some catalog -> - let viewer_preferences = - match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with - | Some d -> d - | None -> Pdf.Dictionary [] - in - let viewer_preferences' = - Pdf.add_dict_entry viewer_preferences key value - in - let catalog' = - Pdf.add_dict_entry catalog "/ViewerPreferences" viewer_preferences' - in - let catalognum = Pdf.addobj pdf catalog' in - let trailerdict' = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) - in - {pdf with - Pdf.minor = max pdf.Pdf.minor version; - Pdf.root = catalognum; - Pdf.trailerdict = trailerdict'} - | None -> error "bad root" - - - -(* \section{Set page layout} *) -let set_page_layout pdf s = - match s with - | "SinglePage" | "OneColumn" | "TwoColumnLeft" - | "TwoColumnRight" | "TwoPageLeft" | "TwoPageRight" -> - begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with - | Some catalog -> - let catalog' = - Pdf.add_dict_entry catalog "/PageLayout" (Pdf.Name ("/" ^ s)) - in - let catalognum = Pdf.addobj pdf catalog' in - let trailerdict' = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) - in - {pdf with - Pdf.root = catalognum; - Pdf.trailerdict = trailerdict'} - | None -> error "bad root" - end - | _ -> error "Unknown page layout" - -(* \section{Set or replace metadata} *) -let set_metadata_from_bytes keepversion data pdf = - let metadata_stream = - Pdf.Stream - {contents = - (Pdf.Dictionary - ["/Length", Pdf.Integer (bytes_size data); - "/Type", Pdf.Name "/Metadata"; - "/Subtype", Pdf.Name "/XML"], - Pdf.Got data)} - in - let objnum = Pdf.addobj pdf metadata_stream in - let document_catalog = - match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with - | Some s -> s - | None -> error "Malformed PDF: No root." - in - let document_catalog' = - Pdf.add_dict_entry document_catalog "/Metadata" (Pdf.Indirect objnum) - in - let rootnum = Pdf.addobj pdf document_catalog' in - let trailerdict = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum) - in - {pdf with - Pdf.trailerdict = trailerdict; - Pdf.root = rootnum; - Pdf.minor = - if keepversion then pdf.Pdf.minor else max 4 pdf.Pdf.minor} - -let set_metadata keepversion filename pdf = - let ch = open_in_bin filename in - let data = mkbytes (in_channel_length ch) in - for x = 0 to bytes_size data - 1 do - bset data x (input_byte ch) - done; - set_metadata_from_bytes keepversion data pdf - - - -(* \section{Remove metadata} *) -let remove_metadata pdf = - match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with - | None -> error "malformed file" - | Some root -> - let root' = Pdf.remove_dict_entry root "/Metadata" in - let rootnum = Pdf.addobj pdf root' in - {pdf with - Pdf.trailerdict = - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum); - Pdf.root = - rootnum} - (* List bookmarks *) let output_string_of_target pdf fastrefnums x = match Pdfdest.pdfobject_of_destination x with @@ -584,9 +360,9 @@ let list_bookmarks ~json encoding range pdf output = replace q bs q (replace nl bs n (replace bs bs bs codepoints)) in match encoding with - | UTF8 -> Pdftext.utf8_of_codepoints escaped - | Stripped -> process_stripped escaped - | Raw -> s + | Cpdfmetadata.UTF8 -> Pdftext.utf8_of_codepoints escaped + | Cpdfmetadata.Stripped -> process_stripped escaped + | Cpdfmetadata.Raw -> s in let bookmarks = Pdfmarks.read_bookmarks pdf in let refnums = Pdf.page_reference_numbers pdf in @@ -719,27 +495,6 @@ let hasbox pdf page boxname = | _ -> false -(* Print metadata *) -let get_metadata pdf = - match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with - | None -> error "malformed file" - | Some root -> - match Pdf.lookup_direct pdf "/Metadata" root with - | Some ((Pdf.Stream _) as s) -> - Pdfcodec.decode_pdfstream pdf s; - begin match s with - | Pdf.Stream {contents = (_, Pdf.Got data)} -> Some data - | _ -> assert false - end - | _ -> None - -let print_metadata pdf = - match get_metadata pdf with - None -> () - | Some data -> - for x = 0 to bytes_size data - 1 do - Printf.printf "%c" (char_of_int (bget data x)) - done (* List fonts *) let list_font pdf page (name, dict) = @@ -1728,7 +1483,7 @@ let stamp relative_to_cropbox position topline midline fast scale_to_fit isover let merged = {merged with Pdf.saved_encryption = pdf.Pdf.saved_encryption} in - let merged = copy_id true pdf merged in + let merged = Cpdfmetadata.copy_id true pdf merged in let merged_pages = Pdfpage.pages_of_pagetree merged in let under_pages, over_page = all_but_last merged_pages, last merged_pages @@ -1819,7 +1574,7 @@ let stamp_as_xobject pdf range over = let merged = {merged with Pdf.saved_encryption = pdf.Pdf.saved_encryption} in - let merged = copy_id true pdf merged in + let merged = Cpdfmetadata.copy_id true pdf merged in let merged_pages = Pdfpage.pages_of_pagetree merged in let under_pages, over_page = all_but_last merged_pages, last merged_pages @@ -2185,7 +1940,7 @@ let scale_contents ?(fast=false) position scale pdf range = (* \section{List annotations} *) let get_annotation_string encoding pdf annot = match Pdf.lookup_direct pdf "/Contents" annot with - | Some (Pdf.String s) -> encode_output encoding s + | Some (Pdf.String s) -> Cpdfmetadata.encode_output encoding s | _ -> "" let print_annotation encoding pdf num s = @@ -2696,415 +2451,6 @@ let twoup fast pdf = let pdf = upright all (rotate_pdf ~-90 pdf all) in scale_to_fit_pdf ~fast Cpdfposition.Diagonal 1. (many (width, height) endpage) () pdf all -(* \section{Output info} *) -let get_info raw pdf = - let infodict = - match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with - | Some infodict -> infodict - | _ -> Pdf.Dictionary [] - in - let getstring name = - match Pdf.lookup_direct pdf name infodict with - | Some (Pdf.String s) -> - if raw then s else crude_de_unicode s - | Some (Pdf.Boolean false) -> "False" - | Some (Pdf.Boolean true) -> "True" - | _ -> if name = "/Trapped" then "False" else "" - in - getstring - -let get_info_utf8 pdf = - let infodict = - match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with - | Some infodict -> infodict - | _ -> Pdf.Dictionary [] - in - (function name -> - match Pdf.lookup_direct pdf name infodict with - | Some (Pdf.String s) -> Pdftext.utf8_of_pdfdocstring s - | Some (Pdf.Boolean false) -> "False" - | Some (Pdf.Boolean true) -> "True" - | _ -> if name = "/Trapped" then "False" else "") - -let getstring encoding pdf = - match encoding with - | Raw -> get_info true pdf - | Stripped -> get_info false pdf - | UTF8 -> get_info_utf8 pdf - -let output_info encoding pdf = - let getstring = getstring encoding pdf in - Printf.printf "Version: %i.%i\n" pdf.Pdf.major pdf.Pdf.minor; - Printf.printf "Pages: %i\n" (Pdfpage.endpage pdf); - Printf.printf "Title: %s\n" (getstring "/Title"); - Printf.printf "Author: %s\n" (getstring "/Author"); - Printf.printf "Subject: %s\n" (getstring "/Subject"); - Printf.printf "Keywords: %s\n" (getstring "/Keywords"); - Printf.printf "Creator: %s\n" (getstring "/Creator"); - Printf.printf "Producer: %s\n" (getstring "/Producer"); - Printf.printf "Created: %s\n" (getstring "/CreationDate"); - Printf.printf "Modified: %s\n" (getstring "/ModDate"); - Printf.printf "Trapped: %s\n" (getstring "/Trapped") - -type xmltree = - E of Cpdfxmlm.tag * xmltree list - | D of string - -let xmltree_of_bytes b = - let i = Cpdfxmlm.make_input (`String (0, string_of_bytes b)) in - let el tag childs = E (tag, childs) - and data d = D d in - Cpdfxmlm.input_doc_tree ~el ~data i - -let bytes_of_xmltree t = - let buf = Buffer.create 1024 in - let o = Cpdfxmlm.make_output (`Buffer buf) in - let frag = function - E (tag, childs) -> `El (tag, childs) - | D d -> `Data d - in - Cpdfxmlm.output_doc_tree frag o t; - bytes_of_string (Buffer.contents buf) - -let rec string_of_xmltree = function - D d -> - Printf.sprintf "DATA {%s}" d - | E (tag, trees) -> - Printf.sprintf "ELT (%s, %s)" - (string_of_tag tag) - (string_of_xmltrees trees) - -and string_of_tag ((n, n'), attributes) = - Printf.sprintf - "NAME |%s| |%s|, ATTRIBUTES {%s}" n n' - (string_of_attributes attributes) - -and string_of_attribute ((n, n'), str) = - Printf.sprintf "ATTRNAME |%s| |%s|, STR {%s}" n n' str - -and string_of_attributes attrs = - fold_left - (fun a b -> a ^ " " ^ b) "" (map string_of_attribute attrs) - -and string_of_xmltrees trees = - fold_left - (fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees) - -let adobe = "http://ns.adobe.com/pdf/1.3/" - -let xmp = "http://ns.adobe.com/xap/1.0/" - -let dc = "http://purl.org/dc/elements/1.1/" - -let rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" - -let combine_with_spaces strs = - String.trim - (fold_left (fun x y -> x ^ (if x <> "" then ", " else "") ^ y) "" strs) - -(* Collect all
  • elements inside a seq, bag, or alt. Combine with commas. If -none found, return empty string instead. *) -let collect_list_items = function - E (((n, n'), _), elts) when - n = rdf && (n' = "Alt" || n' = "Seq" || n' = "Bag") - -> - combine_with_spaces - (option_map - (function - E (((n, n'), _), [D d]) when n = rdf && n' = "li" -> - Some d - | _ -> None) - elts) - | _ -> "" - -let collect_list_items_all all = - match keep (function E _ -> true | _ -> false) all with - h::_ -> Some (collect_list_items h) - | [] -> None - -let rec get_data_for namespace name = function - D _ -> None - | E (((n, n'), _), [D d]) when n = namespace && n' = name -> - Some d - | E (((n, n'), _), e) when n = namespace && n' = name -> - collect_list_items_all e - | E (_, l) -> - match option_map (get_data_for namespace name) l with - x :: _ -> Some x - | _ -> None - -let output_xmp_info encoding pdf = - let print_out tree title namespace name = - match get_data_for namespace name tree with - None -> () - | Some data -> - Printf.printf "%s: " title; - print_endline data - in - match get_metadata pdf with - None -> () - | Some metadata -> - try - let dtd, tree = xmltree_of_bytes metadata in - print_out tree "XMP pdf:Keywords" adobe "Keywords"; - print_out tree "XMP pdf:Producer" adobe "Producer"; - print_out tree "XMP pdf:Trapped" adobe "Trapped"; - print_out tree "XMP pdf:Title" adobe "Title"; - print_out tree "XMP pdf:Creator" adobe "Creator"; - print_out tree "XMP pdf:Subject" adobe "Subject"; - print_out tree "XMP pdf:Author" adobe "Author"; - print_out tree "XMP pdf:CreationDate" adobe "CreationDate"; - print_out tree "XMP pdf:ModDate" adobe "ModDate"; - print_out tree "XMP xmp:CreateDate" xmp "CreateDate"; - print_out tree "XMP xmp:CreatorTool" xmp "CreatorTool"; - print_out tree "XMP xmp:MetadataDate" xmp "MetadataDate"; - print_out tree "XMP xmp:ModifyDate" xmp "ModifyDate"; - print_out tree "XMP dc:title" dc "title"; - print_out tree "XMP dc:creator" dc "creator"; - print_out tree "XMP dc:subject" dc "subject"; - print_out tree "XMP dc:description" dc "description" - with - _ -> () - -(* Get XMP info equivalent of an old metadata field *) -let check = function - "/Title" -> [(adobe, "Title"); (dc, "title")] -| "/Author" -> [(adobe, "Author"); (dc, "creator")] -| "/Subject" -> [(adobe, "Subject"); (dc, "subject")] -| "/Keywords" -> [(adobe, "Keywords")] -| "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool")] -| "/Producer" -> [(adobe, "Producer")] -| "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")] -| "/ModDate" -> [(adobe, "ModificationDate"); (xmp, "ModifyDate")] -| _ -> failwith "Cpdf.check_name not known" - -let get_xmp_info pdf name = - let tocheck = check name in - match get_metadata pdf with - None -> "" - | Some metadata -> - try - let _, tree = xmltree_of_bytes metadata in - let results = map (fun (kind, key) -> match get_data_for kind key tree with Some x -> x | None -> "") tocheck in - match lose (eq "") results with - x::_ -> x - | [] -> "" - with - _ -> "" - -(* Set XMP info *) -let rec set_xml_field kind fieldname value = function - D data -> D data -| E (((n, n'), m), _ (*[D _]*)) when n = kind && n' = fieldname -> (* Replace anything inside, including nothing i.e *) - E (((n, n'), m), [D value]) -| E (x, ts) -> E (x, map (set_xml_field kind fieldname value) ts) - -let set_pdf_info_xml kind fieldname value xmldata pdf = - let dtd, tree = xmltree_of_bytes xmldata in - let str = - match value with - Pdf.String s -> s - | Pdf.Boolean true -> "True" - | Pdf.Boolean false -> "False" - | _ -> failwith "set_pdf_info_xml: not a string" - in - let newtree = set_xml_field kind fieldname str tree in - bytes_of_xmltree (dtd, newtree) - -let set_pdf_info_xml_many changes value xmldata pdf = - let xmldata = ref xmldata in - iter - (fun (kind, fieldname) -> - xmldata := set_pdf_info_xml kind fieldname value !xmldata pdf) - changes; - !xmldata - - -(* \section{Set an entry in the /Info dictionary} *) - -(* We must parse the date to get its components, then use strftime to build the - * new string in XMP format *) - -type date = - {mutable year : int; - mutable month : int; (* 1 - 12 *) - mutable day : int; (* 1 - 31 *) - mutable hour : int; (* 0 - 23 *) - mutable minute : int; (* 0 - 59 *) - mutable second : int; (* 0 - 59 *) - mutable ut_relationship : int; (* -1, 0, +1 *) - mutable offset_hours : int; (* 0 - 59 *) - mutable offset_minutes : int (* 0 - 59 *)} - -let default_date () = - {year = 0; - month = 1; - day = 1; - hour = 0; - minute = 0; - second = 0; - ut_relationship = 0; - offset_hours = 0; - offset_minutes = 0} - -(* XMP date format is YYYY-MM-DDThh:mm:ssTZD *) -let make_xmp_date_from_components d = - let tzd = - if d.ut_relationship = 0 && d.offset_hours = 0 && d.offset_minutes = 0 then "Z" else - (if d.ut_relationship >=0 then "+" else "-") ^ - Printf.sprintf "%02i" d.offset_hours ^ - ":" ^ - Printf.sprintf "%02i" d.offset_minutes - in - Cpdfstrftime.strftime - ~time:{Cpdfstrftime._tm_sec = d.second; - Cpdfstrftime._tm_min = d.minute; - Cpdfstrftime._tm_hour = d.hour; - Cpdfstrftime._tm_mday = d.day; - Cpdfstrftime._tm_mon = d.month - 1; - Cpdfstrftime._tm_year = d.year - 1900; - Cpdfstrftime._tm_wday = 0; - Cpdfstrftime._tm_yday = 0; - Cpdfstrftime._tm_isdst = false} - "%Y-%m-%dT%H:%M:%S" - ^ - tzd - -let xmp_date date = - let d = default_date () in - try - match explode date with - 'D'::':'::r -> - begin match r with - y1::y2::y3::y4::r -> - d.year <- int_of_string (implode [y1; y2; y3; y4]); - begin match r with - m1::m2::r -> - d.month <- int_of_string (implode [m1; m2]); - begin match r with - d1::d2::r -> - d.day <- int_of_string (implode [d1; d2]); - begin match r with - h1::h2::r -> - d.hour <- int_of_string (implode [h1; h2]); - begin match r with - m1::m2::r -> - d.minute <- int_of_string (implode [m1; m2]); - begin match r with - s1::s2::r -> - d.second <- int_of_string (implode [s1; s2]); - begin match r with - o::r -> - d.ut_relationship <- - if o = '+' then 1 else - if o = '-' then -1 else - 0; - begin match r with - h1::h2::'\''::r -> - d.offset_hours <- int_of_string (implode [h1; h2]); - begin match r with - m1::m2::_ -> - d.offset_minutes <- int_of_string (implode [m1; m2]); - raise Exit - | _ -> raise Exit - end - | _ -> raise Exit - end - | _ -> raise Exit - end - | _ -> raise Exit - end - | _ -> raise Exit - end - | _ -> raise Exit - end - | _ -> raise Exit - end - | _ -> raise Exit - end - | _ -> - Printf.eprintf "xmp_date: Malformed date string (no year): %s\n%!" date; - make_xmp_date_from_components d - end - | _ -> - Printf.eprintf "xmp_date: Malformed date string (no prefix): %s\n%!" date; - make_xmp_date_from_components d - with - Exit -> make_xmp_date_from_components d - -let set_pdf_info ?(xmp_also=false) ?(xmp_just_set=false) (key, value, version) pdf = - let infodict = - match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with - | Some d -> d - | None -> Pdf.Dictionary [] - in - let infodict' = Pdf.add_dict_entry infodict key value in - let objnum = Pdf.addobj pdf infodict' in - if not xmp_just_set then - begin - pdf.Pdf.trailerdict <- - Pdf.add_dict_entry pdf.Pdf.trailerdict "/Info" (Pdf.Indirect objnum); - pdf.Pdf.minor <- - max pdf.Pdf.minor version - end; - if xmp_also then - begin match get_metadata pdf with - None -> pdf - | Some xmldata -> - let xmp_date = function Pdf.String s -> Pdf.String (xmp_date s) | _ -> failwith "xmp_date not a string" in - let changes, value = - match key with - | "/Producer" -> [(adobe, "Producer")], value - | "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool"); (dc, "creator")], value - | "/Author" -> [(adobe, "Author")], value - | "/Title" -> [(adobe, "Title"); (dc, "title")], value - | "/Subject" -> [(adobe, "Subject"); (dc, "subject")], value - | "/Keywords" -> [(adobe, "Keywords")], value - | "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")], xmp_date value - | "/ModDate" -> [(adobe, "ModDate"); (xmp, "ModifyDate")], xmp_date value - | "/Trapped" -> [(adobe, "Trapped")], value - | _ -> failwith "Unknown call to set_pdf_info" - in - set_metadata_from_bytes - true - (set_pdf_info_xml_many changes value xmldata pdf) - pdf - end - else - pdf - -(* Set metadata date *) -let set_metadata_date pdf date = - match get_metadata pdf with - None -> pdf - | Some xmldata -> - let changes= [(xmp, "MetadataDate")] in - let value = match date with "now" -> xmp_date (expand_date "now") | x -> x in - set_metadata_from_bytes - true - (set_pdf_info_xml_many changes (Pdf.String value) xmldata pdf) - pdf - -let replacements pdf = - let info = get_info_utf8 pdf in - [("CREATEDATE", xmp_date (let i = info "/CreationDate" in if i = "" then expand_date "now" else i)); - ("MODDATE", xmp_date (let i = info "/ModDate" in if i = "" then expand_date "now" else i)); - ("PRODUCER", info "/Producer"); - ("CREATOR", info "/Creator"); - ("TITLE", info "/Title"); - ("SUBJECT", info "/Subject"); - ("AUTHOR", info "/Author"); - ("KEYWORDS", info "/Keywords"); - ("TRAPPED", info "/Trapped"); - ("METADATADATE", xmp_date (expand_date "now"))] - -let create_metadata pdf = - let xmp = ref xmp_template in - iter - (fun (s, r) -> xmp := string_replace_all s r !xmp) - (replacements pdf); - set_metadata_from_bytes false (bytes_of_string !xmp) pdf - (* \section{Blacken text} *) (* @@ -3497,9 +2843,6 @@ let draft onlyremove boxes range pdf = pagenums; Pdfpage.change_pages true !pdf (rev !pages') -let set_version v pdf = - pdf.Pdf.minor <- v - let blank_document width height pages = let pdf_pages = map (fun () -> Pdfpage.blankpage (Pdfpaper.make Pdfunits.PdfPoint width height)) (many () pages) @@ -4022,7 +3365,7 @@ let list_spot_colours pdf = let add_bookmark_title filename use_title pdf = let title = if use_title then - match get_info_utf8 pdf "/Title", get_xmp_info pdf "/Title" with + match Cpdfmetadata.get_info_utf8 pdf "/Title", Cpdfmetadata.get_xmp_info pdf "/Title" with "", x | x, "" | _, x -> x else Filename.basename filename @@ -4062,13 +3405,13 @@ let create_pdf pages pagesize = (* Remove characters which might not make good filenames. *) let remove_unsafe_characters encoding s = - if encoding = Raw then s else + if encoding = Cpdfmetadata.Raw then s else let chars = lose (function x -> match x with '/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true - | x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Stripped) -> true + | x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Cpdfmetadata.Stripped) -> true | _ -> false) (explode s) in diff --git a/cpdf.mli b/cpdf.mli index c02a444..5326cac 100644 --- a/cpdf.mli +++ b/cpdf.mli @@ -1,13 +1,6 @@ (** Coherent PDF Tools Core Routines *) open Pdfutil -(** {2 Types and Exceptions} *) - -(** Possible output encodings for some function. [Raw] does no processing at -all - the PDF string is output as-is. [UTF8] converts loslessly to UTF8. -[Stripped] extracts the unicode codepoints and returns only those which -correspond to 7 bit ASCII. *) -type encoding = Raw | UTF8 | Stripped type color = Grey of float @@ -38,45 +31,6 @@ val map_pages : (int -> Pdfpage.t -> 'a) -> Pdf.t -> int list -> 'a list val copy_cropbox_to_mediabox : Pdf.t -> int list -> Pdf.t -(** {2 Metadata and settings} *) - -(** [copy_id keepversion copyfrom copyto] copies the ID, if any, from -[copyfrom] to [copyto]. If [keepversion] is true, the PDF version of [copyto] -won't be affected. *) -val copy_id : bool -> Pdf.t -> Pdf.t -> Pdf.t - -(** [set_pdf_info (key, value, version)] sets the entry [key] in the /Info directory, updating -the PDF minor version to [version].*) -val set_pdf_info : ?xmp_also:bool -> ?xmp_just_set:bool -> (string * Pdf.pdfobject * int) -> Pdf.t -> Pdf.t - -val get_xmp_info : Pdf.t -> string -> string - -(** [set_pdf_info (key, value, version)] sets the entry [key] in the -/ViewerPreferences directory, updating the PDF minor version to [version].*) -val set_viewer_preference : (string * Pdf.pdfobject * int) -> Pdf.t -> Pdf.t - -(** Set the page layout to the given name (sans slash) e.g SinglePage *) -val set_page_layout : Pdf.t -> string -> Pdf.t - -(** Set the page layout to the given name (sans slash) e.g SinglePage *) -val set_page_mode : Pdf.t -> string -> Pdf.t - -(** Set the open action. If the boolean is true, /Fit will be used, otherwise /XYZ *) -val set_open_action : Pdf.t -> bool -> int -> Pdf.t - -(** Set the PDF version number *) -val set_version : int -> Pdf.t -> unit - -(** Given a PDF, returns a function which can lookup a given dictionary entry -from the /Info dictionary, returning it as a UTF8 string *) -val get_info_utf8 : Pdf.t -> string -> string - -(** Output to standard output general information about a PDF. *) -val output_info : encoding -> Pdf.t -> unit - -(** Output to standard output information from any XMP metadata stream in a PDF. *) -val output_xmp_info : encoding -> Pdf.t -> unit - (** {2 Bookmarks} *) (** [parse_bookmark_file verify pdf input] parses the bookmark file in [input]. @@ -90,30 +44,9 @@ val add_bookmarks : json:bool -> bool -> Pdfio.input -> Pdf.t -> Pdf.t (** [list_bookmarks encoding range pdf output] lists the bookmarks to the given output in the format specified in cpdfmanual.pdf *) -val list_bookmarks : json:bool -> encoding -> int list -> Pdf.t -> Pdfio.output -> unit +val list_bookmarks : json:bool -> Cpdfmetadata.encoding -> int list -> Pdf.t -> Pdfio.output -> unit -(** {2 XML Metadata} *) -(** [set_metadata keepversion filename pdf] sets the XML metadata of a PDF to the contents of [filename]. If [keepversion] is true, the PDF version will not be altered. *) -val set_metadata : bool -> string -> Pdf.t -> Pdf.t - -(** The same, but the content comes from [bytes]. *) -val set_metadata_from_bytes : bool -> Pdfio.bytes -> Pdf.t -> Pdf.t - -(** Remove the metadata from a file *) -val remove_metadata : Pdf.t -> Pdf.t - -(** Extract metadata to a [Pdfio.bytes] *) -val get_metadata : Pdf.t -> Pdfio.bytes option - -(** Print metadate to stdout *) -val print_metadata : Pdf.t -> unit - -(** Set the metadata date *) -val set_metadata_date : Pdf.t -> string -> Pdf.t - -(** Create XMP metadata from scratch *) -val create_metadata : Pdf.t -> Pdf.t (** {2 Stamping} *) @@ -145,9 +78,6 @@ val list_fonts : Pdf.t -> int list -> (int * string * string * string * string) (** {2 Adding text} *) -(** Expand the string "now" to a PDF date string, ignoring any other string *) -val expand_date : string -> string - (** Justification of multiline text *) type justification = | LeftJustify @@ -281,10 +211,10 @@ val show_boxes : ?fast:bool -> Pdf.t -> int list -> Pdf.t (** {2 Annotations} *) (** List the annotations to standard output in a given encoding. See cpdfmanual.pdf for the format details. *) -val list_annotations : json:bool -> encoding -> Pdf.t -> unit +val list_annotations : json:bool -> Cpdfmetadata.encoding -> Pdf.t -> unit (** Return the annotations as a (pagenumber, content) list *) -val get_annotations : encoding -> Pdf.t -> (int * string) list +val get_annotations : Cpdfmetadata.encoding -> Pdf.t -> (int * string) list (** Copy the annotations on a given set of pages from a to b. b is returned. *) val copy_annotations : int list -> Pdf.t -> Pdf.t -> Pdf.t @@ -375,12 +305,12 @@ val bookmarks_open_to_level : int -> Pdf.t -> Pdf.t val create_pdf : int -> Pdfpaper.t -> Pdf.t -val name_of_spec : encoding -> +val name_of_spec : Cpdfmetadata.encoding -> Pdfmarks.t list -> Pdf.t -> int -> string -> int -> string -> int -> int -> string val extract_images : string -> string -> - encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit + Cpdfmetadata.encoding -> bool -> bool -> Pdf.t -> int list -> string -> unit diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 3a3fce4..d1c5a90 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -417,7 +417,7 @@ type args = mutable retain_numbering : bool; mutable remove_duplicate_fonts : bool; mutable remove_duplicate_streams : bool; - mutable encoding : Cpdf.encoding; + mutable encoding : Cpdfmetadata.encoding; mutable scale : float; mutable copyfontpage : int; mutable copyfontname : string option; @@ -536,7 +536,7 @@ let args = retain_numbering = false; remove_duplicate_fonts = false; remove_duplicate_streams = false; - encoding = Cpdf.Stripped; + encoding = Cpdfmetadata.Stripped; scale = 1.; copyfontpage = 1; copyfontname = None; @@ -655,7 +655,7 @@ let reset_arguments () = args.retain_numbering <- false; args.remove_duplicate_fonts <- false; args.remove_duplicate_streams <- false; - args.encoding <- Cpdf.Stripped; + args.encoding <- Cpdfmetadata.Stripped; args.scale <- 1.; args.copyfontpage <- 1; args.copyfontname <- None; @@ -1779,13 +1779,13 @@ and specs = Arg.Unit setrecrypt, " Keep this file's encryption when writing"); ("-raw", - Arg.Unit (setencoding Cpdf.Raw), + Arg.Unit (setencoding Cpdfmetadata.Raw), " Do not process text"); ("-stripped", - Arg.Unit (setencoding Cpdf.Stripped), + Arg.Unit (setencoding Cpdfmetadata.Stripped), " Process text by simple stripping to ASCII"); ("-utf8", - Arg.Unit (setencoding Cpdf.UTF8), + Arg.Unit (setencoding Cpdfmetadata.UTF8), " Process text by conversion to UTF8 Unicode"); ("-fast", Arg.Unit setfast, @@ -2724,15 +2724,15 @@ let unescape_octals s = implode (unescape_octals [] (explode s)) let process s = - if args.encoding <> Cpdf.Raw + if args.encoding <> Cpdfmetadata.Raw then Pdftext.pdfdocstring_of_utf8 s else unescape_octals s let set_producer s pdf = - ignore (Cpdf.set_pdf_info ("/Producer", Pdf.String (process s), 0) pdf) + ignore (Cpdfmetadata.set_pdf_info ("/Producer", Pdf.String (process s), 0) pdf) let set_creator s pdf = - ignore (Cpdf.set_pdf_info ("/Creator", Pdf.String (process s), 0) pdf) + ignore (Cpdfmetadata.set_pdf_info ("/Creator", Pdf.String (process s), 0) pdf) let really_write_pdf ?(encryption = None) ?(is_decompress=false) mk_id pdf outname = if args.producer <> None then set_producer (unopt args.producer) pdf; @@ -3146,8 +3146,8 @@ let go () = if inname <> "" then Printf.printf "Linearized: %b\n" (Pdfread.is_linearized (Pdfio.input_of_channel (open_in_bin inname))); let pdf = decrypt_if_necessary input (Some Info) pdf in - Cpdf.output_info args.encoding pdf; - Cpdf.output_xmp_info args.encoding pdf + Cpdfmetadata.output_info args.encoding pdf; + Cpdfmetadata.output_xmp_info args.encoding pdf | Some PageInfo -> begin match args.inputs, args.out with | (_, pagespec, _, _, _, _)::_, _ -> @@ -3157,7 +3157,7 @@ let go () = | _ -> error "list-bookmarks: bad command line" end | Some Metadata -> - Cpdf.print_metadata (get_single_pdf (Some Metadata) true) + Cpdfmetadata.print_metadata (get_single_pdf (Some Metadata) true) | Some Fonts -> begin match args.inputs, args.out with | (_, pagespec, _, _, _, _)::_, _ -> @@ -3357,14 +3357,14 @@ let go () = | SetCreate _ | SetModify _ | SetCreator _ | SetProducer _ | SetTrapped | SetUntrapped) as op) -> let key, value, version = - let f s = if args.encoding <> Cpdf.Raw then Pdftext.pdfdocstring_of_utf8 s else unescape_octals s in + let f s = if args.encoding <> Cpdfmetadata.Raw then Pdftext.pdfdocstring_of_utf8 s else unescape_octals s in match op with | SetAuthor s -> "/Author", Pdf.String (f s), 0 | SetTitle s -> "/Title", Pdf.String (f s), 1 | SetSubject s -> "/Subject", Pdf.String (f s), 1 | SetKeywords s -> "/Keywords", Pdf.String (f s), 1 - | SetCreate s -> "/CreationDate", Pdf.String (Cpdf.expand_date s), 0 - | SetModify s -> "/ModDate", Pdf.String (Cpdf.expand_date s), 0 + | SetCreate s -> "/CreationDate", Pdf.String (Cpdfmetadata.expand_date s), 0 + | SetModify s -> "/ModDate", Pdf.String (Cpdfmetadata.expand_date s), 0 | SetCreator s -> "/Creator", Pdf.String (f s), 0 | SetProducer s -> "/Producer", Pdf.String (f s), 0 | SetTrapped -> "/Trapped", Pdf.Boolean true, 3 @@ -3374,12 +3374,12 @@ let go () = let pdf = get_single_pdf args.op false in let version = if args.keepversion then pdf.Pdf.minor else version in write_pdf false - (Cpdf.set_pdf_info + (Cpdfmetadata.set_pdf_info ~xmp_also:args.alsosetxml ~xmp_just_set:args.justsetxml (key, value, version) pdf) | Some (SetMetadataDate date) -> - write_pdf false (Cpdf.set_metadata_date (get_single_pdf args.op false) date) + write_pdf false (Cpdfmetadata.set_metadata_date (get_single_pdf args.op false) date) | Some ((HideToolbar _ | HideMenubar _ | HideWindowUI _ | FitWindow _ | CenterWindow _ | DisplayDocTitle _) as op) -> begin match args.out with @@ -3396,20 +3396,20 @@ let go () = in let pdf = get_single_pdf args.op false in let version = if args.keepversion then pdf.Pdf.minor else version in - write_pdf false (Cpdf.set_viewer_preference (key, value, version) pdf) + write_pdf false (Cpdfmetadata.set_viewer_preference (key, value, version) pdf) end | Some (OpenAtPage str) -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf str in let n = match range with [x] -> x | _ -> error "open_at_page: range does not specify single page" in - write_pdf false (Cpdf.set_open_action pdf false n) + write_pdf false (Cpdfmetadata.set_open_action pdf false n) | Some (OpenAtPageFit str) -> let pdf = get_single_pdf args.op false in let range = parse_pagespec_allow_empty pdf str in let n = match range with [x] -> x | _ -> error "open_at_page: range does not specify single page" in - write_pdf false (Cpdf.set_open_action pdf true n) + write_pdf false (Cpdfmetadata.set_open_action pdf true n) | Some (SetMetadata metadata_file) -> - write_pdf false (Cpdf.set_metadata args.keepversion metadata_file (get_single_pdf args.op false)) + write_pdf false (Cpdfmetadata.set_metadata args.keepversion metadata_file (get_single_pdf args.op false)) | Some (SetVersion v) -> let pdf = get_single_pdf args.op false in let pdf = @@ -3419,9 +3419,9 @@ let go () = in write_pdf false pdf | Some (SetPageLayout s) -> - write_pdf false (Cpdf.set_page_layout (get_single_pdf args.op false) s) + write_pdf false (Cpdfmetadata.set_page_layout (get_single_pdf args.op false) s) | Some (SetPageMode s) -> - write_pdf false (Cpdf.set_page_mode (get_single_pdf args.op false) s) + write_pdf false (Cpdfmetadata.set_page_mode (get_single_pdf args.op false) s) | Some Split -> begin match args.inputs, args.out with | [(f, ranges, _, _, _, _)], File output_spec -> @@ -3514,7 +3514,7 @@ let go () = begin match args.inputs with | [(k, _, u, o, _, _) as input] -> let pdf = - Cpdf.copy_id + Cpdfmetadata.copy_id args.keepversion (pdfread_pdf_of_file (optstring u) (optstring o) getfrom) (get_pdf_from_input_kind input args.op k) @@ -3765,7 +3765,7 @@ let go () = args.recrypt <- false; write_pdf false (get_single_pdf args.op false) | Some RemoveMetadata -> - write_pdf false (Cpdf.remove_metadata (get_single_pdf args.op false)) + write_pdf false (Cpdfmetadata.remove_metadata (get_single_pdf args.op false)) | Some ExtractImages -> let output_spec = begin match args.out with @@ -3838,7 +3838,7 @@ let go () = write_pdf false (Cpdf.remove_clipping pdf range) | Some CreateMetadata -> let pdf = get_single_pdf args.op false in - write_pdf false (Cpdf.create_metadata pdf) + write_pdf false (Cpdfmetadata.create_metadata pdf) | Some EmbedMissingFonts -> let fi = match args.inputs with diff --git a/cpdfmetadata.ml b/cpdfmetadata.ml new file mode 100644 index 0000000..220c361 --- /dev/null +++ b/cpdfmetadata.ml @@ -0,0 +1,661 @@ +open Pdfutil +open Pdfio +open Cpdferror + +type encoding = + | Raw + | UTF8 + | Stripped + +(* Just strip everything which isn't 7 bit ASCII *) +let crude_de_unicode s = + implode (map char_of_int (lose (fun x -> x > 127) (Pdftext.codepoints_of_pdfdocstring s))) + +let encode_output enc s = + match enc with + | Raw -> s + | UTF8 -> Pdftext.utf8_of_pdfdocstring s + | Stripped -> crude_de_unicode s + +let xmp_template = +{| + + + + + CREATEDATE + MODDATE + PRODUCER + CREATOR + TITLE + SUBJECT + AUTHOR + KEYWORDS + TRAPPED + + + + CREATEDATE + CREATOR + MODDATE + METADATADATE + + + + TITLE + + + + +|} + +(* \section{Set or replace metadata} *) +let set_metadata_from_bytes keepversion data pdf = + let metadata_stream = + Pdf.Stream + {contents = + (Pdf.Dictionary + ["/Length", Pdf.Integer (bytes_size data); + "/Type", Pdf.Name "/Metadata"; + "/Subtype", Pdf.Name "/XML"], + Pdf.Got data)} + in + let objnum = Pdf.addobj pdf metadata_stream in + let document_catalog = + match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with + | Some s -> s + | None -> error "Malformed PDF: No root." + in + let document_catalog' = + Pdf.add_dict_entry document_catalog "/Metadata" (Pdf.Indirect objnum) + in + let rootnum = Pdf.addobj pdf document_catalog' in + let trailerdict = + Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum) + in + {pdf with + Pdf.trailerdict = trailerdict; + Pdf.root = rootnum; + Pdf.minor = + if keepversion then pdf.Pdf.minor else max 4 pdf.Pdf.minor} + +let set_metadata keepversion filename pdf = + let ch = open_in_bin filename in + let data = mkbytes (in_channel_length ch) in + for x = 0 to bytes_size data - 1 do + bset data x (input_byte ch) + done; + set_metadata_from_bytes keepversion data pdf + + + +(* \section{Remove metadata} *) +let remove_metadata pdf = + match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with + | None -> error "malformed file" + | Some root -> + let root' = Pdf.remove_dict_entry root "/Metadata" in + let rootnum = Pdf.addobj pdf root' in + {pdf with + Pdf.trailerdict = + Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum); + Pdf.root = + rootnum} +(* Print metadata *) +let get_metadata pdf = + match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with + | None -> error "malformed file" + | Some root -> + match Pdf.lookup_direct pdf "/Metadata" root with + | Some ((Pdf.Stream _) as s) -> + Pdfcodec.decode_pdfstream pdf s; + begin match s with + | Pdf.Stream {contents = (_, Pdf.Got data)} -> Some data + | _ -> assert false + end + | _ -> None + +let print_metadata pdf = + match get_metadata pdf with + None -> () + | Some data -> + for x = 0 to bytes_size data - 1 do + Printf.printf "%c" (char_of_int (bget data x)) + done + + +let get_info raw pdf = + let infodict = + match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with + | Some infodict -> infodict + | _ -> Pdf.Dictionary [] + in + let getstring name = + match Pdf.lookup_direct pdf name infodict with + | Some (Pdf.String s) -> + if raw then s else crude_de_unicode s + | Some (Pdf.Boolean false) -> "False" + | Some (Pdf.Boolean true) -> "True" + | _ -> if name = "/Trapped" then "False" else "" + in + getstring + +let get_info_utf8 pdf = + let infodict = + match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with + | Some infodict -> infodict + | _ -> Pdf.Dictionary [] + in + (function name -> + match Pdf.lookup_direct pdf name infodict with + | Some (Pdf.String s) -> Pdftext.utf8_of_pdfdocstring s + | Some (Pdf.Boolean false) -> "False" + | Some (Pdf.Boolean true) -> "True" + | _ -> if name = "/Trapped" then "False" else "") + +let getstring encoding pdf = + match encoding with + | Raw -> get_info true pdf + | Stripped -> get_info false pdf + | UTF8 -> get_info_utf8 pdf + +let output_info encoding pdf = + let getstring = getstring encoding pdf in + Printf.printf "Version: %i.%i\n" pdf.Pdf.major pdf.Pdf.minor; + Printf.printf "Pages: %i\n" (Pdfpage.endpage pdf); + Printf.printf "Title: %s\n" (getstring "/Title"); + Printf.printf "Author: %s\n" (getstring "/Author"); + Printf.printf "Subject: %s\n" (getstring "/Subject"); + Printf.printf "Keywords: %s\n" (getstring "/Keywords"); + Printf.printf "Creator: %s\n" (getstring "/Creator"); + Printf.printf "Producer: %s\n" (getstring "/Producer"); + Printf.printf "Created: %s\n" (getstring "/CreationDate"); + Printf.printf "Modified: %s\n" (getstring "/ModDate"); + Printf.printf "Trapped: %s\n" (getstring "/Trapped") + +type xmltree = + E of Cpdfxmlm.tag * xmltree list + | D of string + +let xmltree_of_bytes b = + let i = Cpdfxmlm.make_input (`String (0, string_of_bytes b)) in + let el tag childs = E (tag, childs) + and data d = D d in + Cpdfxmlm.input_doc_tree ~el ~data i + +let bytes_of_xmltree t = + let buf = Buffer.create 1024 in + let o = Cpdfxmlm.make_output (`Buffer buf) in + let frag = function + E (tag, childs) -> `El (tag, childs) + | D d -> `Data d + in + Cpdfxmlm.output_doc_tree frag o t; + bytes_of_string (Buffer.contents buf) + +let rec string_of_xmltree = function + D d -> + Printf.sprintf "DATA {%s}" d + | E (tag, trees) -> + Printf.sprintf "ELT (%s, %s)" + (string_of_tag tag) + (string_of_xmltrees trees) + +and string_of_tag ((n, n'), attributes) = + Printf.sprintf + "NAME |%s| |%s|, ATTRIBUTES {%s}" n n' + (string_of_attributes attributes) + +and string_of_attribute ((n, n'), str) = + Printf.sprintf "ATTRNAME |%s| |%s|, STR {%s}" n n' str + +and string_of_attributes attrs = + fold_left + (fun a b -> a ^ " " ^ b) "" (map string_of_attribute attrs) + +and string_of_xmltrees trees = + fold_left + (fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees) + +let adobe = "http://ns.adobe.com/pdf/1.3/" + +let xmp = "http://ns.adobe.com/xap/1.0/" + +let dc = "http://purl.org/dc/elements/1.1/" + +let rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" + +let combine_with_spaces strs = + String.trim + (fold_left (fun x y -> x ^ (if x <> "" then ", " else "") ^ y) "" strs) + +(* Collect all
  • elements inside a seq, bag, or alt. Combine with commas. If +none found, return empty string instead. *) +let collect_list_items = function + E (((n, n'), _), elts) when + n = rdf && (n' = "Alt" || n' = "Seq" || n' = "Bag") + -> + combine_with_spaces + (option_map + (function + E (((n, n'), _), [D d]) when n = rdf && n' = "li" -> + Some d + | _ -> None) + elts) + | _ -> "" + +let collect_list_items_all all = + match keep (function E _ -> true | _ -> false) all with + h::_ -> Some (collect_list_items h) + | [] -> None + +let rec get_data_for namespace name = function + D _ -> None + | E (((n, n'), _), [D d]) when n = namespace && n' = name -> + Some d + | E (((n, n'), _), e) when n = namespace && n' = name -> + collect_list_items_all e + | E (_, l) -> + match option_map (get_data_for namespace name) l with + x :: _ -> Some x + | _ -> None + +let output_xmp_info encoding pdf = + let print_out tree title namespace name = + match get_data_for namespace name tree with + None -> () + | Some data -> + Printf.printf "%s: " title; + print_endline data + in + match get_metadata pdf with + None -> () + | Some metadata -> + try + let dtd, tree = xmltree_of_bytes metadata in + print_out tree "XMP pdf:Keywords" adobe "Keywords"; + print_out tree "XMP pdf:Producer" adobe "Producer"; + print_out tree "XMP pdf:Trapped" adobe "Trapped"; + print_out tree "XMP pdf:Title" adobe "Title"; + print_out tree "XMP pdf:Creator" adobe "Creator"; + print_out tree "XMP pdf:Subject" adobe "Subject"; + print_out tree "XMP pdf:Author" adobe "Author"; + print_out tree "XMP pdf:CreationDate" adobe "CreationDate"; + print_out tree "XMP pdf:ModDate" adobe "ModDate"; + print_out tree "XMP xmp:CreateDate" xmp "CreateDate"; + print_out tree "XMP xmp:CreatorTool" xmp "CreatorTool"; + print_out tree "XMP xmp:MetadataDate" xmp "MetadataDate"; + print_out tree "XMP xmp:ModifyDate" xmp "ModifyDate"; + print_out tree "XMP dc:title" dc "title"; + print_out tree "XMP dc:creator" dc "creator"; + print_out tree "XMP dc:subject" dc "subject"; + print_out tree "XMP dc:description" dc "description" + with + _ -> () + +(* Get XMP info equivalent of an old metadata field *) +let check = function + "/Title" -> [(adobe, "Title"); (dc, "title")] +| "/Author" -> [(adobe, "Author"); (dc, "creator")] +| "/Subject" -> [(adobe, "Subject"); (dc, "subject")] +| "/Keywords" -> [(adobe, "Keywords")] +| "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool")] +| "/Producer" -> [(adobe, "Producer")] +| "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")] +| "/ModDate" -> [(adobe, "ModificationDate"); (xmp, "ModifyDate")] +| _ -> failwith "Cpdf.check_name not known" + +let get_xmp_info pdf name = + let tocheck = check name in + match get_metadata pdf with + None -> "" + | Some metadata -> + try + let _, tree = xmltree_of_bytes metadata in + let results = map (fun (kind, key) -> match get_data_for kind key tree with Some x -> x | None -> "") tocheck in + match lose (eq "") results with + x::_ -> x + | [] -> "" + with + _ -> "" + +(* Set XMP info *) +let rec set_xml_field kind fieldname value = function + D data -> D data +| E (((n, n'), m), _ (*[D _]*)) when n = kind && n' = fieldname -> (* Replace anything inside, including nothing i.e *) + E (((n, n'), m), [D value]) +| E (x, ts) -> E (x, map (set_xml_field kind fieldname value) ts) + +let set_pdf_info_xml kind fieldname value xmldata pdf = + let dtd, tree = xmltree_of_bytes xmldata in + let str = + match value with + Pdf.String s -> s + | Pdf.Boolean true -> "True" + | Pdf.Boolean false -> "False" + | _ -> failwith "set_pdf_info_xml: not a string" + in + let newtree = set_xml_field kind fieldname str tree in + bytes_of_xmltree (dtd, newtree) + +let set_pdf_info_xml_many changes value xmldata pdf = + let xmldata = ref xmldata in + iter + (fun (kind, fieldname) -> + xmldata := set_pdf_info_xml kind fieldname value !xmldata pdf) + changes; + !xmldata + + +(* \section{Set an entry in the /Info dictionary} *) + +(* We must parse the date to get its components, then use strftime to build the + * new string in XMP format *) + +type date = + {mutable year : int; + mutable month : int; (* 1 - 12 *) + mutable day : int; (* 1 - 31 *) + mutable hour : int; (* 0 - 23 *) + mutable minute : int; (* 0 - 59 *) + mutable second : int; (* 0 - 59 *) + mutable ut_relationship : int; (* -1, 0, +1 *) + mutable offset_hours : int; (* 0 - 59 *) + mutable offset_minutes : int (* 0 - 59 *)} + +let default_date () = + {year = 0; + month = 1; + day = 1; + hour = 0; + minute = 0; + second = 0; + ut_relationship = 0; + offset_hours = 0; + offset_minutes = 0} + +(* XMP date format is YYYY-MM-DDThh:mm:ssTZD *) +let make_xmp_date_from_components d = + let tzd = + if d.ut_relationship = 0 && d.offset_hours = 0 && d.offset_minutes = 0 then "Z" else + (if d.ut_relationship >=0 then "+" else "-") ^ + Printf.sprintf "%02i" d.offset_hours ^ + ":" ^ + Printf.sprintf "%02i" d.offset_minutes + in + Cpdfstrftime.strftime + ~time:{Cpdfstrftime._tm_sec = d.second; + Cpdfstrftime._tm_min = d.minute; + Cpdfstrftime._tm_hour = d.hour; + Cpdfstrftime._tm_mday = d.day; + Cpdfstrftime._tm_mon = d.month - 1; + Cpdfstrftime._tm_year = d.year - 1900; + Cpdfstrftime._tm_wday = 0; + Cpdfstrftime._tm_yday = 0; + Cpdfstrftime._tm_isdst = false} + "%Y-%m-%dT%H:%M:%S" + ^ + tzd + +let xmp_date date = + let d = default_date () in + try + match explode date with + 'D'::':'::r -> + begin match r with + y1::y2::y3::y4::r -> + d.year <- int_of_string (implode [y1; y2; y3; y4]); + begin match r with + m1::m2::r -> + d.month <- int_of_string (implode [m1; m2]); + begin match r with + d1::d2::r -> + d.day <- int_of_string (implode [d1; d2]); + begin match r with + h1::h2::r -> + d.hour <- int_of_string (implode [h1; h2]); + begin match r with + m1::m2::r -> + d.minute <- int_of_string (implode [m1; m2]); + begin match r with + s1::s2::r -> + d.second <- int_of_string (implode [s1; s2]); + begin match r with + o::r -> + d.ut_relationship <- + if o = '+' then 1 else + if o = '-' then -1 else + 0; + begin match r with + h1::h2::'\''::r -> + d.offset_hours <- int_of_string (implode [h1; h2]); + begin match r with + m1::m2::_ -> + d.offset_minutes <- int_of_string (implode [m1; m2]); + raise Exit + | _ -> raise Exit + end + | _ -> raise Exit + end + | _ -> raise Exit + end + | _ -> raise Exit + end + | _ -> raise Exit + end + | _ -> raise Exit + end + | _ -> raise Exit + end + | _ -> raise Exit + end + | _ -> + Printf.eprintf "xmp_date: Malformed date string (no year): %s\n%!" date; + make_xmp_date_from_components d + end + | _ -> + Printf.eprintf "xmp_date: Malformed date string (no prefix): %s\n%!" date; + make_xmp_date_from_components d + with + Exit -> make_xmp_date_from_components d + +let set_pdf_info ?(xmp_also=false) ?(xmp_just_set=false) (key, value, version) pdf = + let infodict = + match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with + | Some d -> d + | None -> Pdf.Dictionary [] + in + let infodict' = Pdf.add_dict_entry infodict key value in + let objnum = Pdf.addobj pdf infodict' in + if not xmp_just_set then + begin + pdf.Pdf.trailerdict <- + Pdf.add_dict_entry pdf.Pdf.trailerdict "/Info" (Pdf.Indirect objnum); + pdf.Pdf.minor <- + max pdf.Pdf.minor version + end; + if xmp_also then + begin match get_metadata pdf with + None -> pdf + | Some xmldata -> + let xmp_date = function Pdf.String s -> Pdf.String (xmp_date s) | _ -> failwith "xmp_date not a string" in + let changes, value = + match key with + | "/Producer" -> [(adobe, "Producer")], value + | "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool"); (dc, "creator")], value + | "/Author" -> [(adobe, "Author")], value + | "/Title" -> [(adobe, "Title"); (dc, "title")], value + | "/Subject" -> [(adobe, "Subject"); (dc, "subject")], value + | "/Keywords" -> [(adobe, "Keywords")], value + | "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")], xmp_date value + | "/ModDate" -> [(adobe, "ModDate"); (xmp, "ModifyDate")], xmp_date value + | "/Trapped" -> [(adobe, "Trapped")], value + | _ -> failwith "Unknown call to set_pdf_info" + in + set_metadata_from_bytes + true + (set_pdf_info_xml_many changes value xmldata pdf) + pdf + end + else + pdf + +let expand_date = function + | "now" -> + begin match Sys.getenv_opt "CPDF_REPRODUCIBLE_DATES" with + | Some "true" -> Cpdfstrftime.strftime ~time:Cpdfstrftime.dummy "D:%Y%m%d%H%M%S" + | _ -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S" + end + | x -> x + +(* Set metadata date *) +let set_metadata_date pdf date = + match get_metadata pdf with + None -> pdf + | Some xmldata -> + let changes= [(xmp, "MetadataDate")] in + let value = match date with "now" -> xmp_date (expand_date "now") | x -> x in + set_metadata_from_bytes + true + (set_pdf_info_xml_many changes (Pdf.String value) xmldata pdf) + pdf + + +(* \section{Copy an /ID from one file to another} *) +let copy_id keepversion copyfrom copyto = + match Pdf.lookup_direct copyfrom "/ID" copyfrom.Pdf.trailerdict with + | None -> copyto (* error "Source PDF file has no /ID entry to copy from" *) + | Some id -> + copyto.Pdf.trailerdict <- + Pdf.add_dict_entry copyto.Pdf.trailerdict "/ID" id; + copyto.Pdf.minor <- + if keepversion then copyto.Pdf.minor else max copyto.Pdf.minor 1; + copyto + +let replacements pdf = + let info = get_info_utf8 pdf in + [("CREATEDATE", xmp_date (let i = info "/CreationDate" in if i = "" then expand_date "now" else i)); + ("MODDATE", xmp_date (let i = info "/ModDate" in if i = "" then expand_date "now" else i)); + ("PRODUCER", info "/Producer"); + ("CREATOR", info "/Creator"); + ("TITLE", info "/Title"); + ("SUBJECT", info "/Subject"); + ("AUTHOR", info "/Author"); + ("KEYWORDS", info "/Keywords"); + ("TRAPPED", info "/Trapped"); + ("METADATADATE", xmp_date (expand_date "now"))] + +let create_metadata pdf = + let xmp = ref xmp_template in + iter + (fun (s, r) -> xmp := string_replace_all s r !xmp) + (replacements pdf); + set_metadata_from_bytes false (bytes_of_string !xmp) pdf + +(* \section{Set viewer preferences} *) +let set_viewer_preference (key, value, version) pdf = + match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with + | Some catalog -> + let viewer_preferences = + match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with + | Some d -> d + | None -> Pdf.Dictionary [] + in + let viewer_preferences' = + Pdf.add_dict_entry viewer_preferences key value + in + let catalog' = + Pdf.add_dict_entry catalog "/ViewerPreferences" viewer_preferences' + in + let catalognum = Pdf.addobj pdf catalog' in + let trailerdict' = + Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) + in + {pdf with + Pdf.minor = max pdf.Pdf.minor version; + Pdf.root = catalognum; + Pdf.trailerdict = trailerdict'} + | None -> error "bad root" + + + +(* \section{Set page layout} *) +let set_page_layout pdf s = + match s with + | "SinglePage" | "OneColumn" | "TwoColumnLeft" + | "TwoColumnRight" | "TwoPageLeft" | "TwoPageRight" -> + begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with + | Some catalog -> + let catalog' = + Pdf.add_dict_entry catalog "/PageLayout" (Pdf.Name ("/" ^ s)) + in + let catalognum = Pdf.addobj pdf catalog' in + let trailerdict' = + Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) + in + {pdf with + Pdf.root = catalognum; + Pdf.trailerdict = trailerdict'} + | None -> error "bad root" + end + | _ -> error "Unknown page layout" + + +(* \section{Set page mode} *) +let set_page_mode pdf s = + match s with + | "UseNone" | "UseOutlines" | "UseThumbs" + | "FullScreen" | "UseOC" | "UseAttachments" -> + begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with + | Some catalog -> + let catalog' = + Pdf.add_dict_entry catalog "/PageMode" (Pdf.Name ("/" ^ s)) + in + let catalognum = Pdf.addobj pdf catalog' in + let trailerdict' = + Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) + in + {pdf with + Pdf.root = catalognum; + Pdf.trailerdict = trailerdict'} + | None -> error "bad root" + end + | _ -> error "Unknown page mode" + +(* Set open action *) +let set_open_action pdf fit pagenumber = + if pagenumber > Pdfpage.endpage pdf || pagenumber < 0 then + raise (error "set_open_action: invalid page number") + else + let pageobjectnumber = select pagenumber (Pdf.page_reference_numbers pdf) in + let destination = + if fit then + Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/Fit"] + else + Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/XYZ"; Pdf.Null; Pdf.Null; Pdf.Null] + in + let open_action = + Pdf.Dictionary [("/D", destination); ("/S", Pdf.Name "/GoTo")] + in + match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with + | Some catalog -> + let catalog' = + Pdf.add_dict_entry catalog "/OpenAction" open_action + in + let catalognum = Pdf.addobj pdf catalog' in + let trailerdict' = + Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum) + in + {pdf with Pdf.root = catalognum; Pdf.trailerdict = trailerdict'} + | None -> error "bad root" + + +let set_version v pdf = + pdf.Pdf.minor <- v diff --git a/cpdfmetadata.mli b/cpdfmetadata.mli new file mode 100644 index 0000000..c1ed9ec --- /dev/null +++ b/cpdfmetadata.mli @@ -0,0 +1,74 @@ + +(** {2 Types and Exceptions} *) + +(** Possible output encodings for some function. [Raw] does no processing at +all - the PDF string is output as-is. [UTF8] converts loslessly to UTF8. +[Stripped] extracts the unicode codepoints and returns only those which +correspond to 7 bit ASCII. *) +type encoding = Raw | UTF8 | Stripped + +val encode_output : encoding -> string -> string + +(** {2 Metadata and settings} *) + +(** [copy_id keepversion copyfrom copyto] copies the ID, if any, from +[copyfrom] to [copyto]. If [keepversion] is true, the PDF version of [copyto] +won't be affected. *) +val copy_id : bool -> Pdf.t -> Pdf.t -> Pdf.t + +(** [set_pdf_info (key, value, version)] sets the entry [key] in the /Info directory, updating +the PDF minor version to [version].*) +val set_pdf_info : ?xmp_also:bool -> ?xmp_just_set:bool -> (string * Pdf.pdfobject * int) -> Pdf.t -> Pdf.t + +val get_xmp_info : Pdf.t -> string -> string + +(** [set_pdf_info (key, value, version)] sets the entry [key] in the +/ViewerPreferences directory, updating the PDF minor version to [version].*) +val set_viewer_preference : (string * Pdf.pdfobject * int) -> Pdf.t -> Pdf.t + +(** Set the page layout to the given name (sans slash) e.g SinglePage *) +val set_page_layout : Pdf.t -> string -> Pdf.t + +(** Set the page layout to the given name (sans slash) e.g SinglePage *) +val set_page_mode : Pdf.t -> string -> Pdf.t + +(** Set the open action. If the boolean is true, /Fit will be used, otherwise /XYZ *) +val set_open_action : Pdf.t -> bool -> int -> Pdf.t + +(** Set the PDF version number *) +val set_version : int -> Pdf.t -> unit + +(** Given a PDF, returns a function which can lookup a given dictionary entry +from the /Info dictionary, returning it as a UTF8 string *) +val get_info_utf8 : Pdf.t -> string -> string + +(** Output to standard output general information about a PDF. *) +val output_info : encoding -> Pdf.t -> unit + +(** Output to standard output information from any XMP metadata stream in a PDF. *) +val output_xmp_info : encoding -> Pdf.t -> unit + +(** Create XMP metadata from scratch *) +val create_metadata : Pdf.t -> Pdf.t + +(** {2 XML Metadata} *) + +(** [set_metadata keepversion filename pdf] sets the XML metadata of a PDF to the contents of [filename]. If [keepversion] is true, the PDF version will not be altered. *) +val set_metadata : bool -> string -> Pdf.t -> Pdf.t + +(** The same, but the content comes from [bytes]. *) +val set_metadata_from_bytes : bool -> Pdfio.bytes -> Pdf.t -> Pdf.t + +(** Remove the metadata from a file *) +val remove_metadata : Pdf.t -> Pdf.t + +(** Extract metadata to a [Pdfio.bytes] *) +val get_metadata : Pdf.t -> Pdfio.bytes option + +(** Print metadate to stdout *) +val print_metadata : Pdf.t -> unit + +(** Set the metadata date *) +val set_metadata_date : Pdf.t -> string -> Pdf.t + +val expand_date : string -> string