open Pdfutil
open Pdfio
open Cpdferror
type encoding =
| Raw
| UTF8
| Stripped
(* Just strip everything which isn't 7 bit ASCII *)
let crude_de_unicode s =
implode (map char_of_int (lose (fun x -> x > 127) (Pdftext.codepoints_of_pdfdocstring s)))
let encode_output enc s =
match enc with
| Raw -> s
| UTF8 -> Pdftext.utf8_of_pdfdocstring s
| Stripped -> crude_de_unicode s
let xmp_template =
{|
CREATEDATE
MODDATE
PRODUCER
CREATOR
TITLE
SUBJECT
AUTHOR
KEYWORDS
TRAPPED
CREATEDATE
CREATOR
MODDATE
METADATADATE
TITLE
|}
(* Set or replace metadata *)
let set_metadata_from_bytes keepversion data pdf =
let metadata_stream =
Pdf.Stream
{contents =
(Pdf.Dictionary
["/Length", Pdf.Integer (bytes_size data);
"/Type", Pdf.Name "/Metadata";
"/Subtype", Pdf.Name "/XML"],
Pdf.Got data)}
in
let objnum = Pdf.addobj pdf metadata_stream in
let document_catalog =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some s -> s
| None -> error "Malformed PDF: No root."
in
let document_catalog' =
Pdf.add_dict_entry document_catalog "/Metadata" (Pdf.Indirect objnum)
in
let rootnum = Pdf.addobj pdf document_catalog' in
let trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum)
in
{pdf with
Pdf.trailerdict = trailerdict;
Pdf.root = rootnum;
Pdf.minor =
if keepversion then pdf.Pdf.minor else max 4 pdf.Pdf.minor}
let set_metadata keepversion filename pdf =
let ch = open_in_bin filename in
let data = mkbytes (in_channel_length ch) in
for x = 0 to bytes_size data - 1 do
bset data x (input_byte ch)
done;
set_metadata_from_bytes keepversion data pdf
(* Remove metadata *)
let remove_metadata pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| None -> error "malformed file"
| Some root ->
let root' = Pdf.remove_dict_entry root "/Metadata" in
let rootnum = Pdf.addobj pdf root' in
{pdf with
Pdf.trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum);
Pdf.root =
rootnum}
(* Print metadata *)
let get_metadata pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| None -> error "malformed file"
| Some root ->
match Pdf.lookup_direct pdf "/Metadata" root with
| Some ((Pdf.Stream _) as s) ->
Pdfcodec.decode_pdfstream pdf s;
begin match s with
| Pdf.Stream {contents = (_, Pdf.Got data)} -> Some data
| _ -> assert false
end
| _ -> None
let print_metadata pdf =
match get_metadata pdf with
None -> ()
| Some data ->
for x = 0 to bytes_size data - 1 do
Printf.printf "%c" (char_of_int (bget data x))
done
let get_info raw pdf =
let infodict =
match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with
| Some infodict -> infodict
| _ -> Pdf.Dictionary []
in
let getstring name =
match Pdf.lookup_direct pdf name infodict with
| Some (Pdf.String s) ->
if raw then s else crude_de_unicode s
| Some (Pdf.Boolean false) -> "False"
| Some (Pdf.Boolean true) -> "True"
| _ -> if name = "/Trapped" then "False" else ""
in
getstring
let get_info_utf8 pdf =
let infodict =
match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with
| Some infodict -> infodict
| _ -> Pdf.Dictionary []
in
(function name ->
match Pdf.lookup_direct pdf name infodict with
| Some (Pdf.String s) -> Pdftext.utf8_of_pdfdocstring s
| Some (Pdf.Boolean false) -> "False"
| Some (Pdf.Boolean true) -> "True"
| _ -> if name = "/Trapped" then "False" else "")
let getstring encoding pdf =
match encoding with
| Raw -> get_info true pdf
| Stripped -> get_info false pdf
| UTF8 -> get_info_utf8 pdf
let get_catalog_item name pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
begin try
implode (tl (explode (Pdfwrite.string_of_pdf (unopt (Pdf.lookup_direct pdf name catalog)))))
with
_ -> ""
end
| _ -> ""
let output_info encoding pdf =
let getstring = getstring encoding pdf in
Printf.printf "Version: %i.%i\n" pdf.Pdf.major pdf.Pdf.minor;
Printf.printf "Pages: %i\n" (Pdfpage.endpage pdf);
Printf.printf "Title: %s\n" (getstring "/Title");
Printf.printf "Author: %s\n" (getstring "/Author");
Printf.printf "Subject: %s\n" (getstring "/Subject");
Printf.printf "Keywords: %s\n" (getstring "/Keywords");
Printf.printf "Creator: %s\n" (getstring "/Creator");
Printf.printf "Producer: %s\n" (getstring "/Producer");
Printf.printf "Created: %s\n" (getstring "/CreationDate");
Printf.printf "Modified: %s\n" (getstring "/ModDate");
Printf.printf "Trapped: %s\n" (getstring "/Trapped");
Printf.printf "PageMode: %s\n" (get_catalog_item "/PageMode" pdf);
Printf.printf "PageLayout: %s\n" (get_catalog_item "/PageLayout" pdf)
type xmltree =
E of Cpdfxmlm.tag * xmltree list
| D of string
let xmltree_of_bytes b =
let i = Cpdfxmlm.make_input (`String (0, string_of_bytes b)) in
let el tag childs = E (tag, childs)
and data d = D d in
Cpdfxmlm.input_doc_tree ~el ~data i
let bytes_of_xmltree t =
let buf = Buffer.create 1024 in
let o = Cpdfxmlm.make_output (`Buffer buf) in
let frag = function
E (tag, childs) -> `El (tag, childs)
| D d -> `Data d
in
Cpdfxmlm.output_doc_tree frag o t;
bytes_of_string (Buffer.contents buf)
let rec string_of_xmltree = function
D d ->
Printf.sprintf "DATA {%s}" d
| E (tag, trees) ->
Printf.sprintf "ELT (%s, %s)"
(string_of_tag tag)
(string_of_xmltrees trees)
and string_of_tag ((n, n'), attributes) =
Printf.sprintf
"NAME |%s| |%s|, ATTRIBUTES {%s}" n n'
(string_of_attributes attributes)
and string_of_attribute ((n, n'), str) =
Printf.sprintf "ATTRNAME |%s| |%s|, STR {%s}" n n' str
and string_of_attributes attrs =
fold_left
(fun a b -> a ^ " " ^ b) "" (map string_of_attribute attrs)
and string_of_xmltrees trees =
fold_left
(fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees)
let adobe = "http://ns.adobe.com/pdf/1.3/"
let xmp = "http://ns.adobe.com/xap/1.0/"
let dc = "http://purl.org/dc/elements/1.1/"
let rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
let combine_with_spaces strs =
String.trim
(fold_left (fun x y -> x ^ (if x <> "" then ", " else "") ^ y) "" strs)
(* Collect all
elements inside a seq, bag, or alt. Combine with commas. If
none found, return empty string instead. *)
let collect_list_items = function
E (((n, n'), _), elts) when
n = rdf && (n' = "Alt" || n' = "Seq" || n' = "Bag")
->
combine_with_spaces
(option_map
(function
E (((n, n'), _), [D d]) when n = rdf && n' = "li" ->
Some d
| _ -> None)
elts)
| _ -> ""
let collect_list_items_all all =
match keep (function E _ -> true | _ -> false) all with
h::_ -> Some (collect_list_items h)
| [] -> None
let rec get_data_for namespace name = function
D _ -> None
| E (((n, n'), _), [D d]) when n = namespace && n' = name ->
Some d
| E (((n, n'), _), e) when n = namespace && n' = name ->
collect_list_items_all e
| E (_, l) ->
match option_map (get_data_for namespace name) l with
x :: _ -> Some x
| _ -> None
let output_xmp_info encoding pdf =
let print_out tree title namespace name =
match get_data_for namespace name tree with
None -> ()
| Some data ->
Printf.printf "%s: " title;
print_endline data
in
match get_metadata pdf with
None -> ()
| Some metadata ->
try
let dtd, tree = xmltree_of_bytes metadata in
print_out tree "XMP pdf:Keywords" adobe "Keywords";
print_out tree "XMP pdf:Producer" adobe "Producer";
print_out tree "XMP pdf:Trapped" adobe "Trapped";
print_out tree "XMP pdf:Title" adobe "Title";
print_out tree "XMP pdf:Creator" adobe "Creator";
print_out tree "XMP pdf:Subject" adobe "Subject";
print_out tree "XMP pdf:Author" adobe "Author";
print_out tree "XMP pdf:CreationDate" adobe "CreationDate";
print_out tree "XMP pdf:ModDate" adobe "ModDate";
print_out tree "XMP xmp:CreateDate" xmp "CreateDate";
print_out tree "XMP xmp:CreatorTool" xmp "CreatorTool";
print_out tree "XMP xmp:MetadataDate" xmp "MetadataDate";
print_out tree "XMP xmp:ModifyDate" xmp "ModifyDate";
print_out tree "XMP dc:title" dc "title";
print_out tree "XMP dc:creator" dc "creator";
print_out tree "XMP dc:subject" dc "subject";
print_out tree "XMP dc:description" dc "description"
with
_ -> ()
(* Get XMP info equivalent of an old metadata field *)
let check = function
"/Title" -> [(adobe, "Title"); (dc, "title")]
| "/Author" -> [(adobe, "Author"); (dc, "creator")]
| "/Subject" -> [(adobe, "Subject"); (dc, "subject")]
| "/Keywords" -> [(adobe, "Keywords")]
| "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool")]
| "/Producer" -> [(adobe, "Producer")]
| "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")]
| "/ModDate" -> [(adobe, "ModificationDate"); (xmp, "ModifyDate")]
| _ -> failwith "Cpdf.check_name not known"
let get_xmp_info pdf name =
let tocheck = check name in
match get_metadata pdf with
None -> ""
| Some metadata ->
try
let _, tree = xmltree_of_bytes metadata in
let results = map (fun (kind, key) -> match get_data_for kind key tree with Some x -> x | None -> "") tocheck in
match lose (eq "") results with
x::_ -> x
| [] -> ""
with
_ -> ""
(* Set XMP info *)
let rec set_xml_field kind fieldname value = function
D data -> D data
| E (((n, n'), m), _ (*[D _]*)) when n = kind && n' = fieldname -> (* Replace anything inside, including nothing i.e *)
E (((n, n'), m), [D value])
| E (x, ts) -> E (x, map (set_xml_field kind fieldname value) ts)
let set_pdf_info_xml kind fieldname value xmldata pdf =
let dtd, tree = xmltree_of_bytes xmldata in
let str =
match value with
Pdf.String s -> Pdftext.utf8_of_pdfdocstring s
| Pdf.Boolean true -> "True"
| Pdf.Boolean false -> "False"
| _ -> failwith "set_pdf_info_xml: not a string"
in
let newtree = set_xml_field kind fieldname str tree in
bytes_of_xmltree (dtd, newtree)
let set_pdf_info_xml_many changes value xmldata pdf =
let xmldata = ref xmldata in
iter
(fun (kind, fieldname) ->
xmldata := set_pdf_info_xml kind fieldname value !xmldata pdf)
changes;
!xmldata
(* Set an entry in the /Info dictionary *)
(* We must parse the date to get its components, then use strftime to build the
* new string in XMP format *)
type date =
{mutable year : int;
mutable month : int; (* 1 - 12 *)
mutable day : int; (* 1 - 31 *)
mutable hour : int; (* 0 - 23 *)
mutable minute : int; (* 0 - 59 *)
mutable second : int; (* 0 - 59 *)
mutable ut_relationship : int; (* -1, 0, +1 *)
mutable offset_hours : int; (* 0 - 59 *)
mutable offset_minutes : int (* 0 - 59 *)}
let default_date () =
{year = 0;
month = 1;
day = 1;
hour = 0;
minute = 0;
second = 0;
ut_relationship = 0;
offset_hours = 0;
offset_minutes = 0}
(* XMP date format is YYYY-MM-DDThh:mm:ssTZD *)
let make_xmp_date_from_components d =
let tzd =
if d.ut_relationship = 0 && d.offset_hours = 0 && d.offset_minutes = 0 then "Z" else
(if d.ut_relationship >=0 then "+" else "-") ^
Printf.sprintf "%02i" d.offset_hours ^
":" ^
Printf.sprintf "%02i" d.offset_minutes
in
Cpdfstrftime.strftime
~time:{Cpdfstrftime._tm_sec = d.second;
Cpdfstrftime._tm_min = d.minute;
Cpdfstrftime._tm_hour = d.hour;
Cpdfstrftime._tm_mday = d.day;
Cpdfstrftime._tm_mon = d.month - 1;
Cpdfstrftime._tm_year = d.year - 1900;
Cpdfstrftime._tm_wday = 0;
Cpdfstrftime._tm_yday = 0;
Cpdfstrftime._tm_isdst = false}
"%Y-%m-%dT%H:%M:%S"
^
tzd
let xmp_date date =
let d = default_date () in
try
match explode date with
'D'::':'::r ->
begin match r with
y1::y2::y3::y4::r ->
d.year <- int_of_string (implode [y1; y2; y3; y4]);
begin match r with
m1::m2::r ->
d.month <- int_of_string (implode [m1; m2]);
begin match r with
d1::d2::r ->
d.day <- int_of_string (implode [d1; d2]);
begin match r with
h1::h2::r ->
d.hour <- int_of_string (implode [h1; h2]);
begin match r with
m1::m2::r ->
d.minute <- int_of_string (implode [m1; m2]);
begin match r with
s1::s2::r ->
d.second <- int_of_string (implode [s1; s2]);
begin match r with
o::r ->
d.ut_relationship <-
if o = '+' then 1 else
if o = '-' then -1 else
0;
begin match r with
h1::h2::'\''::r ->
d.offset_hours <- int_of_string (implode [h1; h2]);
begin match r with
m1::m2::_ ->
d.offset_minutes <- int_of_string (implode [m1; m2]);
raise Exit
| _ -> raise Exit
end
| _ -> raise Exit
end
| _ -> raise Exit
end
| _ -> raise Exit
end
| _ -> raise Exit
end
| _ -> raise Exit
end
| _ -> raise Exit
end
| _ -> raise Exit
end
| _ ->
Printf.eprintf "xmp_date: Malformed date string (no year): %s\n%!" date;
make_xmp_date_from_components d
end
| _ ->
Printf.eprintf "xmp_date: Malformed date string (no prefix): %s\n%!" date;
make_xmp_date_from_components d
with
Exit -> make_xmp_date_from_components d
let set_pdf_info ?(xmp_also=false) ?(xmp_just_set=false) (key, value, version) pdf =
let infodict =
match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with
| Some d -> d
| None -> Pdf.Dictionary []
in
let infodict' = Pdf.add_dict_entry infodict key value in
let objnum = Pdf.addobj pdf infodict' in
if not xmp_just_set then
begin
pdf.Pdf.trailerdict <-
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Info" (Pdf.Indirect objnum);
pdf.Pdf.minor <-
max pdf.Pdf.minor version
end;
if xmp_also then
begin match get_metadata pdf with
None -> pdf
| Some xmldata ->
let xmp_date = function Pdf.String s -> Pdf.String (xmp_date s) | _ -> failwith "xmp_date not a string" in
let changes, value =
match key with
| "/Producer" -> [(adobe, "Producer")], value
| "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool"); (dc, "creator")], value
| "/Author" -> [(adobe, "Author")], value
| "/Title" -> [(adobe, "Title"); (dc, "title")], value
| "/Subject" -> [(adobe, "Subject"); (dc, "subject")], value
| "/Keywords" -> [(adobe, "Keywords")], value
| "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")], xmp_date value
| "/ModDate" -> [(adobe, "ModDate"); (xmp, "ModifyDate")], xmp_date value
| "/Trapped" -> [(adobe, "Trapped")], value
| _ -> failwith "Unknown call to set_pdf_info"
in
set_metadata_from_bytes
true
(set_pdf_info_xml_many changes value xmldata pdf)
pdf
end
else
pdf
let expand_date = function
| "now" ->
begin match Sys.getenv_opt "CPDF_REPRODUCIBLE_DATES" with
| Some "true" -> Cpdfstrftime.strftime ~time:Cpdfstrftime.dummy "D:%Y%m%d%H%M%S"
| _ -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S"
end
| x -> x
(* Set metadata date *)
let set_metadata_date pdf date =
match get_metadata pdf with
None -> pdf
| Some xmldata ->
let changes= [(xmp, "MetadataDate")] in
let value = match date with "now" -> xmp_date (expand_date "now") | x -> x in
set_metadata_from_bytes
true
(set_pdf_info_xml_many changes (Pdf.String value) xmldata pdf)
pdf
(* Copy an /ID from one file to another *)
let copy_id keepversion copyfrom copyto =
match Pdf.lookup_direct copyfrom "/ID" copyfrom.Pdf.trailerdict with
| None -> copyto (* error "Source PDF file has no /ID entry to copy from" *)
| Some id ->
copyto.Pdf.trailerdict <-
Pdf.add_dict_entry copyto.Pdf.trailerdict "/ID" id;
copyto.Pdf.minor <-
if keepversion then copyto.Pdf.minor else max copyto.Pdf.minor 1;
copyto
let replacements pdf =
let info = get_info_utf8 pdf in
[("CREATEDATE", xmp_date (let i = info "/CreationDate" in if i = "" then expand_date "now" else i));
("MODDATE", xmp_date (let i = info "/ModDate" in if i = "" then expand_date "now" else i));
("PRODUCER", info "/Producer");
("CREATOR", info "/Creator");
("TITLE", info "/Title");
("SUBJECT", info "/Subject");
("AUTHOR", info "/Author");
("KEYWORDS", info "/Keywords");
("TRAPPED", info "/Trapped");
("METADATADATE", xmp_date (expand_date "now"))]
let create_metadata pdf =
let xmp = ref xmp_template in
iter
(fun (s, r) -> xmp := string_replace_all s r !xmp)
(replacements pdf);
set_metadata_from_bytes false (bytes_of_string !xmp) pdf
(* Set viewer preferences *)
let set_viewer_preference (key, value, version) pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let viewer_preferences =
match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with
| Some d -> d
| None -> Pdf.Dictionary []
in
let viewer_preferences' =
Pdf.add_dict_entry viewer_preferences key value
in
let catalog' =
Pdf.add_dict_entry catalog "/ViewerPreferences" viewer_preferences'
in
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
in
{pdf with
Pdf.minor = max pdf.Pdf.minor version;
Pdf.root = catalognum;
Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
(* Set page layout *)
let set_page_layout pdf s =
match s with
| "SinglePage" | "OneColumn" | "TwoColumnLeft"
| "TwoColumnRight" | "TwoPageLeft" | "TwoPageRight" ->
begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let catalog' =
Pdf.add_dict_entry catalog "/PageLayout" (Pdf.Name ("/" ^ s))
in
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
in
{pdf with
Pdf.root = catalognum;
Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
end
| _ -> error "Unknown page layout"
(* Set page mode *)
let set_page_mode pdf s =
match s with
| "UseNone" | "UseOutlines" | "UseThumbs"
| "FullScreen" | "UseOC" | "UseAttachments" ->
begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let catalog' =
Pdf.add_dict_entry catalog "/PageMode" (Pdf.Name ("/" ^ s))
in
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
in
{pdf with
Pdf.root = catalognum;
Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
end
| _ -> error "Unknown page mode"
(* Set open action *)
let set_open_action pdf fit pagenumber =
if pagenumber > Pdfpage.endpage pdf || pagenumber < 0 then
raise (error "set_open_action: invalid page number")
else
let pageobjectnumber = select pagenumber (Pdf.page_reference_numbers pdf) in
let destination =
if fit then
Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/Fit"]
else
Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/XYZ"; Pdf.Null; Pdf.Null; Pdf.Null]
in
let open_action =
Pdf.Dictionary [("/D", destination); ("/S", Pdf.Name "/GoTo")]
in
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let catalog' =
Pdf.add_dict_entry catalog "/OpenAction" open_action
in
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
in
{pdf with Pdf.root = catalognum; Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
let set_version v pdf =
pdf.Pdf.minor <- v