
4791 lines
184 KiB
Raw Normal View History

2013-08-20 15:32:57 +01:00
(* CPDF Core routines *)
open Pdfutil
open Pdfio
2021-10-02 12:22:59 +01:00
open Cpdferror
2013-08-20 15:32:57 +01:00
2021-11-15 11:17:15 -08:00
type color =
Grey of float
| RGB of float * float * float
| CYMK of float * float * float * float
2014-11-24 14:31:38 +00:00
let debug = ref false
2019-07-01 14:40:22 +01:00
let xmp_template =
{|<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'?>
<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'
<rdf:Description about=''
<rdf:Description about=''
<rdf:Description about=''
<?xpacket end='r'?>|}
2016-04-03 12:48:44 +01:00
(* For debugging *)
let report_pdf_size pdf =
Pdf.remove_unreferenced pdf;
Pdfwrite.pdf_to_file_options ~preserve_objstm:false ~generate_objstm:false
~compress_objstm:false false None false pdf "temp.pdf";
let fh = open_in_bin "temp.pdf" in
Printf.printf "Size %i bytes\n" (in_channel_length fh);
flush stdout;
close_in fh
2014-11-24 12:02:36 +00:00
(* Prefer a) the one given with -cpdflin b) a local cpdflin, c) otherwise assume
installed at a system place *)
2014-10-02 13:48:45 +01:00
let find_cpdflin provided =
match provided with
2014-11-24 12:02:36 +00:00
Some x -> x
2014-10-02 13:48:45 +01:00
| None ->
let dotslash = match Sys.os_type with "Win32" -> "" | _ -> "./" in
2014-11-24 12:02:36 +00:00
if Sys.file_exists "cpdflin" then (dotslash ^ "cpdflin") else
if Sys.file_exists "cpdflin.exe" then (dotslash ^ "cpdflin.exe") else
match Sys.os_type with
"Win32" -> "cpdflin.exe"
| _ -> "cpdflin"
2014-10-02 13:48:45 +01:00
(* Call cpdflin, given the (temp) input name, the output name, and the location
of the cpdflin binary. Returns the exit code. *)
let call_cpdflin cpdflin temp output best_password =
let command =
cpdflin ^ " --linearize " ^ " --password=" ^ best_password ^ " " ^
2014-11-19 17:36:02 +00:00
Filename.quote temp ^ " " ^ Filename.quote output
2014-10-02 13:48:45 +01:00
2014-11-21 09:58:48 -05:00
match Sys.os_type with
"Win32" ->
2014-11-21 09:58:48 -05:00
(* On windows, don't use LD_LIBRARY_PATH - it will happen automatically *)
2014-11-24 14:31:38 +00:00
if !debug then prerr_endline command;
2014-11-21 09:58:48 -05:00
Sys.command command
| _ ->
2014-11-24 12:02:36 +00:00
(* On other platforms, if -cpdflin was provided, or cpdflin was in the
current folder, set up LD_LIBRARY_PATH: *)
match cpdflin with
2014-11-24 13:28:22 +00:00
"cpdflin" ->
2014-11-24 14:31:38 +00:00
if !debug then prerr_endline command;
2014-11-24 13:28:22 +00:00
Sys.command command
| _ ->
let command =
"DYLD_FALLBACK_LIBRARY_PATH=" ^ Filename.dirname cpdflin ^ " " ^
"LD_LIBRARY_PATH=" ^ Filename.dirname cpdflin ^ " " ^
2014-11-24 13:28:22 +00:00
2014-11-24 14:31:38 +00:00
if !debug then prerr_endline command;
2014-11-24 13:28:22 +00:00
Sys.command command
2014-10-02 13:48:45 +01:00
(* Recompress anything which isn't compressed, unless it's metadata. *)
let recompress_stream pdf = function
(* If there is no compression, compress with /FlateDecode *)
| Pdf.Stream {contents = (dict, _)} as stream ->
begin match
Pdf.lookup_direct pdf "/Filter" dict,
Pdf.lookup_direct pdf "/Type" dict
| _, Some (Pdf.Name "/Metadata") -> ()
| (None | Some (Pdf.Array [])), _ ->
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream
| _ -> ()
| _ -> assert false
let recompress_pdf pdf =
if not (Pdfcrypt.is_encrypted pdf) then
Pdf.iter_stream (recompress_stream pdf) pdf;
let decompress_pdf pdf =
if not (Pdfcrypt.is_encrypted pdf) then
(Pdf.iter_stream (Pdfcodec.decode_pdfstream_until_unknown pdf) pdf);
2021-10-26 16:18:09 +01:00
(* Equality on PDF objects *)
let pdfobjeq pdf x y =
let x = Pdf.lookup_obj pdf x
and y = Pdf.lookup_obj pdf y in
begin match x with Pdf.Stream _ -> Pdf.getstream x | _ -> () end;
begin match y with Pdf.Stream _ -> Pdf.getstream y | _ -> () end;
compare x y
let really_squeeze pdf =
let objs = ref [] in
Pdf.objiter (fun objnum _ -> objs := objnum :: !objs) pdf;
let toprocess =
(fun x -> length x > 1)
(collate (pdfobjeq pdf) (sort (pdfobjeq pdf) !objs))
(* Remove any pools of objects which are page objects, since Adobe Reader
* gets confused when there are duplicate page objects. *)
let toprocess =
[] -> assert false
| h::_ as l ->
match Pdf.lookup_direct pdf "/Type" (Pdf.lookup_obj pdf h) with
Some (Pdf.Name "/Page") -> None
| _ -> Some l)
let pdfr = ref pdf in
let changetable = Hashtbl.create 100 in
(function [] -> assert false | h::t ->
iter (fun e -> Hashtbl.add changetable e h) t)
(* For a unknown reason, the output file is much smaller if
Pdf.renumber is run twice. This is bizarre, since Pdf.renumber is
an old, well-understood function in use for years -- what is
going on? Furthermore, if we run it 3 times, it gets bigger again! *)
pdfr := Pdf.renumber changetable !pdfr;
pdfr := Pdf.renumber changetable !pdfr;
Pdf.remove_unreferenced !pdfr;
pdf.Pdf.root <- !pdfr.Pdf.root;
pdf.Pdf.objects <- !pdfr.Pdf.objects;
pdf.Pdf.trailerdict <- !pdfr.Pdf.trailerdict
2014-10-11 14:17:24 +01:00
(* Squeeze the form xobject at objnum.
FIXME: For old PDFs (< v1.2) any resources from the page (or its ancestors in
the page tree!) are also needed - we must merge them with the ones from the
xobject itself. However, it it safe for now -- in the unlikely event that the
resources actually need to be available, the parse will fail, the squeeze of
this object will fail, and we bail out. *)
let xobjects_done = ref []
2014-10-01 12:05:13 +01:00
let squeeze_form_xobject pdf objnum =
if mem objnum !xobjects_done then () else
xobjects_done := objnum :: !xobjects_done;
let obj = Pdf.lookup_obj pdf objnum in
match Pdf.lookup_direct pdf "/Subtype" obj with
Some (Pdf.Name "/Form") ->
let resources =
match Pdf.lookup_direct pdf "/Resources" obj with
Some d -> d
| None -> Pdf.Dictionary []
begin match
(Pdfops.parse_operators pdf resources [Pdf.Indirect objnum])
Pdf.Stream {contents = (_, Pdf.Got data)} ->
(* Put replacement data in original stream, and overwrite /Length *)
begin match obj with
Pdf.Stream ({contents = (d, _)} as str) ->
str :=
(Pdf.add_dict_entry d "/Length" (Pdf.Integer (bytes_size data)),
Pdf.Got data)
| _ -> failwith "squeeze_form_xobject"
| _ -> failwith "squeeze_form_xobject"
| _ -> ()
2014-10-11 14:17:24 +01:00
(* For a list of indirects representing content streams, make sure that none of
them are duplicated in the PDF. This indicates sharing, which parsing and
rewriting the streams might destroy, thus making the file bigger. FIXME: The
correct thing to do is to preserve the multiple content streams. *)
let no_duplicates content_stream_numbers stream_numbers =
(mem false
(fun n -> length (keep (eq n) content_stream_numbers) < 2)
(* Give a list of content stream numbers, given a page reference number *)
let content_streams_of_page pdf refnum =
match Pdf.direct pdf (Pdf.lookup_obj pdf refnum) with
Pdf.Dictionary dict ->
begin match lookup "/Contents" dict with
Some (Pdf.Indirect i) -> [i]
| Some (Pdf.Array x) ->
option_map (function Pdf.Indirect i -> Some i | _ -> None) x
| _ -> []
| _ -> []
(* For each object in the PDF marked with /Type /Page, for each /Contents
indirect reference or array of such, decode and recode that content stream. *)
let squeeze_all_content_streams pdf =
2014-10-11 14:17:24 +01:00
let page_reference_numbers = Pdf.page_reference_numbers pdf in
let all_content_streams_in_doc =
flatten (map (content_streams_of_page pdf) page_reference_numbers)
xobjects_done := [];
(fun objnum _ ->
match Pdf.lookup_obj pdf objnum with
Pdf.Dictionary dict as d
Pdf.lookup_direct pdf "/Type" d = Some (Pdf.Name "/Page")
let resources =
match Pdf.lookup_direct pdf "/Resources" d with
Some d -> d
| None -> Pdf.Dictionary []
2014-10-11 14:17:24 +01:00
begin try
let content_streams =
match lookup "/Contents" dict with
Some (Pdf.Indirect i) ->
begin match Pdf.direct pdf (Pdf.Indirect i) with
Pdf.Array x -> x
| _ -> [Pdf.Indirect i]
| Some (Pdf.Array x) -> x
| _ -> raise Not_found
(map (function Pdf.Indirect i -> i | _ -> assert false) content_streams)
let newstream =
(Pdfops.parse_operators pdf resources content_streams)
let newdict =
d "/Contents" (Pdf.Indirect (Pdf.addobj pdf newstream))
Pdf.addobj_given_num pdf (objnum, newdict);
(* Now process all xobjects related to this page *)
begin match Pdf.lookup_direct pdf "/XObject" resources with
Some (Pdf.Dictionary xobjs) ->
(_, Pdf.Indirect i) -> squeeze_form_xobject pdf i
| _ -> failwith "squeeze_xobject")
| _ -> ()
(* No /Contents, which is ok. Or a parsing failure due to
uninherited resources. FIXME: Add support for inherited
resources. *)
Not_found -> ()
| _ -> ())
2019-07-10 13:16:32 +01:00
(* We run squeeze enough times for the number of objects to not change *)
let squeeze ?logto ?(pagedata=true) ?(recompress=true) pdf =
2015-01-07 20:29:39 +00:00
let log x =
match logto with
None -> print_string x; flush stdout
2015-09-25 14:55:15 +01:00
| Some "nolog" -> ()
2015-01-07 20:29:39 +00:00
| Some s ->
let fh = open_out_gen [Open_wronly; Open_creat] 0o666 s in
seek_out fh (out_channel_length fh);
output_string fh x;
close_out fh
let n = ref (Pdf.objcard pdf) in
log (Printf.sprintf "Beginning squeeze: %i objects\n" (Pdf.objcard pdf));
while !n > (ignore (really_squeeze pdf); Pdf.objcard pdf) do
n := Pdf.objcard pdf;
log (Printf.sprintf "Squeezing... Down to %i objects\n" (Pdf.objcard pdf));
if pagedata then
log (Printf.sprintf "Squeezing page data and xobjects\n");
squeeze_all_content_streams pdf;
if recompress then
log (Printf.sprintf "Recompressing document\n");
Pdfcodec.flate_level := 9;
ignore (recompress_pdf pdf)
2015-01-07 20:29:39 +00:00
e ->
"Squeeze failed. No output written.\n Proximate error was:\n %s"
(Printexc.to_string e)))
2013-08-20 15:32:57 +01:00
type encoding =
| Raw
| UTF8
| Stripped
(* Just strip everything which isn't 7 bit ASCII *)
let crude_de_unicode s =
implode (map char_of_int (lose (fun x -> x > 127) (Pdftext.codepoints_of_pdfdocstring s)))
let encode_output enc s =
match enc with
| Raw -> s
| UTF8 -> Pdftext.utf8_of_pdfdocstring s
| Stripped -> crude_de_unicode s
(* Get the number of pages in file. Doesn't need decryption. *)
let endpage_io ?revision i user_pw owner_pw =
let pdf = Pdfread.pdf_of_input_lazy ?revision user_pw owner_pw i in
2013-08-20 15:32:57 +01:00
Pdfpage.endpage pdf
2013-08-20 15:32:57 +01:00
let print_pdf_objs pdf =
Printf.printf "Trailerdict: %s\n" (Pdfwrite.string_of_pdf pdf.Pdf.trailerdict);
Printf.printf "Root: %i\n" pdf.Pdf.root;
begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
Printf.printf "Catalog: %s\n" (Pdfwrite.string_of_pdf catalog);
begin match Pdf.lookup_direct pdf "/Pages" catalog with
| Some pages ->
Printf.printf "Pages: %s\n" (Pdfwrite.string_of_pdf pages)
| None ->
flprint "no catalog\n"
| None ->
flprint "No catalog!\n"
(fun n obj ->
Printf.printf "%i 0 obj:\n\n" n;
Printf.printf "%s\n" (Pdfwrite.string_of_pdf obj))
(* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num =
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
2021-06-10 16:09:59 +01:00
let rec process_text time text m =
2013-08-20 15:32:57 +01:00
match m with
2021-06-10 16:09:59 +01:00
| [] -> Cpdfstrftime.strftime ~time text
| (s, r)::t -> process_text time (string_replace_all_lazy s r text) t
2013-08-20 15:32:57 +01:00
let expand_date = function
2021-07-23 16:26:07 +01:00
| "now" ->
begin match Sys.getenv_opt "CPDF_REPRODUCIBLE_DATES" with
| Some "true" -> Cpdfstrftime.strftime ~time:Cpdfstrftime.dummy "D:%Y%m%d%H%M%S"
| _ -> Cpdfstrftime.strftime "D:%Y%m%d%H%M%S"
2013-08-20 15:32:57 +01:00
| x -> x
(* For uses of process_pages which don't need to deal with matrices, this
function transforms into one which returns the identity matrix *)
let ppstub f n p = (f n p, n, Pdftransform.i_matrix)
2013-08-20 15:32:57 +01:00
let process_pages f pdf range =
let pages = Pdfpage.pages_of_pagetree pdf in
let pages', pagenumbers, matrices = (* new page objects, page number, matrix *)
(fun n p -> if mem n range then f n p else (p, n, Pdftransform.i_matrix))
(ilist 1 (length pages))
2013-08-20 15:32:57 +01:00
Pdfpage.change_pages ~matrices:(combine pagenumbers matrices) true pdf pages'
2013-08-20 15:32:57 +01:00
let iter_pages f pdf range =
let pages = Pdfpage.pages_of_pagetree pdf in
(fun n p -> if mem n range then f n p)
(ilist 1 (length pages))
let map_pages f pdf range =
let pages = Pdfpage.pages_of_pagetree pdf in
(fun n p -> if mem n range then Some (f n p) else None)
(ilist 1 (length pages))
2016-04-03 19:46:54 +01:00
(* Add stack operators to a content stream to ensure it is composeable. On
-fast, we don't check for Q deficit, assuming PDF is ISO. *)
let protect fast pdf resources content =
let deficit =
if fast then 0 else
let ops = Pdfops.parse_operators pdf resources content in
let qs = length (keep (eq Pdfops.Op_q) ops) in
let bigqs = length (keep (eq Pdfops.Op_Q) ops) in
let deficit = if qs > bigqs then qs - bigqs else 0 in
2021-07-23 16:26:07 +01:00
if deficit <> 0 then Printf.eprintf "Q Deficit was nonzero. Fixing. %i\n%!" deficit;
2016-04-03 19:46:54 +01:00
let addstream ops = Pdf.addobj pdf (Pdfops.stream_of_ops ops) in
let q = addstream [Pdfops.Op_q] in
let qs = addstream (many Pdfops.Op_Q deficit @ [Pdfops.Op_Q]) in
[Pdf.Indirect q] @ content @ [Pdf.Indirect qs]
2013-08-20 15:32:57 +01:00
2021-10-26 16:18:09 +01:00
(* If a cropbox exists, make it the mediabox. If not, change nothing. *)
let copy_cropbox_to_mediabox pdf range =
(ppstub (fun _ page ->
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> {page with Pdfpage.mediabox = Pdf.direct pdf pdfobject}
| None -> page))
2013-08-20 15:32:57 +01:00
(* Union two resource dictionaries from the same PDF. *)
let combine_pdf_resources pdf a b =
let a_entries =
match a with
| Pdf.Dictionary entries -> entries
| _ -> []
in let b_entries =
match b with
| Pdf.Dictionary entries -> entries
| _ -> []
let resource_keys =
["/Font"; "/ExtGState"; "/ColorSpace"; "/Pattern";
"/Shading"; "/XObject"; "/Properties"]
let combine_entries key =
let a_entries =
match Pdf.lookup_direct pdf key a with
| Some (Pdf.Dictionary d) -> d
| _ -> []
in let b_entries =
match Pdf.lookup_direct pdf key b with
| Some (Pdf.Dictionary d) -> d
| _ -> []
2016-04-03 19:51:47 +01:00
if a_entries = [] && b_entries = [] then
Some (key, Pdf.Dictionary (a_entries @ b_entries))
2013-08-20 15:32:57 +01:00
2016-04-03 19:51:47 +01:00
let unknown_keys_a = lose (fun (k, _) -> mem k resource_keys) a_entries in
let unknown_keys_b = lose (fun (k, _) -> mem k resource_keys) b_entries in
let combined_known_entries = option_map combine_entries resource_keys in
2021-10-26 15:36:56 +01:00
(fun dict (k, v) -> Pdf.add_dict_entry dict k v)
(Pdf.Dictionary [])
(unknown_keys_a @ unknown_keys_b @ combined_known_entries)
2013-08-20 15:32:57 +01:00
(* \section{Copy an /ID from one file to another} *)
let copy_id keepversion copyfrom copyto =
match Pdf.lookup_direct copyfrom "/ID" copyfrom.Pdf.trailerdict with
| None -> copyto (* error "Source PDF file has no /ID entry to copy from" *)
| Some id ->
copyto.Pdf.trailerdict <-
Pdf.add_dict_entry copyto.Pdf.trailerdict "/ID" id;
copyto.Pdf.minor <-
if keepversion then copyto.Pdf.minor else max copyto.Pdf.minor 1;
(* \section{Remove bookmarks} *)
(* \section{Add bookmarks} *)
let read_lines input =
let lines = ref [] in
while true do
let c = read_line input in
lines =| c
done; []
_ -> rev !lines
(* Verify a list of bookmarks. Positive jumps of > 1 not allowed, no numbers
smaller than 0. *)
2017-05-29 14:39:01 +01:00
let rec verify_bookmarks pdf lastlevel fastrefnums endpage = function
2013-08-20 15:32:57 +01:00
| [] -> true
| {Pdfmarks.level = level; Pdfmarks.target = target}::more ->
2017-05-29 14:39:01 +01:00
let page = Pdfpage.pagenumber_of_target pdf ~fastrefnums target in
2013-08-20 15:32:57 +01:00
level < lastlevel + 2 &&
level >= 0 &&
page <= endpage &&
page >= 0 &&
2017-05-29 14:39:01 +01:00
verify_bookmarks pdf level fastrefnums endpage more
2017-05-28 19:19:17 +01:00
let verify_bookmarks pdf lastlevel endpage marks =
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 14:39:01 +01:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
2019-07-02 18:37:08 +01:00
match marks with
| [] -> true
| m::more -> m.Pdfmarks.level = 0 && verify_bookmarks pdf lastlevel fastrefnums endpage more
2013-08-20 15:32:57 +01:00
(* Parse a line of the bookmarks file. *)
(* Un-escape things which are escaped. Quotes, newlines and backslashes *)
let rec fixup_characters prev = function
| [] -> rev prev
| '\\'::'\\'::t -> fixup_characters ('\\'::prev) t
| '\\'::'"'::t -> fixup_characters ('"'::prev) t
| '\\'::'n'::t -> fixup_characters ('\n'::prev) t
2013-08-20 15:32:57 +01:00
| h::t -> fixup_characters (h::prev) t
2019-03-19 14:03:28 +00:00
let debug_bookmark_string s =
Printf.printf "STR: %s\n" s
(* If optionaldest = [Pdfgenlex.LexString s], we parse the string, convert the
* integer to an indirect of the real page target, and then put it in. *)
2021-10-27 13:16:55 +01:00
let target_of_markfile_obj pdf i' pdfobj =
(*Printf.printf "Parsed %s\n" (Pdfwrite.string_of_pdf pdfobj);*)
match pdfobj with
Pdf.Array (Pdf.Integer x::more) ->
let pageobjnum = Pdfpage.page_object_number pdf i' in
begin match pageobjnum with
None ->
raise (Pdf.PDFError "bookmark_of_data: page obj num not found")
| Some p ->
Pdfdest.read_destination pdf (Pdf.Array (Pdf.Indirect p::more))
(* Need to deal with "null", "(string)", and "<<other thing like action" *)
| Pdf.Null -> Pdfdest.NullDestination
| Pdf.String s -> Pdfdest.read_destination pdf (Pdf.String s)
| x -> Pdfdest.Action x
let target_of_markfile_target pdf i' = function
| [Pdfgenlex.LexString s] ->
let pdfobj = Pdfread.parse_single_object s in
target_of_markfile_obj pdf i' pdfobj
| _ -> Pdfpage.target_of_pagenumber pdf i'
let bookmark_of_data pdf i s i' isopen optionaldest =
2019-03-19 14:03:28 +00:00
(*debug_bookmark_string s;
debug_bookmark_string (implode (fixup_characters [] (explode s)));
debug_bookmark_string (Pdftext.pdfdocstring_of_utf8 (implode (fixup_characters [] (explode s))));*)
{Pdfmarks.level = i;
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 (implode (fixup_characters [] (explode s)));
2021-10-27 13:16:55 +01:00
Pdfmarks.target = target_of_markfile_target pdf i' optionaldest;
Pdfmarks.isopen = isopen}
2021-10-27 13:16:55 +01:00
let target_of_json_target pdf pagenumber target =
target_of_markfile_obj pdf pagenumber (Cpdfjson.object_of_json target)
let mark_of_json pdf = function
| `Assoc [("level", `Int level);
("text", `String text);
("page", `Int pagenumber);
("open", `Bool openstatus);
("target", target)] ->
{Pdfmarks.level = level;
2021-10-27 15:29:25 +01:00
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 text;
2021-10-27 13:16:55 +01:00
Pdfmarks.target = target_of_json_target pdf pagenumber target;
Pdfmarks.isopen = openstatus}
| _ -> error "malformed mark in mark_of_json"
let marks_of_json pdf = function
| `List ms -> map (mark_of_json pdf) ms
| _ -> error "top level of JSON boomark file not a list"
let parse_bookmark_file_json verify pdf i =
let module J = Cpdfyojson.Safe in
let json =
match i.Pdfio.caml_channel with
| Some ch -> J.from_channel ch
| None ->
let content = Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 i.Pdfio.in_channel_length) in
J.from_string content
let marks = marks_of_json pdf json in
if verify then
if verify_bookmarks pdf 0 (Pdfpage.endpage pdf) marks then marks else
error "Bad bookmark file (References non-existant pages or is malformed)"
e ->
error (Printf.sprintf "Malformed JSON bookmark file (%s)" (Printexc.to_string e))
2021-10-27 11:44:30 +01:00
2013-08-20 15:32:57 +01:00
let parse_bookmark_file verify pdf input =
let currline = ref 0 in
2013-08-20 15:32:57 +01:00
let lines = Pdfio.read_lines input in
let currline = ref 0 in
let bookmarks = ref [] in
(function line ->
incr currline;
Pdfgenlex.lex_string line
| Pdfgenlex.LexInt i::Pdfgenlex.LexString s::Pdfgenlex.LexInt i'::Pdfgenlex.LexName "open"::optionaldest ->
bookmarks =| bookmark_of_data pdf i s i' true optionaldest
| Pdfgenlex.LexInt i::Pdfgenlex.LexString s::Pdfgenlex.LexInt i'::optionaldest ->
bookmarks =| bookmark_of_data pdf i s i' false optionaldest
2013-08-20 15:32:57 +01:00
| [] -> () (* ignore blank lines *)
| _ ->
error ("Bad bookmark file, line " ^ (string_of_int !currline)))
let bookmarks = rev !bookmarks in
if verify then
if verify_bookmarks pdf 0 (Pdfpage.endpage pdf) bookmarks
then bookmarks
"Bad bookmark file (References non-existant pages or is malformed)"
e ->
"Bad bookmark file (syntax) at line %i (error was %s)"
(Printexc.to_string e))
2013-08-20 15:32:57 +01:00
2021-10-27 11:44:30 +01:00
let add_bookmarks ~json verify input pdf =
let parsed =
(if json then parse_bookmark_file_json else parse_bookmark_file) verify pdf input in
2013-08-20 15:32:57 +01:00
(*iter (fun b -> flprint (Pdfmarks.string_of_bookmark b); flprint "\n") parsed;*)
Pdfmarks.add_bookmarks parsed pdf
(* \section{Set page mode} *)
let set_page_mode pdf s =
match s with
| "UseNone" | "UseOutlines" | "UseThumbs"
| "FullScreen" | "UseOC" | "UseAttachments" ->
begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let catalog' =
Pdf.add_dict_entry catalog "/PageMode" (Pdf.Name ("/" ^ s))
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
{pdf with
Pdf.root = catalognum;
Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
| _ -> error "Unknown page mode"
2013-10-02 15:29:53 +01:00
(* Set open action *)
let set_open_action pdf fit pagenumber =
if pagenumber > Pdfpage.endpage pdf || pagenumber < 0 then
raise (error "set_open_action: invalid page number")
let pageobjectnumber = select pagenumber (Pdf.page_reference_numbers pdf) in
let destination =
if fit then
Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/Fit"]
Pdf.Array [Pdf.Indirect pageobjectnumber; Pdf.Name "/XYZ"; Pdf.Null; Pdf.Null; Pdf.Null]
let open_action =
Pdf.Dictionary [("/D", destination); ("/S", Pdf.Name "/GoTo")]
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let catalog' =
Pdf.add_dict_entry catalog "/OpenAction" open_action
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
{pdf with Pdf.root = catalognum; Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
2013-08-20 15:32:57 +01:00
(* \section{Set viewer preferences} *)
let set_viewer_preference (key, value, version) pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let viewer_preferences =
match Pdf.lookup_direct pdf "/ViewerPreferences" catalog with
| Some d -> d
| None -> Pdf.Dictionary []
let viewer_preferences' =
Pdf.add_dict_entry viewer_preferences key value
let catalog' =
Pdf.add_dict_entry catalog "/ViewerPreferences" viewer_preferences'
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
{pdf with
Pdf.minor = max pdf.Pdf.minor version;
Pdf.root = catalognum;
Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
2019-06-28 15:01:28 +01:00
2013-08-20 15:32:57 +01:00
(* \section{Set page layout} *)
let set_page_layout pdf s =
match s with
| "SinglePage" | "OneColumn" | "TwoColumnLeft"
| "TwoColumnRight" | "TwoPageLeft" | "TwoPageRight" ->
begin match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some catalog ->
let catalog' =
Pdf.add_dict_entry catalog "/PageLayout" (Pdf.Name ("/" ^ s))
let catalognum = Pdf.addobj pdf catalog' in
let trailerdict' =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect catalognum)
{pdf with
Pdf.root = catalognum;
Pdf.trailerdict = trailerdict'}
| None -> error "bad root"
| _ -> error "Unknown page layout"
(* \section{Set or replace metadata} *)
let set_metadata_from_bytes keepversion data pdf =
let metadata_stream =
{contents =
["/Length", Pdf.Integer (bytes_size data);
"/Type", Pdf.Name "/Metadata";
"/Subtype", Pdf.Name "/XML"],
Pdf.Got data)}
let objnum = Pdf.addobj pdf metadata_stream in
let document_catalog =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| Some s -> s
| None -> error "Malformed PDF: No root."
let document_catalog' =
Pdf.add_dict_entry document_catalog "/Metadata" (Pdf.Indirect objnum)
let rootnum = Pdf.addobj pdf document_catalog' in
let trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum)
{pdf with
Pdf.trailerdict = trailerdict;
Pdf.root = rootnum;
Pdf.minor =
if keepversion then pdf.Pdf.minor else max 4 pdf.Pdf.minor}
let set_metadata keepversion filename pdf =
let ch = open_in_bin filename in
let data = mkbytes (in_channel_length ch) in
for x = 0 to bytes_size data - 1 do
bset data x (input_byte ch)
set_metadata_from_bytes keepversion data pdf
2019-07-01 14:40:22 +01:00
2013-08-20 15:32:57 +01:00
(* \section{Remove metadata} *)
let remove_metadata pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| None -> error "malformed file"
| Some root ->
let root' = Pdf.remove_dict_entry root "/Metadata" in
let rootnum = Pdf.addobj pdf root' in
{pdf with
Pdf.trailerdict =
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect rootnum);
Pdf.root =
(* List bookmarks *)
let output_string_of_target pdf fastrefnums x =
match Pdfdest.pdfobject_of_destination x with
2019-07-06 15:09:25 +01:00
| Pdf.Array (_::more) ->
let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
"\"" ^ Pdfwrite.string_of_pdf a ^ "\""
| x -> "\"" ^ Pdfwrite.string_of_pdf x ^ "\""
2013-08-20 15:32:57 +01:00
2021-10-26 19:41:14 +01:00
let json_of_target pdf fastrefnums x =
match Pdfdest.pdfobject_of_destination x with
| Pdf.Array (_::more) ->
let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
Cpdfjson.json_of_object pdf (fun _ -> ()) false false a
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) false false x
let output_json_marks ch calculate_page_number pdf fastrefnums marks =
let module J = Cpdfyojson.Safe in
let json_of_mark m =
[("level", `Int m.Pdfmarks.level);
2021-10-27 15:29:25 +01:00
("text", `String (Pdftext.utf8_of_pdfdocstring m.Pdfmarks.text));
2021-10-26 19:41:14 +01:00
("page", `Int (calculate_page_number m));
("open", `Bool m.Pdfmarks.isopen);
("target", json_of_target pdf fastrefnums m.Pdfmarks.target)]
let json = `List (map json_of_mark marks) in
J.pretty_to_channel ch json
2021-10-27 11:44:30 +01:00
(* List the bookmarks in the given range to the given output *)
2021-10-26 19:41:14 +01:00
let list_bookmarks ~json encoding range pdf output =
2017-05-28 19:58:30 +01:00
let process_stripped escaped =
let b = Buffer.create 200 in
2020-03-04 10:50:32 -08:00
2017-05-28 19:58:30 +01:00
(fun x ->
if x <= 127 then Buffer.add_char b (char_of_int x))
Buffer.contents b
2013-08-20 15:32:57 +01:00
let process_string s =
2017-05-28 19:45:02 +01:00
let rec replace c x y = function
2013-08-20 15:32:57 +01:00
| [] -> []
2017-05-28 19:45:02 +01:00
| h::t when h = c -> x::y::replace c x y t
| h::t -> h::replace c x y t
2013-08-20 15:32:57 +01:00
(* Convert to UTF8, raw, or stripped, and escape backslashed and quotation marks *)
let codepoints = Pdftext.codepoints_of_pdfdocstring s in
let escaped =
let bs = int_of_char '\\'
and nl = int_of_char '\n'
and n = int_of_char 'n'
and q = int_of_char '\"' in
2018-04-30 17:42:43 +01:00
replace q bs q (replace nl bs n (replace bs bs bs codepoints))
2013-08-20 15:32:57 +01:00
match encoding with
| UTF8 -> Pdftext.utf8_of_codepoints escaped
2017-05-28 19:58:30 +01:00
| Stripped -> process_stripped escaped
2013-08-20 15:32:57 +01:00
| Raw -> s
let bookmarks = Pdfmarks.read_bookmarks pdf in
2017-05-28 19:19:17 +01:00
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 14:02:34 +01:00
let rangetable = hashset_of_list range in
let range_is_all = range = ilist 1 (Pdfpage.endpage pdf) in
2017-05-29 14:02:34 +01:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
(* Find the pagenumber of each bookmark target. If it is in the range,
* keep that bookmark. Also keep the bookmark if its target is the null
* destination. *)
2013-08-20 15:32:57 +01:00
let inrange =
(function x ->
range_is_all ||
x.Pdfmarks.target = Pdfdest.NullDestination ||
(match x.Pdfmarks.target with Pdfdest.NamedDestinationElsewhere _ -> true | _ -> false) ||
2017-05-29 14:02:34 +01:00
Hashtbl.mem rangetable (Pdfpage.pagenumber_of_target ~fastrefnums pdf x.Pdfmarks.target)) bookmarks
2013-08-20 15:32:57 +01:00
2019-07-06 15:09:25 +01:00
let calculate_page_number mark =
(* Some buggy PDFs use integers for page numbers instead of page
* object references. Adobe Reader and Preview seem to support
* this, for presumably historical reasons. So if we see a
* OtherDocPageNumber (which is what Pdfdest parses these as,
* because that's what they are legitimately, we use this as the
* page number. It is zero based, though, and we are one-based, so
* we add one. Pdfpage.pagenumber_of_target has been modified to support this.*)
Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target
2021-10-26 19:41:14 +01:00
if json then
output_json_marks stdout calculate_page_number pdf fastrefnums inrange
(function mark ->
(Printf.sprintf "%i \"%s\" %i%s %s\n"
(process_string mark.Pdfmarks.text)
(calculate_page_number mark)
(if mark.Pdfmarks.isopen then " open" else "")
(output_string_of_target pdf fastrefnums mark.Pdfmarks.target)))
2013-08-20 15:32:57 +01:00
(* o is the stamp, u is the main pdf page *)
(* \section{Split at bookmarks} *)
(* Returns empty string on failure. Should only be used in conjunction with
split at bookmarks code, so should never fail, by definiton. *)
let remove_unsafe_characters s =
let chars =
(function x ->
match x with
'/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true
| x when int_of_char x < 32 || int_of_char x > 126 -> true
| _ -> false)
(explode s)
match chars with
| '.'::more -> implode more
| chars -> implode chars
let get_bookmark_name pdf marks splitlevel n _ =
2017-05-28 19:19:17 +01:00
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 14:39:01 +01:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
2013-08-20 15:32:57 +01:00
| {Pdfmarks.text = title}::_ -> remove_unsafe_characters title
| _ -> ""
(* Find the stem of a filename *)
(*let stem s =
implode (rev (tail_no_fail (dropwhile (neq '.') (rev (explode (Filename.basename s))))))*)
2013-08-20 15:32:57 +01:00
(* Return list, in order, a *set* of page numbers of bookmarks at a given level *)
let bookmark_pages level pdf =
2017-05-28 19:19:17 +01:00
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 14:39:01 +01:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
2013-08-20 15:32:57 +01:00
2017-05-29 14:39:01 +01:00
(function l when l.Pdfmarks.level = level -> Some (Pdfpage.pagenumber_of_target ~fastrefnums pdf l.Pdfmarks.target) | _ -> None)
2013-08-20 15:32:57 +01:00
(Pdfmarks.read_bookmarks pdf))
(* Called from cpdflib.ml - different from above *)
let split_on_bookmarks pdf level =
let points = lose (eq 0) (map pred (bookmark_pages level pdf))
in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let ranges = splitat points (indx pdf_pages) in
map (fun rs -> Pdfpage.pdf_of_pages pdf rs) ranges
(* Output information for each page *)
let output_page_info pdf range =
2013-08-20 15:32:57 +01:00
let pages = Pdfpage.pages_of_pagetree pdf
and labels = Pdfpagelabels.read pdf in
let getbox page box =
if box = "/MediaBox" then
match page.Pdfpage.mediabox with
| Pdf.Array [a; b; c; d] ->
Printf.sprintf "%f %f %f %f"
(Pdf.getnum a) (Pdf.getnum b) (Pdf.getnum c) (Pdf.getnum d)
| _ -> ""
match Pdf.lookup_direct pdf box page.Pdfpage.rest with
| Some (Pdf.Array [a; b; c; d]) ->
Printf.sprintf "%f %f %f %f"
(Pdf.getnum a) (Pdf.getnum b) (Pdf.getnum c) (Pdf.getnum d)
| _ -> ""
and rotation page =
Pdfpage.int_of_rotation page.Pdfpage.rotate
2020-03-04 10:50:32 -08:00
(fun pnum ->
let page = select pnum pages in
Printf.printf "Page %i:\n" pnum;
Printf.printf "Label: %s\n"
(try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> "");
Printf.printf "MediaBox: %s\n" (getbox page "/MediaBox");
Printf.printf "CropBox: %s\n" (getbox page "/CropBox");
Printf.printf "BleedBox: %s\n" (getbox page "/BleedBox");
Printf.printf "TrimBox: %s\n" (getbox page "/TrimBox");
Printf.printf "ArtBox: %s\n" (getbox page "/ArtBox");
Printf.printf "Rotation: %i\n" (rotation page))
2013-08-20 15:32:57 +01:00
(* Does the page have a defined box e.g "/CropBox" *)
let hasbox pdf page boxname =
let pages = Pdfpage.pages_of_pagetree pdf in
if page > length pages || page < 1 then raise (Failure "hasbox: bad page") else
let p = select page pages in
match Pdf.lookup_direct pdf boxname p.Pdfpage.rest with
| Some _ -> true
| _ -> false
2014-10-13 18:16:06 +01:00
(* Print metadata *)
2013-08-20 15:32:57 +01:00
let get_metadata pdf =
match Pdf.lookup_direct pdf "/Root" pdf.Pdf.trailerdict with
| None -> error "malformed file"
| Some root ->
match Pdf.lookup_direct pdf "/Metadata" root with
| Some ((Pdf.Stream _) as s) ->
Pdfcodec.decode_pdfstream pdf s;
2013-08-20 15:32:57 +01:00
begin match s with
| Pdf.Stream {contents = (_, Pdf.Got data)} -> Some data
2013-08-20 15:32:57 +01:00
| _ -> assert false
2014-09-08 17:55:14 +01:00
| _ -> None
2013-08-20 15:32:57 +01:00
let print_metadata pdf =
2014-09-08 17:55:14 +01:00
match get_metadata pdf with
None -> ()
| Some data ->
for x = 0 to bytes_size data - 1 do
Printf.printf "%c" (char_of_int (bget data x))
2013-08-20 15:32:57 +01:00
2021-11-03 13:40:54 +00:00
(* List fonts *)
2013-08-20 15:32:57 +01:00
let list_font pdf page (name, dict) =
let subtype =
match Pdf.lookup_direct pdf "/Subtype" dict with
2021-11-03 13:40:54 +00:00
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
2013-08-20 15:32:57 +01:00
| _ -> ""
in let basefont =
match Pdf.lookup_direct pdf "/BaseFont" dict with
2021-11-03 13:40:54 +00:00
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
2013-08-20 15:32:57 +01:00
| _ -> ""
in let encoding =
match Pdf.lookup_direct pdf "/Encoding" dict with
2021-11-03 13:40:54 +00:00
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
2013-08-20 15:32:57 +01:00
| _ -> ""
2021-11-03 13:40:54 +00:00
(page, name, subtype, basefont, encoding)
2013-08-20 15:32:57 +01:00
2021-11-03 16:51:28 +00:00
let list_fonts pdf range =
2013-08-20 15:32:57 +01:00
let pages = Pdfpage.pages_of_pagetree pdf in
(fun (num, page) ->
2021-11-03 16:51:28 +00:00
if mem num range then
begin match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some (Pdf.Dictionary fontdict) ->
map (list_font pdf num) fontdict
| _ -> []
2013-08-20 15:32:57 +01:00
(combine (ilist 1 (length pages)) pages))
let string_of_font (p, n, s, b, e) =
Printf.sprintf "%i %s %s %s %s\n" p n s b e
2021-11-03 16:51:28 +00:00
let print_fonts pdf range =
2013-08-20 15:32:57 +01:00
2021-11-03 16:51:28 +00:00
(fold_left ( ^ ) "" (map string_of_font (list_fonts pdf range)))
2013-08-20 15:32:57 +01:00
(* \section{Superimpose text, page numbers etc.} *)
2021-11-01 17:09:58 +00:00
(* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever
is in the font (for existing fonts). *)
2021-12-02 12:04:14 -08:00
let charcodes_of_utf8 font s =
let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in
2021-11-01 17:09:58 +00:00
let codepoints = Pdftext.codepoints_of_utf8 s in
2021-11-11 15:05:07 -08:00
let charcodes =
(fun codepoint ->
match extractor codepoint with
| Some cc -> Some cc
| None -> Printf.eprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint; None)
implode (map char_of_int charcodes)
2013-08-20 15:32:57 +01:00
(* Process codepoints back to UTF8, assuming it came from UTF8 to start with *)
let utf8_of_winansi s =
let text_extractor =
2021-12-02 12:04:14 -08:00
(Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding))
2013-08-20 15:32:57 +01:00
let codepoints = Pdftext.codepoints_of_text text_extractor s in
Pdftext.utf8_of_codepoints codepoints
(* Get the width of some text in the given font *)
let width_of_text font text =
match font with
| Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} ->
begin try
fold_left ( +. ) 0. (map (fun c -> fontmetrics.(int_of_char c)) (explode text))
_ -> 0.
| _ -> 0.
type ops_metrics =
{metrics_text : string;
metrics_x : float;
metrics_y : float;
metrics_rot : float}
let ops_metrics : ops_metrics list ref = ref []
let ops_baseline_adjustment = ref 0.
let metrics_howmany () = length !ops_metrics
let metrics_text n =
utf8_of_winansi (select n !ops_metrics).metrics_text
let metrics_x n =
(select n !ops_metrics).metrics_x
let metrics_y n =
(select n !ops_metrics).metrics_y
let metrics_rot n =
(select n !ops_metrics).metrics_rot
let metrics_baseline_adjustment () = !ops_baseline_adjustment
2021-11-15 11:17:15 -08:00
let colour_op = function
| RGB (r, g, b) -> Pdfops.Op_rg (r, g, b)
| Grey g -> Pdfops.Op_g g
| CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k)
let colour_op_stroke = function
| RGB (r, g, b) -> Pdfops.Op_RG (r, g, b)
| Grey g -> Pdfops.Op_G g
| CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k)
2013-08-20 15:32:57 +01:00
let ops longest_w metrics x y rotate hoffset voffset outline linewidth unique_fontname unique_extgstatename colour fontsize text =
if metrics then
ops_metrics :=
{metrics_text = text; metrics_x = x -. hoffset; metrics_y = y -. voffset; metrics_rot = rotate}
[Pdftransform.Translate (x -. hoffset, y -. voffset);
Pdftransform.Rotate ((0., 0.), rotate)]);
] @
(if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0]) @
2021-11-15 11:17:15 -08:00
[colour_op colour; colour_op_stroke colour]
2013-08-20 15:32:57 +01:00
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
[Pdfops.Op_Tf (unique_fontname, fontsize);
Pdfops.Op_Tj text;
2021-08-12 20:38:55 +01:00
type justification = LeftJustify | CentreJustify | RightJustify
2013-08-20 15:32:57 +01:00
(* Find the h-offset for justification based on the longest width, the current
width, the justification and the position. *)
2021-08-12 20:38:55 +01:00
let find_justification_offsets longest_w w position j =
let open Cpdfposition in
match j with
| LeftJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0.
| Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w
| Diagonal -> 0.
| ReverseDiagonal -> 0.
| RightJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w)
| Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.)
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
| CentreJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.)
| Top _ | PosCentre _ | Bottom _ | Centre -> 0.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
2013-08-20 15:32:57 +01:00
2014-09-08 17:55:14 +01:00
(* Lex an integer from the table *)
let extract_num header s =
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[Pdfgenlex.LexInt i] -> Pdf.Integer i
| [Pdfgenlex.LexReal f] -> Pdf.Real f
| _ -> raise (Failure ("extract_num: " ^ s))
2014-09-08 17:55:14 +01:00
let extract_fontbbox header s =
let num = function
Pdfgenlex.LexInt i -> Pdf.Integer i
| Pdfgenlex.LexReal f -> Pdf.Real f
| _ -> raise (Failure "extract_fontbbox")
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[a; b; c; d] -> [num a; num b; num c; num d]
| _ -> raise (Failure "extract_fontbbox")
let remove_slash s =
match explode s with
'/'::x -> implode x
| _ -> raise (Failure "remove_slash")
let extract_widths chars_and_widths =
2020-03-04 10:50:32 -08:00
let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in
(fun x ->
let name = List.assoc x win_to_name in
let width = List.assoc (remove_slash name) chars_and_widths in
_ -> 0)
(ilist 0 255)
2014-09-08 17:55:14 +01:00
let make_font embed fontname =
2014-09-08 17:55:14 +01:00
let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in
let header, width_data, _, chars_and_widths = Pdfstandard14.afm_data font in
let widths = extract_widths (list_of_hashtbl chars_and_widths) in
let flags = Pdfstandard14.flags_of_standard_font font in
let fontbbox = extract_fontbbox header "FontBBox" in
let italicangle = extract_num header "ItalicAngle" in
let ascent = try extract_num header "Ascender" with _ -> Pdf.Integer 0 in
let descent = try extract_num header "Descender" with _ -> Pdf.Integer 0 in
let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in
let stemv = Pdfstandard14.stemv_of_standard_font font in
2014-09-08 17:55:14 +01:00
let fontdescriptor =
[("/Type", Pdf.Name "/FontDescriptor");
("/FontName", Pdf.Name ("/" ^ fontname));
("/Flags", Pdf.Integer flags);
("/FontBBox", Pdf.Array fontbbox);
("/ItalicAngle", italicangle);
("/Ascent", ascent);
("/Descent", descent);
("/CapHeight", capheight);
2014-09-08 17:55:14 +01:00
("/StemV", Pdf.Integer stemv)]
(* With -no-embed-font, we use the standard encoding, and just the
* minimal stuff. Without -no-embed-font, we switch to WinAnsiEncoding,
* and fill out everything except the font file instead *)
if embed then
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/BaseFont", Pdf.Name ("/" ^ fontname));
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/FirstChar", Pdf.Integer 0);
("/LastChar", Pdf.Integer 255);
("/Widths", Pdf.Array (map (fun x -> Pdf.Integer x) widths));
("/FontDescriptor", fontdescriptor)]
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/BaseFont", Pdf.Name ("/" ^ fontname))]
2014-09-08 17:55:14 +01:00
2016-11-13 14:02:09 +00:00
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
fold_left ( ^ ) ""
| Pdf.String text ->
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(map_pages (extract_page_text extract_text_font_size pdf) pdf range)
2021-08-12 20:38:55 +01:00
2013-08-20 15:32:57 +01:00
let addtext
2015-07-17 16:34:47 +01:00
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
2013-08-20 15:32:57 +01:00
underneath position hoffset voffset text pages orientation cropbox opacity
justification filename extract_text_font_size shift pdf
2013-08-20 15:32:57 +01:00
2021-06-10 16:09:59 +01:00
let time = Cpdfstrftime.current_time () in
2013-08-20 15:32:57 +01:00
let endpage = Pdfpage.endpage pdf in
2016-11-13 14:02:09 +00:00
let replace_pairs pdf filename bates batespad num page =
2017-09-24 13:14:29 +01:00
"%PageDiv2", (fun () -> string_of_int ((num + 1) / 2));
"%Page", (fun () -> string_of_int num);
2016-11-13 14:02:09 +00:00
"%Roman", (fun () -> roman_upper num);
"%roman", (fun () -> roman_lower num);
"%filename", (fun () -> filename);
"%Label", (fun () -> pagelabel pdf num);
"%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
2015-07-17 16:34:47 +01:00
2016-11-13 14:02:09 +00:00
(fun () ->
(let numstring = string_of_int (bates + num - 1) in
2015-07-17 16:34:47 +01:00
match batespad with
None -> numstring
| Some w ->
if String.length numstring >= w
then numstring
2016-11-13 14:02:09 +00:00
else implode (many '0' (w - String.length numstring)) ^ numstring))]
2015-07-17 16:34:47 +01:00
let shifts = Cpdfcoord.parse_coordinates pdf shift in
2013-08-20 15:32:57 +01:00
let addtext_page num page =
let shift_x, shift_y = List.nth shifts (num - 1) in
2013-08-20 15:32:57 +01:00
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
page.Pdfpage.resources, None
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> Pdf.Dictionary []
| Some d -> d
let unique_fontname = Pdf.unique_key "F" fontdict in
let ops =
2021-06-10 16:09:59 +01:00
let text = process_text time text (replace_pairs pdf filename bates batespad num page) in
2013-08-20 15:32:57 +01:00
let calc_textwidth text =
match font with
| Some f ->
2016-11-04 15:57:28 +00:00
let rawwidth =
(if embed then Pdftext.WinAnsiEncoding else Pdftext.StandardEncoding)
2016-11-02 16:50:37 +00:00
2016-11-04 15:57:28 +00:00
(float rawwidth *. fontsize) /. 1000.
2013-08-20 15:32:57 +01:00
| None ->
let font =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
2021-11-03 13:40:54 +00:00
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
2013-08-20 15:32:57 +01:00
2021-11-03 13:40:54 +00:00
| _ -> failwith "addtext: font not found for width"
2013-08-20 15:32:57 +01:00
let rawwidth = width_of_text (Pdftext.read_font pdf font) text in
(rawwidth *. fontsize) /. 1000.
let expanded_lines =
2016-11-13 14:02:09 +00:00
(function text ->
2021-06-10 16:09:59 +01:00
process_text time text (replace_pairs pdf filename bates batespad num page))
2016-11-13 14:02:09 +00:00
2013-08-20 15:32:57 +01:00
let textwidth = calc_textwidth text
and allwidths = map calc_textwidth expanded_lines in
let longest_w = last (sort compare allwidths) in
let joffset = find_justification_offsets longest_w textwidth position justification in
let mediabox =
if cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
Pdf.parse_rectangle page.Pdfpage.mediabox
2021-08-12 20:38:55 +01:00
let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox orientation position in
2013-08-20 15:32:57 +01:00
let hoffset, voffset =
if position = Diagonal || position = ReverseDiagonal
then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset
else hoffset, voffset
match font with
| Some f ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
2013-08-20 15:32:57 +01:00
unique_fontname unique_extgstatename colour fontsize text
| None ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
2013-08-20 15:32:57 +01:00
fontname None colour fontsize text
let newresources =
match font with
| Some _ ->
2014-09-08 17:55:14 +01:00
let newfontdict =
Pdf.add_dict_entry fontdict unique_fontname (make_font embed fontname)
2013-08-20 15:32:57 +01:00
2014-09-08 17:55:14 +01:00
Pdf.add_dict_entry resources' "/Font" newfontdict
2013-08-20 15:32:57 +01:00
| None -> page.Pdfpage.resources
let page = {page with Pdfpage.resources = newresources} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
if metrics then
(ignore (iter_pages (fun a b -> ignore (addtext_page a b)) pdf pages); pdf)
process_pages (ppstub addtext_page) pdf pages
2013-08-20 15:32:57 +01:00
(* Prev is a list of lists of characters *)
let split_at_newline t =
let rec split_at_newline_inner prev = function
| [] -> rev (map implode (map rev prev))
| '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t
2013-08-20 15:32:57 +01:00
| '\\'::'n'::t -> split_at_newline_inner ([]::prev) t
| h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t
split_at_newline_inner [[]] (explode t)
let rec unescape_chars prev = function
| [] -> rev prev
2015-05-31 15:10:48 +01:00
| '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t ->
2013-08-20 15:32:57 +01:00
let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in
unescape_chars (chr::prev) t
| '\\'::'\\'::t -> unescape_chars ('\\'::prev) t
2013-08-20 15:32:57 +01:00
| '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t
| h::t -> unescape_chars (h::prev) t
let unescape_string s =
implode (unescape_chars [] (explode s))
2021-11-02 13:56:45 +00:00
addtexts metrics linewidth outline fast fontname (font : Pdftext.standard_font option) embed bates batespad colour position linespacing
2015-01-20 15:50:36 +00:00
fontsize underneath text pages orientation cropbox opacity justification
2021-11-03 12:15:15 +00:00
midline topline filename extract_text_font_size shift ?(raw=false) pdf
2013-08-20 15:32:57 +01:00
if pages = [] then error "addtexts: empty page range" else
2013-08-20 15:32:57 +01:00
(*flprint "addtexts:\n";
iter (Printf.printf "%C ") (explode text);
flprint "\n";
Printf.printf "\nCpdf.addtexts: metrics = %b" metrics;
flprint "\n";*)
(*Printf.printf "linewidth = %f\n" linewidth;
Printf.printf "outline = %b\n" outline;
Printf.printf "fast = %b\n" fast;
Printf.printf "fontname = %s\n" fontname;
Printf.printf "winansi text = %s\n" text;
Printf.printf "position = %s\n" (string_of_position position);
Printf.printf "bates = %i\n" bates;
Printf.printf "linespacing = %f\n" linespacing;
Printf.printf "fontsize = %f\n" fontsize;
Printf.printf "underneath = %b\n" underneath;
Printf.printf "font = %s\n" begin match font with None -> "None" | Some x -> Pdftext.string_of_standard_font x end;
Printf.printf "justification = %s\n"
begin match justification with LeftJustify -> "left" | RightJustify -> "right" | CentreJustify -> "centre" end;
Printf.printf "midline = %b\n" midline;
begin match colour with r, g, b -> Printf.printf "%f, %f, %f\n" r g b end;
Printf.printf "opacity = %f\n" opacity;
flprint "\n";
Printf.printf "relative-to-cropbox = %b" cropbox;
flprint "\n";*)
ops_metrics := [];
2021-11-03 13:40:54 +00:00
let realfontname = ref fontname in
2021-11-02 13:56:45 +00:00
let fontpdfobj =
match font with
| Some f ->
2021-11-02 13:56:45 +00:00
make_font embed (Pdftext.string_of_standard_font f)
| None ->
2021-11-02 13:56:45 +00:00
let firstpage =
2021-11-02 16:10:30 +00:00
List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1)
2021-11-02 13:56:45 +00:00
match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
2021-11-03 13:40:54 +00:00
| _ ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname ->
font := Some v; realfontname := k
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
2021-11-02 13:56:45 +00:00
2021-11-03 13:40:54 +00:00
| _ -> failwith "addtext: font dictionary not present"
2021-12-02 12:04:14 -08:00
let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in
let lines = map unescape_string (split_at_newline text) in
let pdf = ref pdf in
2013-08-20 15:32:57 +01:00
let voffset =
2021-08-12 20:38:55 +01:00
let open Cpdfposition in
2013-08-20 15:32:57 +01:00
match position with
| Bottom _ | BottomLeft _ | BottomRight _ ->
ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.)))
| Left _ | Right _ ->
(* Vertically align *)
ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.)))
| Diagonal | ReverseDiagonal ->
(* Change so that the whole paragraph sits on the centre... *)
ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.))
| _ -> ref 0.
if midline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000.
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
2015-01-20 15:50:36 +00:00
if topline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000.
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
2013-08-20 15:32:57 +01:00
ops_baseline_adjustment := 0.;
(fun line ->
let voff, hoff =
2021-08-12 20:38:55 +01:00
if orientation = Cpdfposition.Vertical then 0., -.(!voffset) else !voffset, 0.
2013-08-20 15:32:57 +01:00
pdf :=
2021-11-03 13:40:54 +00:00
addtext metrics lines linewidth outline fast colour !realfontname
2015-07-17 16:34:47 +01:00
embed bates batespad fontsize font underneath position hoff voff line
pages orientation cropbox opacity justification filename
extract_text_font_size shift
2013-08-20 15:32:57 +01:00
voffset := !voffset +. (linespacing *. fontsize))
ops_metrics := rev !ops_metrics;
let removetext range pdf =
(* Could fail on nesting, or other marked content inside our marked content.*)
let rec remove_until_last_EMC level = function
| [] -> []
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
remove_until_last_EMC (level + 1) more
| Pdfops.Op_EMC::more ->
if level = 1
then more
else remove_until_last_EMC (level - 1) more
| _::more ->
remove_until_last_EMC level more
let rec remove_stamps prev = function
| [] -> rev prev
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
let rest = remove_until_last_EMC 1 more in
remove_stamps prev rest
| h::t -> remove_stamps (h::prev) t
let removetext_page _ page =
{page with
Pdfpage.content =
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
process_pages (ppstub removetext_page) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Padding with blank pages.} *)
let insert_after pos page pages =
let before, after = cleave pages pos in
before @ [page] @ after
(* Insert many. *)
let rec insert_after_many pages = function
| [] -> pages
| (pos, page)::more ->
let pages' = insert_after pos page pages in
insert_after_many pages' (map (fun (p, pa) -> p + 1, pa) more)
(* For each pagenum in the range, increment the count by padsize, and carry on. e.g
insert_after_many_changes 1 0 [2] [1; 2; 3] --> [(1, 1); (2, 2); (3, 4)] *)
let rec insert_after_many_changes isbefore padsize offset range = function
[] -> []
| h::t ->
let item = (h, h + offset + if isbefore && mem h range then 1 else 0) in
if mem h range then
item::insert_after_many_changes isbefore padsize (offset + padsize) range t
item::insert_after_many_changes isbefore padsize offset range t
let print_changes =
2020-03-04 10:50:32 -08:00
iter (fun (f, t) -> Printf.printf "%i --> %i\n" f t)
2017-12-21 14:54:01 +00:00
let pad_with_pdf (range : int list) (pdf : Pdf.t) (isbefore : bool) (padfile : Pdf.t) =
2020-03-04 10:50:32 -08:00
let range = sort compare (setify range) in
2017-12-21 14:54:01 +00:00
let merged =
false false ["a"; "b"] [pdf; padfile] [ilist 1 (Pdfpage.endpage pdf); ilist 1 (Pdfpage.endpage padfile)]
let original_pages, padpages =
cleave (Pdfpage.pages_of_pagetree merged) (Pdfpage.endpage pdf)
let newpages =
2020-03-04 10:50:32 -08:00
2017-12-21 14:54:01 +00:00
(fun (pagenum, page) ->
if mem pagenum range then
(if isbefore then padpages @ [page] else [page] @ padpages)
(combine (indx original_pages) original_pages)
(* FIXME Provide ~changes here? *)
Pdfpage.change_pages false merged (flatten newpages)
2017-12-18 19:44:02 +00:00
let pad padwith range pdf isbefore =
match padwith with
Some padfile -> pad_with_pdf range pdf isbefore padfile
| None ->
let i = if isbefore then 1 else 0 in
let pages = Pdfpage.pages_of_pagetree pdf in
let blankpages =
(fun n ->
{Pdfpage.content = [];
Pdfpage.mediabox = (select (n + i) pages).Pdfpage.mediabox;
Pdfpage.resources = Pdf.Dictionary [];
Pdfpage.rotate = (select (n + i) pages).Pdfpage.rotate;
Pdfpage.rest = Pdf.remove_dict_entry ((select (n + i) pages).Pdfpage.rest) "/Annots"})
2017-12-18 19:44:02 +00:00
let pages' = insert_after_many pages (combine range blankpages) in
let changes =
2020-03-04 10:50:32 -08:00
isbefore 1 0 (map (fun x -> x + i) range) (ilist 1 (length pages))
2017-12-18 19:44:02 +00:00
Pdfpage.change_pages ~changes true pdf pages'
2013-08-20 15:32:57 +01:00
2017-12-18 19:44:02 +00:00
let padafter ?padwith range pdf =
2013-08-20 15:32:57 +01:00
let isinpdf n = mem n (ilist 1 (Pdfpage.endpage pdf)) in
if not (fold_left ( && ) true (map isinpdf range)) then
raise (Failure "padafter: range contains pages not present in pdf");
2017-12-18 19:44:02 +00:00
pad padwith range pdf false
2013-08-20 15:32:57 +01:00
2017-12-18 19:44:02 +00:00
let padbefore ?padwith range pdf =
2013-08-20 15:32:57 +01:00
let isinpdf n = mem n (ilist 1 (Pdfpage.endpage pdf)) in
if not (fold_left ( && ) true (map isinpdf range)) then
raise (Failure "padbefore: range contains pages not present in pdf");
2017-12-18 19:44:02 +00:00
pad padwith (map pred range) pdf true
2013-08-20 15:32:57 +01:00
let padmultiple n pdf =
2019-07-01 15:35:17 +01:00
let neg, n = n < 0, if n < 0 then -n else n in
2013-08-20 15:32:57 +01:00
let pages = Pdfpage.pages_of_pagetree pdf in
let len = length pages in
let pages_to_add = if len / n * n = len then 0 else n - (len mod n) in
if pages_to_add > 0 then
let blankpages =
{Pdfpage.content = [];
Pdfpage.mediabox = (select len pages).Pdfpage.mediabox;
Pdfpage.resources = Pdf.Dictionary [];
Pdfpage.rotate = (select len pages).Pdfpage.rotate;
2019-07-01 15:35:17 +01:00
Pdfpage.rest = Pdf.Dictionary []}
2013-08-20 15:32:57 +01:00
let changes = map (fun x -> (x, x)) (ilist 1 (length pages)) in
2019-07-01 15:35:17 +01:00
Pdfpage.change_pages ~changes true pdf (if neg then blankpages @ pages else pages @ blankpages)
2013-08-20 15:32:57 +01:00
(* \section{Shift page data} *)
let make_mediabox (xmin, ymin, xmax, ymax) =
[Pdf.Real xmin; Pdf.Real ymin; Pdf.Real xmax; Pdf.Real ymax]
(* Change the media box and other known boxes by the function [f] which takes
xmin, xmax, ymin, ymax as input. *)
let change_boxes f pdf page =
let names = ["/TrimBox"; "/ArtBox"; "/CropBox"; "/BleedBox"]
in let getbox n =
Pdf.lookup_direct pdf n page.Pdfpage.rest
let boxes = combine names (map getbox names) in
let toreplace = lose (function (_, None) -> true | _ -> false) boxes in
let toreplace =
(function (name, Some value) -> (name, value) | _ -> assert false)
let rest' =
(fun e (k, v) ->
let v =
make_mediabox (f (Pdf.parse_rectangle v))
Pdf.replace_dict_entry e k v)
{page with
Pdfpage.mediabox =
make_mediabox (f (Pdf.parse_rectangle page.Pdfpage.mediabox));
Pdfpage.rest = rest'}
2019-07-16 13:05:06 +01:00
let process_xobject f pdf resources i =
let xobj = Pdf.lookup_obj pdf i in
match Pdf.lookup_direct pdf "/Subtype" xobj with
| None -> raise (Pdf.PDFError "No /Subtype in Xobject")
| Some (Pdf.Name "/Form") ->
Pdf.getstream xobj;
begin match xobj with
| Pdf.Stream ({contents = Pdf.Dictionary dict, Pdf.Got bytes} as rf) ->
begin match f pdf resources [Pdf.Stream rf] with
| [Pdf.Stream {contents = (Pdf.Dictionary dict', data)}] ->
let dict' =
(Pdf.Dictionary (mergedict dict dict'))
rf := (dict', data)
| _ -> assert false
| _ -> assert false (* getstream would have complained already *)
| Some _ -> ()
2019-07-15 18:46:17 +01:00
2019-07-16 13:05:06 +01:00
let process_xobjects pdf page f =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
(fun (k, v) ->
match v with
| Pdf.Indirect i -> process_xobject f pdf page.Pdfpage.resources i
| _ -> raise (Pdf.PDFError "process_xobject"))
| _ -> ()
2013-08-20 15:32:57 +01:00
2019-07-16 13:05:06 +01:00
(* The content transformed by altering any use of [Op_cm]. But we must also
alter any /Matrix entries in pattern dictionaries *)
let change_pattern_matrices_resources pdf tr resources =
2013-08-20 15:32:57 +01:00
begin match Pdf.lookup_direct pdf "/Pattern" resources with
2019-07-16 13:05:06 +01:00
| Some (Pdf.Dictionary patterns) ->
let entries =
(fun (name, p) ->
2019-07-18 17:26:18 +01:00
(*Printf.printf "Changing matrices of pattern %s\n" name;*)
2019-07-16 13:05:06 +01:00
let old_pattern = Pdf.direct pdf p in
let new_pattern =
let existing_tr = Pdf.parse_matrix pdf "/Matrix" old_pattern in
let new_tr = Pdftransform.matrix_compose (Pdftransform.matrix_invert tr) existing_tr in
Pdf.add_dict_entry old_pattern "/Matrix" (Pdf.make_matrix new_tr)
name, Pdf.Indirect (Pdf.addobj pdf new_pattern))
Pdf.add_dict_entry resources "/Pattern" (Pdf.Dictionary entries)
| _ -> resources
2019-07-16 13:05:06 +01:00
Pdftransform.NonInvertable ->
2021-07-23 16:26:07 +01:00
Printf.eprintf "Warning: noninvertible matrix\n%!";
let change_pattern_matrices_page pdf tr page =
let page =
{page with Pdfpage.resources = change_pattern_matrices_resources pdf tr page.Pdfpage.resources}
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
(fun (k, v) ->
match v with
| Pdf.Indirect i ->
(* Check if it's a form XObject. If so, rewrite its resources and add back as same number. *)
begin match Pdf.lookup_direct pdf "/Subtype" v with
| Some (Pdf.Name "/Form") ->
2019-07-18 17:26:18 +01:00
(*Printf.printf "Processing form xobject %s for patterns\n" k; *)
let form_xobject = Pdf.lookup_obj pdf i in
begin match Pdf.lookup_direct pdf "/Resources" form_xobject with
| Some resources ->
let form_xobject' =
Pdf.add_dict_entry form_xobject "/Resources" (change_pattern_matrices_resources pdf tr resources)
Pdf.addobj_given_num pdf (i, form_xobject')
| _ -> ()
| _ -> ()
| _ -> raise (Pdf.PDFError "change_pattern_matrices_page"))
2019-07-16 13:05:06 +01:00
| _ -> page
2013-08-20 15:32:57 +01:00
let transform_rect transform rect =
let minx, miny, maxx, maxy = Pdf.parse_rectangle rect in
let (x0, y0) = Pdftransform.transform_matrix transform (minx, miny) in
let (x1, y1) = Pdftransform.transform_matrix transform (maxx, maxy) in
let (x2, y2) = Pdftransform.transform_matrix transform (minx, maxy) in
let (x3, y3) = Pdftransform.transform_matrix transform (maxx, miny) in
let minx = fmin (fmin x0 x1) (fmin x2 x3) in
let miny = fmin (fmin y0 y1) (fmin y2 y3) in
let maxx = fmax (fmax x0 x1) (fmax x2 x3) in
let maxy = fmax (fmax y0 y1) (fmax y2 y3) in
Pdf.Array [Pdf.Real minx; Pdf.Real miny; Pdf.Real maxx; Pdf.Real maxy]
2021-12-15 08:53:34 +00:00
let transform_quadpoint_single transform = function
| [x1; y1; x2; y2; x3; y3; x4; y4] ->
let x1, y1, x2, y2, x3, y3, x4, y4 =
Pdf.getnum x1, Pdf.getnum y1,
Pdf.getnum x2, Pdf.getnum y2,
Pdf.getnum x3, Pdf.getnum y3,
Pdf.getnum x4, Pdf.getnum y4
let (x1, y1) = Pdftransform.transform_matrix transform (x1, y1) in
let (x2, y2) = Pdftransform.transform_matrix transform (x2, y2) in
let (x3, y3) = Pdftransform.transform_matrix transform (x3, y3) in
let (x4, y4) = Pdftransform.transform_matrix transform (x4, y4) in
map (fun x -> Pdf.Real x) [x1; y1; x2; y2; x3; y3; x4; y4]
| qp ->
Printf.eprintf "Malformed /QuadPoints format: must be a multiple of 8 entries\n";
let transform_quadpoints transform = function
| Pdf.Array qps ->
Pdf.Array (flatten (map (transform_quadpoint_single transform) (splitinto 8 qps)))
| qp ->
Printf.eprintf "Unknown or malformed /QuadPoints format %s\n" (Pdfwrite.string_of_pdf qp);
(* Apply transformations to any annotations in /Annots (i.e their /Rect and /QuadPoints entries) *)
2019-07-10 13:16:32 +01:00
let transform_annotations pdf transform rest =
match Pdf.lookup_direct pdf "/Annots" rest with
| Some (Pdf.Array annots) ->
(* Always indirect references, so alter in place *)
2020-03-04 10:50:32 -08:00
2019-07-10 13:16:32 +01:00
| Pdf.Indirect i ->
let annot = Pdf.lookup_obj pdf i in
2021-12-15 08:53:34 +00:00
let rect' =
match Pdf.lookup_direct pdf "/Rect" annot with
| Some rect -> transform_rect transform rect
| None -> raise (Pdf.PDFError "transform_annotations: no rect")
let quadpoints' =
match Pdf.lookup_direct pdf "/QuadPoints" annot with
| Some qp -> Some (transform_quadpoints transform qp)
| None -> None
2019-07-10 13:16:32 +01:00
2021-12-15 08:53:34 +00:00
let annot = Pdf.add_dict_entry annot "/Rect" rect' in
let annot =
match quadpoints' with
| Some qp -> Pdf.add_dict_entry annot "/QuadPoints" qp
| None -> annot
Pdf.addobj_given_num pdf (i, annot)
2021-07-23 16:26:07 +01:00
| _ -> Printf.eprintf "transform_annotations: not indirect\n%!")
2019-07-10 13:16:32 +01:00
| _ -> ()
2016-07-21 17:02:11 +01:00
let shift_page ?(fast=false) dxdylist pdf pnum page =
let dx, dy = List.nth dxdylist (pnum - 1) in
let transform_op =
Pdfops.Op_cm (Pdftransform.matrix_of_op (Pdftransform.Translate (dx, dy)))
2013-08-20 15:32:57 +01:00
2019-07-16 13:05:06 +01:00
let page =
change_pattern_matrices_page pdf (Pdftransform.mktranslate ~-.dx ~-.dy) page
2016-07-21 17:02:11 +01:00
transform_annotations pdf (Pdftransform.mktranslate dx dy) page.Pdfpage.rest;
2021-05-31 18:10:39 +01:00
(Pdfpage.prepend_operators pdf [transform_op] ~fast page, pnum, Pdftransform.mktranslate dx dy)
2013-08-20 15:32:57 +01:00
2016-07-21 17:02:11 +01:00
let shift_pdf ?(fast=false) dxdylist pdf range =
2021-05-31 18:10:39 +01:00
process_pages (shift_page ~fast dxdylist pdf) pdf range
2013-08-20 15:32:57 +01:00
(* Change a page's media box so its minimum x and y are 0, making other
operations simpler to think about. Any shift that is done is reflected in
other boxes (clip etc.) *)
let rectify_boxes ?(fast=false) pdf page =
let minx, miny, _, _ =
Pdf.parse_rectangle page.Pdfpage.mediabox
let f (iminx, iminy, imaxx, imaxy) =
iminx -. minx, iminy -. miny, imaxx -. minx, imaxy -. miny
let page = change_boxes f pdf page in
if minx <> 0. || miny <> 0.
2021-05-31 18:10:39 +01:00
begin let p, _, _ = shift_page ~fast [(-.minx),(-.miny)] pdf 1 page in p end
2013-08-20 15:32:57 +01:00
else page
(* \section{Flip pages} *)
2021-05-31 18:10:39 +01:00
let flip_page ?(fast=false) transform_op pdf pnum page =
2013-08-20 15:32:57 +01:00
let minx, miny, maxx, maxy =
Pdf.parse_rectangle page.Pdfpage.mediabox
let tr = transform_op minx miny maxx maxy in
2019-07-16 13:05:06 +01:00
let page = change_pattern_matrices_page pdf tr page in
2019-07-10 13:16:32 +01:00
transform_annotations pdf tr page.Pdfpage.rest;
2021-05-31 18:10:39 +01:00
(Pdfpage.prepend_operators pdf [Pdfops.Op_cm tr] ~fast page, pnum, tr)
2013-08-20 15:32:57 +01:00
let vflip_pdf ?(fast=false) pdf range =
let transform_op _ miny _ maxy =
(Pdftransform.Scale ((0., ((miny +. maxy) /. 2.)), 1., -.1.))
2021-05-31 18:10:39 +01:00
process_pages (flip_page ~fast transform_op pdf) pdf range
2013-08-20 15:32:57 +01:00
let hflip_pdf ?(fast=false) pdf range =
let transform_op minx _ maxx _ =
(Pdftransform.Scale (((minx +. maxx) /. 2., 0.), -.1., 1.))
2021-05-31 18:10:39 +01:00
process_pages (flip_page ~fast transform_op pdf) pdf range
2013-08-20 15:32:57 +01:00
2015-02-17 15:48:00 +00:00
let stamp_shift_of_position topline midline sw sh w h p =
let half x = x /. 2.
and dy =
if midline then sh /. 2.
else if topline then sh
else 0.
2021-08-12 20:38:55 +01:00
let open Cpdfposition in
match p with
| PosCentre (ox, oy) -> ox -. half sw, oy -. dy
| PosLeft (ox, oy) -> ox, oy -. dy
| PosRight (ox, oy) -> ox -. sw, oy -. dy
| Top o -> half w -. half sw, h -. o -. sh -. dy
| TopLeft o -> o, h -. sh -. o -. dy
| TopRight o -> w -. sw -. o, h -. sh -. o -. dy
| Left o -> o, half h -. half sh -. dy
| BottomLeft o -> o, o -. dy
| Bottom o -> half w -. half sw, o -. dy
| BottomRight o -> w -. sw -. o, o -. dy
| Right o -> w -. sw -. o, half h -. half sh -. dy
| Diagonal | ReverseDiagonal | Centre ->
half w -. half sw, half h -. half sh -. dy
2015-02-17 15:48:00 +00:00
(* Combine Pdfpage.rest items for two PDFs. For now, we combine /Annots, and
* copy everything else from adict. What else should we combine? *)
let combine_page_items pdf adict bdict =
let getannots dict =
begin match dict with
Pdf.Dictionary d ->
begin match lookup "/Annots" d with
Some (Pdf.Array items) -> items
| _ -> []
| _ -> []
let a_annots = getannots adict in
let b_annots = getannots bdict in
match a_annots @ b_annots with
[] -> adict
| annots -> Pdf.add_dict_entry adict "/Annots" (Pdf.Array annots)
let do_stamp relative_to_cropbox fast position topline midline scale_to_fit isover pdf o u opdf =
(* Scale page stamp o to fit page u *)
let sxmin, symin, sxmax, symax =
(match Pdf.lookup_direct pdf "/CropBox" o.Pdfpage.rest with | Some r -> r | None -> o.Pdfpage.mediabox)
in let txmin, tymin, txmax, tymax =
(match Pdf.lookup_direct pdf "/CropBox" u.Pdfpage.rest with | Some r -> r | None -> u.Pdfpage.mediabox)
let o =
if scale_to_fit then
let xmag = (txmax -. txmin) /. (sxmax -. sxmin) in
let ymag = (tymax -. tymin) /. (symax -. symin) in
let scale =
if xmag < 0.999 && ymag < 0.999 then
if xmag > ymag then xmag else ymag
else if xmag >= 1.001 && ymag >= 1.001 then
if xmag > ymag then ymag else xmag
else if xmag >= 1.001 then ymag
else xmag
let dx = txmin +. ((txmax -. txmin) -. (sxmax -. sxmin) *. scale) /. 2. in
let dy = tymin +. ((tymax -. tymin) -. (symax -. symin) *. scale) /. 2. in
let matrix =
([Pdftransform.Translate (dx, dy)] @
(if relative_to_cropbox then [Pdftransform.Translate (txmin, tymin)] else []) @
[Pdftransform.Scale ((sxmin, symin), scale, scale)]))
transform_annotations pdf matrix o.Pdfpage.rest;
let r = Pdfpage.prepend_operators pdf [Pdfops.Op_cm matrix] ~fast o in
2019-07-16 13:05:06 +01:00
change_pattern_matrices_page pdf matrix r
2016-04-03 19:46:54 +01:00
let sw = sxmax -. sxmin and sh = symax -. symin
and w = txmax -. txmin and h = tymax -. tymin in
2015-02-17 15:48:00 +00:00
let dx, dy = stamp_shift_of_position topline midline sw sh w h position in
let matrix =
((if relative_to_cropbox then [Pdftransform.Translate (txmin, tymin)] else []) @
[Pdftransform.Translate (dx, dy)]))
transform_annotations pdf matrix o.Pdfpage.rest;
let r = Pdfpage.prepend_operators pdf [Pdfops.Op_cm matrix] ~fast o in
2019-07-16 13:05:06 +01:00
change_pattern_matrices_page pdf matrix r
{u with
Pdfpage.content =
(if isover then ( @ ) else ( @@ ))
2016-04-03 19:46:54 +01:00
(protect fast pdf u.Pdfpage.resources u.Pdfpage.content)
(protect fast pdf o.Pdfpage.resources o.Pdfpage.content);
Pdfpage.rest =
combine_page_items pdf u.Pdfpage.rest o.Pdfpage.rest;
Pdfpage.resources =
combine_pdf_resources pdf u.Pdfpage.resources o.Pdfpage.resources}
(* Alter bookmark destinations given a hash table of (old page reference
* number, new page reference number) pairings *)
let change_destination t = function
Pdfdest.XYZ (Pdfdest.PageObject p, a, b, c) ->
Pdfdest.XYZ (Pdfdest.PageObject (Hashtbl.find t p), a, b, c)
| Pdfdest.Fit (Pdfdest.PageObject p) ->
Pdfdest.Fit (Pdfdest.PageObject (Hashtbl.find t p))
| Pdfdest.FitH (Pdfdest.PageObject p, x) ->
Pdfdest.FitH (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitV (Pdfdest.PageObject p, x) ->
Pdfdest.FitV (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitR (Pdfdest.PageObject p, a, b, c, d) ->
Pdfdest.FitR (Pdfdest.PageObject (Hashtbl.find t p), a, b, c, d)
| Pdfdest.FitB (Pdfdest.PageObject p) ->
Pdfdest.Fit (Pdfdest.PageObject (Hashtbl.find t p))
| Pdfdest.FitBH (Pdfdest.PageObject p, x) ->
Pdfdest.FitBH (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitBV (Pdfdest.PageObject p, x) ->
Pdfdest.FitBV (Pdfdest.PageObject (Hashtbl.find t p), x)
| x -> x
let change_bookmark t m =
{m with Pdfmarks.target =
try change_destination t m.Pdfmarks.target with Not_found -> m.Pdfmarks.target}
let stamp relative_to_cropbox position topline midline fast scale_to_fit isover range over pdf =
2016-03-30 17:04:28 +01:00
let prefix = Pdfpage.shortest_unused_prefix pdf in
Pdfpage.add_prefix over prefix;
let marks = Pdfmarks.read_bookmarks pdf in
let marks_refnumbers = Pdf.page_reference_numbers pdf in
let pdf = Pdfmarks.remove_bookmarks pdf in
let over = Pdfmarks.remove_bookmarks over in
let pageseqs = ilist 1 (Pdfpage.endpage pdf) in
let over_firstpage_pdf =
match Pdfpage.pages_of_pagetree over with
| [] -> error "empty PDF"
| h::_ -> Pdfpage.change_pages ~changes:[(1, 1)] true over [h]
let merged =
false false ["a"; "b"] [pdf; over_firstpage_pdf] [pageseqs; [1]]
let merged =
{merged with Pdf.saved_encryption = pdf.Pdf.saved_encryption}
let merged = copy_id true pdf merged in
2016-03-30 17:04:28 +01:00
let merged_pages = Pdfpage.pages_of_pagetree merged in
let under_pages, over_page =
2016-03-30 17:04:28 +01:00
all_but_last merged_pages, last merged_pages
let new_pages =
(fun pageseq under_page ->
2016-03-30 17:04:28 +01:00
do_stamp relative_to_cropbox fast position topline midline scale_to_fit isover merged
(if mem pageseq range then over_page else
Pdfpage.blankpage Pdfpaper.a4)
under_page over)
let changed =
let changes =
2020-03-04 10:50:32 -08:00
map (fun x -> (x, x)) (ilist 1 (length new_pages))
Pdfpage.change_pages ~changes true merged new_pages
let new_refnumbers = Pdf.page_reference_numbers changed in
2020-03-04 10:50:32 -08:00
let changetable = hashtable_of_dictionary (combine marks_refnumbers new_refnumbers) in
let new_marks = map (change_bookmark changetable) marks in
Pdfmarks.add_bookmarks new_marks changed
let add_xobject_to_page xobjname xobjnum page pdf =
let resources' =
let xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some xobjects -> xobjects
| _ -> Pdf.Dictionary []
let new_xobjects =
Pdf.add_dict_entry xobjects xobjname (Pdf.Indirect xobjnum)
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" new_xobjects
{page with Pdfpage.resources = resources'}
2020-03-19 16:23:16 +00:00
let add_page_as_xobject pdf range page name =
let xobject_data =
match Pdfops.stream_of_ops (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content) with
Pdf.Stream {contents = (_, Got b)} -> b
| _ -> assert false
2020-03-19 16:23:16 +00:00
let xobject_dict =
["/Type", Pdf.Name "/XObject";
2020-03-25 13:46:54 +00:00
"/Subtype", Pdf.Name "/Form";
"/BBox", page.Pdfpage.mediabox;
"/Resources", page.Pdfpage.resources;
"/Length", Pdf.Integer (bytes_size xobject_data)]
2020-03-19 16:23:16 +00:00
let xobject =
Pdf.Stream {contents = (Pdf.Dictionary xobject_dict, Pdf.Got xobject_data)}
let xobject_objnum = Pdf.addobj pdf xobject in
let pages = Pdfpage.pages_of_pagetree pdf in
let new_pages =
(fun page pnum ->
if mem pnum range
then add_xobject_to_page name xobject_objnum page pdf
else page)
(indx pages)
Pdfpage.change_pages true pdf new_pages
2020-03-19 17:08:15 +00:00
(* n.b the use of change_pages here ensures no inheritable resources in the
* stamp, therefore creation of xobject from page is as simple as expected. *)
2020-03-19 16:23:16 +00:00
let stamp_as_xobject pdf range over =
let prefix = Pdfpage.shortest_unused_prefix pdf in
Pdfpage.add_prefix over prefix;
let marks = Pdfmarks.read_bookmarks pdf in
let marks_refnumbers = Pdf.page_reference_numbers pdf in
let pdf = Pdfmarks.remove_bookmarks pdf in
let over = Pdfmarks.remove_bookmarks over in
let pageseqs = ilist 1 (Pdfpage.endpage pdf) in
let over_firstpage_pdf =
match Pdfpage.pages_of_pagetree over with
| [] -> error "empty PDF"
| h::_ -> Pdfpage.change_pages ~changes:[(1, 1)] true over [h]
let merged =
false false ["a"; "b"] [pdf; over_firstpage_pdf] [pageseqs; [1]]
let merged =
{merged with Pdf.saved_encryption = pdf.Pdf.saved_encryption}
let merged = copy_id true pdf merged in
let merged_pages = Pdfpage.pages_of_pagetree merged in
let under_pages, over_page =
all_but_last merged_pages, last merged_pages
let new_pages = under_pages in
let changed =
let changes =
map (fun x -> (x, x)) (ilist 1 (length new_pages))
Pdfpage.change_pages ~changes true merged new_pages
let new_refnumbers = Pdf.page_reference_numbers changed in
let changetable = hashtable_of_dictionary (combine marks_refnumbers new_refnumbers) in
let new_marks = map (change_bookmark changetable) marks in
let pdf = Pdfmarks.add_bookmarks new_marks changed in
let name = "/" ^ Pdfpage.shortest_unused_prefix pdf ^ "CPDFXObj" in
2020-03-19 16:23:16 +00:00
(add_page_as_xobject pdf range over_page name, name)
(* Combine pages from two PDFs. For now, assume equal length. *)
(* If [over] has more pages than [under], chop the excess. If the converse, pad
[over] to the same length *)
let equalize_pages under over =
2016-04-03 12:48:44 +01:00
let length_under = Pdfpage.endpage under in
let length_over = Pdfpage.endpage over in
if length_over > length_under then
let changes =
2020-03-04 10:50:32 -08:00
map (fun x -> (x, x)) (ilist 1 length_under)
~changes true over (take (Pdfpage.pages_of_pagetree over) length_under)))
else if length_under > length_over then
let changes =
2020-03-04 10:50:32 -08:00
map (fun x -> (x, x)) (ilist 1 length_over)
~changes true over
(Pdfpage.pages_of_pagetree over @
(many (Pdfpage.blankpage Pdfpaper.a4) (length_under - length_over))))
under, over
2016-11-25 17:38:19 +00:00
let combine_pages fast under over scaletofit swap equalize =
let debug_combine_pages = false in
2016-11-17 18:42:40 +00:00
let debug_pdf pdf n =
if debug_combine_pages then
2016-11-25 17:38:19 +00:00
begin Pdf.remove_unreferenced pdf; Pdfwrite.pdf_to_file pdf n end
2016-11-17 18:42:40 +00:00
2016-04-03 12:48:44 +01:00
Pdfpage.add_prefix over (Pdfpage.shortest_unused_prefix under);
2016-11-25 17:38:19 +00:00
let marks_under, marks_over = Pdfmarks.read_bookmarks under, Pdfmarks.read_bookmarks over in
2016-04-03 12:48:44 +01:00
let under, over = if equalize then equalize_pages under over else under, over in
2016-11-25 17:38:19 +00:00
let under_length, over_length = Pdfpage.endpage under, Pdfpage.endpage over in
2016-04-03 12:48:44 +01:00
if under_length <> over_length then
raise (Pdf.PDFError "combine_pages: not of equal length")
let pageseqs_under = ilist 1 (Pdfpage.endpage under) in
let pageseqs_over = ilist 1 (Pdfpage.endpage over) in
let merged =
false false ["a"; "b"] [under; over] [pageseqs_under; pageseqs_over]
2016-11-17 18:42:40 +00:00
debug_pdf merged "merged.pdf";
2016-04-03 12:48:44 +01:00
let under_pages, over_pages =
cleave (Pdfpage.pages_of_pagetree merged) under_length
let new_pages =
(fun o u ->
2016-11-25 17:38:19 +00:00
false fast (BottomLeft 0.) false false scaletofit (not swap) merged o u over)
2016-04-03 12:48:44 +01:00
over_pages under_pages
2016-11-25 17:38:19 +00:00
(* Build the changes. 123456 -> 123123 *)
let changes =
2020-03-04 10:50:32 -08:00
let len = length new_pages in
2016-11-25 17:38:19 +00:00
combine (ilist 1 (len * 2)) (let x = ilist 1 len in x @ x)
let changed = Pdfpage.change_pages ~changes true merged new_pages in
let r = Pdfmarks.add_bookmarks (marks_under @ marks_over) changed in
debug_pdf r "final.pdf";
2013-08-20 15:32:57 +01:00
(* \section{Set media box} *)
2016-07-21 17:02:11 +01:00
let set_mediabox xywhlist pdf range =
let crop_page pnum page =
let x, y, w, h = List.nth xywhlist (pnum - 1) in
2013-08-20 15:32:57 +01:00
{page with
Pdfpage.mediabox =
[Pdf.Real x; Pdf.Real y;
Pdf.Real (x +. w); Pdf.Real (y +. h)])}
process_pages (ppstub crop_page) pdf range
2013-08-20 15:32:57 +01:00
2019-08-08 16:57:25 +01:00
(* Just used by cpdflib for historical reasons *)
let setBox box minx maxx miny maxy pdf range =
2013-08-20 15:32:57 +01:00
let set_box_page _ page =
{page with
Pdfpage.rest =
page.Pdfpage.rest box
(Pdf.Array [Pdf.Real minx; Pdf.Real miny; Pdf.Real maxx; Pdf.Real maxy])}
process_pages (ppstub set_box_page) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Cropping} *)
let crop_pdf ?(box="/CropBox") xywhlist pdf range =
2016-07-21 17:02:11 +01:00
let crop_page pagenum page =
2013-08-20 15:32:57 +01:00
{page with
Pdfpage.rest =
2016-07-21 17:02:11 +01:00
(let x, y, w, h = List.nth xywhlist (pagenum - 1) in
[Pdf.Real x; Pdf.Real y;
Pdf.Real (x +. w); Pdf.Real (y +. h)])))}
2013-08-20 15:32:57 +01:00
process_pages (ppstub crop_page) pdf range
2013-08-20 15:32:57 +01:00
2017-05-19 19:10:49 +01:00
(* Clip a page to one of its boxes, or the media box if that box is not
* present. This is a hard clip, done by using a clipping rectangle, so that
* the page may then be used as a stamp without extraneous material reapearing.
* *)
let hard_box pdf range boxname mediabox_if_missing fast =
(ppstub (fun pagenum page ->
2017-05-19 19:10:49 +01:00
let minx, miny, maxx, maxy =
if boxname = "/MediaBox" then
Pdf.parse_rectangle page.Pdfpage.mediabox
match Pdf.lookup_direct pdf boxname page.Pdfpage.rest with
| Some a -> Pdf.parse_rectangle a
| _ ->
if mediabox_if_missing
then Pdf.parse_rectangle page.Pdfpage.mediabox
2019-08-01 14:10:02 +01:00
else error (Printf.sprintf "hard_box: box %s not found" boxname)
2017-05-19 19:10:49 +01:00
2017-05-20 13:23:32 +01:00
let ops = [Pdfops.Op_re (minx, miny, maxx -. minx, maxy -. miny); Pdfops.Op_W; Pdfops.Op_n] in
2021-10-26 16:18:09 +01:00
Pdfpage.prepend_operators pdf ops ~fast page))
2017-05-19 19:10:49 +01:00
2013-08-20 15:32:57 +01:00
let remove_cropping_pdf pdf range =
let remove_cropping_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/CropBox")}
process_pages (ppstub remove_cropping_page) pdf range
2013-08-20 15:32:57 +01:00
let remove_trim_pdf pdf range =
let remove_trim_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/TrimBox")}
process_pages (ppstub remove_trim_page) pdf range
2013-08-20 15:32:57 +01:00
let remove_art_pdf pdf range =
let remove_art_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/ArtBox")}
process_pages (ppstub remove_art_page) pdf range
2013-08-20 15:32:57 +01:00
let remove_bleed_pdf pdf range =
let remove_bleed_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/BleedBox")}
process_pages (ppstub remove_bleed_page) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Rotating pages} *)
let rotate_pdf r pdf range =
let rotate_page _ page =
{page with Pdfpage.rotate =
Pdfpage.rotation_of_int r}
process_pages (ppstub rotate_page) pdf range
2013-08-20 15:32:57 +01:00
let rotate_pdf_by r pdf range =
let rotate_page_by _ page =
{page with Pdfpage.rotate =
Pdfpage.rotation_of_int ((Pdfpage.int_of_rotation page.Pdfpage.rotate + r) mod 360)}
process_pages (ppstub rotate_page_by) pdf range
2013-08-20 15:32:57 +01:00
2021-05-31 18:10:39 +01:00
let rotate_page_contents ~fast rotpoint r pdf pnum page =
2013-08-20 15:32:57 +01:00
let rotation_point =
match rotpoint with
| None ->
let minx, miny, maxx, maxy = Pdf.parse_rectangle page.Pdfpage.mediabox in
(minx +. maxx) /. 2., (miny +. maxy) /. 2.
| Some point -> point
let tr =
(Pdftransform.Rotate (rotation_point, -.(rad_of_deg r)))
in let tr2 =
(Pdftransform.Rotate (rotation_point, rad_of_deg r))
let transform_op = Pdfops.Op_cm tr in
2019-07-16 13:05:06 +01:00
let page = change_pattern_matrices_page pdf tr2 page in
transform_annotations pdf tr page.Pdfpage.rest;
2021-05-31 18:10:39 +01:00
(Pdfpage.prepend_operators pdf [transform_op] ~fast page, pnum, tr)
2013-08-20 15:32:57 +01:00
let rotate_contents ?(fast=false) r pdf range =
2021-05-31 18:10:39 +01:00
process_pages (rotate_page_contents ~fast None r pdf) pdf range
2013-08-20 15:32:57 +01:00
(* Return the pages from the pdf in the range, unordered. *)
let select_pages range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
option_map (function n -> try Some (select n pages) with _ -> None) range
(* Upright functionality *)
(* If all pages are already upright, and the mediabox is (0,0)-based, do nothing
to save time. *)
2013-08-20 15:32:57 +01:00
let allupright range pdf =
let page_is_upright page =
page.Pdfpage.rotate = Pdfpage.Rotate0 &&
(let (minx, miny, _, _) = Pdf.parse_rectangle page.Pdfpage.mediabox in
minx < 0.001 && miny < 0.001 && minx > ~-.0.001 && miny > ~-.0.001)
2013-08-20 15:32:57 +01:00
not (mem false (map page_is_upright (select_pages range pdf)))
let upright_transform page =
let rotate =
Pdfpage.int_of_rotation page.Pdfpage.rotate
and cx, cy =
let minx, miny, maxx, maxy = Pdf.parse_rectangle page.Pdfpage.mediabox in
(minx +. maxx) /. 2., (miny +. maxy) /. 2.
Pdftransform.mkrotate (cx, cy) (rad_of_deg (~-.(float rotate)))
let transform_boxes tr pdf page =
let f (minx, miny, maxx, maxy) =
let minx, miny = Pdftransform.transform_matrix tr (minx, miny)
and maxx, maxy = Pdftransform.transform_matrix tr (maxx, maxy) in
(minx, miny, maxx, maxy)
change_boxes f pdf page
let transform_contents ?(fast=false) tr pdf page =
let transform_op = Pdfops.Op_cm tr in
2019-07-16 13:05:06 +01:00
let page = change_pattern_matrices_page pdf (Pdftransform.matrix_invert tr) page in
2019-07-10 13:16:32 +01:00
transform_annotations pdf tr page.Pdfpage.rest;
2019-07-16 13:05:06 +01:00
Pdfpage.prepend_operators pdf [transform_op] ~fast page
2013-08-20 15:32:57 +01:00
let upright ?(fast=false) range pdf =
if allupright range pdf then pdf else
2021-05-31 18:10:39 +01:00
let upright_page _ pnum page =
2013-08-20 15:32:57 +01:00
let tr = upright_transform page in
let page = transform_boxes tr pdf page in
let page = transform_contents ~fast tr pdf page in
2021-05-31 18:10:39 +01:00
(rectify_boxes ~fast pdf {page with Pdfpage.rotate = Pdfpage.Rotate0}, pnum, tr)
2013-08-20 15:32:57 +01:00
2021-05-31 18:10:39 +01:00
process_pages (upright_page pdf) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Scale page data} *)
2016-07-21 17:02:11 +01:00
let scale_pdf ?(fast=false) sxsylist pdf range =
let scale_page pnum page =
let sx, sy = List.nth sxsylist (pnum - 1) in
let f (xmin, ymin, xmax, ymax) =
xmin *. sx, ymin *. sy, xmax *. sx, ymax *. sy
let page = change_boxes f pdf page
and matrix = Pdftransform.matrix_of_op (Pdftransform.Scale ((0., 0.), sx, sy)) in
let transform_op =
Pdfops.Op_cm matrix
2019-07-16 13:05:06 +01:00
and page =
change_pattern_matrices_page pdf (Pdftransform.matrix_invert matrix) page
2016-07-21 17:02:11 +01:00
2019-07-10 13:16:32 +01:00
transform_annotations pdf matrix page.Pdfpage.rest;
2021-05-28 18:00:27 +01:00
(Pdfpage.prepend_operators pdf ~fast [transform_op] page, pnum, matrix)
2016-07-21 17:02:11 +01:00
2021-05-28 18:00:27 +01:00
process_pages scale_page pdf range
2013-08-20 15:32:57 +01:00
(* Scale to fit page of size x * y *)
2019-03-19 14:03:28 +00:00
let scale_to_fit_pdf ?(fast=false) position input_scale xylist op pdf range =
2016-07-21 17:02:11 +01:00
let scale_page_to_fit pnum page =
let x, y = List.nth xylist (pnum - 1) in
2013-08-20 15:32:57 +01:00
let matrix =
let (minx, miny, maxx, maxy) =
(* Use cropbox if available *)
(match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some r -> r
| None -> page.Pdfpage.mediabox)
if maxx <= 0. || maxy <= 0. then failwith "Zero-sized pages are invalid" else
let fx = x /. maxx in let fy = y /. maxy in
let scale = fmin fx fy *. input_scale in
2019-03-19 14:03:28 +00:00
let trans_x =
match position with
2021-08-12 20:38:55 +01:00
Cpdfposition.Left _ -> 0.
| Cpdfposition.Right _ -> (x -. (maxx *. scale))
2019-03-19 14:03:28 +00:00
| _ -> (x -. (maxx *. scale)) /. 2.
and trans_y =
match position with
2021-08-12 20:38:55 +01:00
| Cpdfposition.Top _ -> (y -. (maxy *. scale))
| Cpdfposition.Bottom _ -> 0.
2019-03-19 14:03:28 +00:00
| _ -> (y -. (maxy *. scale)) /. 2.
2013-08-20 15:32:57 +01:00
[Pdftransform.Translate (trans_x, trans_y);
Pdftransform.Scale ((0., 0.), scale, scale)])
let page =
2016-11-13 18:01:01 +00:00
(function (minx, miny, maxx, maxy) -> 0., 0., x, y)
2013-08-20 15:32:57 +01:00
pdf page
2019-07-10 13:16:32 +01:00
transform_annotations pdf matrix page.Pdfpage.rest;
2021-05-31 18:10:39 +01:00
(Pdfpage.prepend_operators pdf [Pdfops.Op_cm matrix] ~fast
(change_pattern_matrices_page pdf (Pdftransform.matrix_invert matrix) page), pnum, matrix)
2013-08-20 15:32:57 +01:00
2021-05-31 18:10:39 +01:00
process_pages scale_page_to_fit pdf range
2013-08-20 15:32:57 +01:00
(* Scale contents *)
2021-05-31 18:10:39 +01:00
let scale_page_contents ?(fast=false) scale position pdf pnum page =
2013-08-20 15:32:57 +01:00
let (minx, miny, maxx, maxy) as box =
(* Use cropbox if available *)
(match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some r -> r
| None -> page.Pdfpage.mediabox)
2021-08-12 20:38:55 +01:00
let sx, sy, _ = Cpdfposition.calculate_position true 0. box Horizontal position in
2013-08-20 15:32:57 +01:00
let tx, ty =
2021-08-12 20:38:55 +01:00
let open Cpdfposition in
2013-08-20 15:32:57 +01:00
match position with
| Top t -> 0., -.t
| TopLeft t -> t, -.t
2021-11-22 12:20:48 -08:00
| TopRight t -> -.t, -.t
2013-08-20 15:32:57 +01:00
| Left t -> t, 0.
| BottomLeft t -> t, t
| Bottom t -> 0., t
2021-11-22 12:20:48 -08:00
| BottomRight t -> -.t, t
2013-08-20 15:32:57 +01:00
| Right t -> -.t, 0.
| _ -> 0., 0. (* centre it... FIXME: We will add a center position, eventually, for text and this... *)
let transform =
[Pdftransform.Translate (tx, ty);
Pdftransform.Scale ((sx, sy), scale, scale)]
let transform_op = Pdfops.Op_cm transform in
2019-07-16 13:05:06 +01:00
let page = change_pattern_matrices_page pdf transform page in
2019-07-10 11:18:01 +01:00
transform_annotations pdf transform page.Pdfpage.rest;
2021-05-31 18:10:39 +01:00
(Pdfpage.prepend_operators pdf [transform_op] ~fast page, pnum, transform)
2013-08-20 15:32:57 +01:00
let scale_contents ?(fast=false) position scale pdf range =
2021-05-31 18:10:39 +01:00
process_pages (scale_page_contents ~fast scale position pdf) pdf range
2013-08-20 15:32:57 +01:00
(* \section{List annotations} *)
let get_annotation_string encoding pdf annot =
match Pdf.lookup_direct pdf "/Contents" annot with
| Some (Pdf.String s) -> encode_output encoding s
| _ -> ""
let print_annotation encoding pdf num s =
2013-08-20 15:32:57 +01:00
let s = get_annotation_string encoding pdf s in
match s with
| "" -> ()
| s ->
flprint (Printf.sprintf "Page %d: " num);
flprint s;
flprint "\n"
2013-08-20 15:32:57 +01:00
let list_page_annotations encoding pdf num page =
2013-08-20 15:32:57 +01:00
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
iter (print_annotation encoding pdf num) (map (Pdf.direct pdf) annots)
2013-08-20 15:32:57 +01:00
| _ -> ()
2021-10-28 17:06:46 +01:00
let annotations_json_page pdf page pagenum =
2021-10-28 15:34:03 +01:00
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
2021-10-28 17:06:46 +01:00
(fun annot ->
`List [`Int pagenum; Cpdfjson.json_of_object pdf (fun _ -> ()) false false annot])
(map (Pdf.direct pdf) annots)
2021-10-28 15:34:03 +01:00
| _ -> []
let list_annotations_json pdf =
let module J = Cpdfyojson.Safe in
2021-10-28 17:06:46 +01:00
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let json = `List (flatten (map2 (annotations_json_page pdf) pages pagenums)) in
2021-10-28 15:34:03 +01:00
J.pretty_to_channel stdout json
let list_annotations ~json encoding pdf =
let range = Cpdfpagespec.parse_pagespec pdf "all" in
2021-10-28 15:34:03 +01:00
if json
then list_annotations_json pdf
else iter_pages (list_page_annotations encoding pdf) pdf range
2013-08-20 15:32:57 +01:00
let get_annotations encoding pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
(fun page pagenumber ->
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
let strings =
map (get_annotation_string encoding pdf) (map (Pdf.direct pdf) annots)
combine (many pagenumber (length strings)) strings
| _ -> [])
(ilist 1 (length pages)))
(* Equalise the page lengths of two PDFs by chopping or extending the first one.
let equalise_lengths a b =
let a' =
if Pdfpage.endpage a < Pdfpage.endpage b then
Pdfpage.change_pages false a
2013-08-20 15:32:57 +01:00
(Pdfpage.pages_of_pagetree a @
many (Pdfpage.blankpage Pdfpaper.a4) (Pdfpage.endpage b - Pdfpage.endpage a))
else if Pdfpage.endpage a > Pdfpage.endpage b then
Pdfpage.change_pages false a
2013-08-20 15:32:57 +01:00
(take (Pdfpage.pages_of_pagetree a) (Pdfpage.endpage b))
else a
2015-07-29 16:04:36 +01:00
a', b
(* Copy annotations *)
(* FIXME: Why does this chop the files to the same length? Should be able to
apply annotations from a longer file to a shorter? *)
(* Rewrite any annotation destinations to point to pages in the
destination file. This prevents pages being copied, and ensures the links are
correct Any Indirect link inside a /Dest is rewritten if in the table. If not
inside a /Dest, nothing is rewritten. *)
let rec renumber_in_dest table indest = function
Pdf.Indirect i ->
try Pdf.Indirect (Hashtbl.find table i) with _ -> Pdf.Indirect i
| Pdf.Array a ->
Pdf.recurse_array (renumber_in_dest table indest) a
| Pdf.Dictionary d ->
2020-03-04 10:50:32 -08:00
("/Dest", v) -> ("/Dest", renumber_in_dest table true v)
| (k, v) -> (k, renumber_in_dest table indest v))
| x -> x
let renumber_in_object pdf objnum table =
pdf (objnum, (renumber_in_dest table false (Pdf.lookup_obj pdf objnum)))
2015-07-29 16:04:36 +01:00
let copy_annotations_page topdf frompdf frompage topage =
match Pdf.lookup_direct frompdf "/Annots" frompage.Pdfpage.rest with
Some (Pdf.Array frompage_annots as annots) ->
let table =
(Pdf.page_reference_numbers frompdf)
(Pdf.page_reference_numbers topdf))
2020-03-04 10:50:32 -08:00
2016-11-24 19:10:53 +00:00
(* FIXME: We assume they are indirects. Must also do direct, though rare.*)
Pdf.Indirect x ->
(*Printf.printf "Copying annotation %s which is\n%s\n"
(Pdfwrite.string_of_pdf (Pdf.Indirect x))
(Pdfwrite.string_of_pdf (Pdf.direct frompdf (Pdf.Indirect
renumber_in_object frompdf x table
| _ -> ())
let objects_to_copy = Pdf.objects_referenced [] [] frompdf annots in
(fun n ->
ignore (Pdf.addobj_given_num topdf (n, Pdf.lookup_obj frompdf n)))
let topage_annots =
match Pdf.lookup_direct frompdf "/Annots" topage.Pdfpage.rest with
| Some (Pdf.Array annots) -> annots
| _ -> []
let merged_dict = Pdf.Array (frompage_annots @ topage_annots) in
let topage' =
{topage with Pdfpage.rest =
Pdf.add_dict_entry topage.Pdfpage.rest "/Annots" merged_dict}
topdf, topage'
2015-07-29 16:04:36 +01:00
| Some x -> topdf, topage
| None -> topdf, topage
2013-08-20 15:32:57 +01:00
let copy_annotations range frompdf topdf =
let frompdf, topdf = equalise_lengths frompdf topdf in
2015-07-29 16:04:36 +01:00
match Pdf.renumber_pdfs [frompdf; topdf] with
| [frompdf; topdf] ->
let frompdf_pages = Pdfpage.pages_of_pagetree frompdf in
let topdf_pages = Pdfpage.pages_of_pagetree topdf in
2015-07-29 16:04:36 +01:00
let pdf = ref topdf
and pages = ref []
and pnum = ref 1
and frompdf_pages = ref frompdf_pages
and topdf_pages = ref topdf_pages in
(* Go through, updating pdf and collecting new pages. *)
while not (isnull !frompdf_pages) do
let frompdf_page = hd !frompdf_pages
and topdf_page = hd !topdf_pages in
let pdf', page =
if mem !pnum range
then copy_annotations_page !pdf frompdf frompdf_page topdf_page
else !pdf, topdf_page
2013-08-20 15:32:57 +01:00
2015-07-29 16:04:36 +01:00
pdf := pdf';
pages =| page;
incr pnum;
frompdf_pages := tl !frompdf_pages;
topdf_pages := tl !topdf_pages
Pdfpage.change_pages true !pdf (rev !pages)
| _ -> assert false
2013-08-20 15:32:57 +01:00
2021-10-26 15:09:07 +01:00
let addrectangle
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
underneath range pdf
let addrectangle_page _ page =
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
page.Pdfpage.resources, None
let mediabox =
if relative_to_cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
Pdf.parse_rectangle page.Pdfpage.mediabox
let x, y, _ =
Cpdfposition.calculate_position false w mediabox Cpdfposition.Horizontal position
let x, y =
match position with
Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h)
| Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.))
| _ -> (x, y)
let ops =
2021-11-15 11:17:15 -08:00
colour_op colour;
colour_op_stroke colour;
2021-10-26 15:09:07 +01:00
(if outline then [Pdfops.Op_w linewidth] else [])
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
Pdfops.Op_re (x, y, w, h);
(if outline then Pdfops.Op_s else Pdfops.Op_f);
let page = {page with Pdfpage.resources = resources'} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
process_pages (ppstub addrectangle_page) pdf range
2021-10-19 16:18:15 +01:00
(* Imposition *)
2013-08-20 15:32:57 +01:00
(* Union two rest dictionaries from the same PDF. *)
let combine_pdf_rests pdf a b =
let a_entries =
match a with
| Pdf.Dictionary entries -> entries
| _ -> []
in let b_entries =
match b with
| Pdf.Dictionary entries -> entries
| _ -> []
2021-10-26 15:36:56 +01:00
let keys_to_combine = ["/Annots"] in
let combine_entries key =
let a_entries =
match Pdf.lookup_direct pdf key a with
| Some (Pdf.Array d) -> d
| _ -> []
in let b_entries =
match Pdf.lookup_direct pdf key b with
| Some (Pdf.Array d) -> d
| _ -> []
if a_entries = [] && b_entries = [] then
Some (key, Pdf.Array (a_entries @ b_entries))
let unknown_keys_a = lose (fun (k, _) -> mem k keys_to_combine) a_entries in
let unknown_keys_b = lose (fun (k, _) -> mem k keys_to_combine) b_entries in
let combined_known_entries = option_map combine_entries keys_to_combine in
2021-10-25 18:27:40 +01:00
(fun dict (k, v) -> Pdf.add_dict_entry dict k v)
(Pdf.Dictionary [])
(unknown_keys_a @ unknown_keys_b @ combined_known_entries)
2021-10-19 16:18:15 +01:00
(* Calculate the transformation matrices for a single imposed output page. *)
2021-10-25 15:35:35 +01:00
(* make margins by scaling for a fitted impose. *)
2021-10-21 13:51:14 +01:00
let make_margin output_mediabox margin tr =
if margin = 0. then tr else
let width, height =
match Pdf.parse_rectangle output_mediabox with
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin
if margin > width /. 2. || margin > height /. 2. then error "margin would fill whole page!" else
2021-10-21 18:10:47 +01:00
let hfactor = (width -. margin -. margin) /. width in
let vfactor = (height -. margin -. margin) /. height in
let factor = fmin hfactor vfactor in
2021-10-21 13:51:14 +01:00
let scale = Pdftransform.matrix_of_op (Pdftransform.Scale ((0., 0.), factor, factor)) in
2021-10-21 18:10:47 +01:00
let shift =
Pdftransform.matrix_of_op (Pdftransform.Translate ((width -. width *. factor) /. 2.,
(height -. height *. factor) /. 2.))
2021-10-21 13:51:14 +01:00
(Pdftransform.matrix_compose shift (Pdftransform.matrix_compose scale tr))
2021-11-12 13:59:24 -08:00
(* FIXME fixup -center for next release. For now it has been disabled. *)
2021-10-26 15:09:07 +01:00
let impose_transforms fit fx fy columns rtl btt center margin mediabox output_mediabox fit_extra_hspace fit_extra_vspace len =
2021-10-19 16:18:15 +01:00
let width, height =
match Pdf.parse_rectangle mediabox with
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin
let trs = ref [] in
2021-10-19 17:55:03 +01:00
let len = ref len in
2021-10-21 18:10:47 +01:00
let cent_extra_x = ref 0. in
let cent_extra_y = ref 0. in
2021-10-25 14:07:05 +01:00
let addtr x y row col px py =
let cex, cey =
2021-10-21 18:10:47 +01:00
(if rtl then ~-.(!cent_extra_x) else !cent_extra_x), (if btt then ~-.(!cent_extra_y) else !cent_extra_y)
2021-10-19 18:43:12 +01:00
2021-10-25 14:07:05 +01:00
let spacecol = if rtl then x - col - 1 else col in
let total_fit_extra_hspace = fit_extra_hspace *. (float_of_int spacecol +. 1.) in
2021-10-25 14:23:21 +01:00
let total_fit_extra_vspace = fit_extra_vspace *. (float_of_int row +. 1.) in
(*Printf.printf "row = %i, py = %f, ey = %f, fit_extra_vspace = %f, total_fit_extra_vspace = %f\n" row py cey fit_extra_vspace total_fit_extra_vspace;*)
2021-10-21 18:10:47 +01:00
trs :=
2021-10-25 14:07:05 +01:00
[Pdftransform.Translate (px +. cex +. total_fit_extra_hspace, py +. cey +. total_fit_extra_vspace)]
2021-10-21 18:10:47 +01:00
2021-10-19 16:56:39 +01:00
2021-10-19 17:55:03 +01:00
let x = int_of_float fx in
let y = int_of_float fy in
2021-10-21 18:36:23 +01:00
let final_full_cols = !len mod x in
2021-10-25 14:45:55 +01:00
let final_full_rows = !len mod y in
2021-10-19 16:56:39 +01:00
let order row col =
2021-10-19 17:55:03 +01:00
((if btt then y - row - 1 else row), (if rtl then x - col - 1 else col))
2021-10-19 16:56:39 +01:00
if columns then
2021-10-19 16:18:15 +01:00
for col = 0 to x - 1 do
2021-10-21 18:10:47 +01:00
if center && !len < y then if !cent_extra_y = 0. then cent_extra_y := ~-.(height *. float_of_int (y - !len)) /. 2.;
2021-10-19 16:56:39 +01:00
for row = y - 1 downto 0 do
2021-10-25 14:45:55 +01:00
let original_row = row in
2021-10-19 16:56:39 +01:00
let row, col = order row col in
2021-10-25 14:45:55 +01:00
let adjusted_row =
let final_empty_rows = y - final_full_rows in
if center && !len <= final_full_rows then original_row + (y - 1 - 1 - (final_empty_rows / 2)) else original_row
if !len > 0 then addtr x y adjusted_row col (width *. float_of_int col) (height *. float_of_int row);
len := !len - 1
2021-10-19 16:56:39 +01:00
2021-10-19 16:18:15 +01:00
2021-10-19 16:56:39 +01:00
for row = y - 1 downto 0 do
2021-10-21 18:10:47 +01:00
if center && !len < x then if !cent_extra_x = 0. then cent_extra_x := (width *. float_of_int (x - !len)) /. 2.;
2021-10-19 16:56:39 +01:00
for col = 0 to x - 1 do
2021-10-21 18:36:23 +01:00
let original_col = col in
2021-10-19 16:56:39 +01:00
let row, col = order row col in
2021-10-21 18:10:47 +01:00
let adjusted_col =
2021-10-21 18:36:23 +01:00
let final_empty_cols = x - final_full_cols in
if center && !len <= final_full_cols then original_col + (x - 1 - 1 - (final_empty_cols / 2)) else original_col
2021-10-21 18:10:47 +01:00
2021-10-25 14:45:55 +01:00
if !len > 0 then addtr x y row adjusted_col (width *. float_of_int col) (height *. float_of_int row);
len := !len - 1
2021-10-19 16:56:39 +01:00
2021-10-25 15:35:35 +01:00
map (if fit then make_margin output_mediabox margin else Fun.id) (rev !trs)
2021-10-19 16:18:15 +01:00
2019-07-08 15:42:19 +01:00
(* Combine two pages into one throughout the document. The pages have already
2021-10-19 16:18:15 +01:00
had their objects renumbered so as not to clash. *)
2021-10-26 15:09:07 +01:00
let impose_pages fit x y columns rtl btt center margin output_mediabox fast fit_extra_hspace fit_extra_vspace pdf = function
2013-08-20 15:32:57 +01:00
| [] -> assert false
| (h::_) as pages ->
let transforms =
2021-10-26 16:39:26 +01:00
fit x y columns rtl btt center margin h.Pdfpage.mediabox
output_mediabox fit_extra_hspace fit_extra_vspace (length pages)
2013-08-20 15:32:57 +01:00
(* Change the pattern matrices before combining resources *)
let pages, h =
2021-10-19 16:18:15 +01:00
let r = map2 (fun p t -> change_pattern_matrices_page pdf t p) pages transforms in
(r, List.hd r)
let resources' = pair_reduce (combine_pdf_resources pdf) (map (fun p -> p.Pdfpage.resources) pages) in
let rest' = pair_reduce (combine_pdf_rests pdf) (map (fun p -> p.Pdfpage.rest) pages) in
2013-08-20 15:32:57 +01:00
let content' =
2021-10-26 16:18:09 +01:00
let transform_stream transform contents =
(* If fast, no mismatched q/Q protection and no parsing of operators. *)
if fast then
2021-10-26 16:39:26 +01:00
[Pdfops.stream_of_ops [Pdfops.Op_q; Pdfops.Op_cm transform]] @ contents @ [Pdfops.stream_of_ops [Pdfops.Op_Q]]
2021-10-26 16:18:09 +01:00
(* If slow, use protect from Pdfpage. *)
let ops = Pdfpage.protect pdf resources' contents @ Pdfops.parse_operators pdf resources' contents in
2021-10-26 16:39:26 +01:00
[Pdfops.stream_of_ops ([Pdfops.Op_q] @ [Pdfops.Op_cm transform] @ ops @ [Pdfops.Op_Q])]
2013-08-20 15:32:57 +01:00
2020-03-04 10:50:32 -08:00
2021-10-26 16:39:26 +01:00
(fun p t -> transform_annotations pdf t p.Pdfpage.rest; transform_stream t p.Pdfpage.content)
2013-08-20 15:32:57 +01:00
2021-10-21 13:51:14 +01:00
{Pdfpage.mediabox = output_mediabox;
2013-08-20 15:32:57 +01:00
Pdfpage.rotate = h.Pdfpage.rotate;
Pdfpage.content = content';
2019-07-08 15:42:19 +01:00
Pdfpage.resources = resources';
2021-10-19 16:18:15 +01:00
Pdfpage.rest = rest'}
2021-10-25 18:48:28 +01:00
(* For fit, we scale contents, move to middle and retain page size. For xy, we
2021-10-26 16:18:09 +01:00
expand mediabox and move contents to middle. This function also does the hard boxing. *)
2021-10-25 18:48:28 +01:00
let make_space fit ~fast spacing pdf =
2021-10-25 18:27:40 +01:00
let endpage = Pdfpage.endpage pdf in
let all = ilist 1 endpage in
2021-10-26 16:18:09 +01:00
let pdf = hard_box pdf all "/MediaBox" false fast in
if spacing = 0. then pdf else
let margin = spacing /. 2. in
2021-10-25 18:27:40 +01:00
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
2021-10-25 18:48:28 +01:00
let width, height =
2021-10-25 18:27:40 +01:00
match Pdf.parse_rectangle firstpage.Pdfpage.mediabox with
2021-10-25 18:48:28 +01:00
xmin, ymin, xmax, ymax -> (xmax -. xmin, ymax -. ymin)
2021-10-25 18:27:40 +01:00
2021-10-25 18:48:28 +01:00
if fit then
2021-10-26 16:18:09 +01:00
2021-10-25 18:27:40 +01:00
(many (margin, margin) endpage)
2021-10-25 18:48:28 +01:00
(scale_contents ~fast (Cpdfposition.BottomLeft 0.) ((width -. spacing) /. width) pdf all)
2021-10-26 16:18:09 +01:00
2021-10-25 18:48:28 +01:00
2021-10-26 16:18:09 +01:00
2021-10-25 18:48:28 +01:00
(many (0., 0., width +. spacing, height +. spacing) endpage)
2021-10-26 16:18:09 +01:00
(shift_pdf ~fast (many (margin, margin) endpage) pdf all) all)
2021-10-25 18:27:40 +01:00
2021-10-26 15:09:07 +01:00
(* We add the border as a thick unfilled rectangle just inside the page edge,
only if its linewidth is > 0 since, for us, 0 means none, not single-pixel
like in PDF. *)
let add_border linewidth ~fast pdf =
if linewidth = 0. then pdf else
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let _, _, w, h = Pdf.parse_rectangle firstpage.Pdfpage.mediabox in
2021-11-15 11:17:15 -08:00
fast (w -. linewidth, h -. linewidth) (RGB (0., 0., 0.)) true linewidth 1. (Cpdfposition.BottomLeft (linewidth /. 2.))
2021-10-26 15:09:07 +01:00
false false (ilist 1 (Pdfpage.endpage pdf)) pdf
2021-10-19 16:18:15 +01:00
let impose ~x ~y ~fit ~columns ~rtl ~btt ~center ~margin ~spacing ~linewidth ~fast pdf =
2021-10-26 16:18:09 +01:00
let endpage = Pdfpage.endpage pdf in
let pagenums = ilist 1 endpage in
let pdf = copy_cropbox_to_mediabox pdf pagenums in
let pdf = remove_cropping_pdf pdf pagenums in
let pdf = upright pagenums pdf in
2021-10-26 15:09:07 +01:00
let pdf = add_border linewidth ~fast pdf in
2021-10-25 18:48:28 +01:00
let pdf = make_space fit ~fast spacing pdf in
2021-10-20 18:51:53 +01:00
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let _, _, w, h = Pdf.parse_rectangle firstpage.Pdfpage.mediabox in
2021-10-19 19:26:02 +01:00
let ix = int_of_float x in
let iy = int_of_float y in
2021-10-21 18:10:47 +01:00
let n, ix, iy, fit_extra_hspace, fit_extra_vspace =
2021-10-20 18:51:53 +01:00
if fit then
(* +. 0.001 ensures a page always fits on itself, or on another page of same height or width. *)
let across = int_of_float (floor (x /. w +. 0.001)) in
let down = int_of_float (floor (y /. h +. 0.001)) in
if across < 1 || down < 1 then error "Not even a single page would fit." else
2021-10-21 18:10:47 +01:00
let excess_hspace = x -. float_of_int across *. w in
let excess_vspace = y -. float_of_int down *. h in
2021-10-25 14:45:55 +01:00
(*Printf.printf "across = %i, down =%i, excess_hspace = %f, excess_hspace = %f\n" across down excess_hspace excess_vspace;*)
2021-10-21 18:10:47 +01:00
(across * down,
excess_hspace /. (float_of_int across +. 1.),
excess_vspace /. (float_of_int down +. 1.))
2021-10-20 18:51:53 +01:00
if ix = 0 && iy = 0 then error "impose-xy: both dimensions cannot be zero" else
2021-10-21 18:10:47 +01:00
if ix = 0 then (endpage, endpage, 1, 0., 0.)
else if iy = 0 then (endpage, 1, endpage, 0., 0.)
else (ix * iy, ix, iy, 0., 0.)
2021-10-19 19:26:02 +01:00
2021-10-19 16:18:15 +01:00
let mediabox' =
if fit then Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real x; Pdf.Real y] else
2021-10-25 15:35:35 +01:00
let m2 = margin *. 2. in
if x = 0.0 then Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real (w *. float_of_int endpage +. m2); Pdf.Real (h +. m2)]
else if y = 0.0 then Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real (w +. m2); Pdf.Real (h *. float_of_int endpage +. m2)]
else Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real (w *. x +. m2); Pdf.Real (h *. y +. m2)]
2021-10-19 16:18:15 +01:00
let pages = Pdfpage.pages_of_pagetree pdf in
let pagesets = splitinto n pages in
let renumbered = map (Pdfpage.renumber_pages pdf) pagesets in
2021-10-25 15:35:35 +01:00
let pages =
2021-10-20 18:51:53 +01:00
2021-10-25 15:35:35 +01:00
(impose_pages fit (float_of_int ix) (float_of_int iy) columns rtl btt
2021-10-26 15:09:07 +01:00
center margin mediabox' fast fit_extra_hspace fit_extra_vspace pdf)
2021-10-20 18:51:53 +01:00
2021-10-19 16:18:15 +01:00
let changes = map (fun x -> (x, (x + (n - 1)) / n)) pagenums in
2021-10-25 15:35:35 +01:00
let pdf = Pdfpage.change_pages ~changes true pdf pages in
if fit then pdf else shift_pdf ~fast (many (margin, margin) (length pages)) pdf (ilist 1 (Pdfpage.endpage pdf))
2013-08-20 15:32:57 +01:00
2021-10-20 18:51:53 +01:00
(* Legacy -twoup-stack. Impose 2x1 on a page twice the size then rotate. *)
2021-10-20 14:00:33 +01:00
let twoup_stack fast pdf =
let pdf =
~x:2. ~y:1. ~fit:false ~columns:false ~rtl:false ~btt:false ~center:false
~margin:0. ~spacing:0. ~linewidth:0. ~fast pdf
let all = ilist 1 (Pdfpage.endpage pdf) in
upright ~fast all (rotate_pdf ~-90 pdf all)
2021-10-25 14:45:55 +01:00
(* Legacy -two-up. Rotate the pages and shrink them so as to fit 2x1 on a page the same size. *)
2021-10-20 14:00:33 +01:00
let twoup fast pdf =
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let width, height =
match Pdf.parse_rectangle firstpage.Pdfpage.mediabox with
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin
let width_exceeds_height = width > height in
let sc =
if width_exceeds_height
then fmin (height /. width) ((width /. 2.) /. height)
else fmin (width /. height) ((height /. 2.) /. width)
let endpage = Pdfpage.endpage pdf in
let all = ilist 1 endpage in
let pdf = scale_pdf ~fast (many (sc, sc) endpage) pdf all in
let pdf =
~x:2. ~y:1. ~fit:false ~columns:false ~rtl:false ~btt:false ~center:true
~margin:0. ~spacing:0. ~linewidth:0. ~fast pdf
let endpage = Pdfpage.endpage pdf in
let all = ilist 1 endpage in
let pdf = upright all (rotate_pdf ~-90 pdf all) in
scale_to_fit_pdf ~fast Cpdfposition.Diagonal 1. (many (width, height) endpage) () pdf all
2013-08-20 15:32:57 +01:00
(* \section{Output info} *)
let get_info raw pdf =
let infodict =
match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with
| Some infodict -> infodict
| _ -> Pdf.Dictionary []
let getstring name =
match Pdf.lookup_direct pdf name infodict with
| Some (Pdf.String s) ->
if raw then s else crude_de_unicode s
2019-07-01 14:40:22 +01:00
| Some (Pdf.Boolean false) -> "False"
| Some (Pdf.Boolean true) -> "True"
| _ -> if name = "/Trapped" then "False" else ""
2013-08-20 15:32:57 +01:00
let get_info_utf8 pdf =
let infodict =
match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with
| Some infodict -> infodict
| _ -> Pdf.Dictionary []
(function name ->
match Pdf.lookup_direct pdf name infodict with
| Some (Pdf.String s) -> Pdftext.utf8_of_pdfdocstring s
2019-07-01 14:40:22 +01:00
| Some (Pdf.Boolean false) -> "False"
| Some (Pdf.Boolean true) -> "True"
| _ -> if name = "/Trapped" then "False" else "")
2013-08-20 15:32:57 +01:00
let getstring encoding pdf =
match encoding with
| Raw -> get_info true pdf
| Stripped -> get_info false pdf
| UTF8 -> get_info_utf8 pdf
2014-09-08 17:55:14 +01:00
2013-08-20 15:32:57 +01:00
let output_info encoding pdf =
let getstring = getstring encoding pdf in
2013-08-20 15:32:57 +01:00
Printf.printf "Version: %i.%i\n" pdf.Pdf.major pdf.Pdf.minor;
Printf.printf "Pages: %i\n" (Pdfpage.endpage pdf);
Printf.printf "Title: %s\n" (getstring "/Title");
Printf.printf "Author: %s\n" (getstring "/Author");
Printf.printf "Subject: %s\n" (getstring "/Subject");
Printf.printf "Keywords: %s\n" (getstring "/Keywords");
Printf.printf "Creator: %s\n" (getstring "/Creator");
Printf.printf "Producer: %s\n" (getstring "/Producer");
Printf.printf "Created: %s\n" (getstring "/CreationDate");
2019-07-01 14:40:22 +01:00
Printf.printf "Modified: %s\n" (getstring "/ModDate");
Printf.printf "Trapped: %s\n" (getstring "/Trapped")
type xmltree =
2021-10-05 17:16:13 +01:00
E of Cpdfxmlm.tag * xmltree list
| D of string
let xmltree_of_bytes b =
2021-10-05 17:16:13 +01:00
let i = Cpdfxmlm.make_input (`String (0, string_of_bytes b)) in
let el tag childs = E (tag, childs)
and data d = D d in
2021-10-05 17:16:13 +01:00
Cpdfxmlm.input_doc_tree ~el ~data i
2019-06-28 16:11:31 +01:00
let bytes_of_xmltree t =
let buf = Buffer.create 1024 in
2021-10-05 17:16:13 +01:00
let o = Cpdfxmlm.make_output (`Buffer buf) in
2019-06-28 16:11:31 +01:00
let frag = function
E (tag, childs) -> `El (tag, childs)
| D d -> `Data d
2021-10-05 17:16:13 +01:00
Cpdfxmlm.output_doc_tree frag o t;
2019-06-28 16:11:31 +01:00
bytes_of_string (Buffer.contents buf)
2014-10-15 13:14:33 +01:00
let rec string_of_xmltree = function
D d ->
2014-10-15 13:14:33 +01:00
Printf.sprintf "DATA {%s}" d
| E (tag, trees) ->
Printf.sprintf "ELT (%s, %s)"
(string_of_tag tag)
(string_of_xmltrees trees)
and string_of_tag ((n, n'), attributes) =
"NAME |%s| |%s|, ATTRIBUTES {%s}" n n'
(string_of_attributes attributes)
and string_of_attribute ((n, n'), str) =
2014-10-15 13:14:33 +01:00
Printf.sprintf "ATTRNAME |%s| |%s|, STR {%s}" n n' str
and string_of_attributes attrs =
(fun a b -> a ^ " " ^ b) "" (map string_of_attribute attrs)
and string_of_xmltrees trees =
2014-10-15 13:14:33 +01:00
(fun a b -> a ^ " " ^ b) "" (map string_of_xmltree trees)
let adobe = "http://ns.adobe.com/pdf/1.3/"
let xmp = "http://ns.adobe.com/xap/1.0/"
let dc = "http://purl.org/dc/elements/1.1/"
let rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
let combine_with_spaces strs =
2021-08-12 20:38:55 +01:00
2014-10-15 13:14:33 +01:00
(fold_left (fun x y -> x ^ (if x <> "" then ", " else "") ^ y) "" strs)
(* Collect all <li> elements inside a seq, bag, or alt. Combine with commas. If
none found, return empty string instead. *)
let collect_list_items = function
E (((n, n'), _), elts) when
n = rdf && (n' = "Alt" || n' = "Seq" || n' = "Bag")
E (((n, n'), _), [D d]) when n = rdf && n' = "li" ->
Some d
| _ -> None)
| _ -> ""
let collect_list_items_all all =
match keep (function E _ -> true | _ -> false) all with
h::_ -> Some (collect_list_items h)
| [] -> None
let rec get_data_for namespace name = function
D _ -> None
2014-10-15 13:14:33 +01:00
| E (((n, n'), _), [D d]) when n = namespace && n' = name ->
Some d
2014-10-15 13:14:33 +01:00
| E (((n, n'), _), e) when n = namespace && n' = name ->
collect_list_items_all e
| E (_, l) ->
match option_map (get_data_for namespace name) l with
x :: _ -> Some x
| _ -> None
let output_xmp_info encoding pdf =
let print_out tree title namespace name =
match get_data_for namespace name tree with
None -> ()
| Some data ->
Printf.printf "%s: " title;
print_endline data
match get_metadata pdf with
None -> ()
| Some metadata ->
let dtd, tree = xmltree_of_bytes metadata in
print_out tree "XMP pdf:Keywords" adobe "Keywords";
print_out tree "XMP pdf:Producer" adobe "Producer";
print_out tree "XMP pdf:Trapped" adobe "Trapped";
2019-06-28 16:11:31 +01:00
print_out tree "XMP pdf:Title" adobe "Title";
print_out tree "XMP pdf:Creator" adobe "Creator";
print_out tree "XMP pdf:Subject" adobe "Subject";
print_out tree "XMP pdf:Author" adobe "Author";
print_out tree "XMP pdf:CreationDate" adobe "CreationDate";
print_out tree "XMP pdf:ModDate" adobe "ModDate";
print_out tree "XMP xmp:CreateDate" xmp "CreateDate";
print_out tree "XMP xmp:CreatorTool" xmp "CreatorTool";
print_out tree "XMP xmp:MetadataDate" xmp "MetadataDate";
print_out tree "XMP xmp:ModifyDate" xmp "ModifyDate";
print_out tree "XMP dc:title" dc "title";
print_out tree "XMP dc:creator" dc "creator";
2019-08-03 14:57:17 +01:00
print_out tree "XMP dc:subject" dc "subject";
print_out tree "XMP dc:description" dc "description"
_ -> ()
2013-08-20 15:32:57 +01:00
2020-11-19 18:11:35 +00:00
(* Get XMP info equivalent of an old metadata field *)
2019-07-03 14:40:32 +01:00
let check = function
"/Title" -> [(adobe, "Title"); (dc, "title")]
2020-11-19 18:11:35 +00:00
| "/Author" -> [(adobe, "Author"); (dc, "creator")]
| "/Subject" -> [(adobe, "Subject"); (dc, "subject")]
| "/Keywords" -> [(adobe, "Keywords")]
| "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool")]
| "/Producer" -> [(adobe, "Producer")]
| "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")]
2020-11-30 19:24:52 +00:00
| "/ModDate" -> [(adobe, "ModificationDate"); (xmp, "ModifyDate")]
2020-11-19 18:11:35 +00:00
| _ -> failwith "Cpdf.check_name not known"
2019-07-03 14:40:32 +01:00
let get_xmp_info pdf name =
let tocheck = check name in
match get_metadata pdf with
None -> ""
| Some metadata ->
let _, tree = xmltree_of_bytes metadata in
2020-03-04 10:50:32 -08:00
let results = map (fun (kind, key) -> match get_data_for kind key tree with Some x -> x | None -> "") tocheck in
2019-07-03 14:40:32 +01:00
match lose (eq "") results with
x::_ -> x
| [] -> ""
_ -> ""
2019-06-28 16:11:31 +01:00
(* Set XMP info *)
2019-07-01 14:40:22 +01:00
let rec set_xml_field kind fieldname value = function
2019-06-28 16:11:31 +01:00
D data -> D data
2019-08-27 16:05:52 +01:00
| E (((n, n'), m), _ (*[D _]*)) when n = kind && n' = fieldname -> (* Replace anything inside, including nothing i.e <tag/> *)
2019-06-28 16:11:31 +01:00
E (((n, n'), m), [D value])
2020-03-04 10:50:32 -08:00
| E (x, ts) -> E (x, map (set_xml_field kind fieldname value) ts)
2019-06-28 16:11:31 +01:00
2019-07-01 14:40:22 +01:00
let set_pdf_info_xml kind fieldname value xmldata pdf =
2019-06-28 16:11:31 +01:00
let dtd, tree = xmltree_of_bytes xmldata in
2019-06-28 18:05:55 +01:00
let str =
match value with
Pdf.String s -> s
| Pdf.Boolean true -> "True"
| Pdf.Boolean false -> "False"
| _ -> failwith "set_pdf_info_xml: not a string"
2019-07-01 14:40:22 +01:00
let newtree = set_xml_field kind fieldname str tree in
2019-06-28 16:11:31 +01:00
bytes_of_xmltree (dtd, newtree)
2019-07-01 14:40:22 +01:00
let set_pdf_info_xml_many changes value xmldata pdf =
2019-06-28 17:02:59 +01:00
let xmldata = ref xmldata in
2020-03-04 10:50:32 -08:00
2019-06-28 17:02:59 +01:00
(fun (kind, fieldname) ->
2019-07-01 14:40:22 +01:00
xmldata := set_pdf_info_xml kind fieldname value !xmldata pdf)
2019-06-28 17:02:59 +01:00
2019-06-29 15:03:22 +01:00
2019-06-28 16:11:31 +01:00
(* \section{Set an entry in the /Info dictionary} *)
2019-06-29 12:50:10 +01:00
(* We must parse the date to get its components, then use strftime to build the
* new string in XMP format *)
type date =
{mutable year : int;
mutable month : int; (* 1 - 12 *)
mutable day : int; (* 1 - 31 *)
mutable hour : int; (* 0 - 23 *)
mutable minute : int; (* 0 - 59 *)
mutable second : int; (* 0 - 59 *)
mutable ut_relationship : int; (* -1, 0, +1 *)
mutable offset_hours : int; (* 0 - 59 *)
mutable offset_minutes : int (* 0 - 59 *)}
let default_date () =
{year = 0;
month = 1;
day = 1;
hour = 0;
minute = 0;
second = 0;
ut_relationship = 0;
offset_hours = 0;
offset_minutes = 0}
(* XMP date format is YYYY-MM-DDThh:mm:ssTZD *)
let make_xmp_date_from_components d =
let tzd =
2019-06-29 15:03:22 +01:00
if d.ut_relationship = 0 && d.offset_hours = 0 && d.offset_minutes = 0 then "Z" else
(if d.ut_relationship >=0 then "+" else "-") ^
2019-06-29 15:03:22 +01:00
Printf.sprintf "%02i" d.offset_hours ^
":" ^
2019-06-29 15:03:22 +01:00
Printf.sprintf "%02i" d.offset_minutes
~time:{Cpdfstrftime._tm_sec = d.second;
Cpdfstrftime._tm_min = d.minute;
Cpdfstrftime._tm_hour = d.hour;
Cpdfstrftime._tm_mday = d.day;
Cpdfstrftime._tm_mon = d.month - 1;
Cpdfstrftime._tm_year = d.year - 1900;
Cpdfstrftime._tm_wday = 0;
Cpdfstrftime._tm_yday = 0;
Cpdfstrftime._tm_isdst = false}
2019-06-29 12:50:10 +01:00
let xmp_date date =
let d = default_date () in
match explode date with
'D'::':'::r ->
begin match r with
y1::y2::y3::y4::r ->
d.year <- int_of_string (implode [y1; y2; y3; y4]);
2019-06-29 12:50:10 +01:00
begin match r with
m1::m2::r ->
2019-06-29 16:03:29 +01:00
d.month <- int_of_string (implode [m1; m2]);
2019-06-29 12:50:10 +01:00
begin match r with
d1::d2::r ->
d.day <- int_of_string (implode [d1; d2]);
2019-06-29 12:50:10 +01:00
begin match r with
h1::h2::r ->
d.hour <- int_of_string (implode [h1; h2]);
2019-06-29 12:50:10 +01:00
begin match r with
m1::m2::r ->
d.minute <- int_of_string (implode [m1; m2]);
2019-06-29 12:50:10 +01:00
begin match r with
s1::s2::r ->
d.second <- int_of_string (implode [s1; s2]);
2019-06-29 12:50:10 +01:00
begin match r with
o::r ->
d.ut_relationship <-
if o = '+' then 1 else
if o = '-' then -1 else
2019-06-29 12:50:10 +01:00
begin match r with
h1::h2::'\''::r ->
d.offset_hours <- int_of_string (implode [h1; h2]);
2019-06-29 12:50:10 +01:00
begin match r with
m1::m2::_ ->
d.offset_minutes <- int_of_string (implode [m1; m2]);
2019-06-29 12:50:10 +01:00
raise Exit
| _ -> raise Exit
| _ -> raise Exit
| _ -> raise Exit
| _ -> raise Exit
| _ -> raise Exit
| _ -> raise Exit
| _ -> raise Exit
| _ -> raise Exit
2021-04-27 18:50:28 +01:00
| _ ->
2021-07-23 16:26:07 +01:00
Printf.eprintf "xmp_date: Malformed date string (no year): %s\n%!" date;
2021-04-27 18:50:28 +01:00
make_xmp_date_from_components d
2019-06-29 12:50:10 +01:00
2021-04-27 18:50:28 +01:00
| _ ->
2021-07-23 16:26:07 +01:00
Printf.eprintf "xmp_date: Malformed date string (no prefix): %s\n%!" date;
2021-04-27 18:50:28 +01:00
make_xmp_date_from_components d
2019-06-29 12:50:10 +01:00
Exit -> make_xmp_date_from_components d
2019-07-01 14:40:22 +01:00
let set_pdf_info ?(xmp_also=false) ?(xmp_just_set=false) (key, value, version) pdf =
2019-06-28 16:11:31 +01:00
let infodict =
match Pdf.lookup_direct pdf "/Info" pdf.Pdf.trailerdict with
| Some d -> d
| None -> Pdf.Dictionary []
let infodict' = Pdf.add_dict_entry infodict key value in
let objnum = Pdf.addobj pdf infodict' in
if not xmp_just_set then
pdf.Pdf.trailerdict <-
Pdf.add_dict_entry pdf.Pdf.trailerdict "/Info" (Pdf.Indirect objnum);
pdf.Pdf.minor <-
max pdf.Pdf.minor version
2019-07-01 14:40:22 +01:00
if xmp_also then
2019-06-28 16:11:31 +01:00
begin match get_metadata pdf with
None -> pdf
| Some xmldata ->
2019-06-29 12:50:10 +01:00
let xmp_date = function Pdf.String s -> Pdf.String (xmp_date s) | _ -> failwith "xmp_date not a string" in
let changes, value =
2019-06-28 17:02:59 +01:00
match key with
2019-06-29 12:50:10 +01:00
| "/Producer" -> [(adobe, "Producer")], value
| "/Creator" -> [(adobe, "Creator"); (xmp, "CreatorTool"); (dc, "creator")], value
| "/Author" -> [(adobe, "Author")], value
| "/Title" -> [(adobe, "Title"); (dc, "title")], value
| "/Subject" -> [(adobe, "Subject"); (dc, "subject")], value
| "/Keywords" -> [(adobe, "Keywords")], value
| "/CreationDate" -> [(adobe, "CreationDate"); (xmp, "CreateDate")], xmp_date value
| "/ModDate" -> [(adobe, "ModDate"); (xmp, "ModifyDate")], xmp_date value
| "/Trapped" -> [(adobe, "Trapped")], value
2019-06-28 17:02:59 +01:00
| _ -> failwith "Unknown call to set_pdf_info"
2019-06-28 16:11:31 +01:00
2019-07-01 14:40:22 +01:00
(set_pdf_info_xml_many changes value xmldata pdf)
2019-06-28 16:11:31 +01:00
2019-06-29 16:03:29 +01:00
(* Set metadata date *)
2019-07-01 14:40:22 +01:00
let set_metadata_date pdf date =
2019-06-29 16:03:29 +01:00
match get_metadata pdf with
None -> pdf
| Some xmldata ->
let changes= [(xmp, "MetadataDate")] in
let value = match date with "now" -> xmp_date (expand_date "now") | x -> x in
2019-07-01 14:40:22 +01:00
(set_pdf_info_xml_many changes (Pdf.String value) xmldata pdf)
2019-06-29 16:03:29 +01:00
2019-07-01 14:40:22 +01:00
let replacements pdf =
let info = get_info_utf8 pdf in
2019-08-01 14:10:02 +01:00
[("CREATEDATE", xmp_date (let i = info "/CreationDate" in if i = "" then expand_date "now" else i));
("MODDATE", xmp_date (let i = info "/ModDate" in if i = "" then expand_date "now" else i));
2019-07-01 14:40:22 +01:00
("PRODUCER", info "/Producer");
("CREATOR", info "/Creator");
("TITLE", info "/Title");
("SUBJECT", info "/Subject");
("AUTHOR", info "/Author");
("KEYWORDS", info "/Keywords");
("TRAPPED", info "/Trapped");
("METADATADATE", xmp_date (expand_date "now"))]
let create_metadata pdf =
let xmp = ref xmp_template in
2020-03-04 10:50:32 -08:00
2019-07-01 14:40:22 +01:00
(fun (s, r) -> xmp := string_replace_all s r !xmp)
(replacements pdf);
set_metadata_from_bytes false (bytes_of_string !xmp) pdf
2013-08-20 15:32:57 +01:00
(* \section{Blacken text} *)
Algorithm: Change
Op_g 0.
<ops minus any color, shading or gs operators>
<ops minus any text positioning or text rendering ones>
2021-11-15 11:17:15 -08:00
let blacktext_ops colour pdf resources content =
2013-08-20 15:32:57 +01:00
let not_text = function
| Pdfops.Op_Tj _ | Pdfops.Op_TJ _
| Pdfops.Op_' _ | Pdfops.Op_'' (_, _, _)
| Pdfops.Op_Td (_, _) | Pdfops.Op_TD (_, _)
| Pdfops.Op_Tm _ | Pdfops.Op_T'
| Pdfops.Op_Tc _
| Pdfops.Op_Tw _
| Pdfops.Op_Tz _
| Pdfops.Op_TL _
| Pdfops.Op_Tf (_, _)
| Pdfops.Op_Tr _
| Pdfops.Op_Ts _ -> false
| _ -> true
in let textlevel = ref 0
in let removed = ref []
in let operators =
Pdfops.parse_operators pdf resources content
let rec remove_colourops prev = function
| [] -> rev prev
| Pdfops.Op_BT::more ->
incr textlevel;
2021-11-15 11:17:15 -08:00
(colour_op colour::Pdfops.Op_BT::prev)
2013-08-20 15:32:57 +01:00
| Pdfops.Op_ET::more ->
decr textlevel;
let prev' = !removed @ Pdfops.Op_ET::prev in
removed := [];
remove_colourops prev' more
| (Pdfops.Op_G _
| Pdfops.Op_g _
| Pdfops.Op_RG (_, _, _)
| Pdfops.Op_rg (_, _, _)
| Pdfops.Op_k (_, _, _, _)
| Pdfops.Op_K (_, _, _, _)
| Pdfops.Op_SCN _
| Pdfops.Op_SC _
| Pdfops.Op_scn _
| Pdfops.Op_sc _
| Pdfops.Op_SCNName (_, _)
| Pdfops.Op_scnName (_, _)
| Pdfops.Op_CS _
| Pdfops.Op_cs _
| Pdfops.Op_sh _
| Pdfops.Op_gs _)
as op::more ->
if !textlevel > 0
removed =| op;
remove_colourops prev more
else remove_colourops (op::prev) more
| op::more ->
if !textlevel > 0 && not_text op then removed =| op;
remove_colourops (op::prev) more
let operators' = remove_colourops [] operators in
[Pdfops.stream_of_ops operators']
(* Blacken a form xobject, writing it to the same object. *)
2021-11-15 11:17:15 -08:00
let blacktext c range pdf =
2013-08-20 15:32:57 +01:00
let blacktext_page _ page =
let content' =
2021-11-15 11:17:15 -08:00
blacktext_ops c pdf page.Pdfpage.resources page.Pdfpage.content
2013-08-20 15:32:57 +01:00
2021-11-15 11:17:15 -08:00
process_xobjects pdf page (blacktext_ops c);
2013-08-20 15:32:57 +01:00
{page with Pdfpage.content = content'}
process_pages (ppstub blacktext_page) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Blacken lines} *)
2021-11-15 11:17:15 -08:00
let blacklines_ops c pdf resources content =
2013-08-20 15:32:57 +01:00
let rec blacken_strokeops prev = function
| [] -> rev prev
| Pdfops.Op_CS _::t ->
2016-11-09 18:15:23 +00:00
blacken_strokeops (Pdfops.Op_CS "/DeviceRGB"::prev) t
2013-08-20 15:32:57 +01:00
| (Pdfops.Op_SC _ | Pdfops.Op_SCN _ | Pdfops.Op_SCNName _ | Pdfops.Op_G _
| Pdfops.Op_RG _ | Pdfops.Op_K _)::t ->
2021-11-15 11:17:15 -08:00
blacken_strokeops (colour_op_stroke c::prev) t
2013-08-20 15:32:57 +01:00
| h::t -> blacken_strokeops (h::prev) t
and operators =
Pdfops.parse_operators pdf resources content
let operators' = blacken_strokeops [] operators in
[Pdfops.stream_of_ops operators']
2021-11-15 11:17:15 -08:00
let blacklines c range pdf =
2013-08-20 15:32:57 +01:00
let blacklines_page _ page =
let content' =
2021-11-15 11:17:15 -08:00
blacklines_ops c pdf page.Pdfpage.resources page.Pdfpage.content
2013-08-20 15:32:57 +01:00
2021-11-15 11:17:15 -08:00
process_xobjects pdf page (blacklines_ops c);
2013-08-20 15:32:57 +01:00
{page with Pdfpage.content = content'}
process_pages (ppstub blacklines_page) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Blacken Fills} *)
2021-11-15 11:17:15 -08:00
let blackfills_ops c pdf resources content =
2013-08-20 15:32:57 +01:00
let rec blacken_fillops prev = function
| [] -> rev prev
| Pdfops.Op_cs _::t ->
2016-11-09 18:15:23 +00:00
blacken_fillops (Pdfops.Op_cs "/DeviceRGB"::prev) t
2013-08-20 15:32:57 +01:00
| (Pdfops.Op_sc _ | Pdfops.Op_scn _ | Pdfops.Op_scnName _ | Pdfops.Op_g _
| Pdfops.Op_rg _ | Pdfops.Op_k _)::t ->
2021-11-15 11:17:15 -08:00
blacken_fillops (colour_op c::prev) t
2013-08-20 15:32:57 +01:00
| h::t -> blacken_fillops (h::prev) t
and operators =
Pdfops.parse_operators pdf resources content
let operators' = blacken_fillops [] operators in
[Pdfops.stream_of_ops operators']
2021-11-15 11:17:15 -08:00
let blackfills c range pdf =
2013-08-20 15:32:57 +01:00
let blackfills_page _ page =
let content' =
2021-11-15 11:17:15 -08:00
blackfills_ops c pdf page.Pdfpage.resources page.Pdfpage.content
2013-08-20 15:32:57 +01:00
2021-11-15 11:17:15 -08:00
process_xobjects pdf page (blackfills_ops c);
2013-08-20 15:32:57 +01:00
{page with Pdfpage.content = content'}
process_pages (ppstub blackfills_page) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Set a minimum line width to avoid dropout} *)
let thinlines range width pdf =
let thinpage _ page =
let operators =
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
let ctmstack = ref [ref Pdftransform.i_matrix] in
let scaleof_ctm () =
match Pdftransform.decompose (!(hd !ctmstack)) with
(scale, _, _, _, _, _) ->
2015-11-25 11:57:48 +00:00
Failure _ (*"hd"*) -> 1.
2013-08-20 15:32:57 +01:00
let rec replace_operators prev = function
| [] -> rev prev
| (Pdfops.Op_w w)::more ->
(* Alter width. *)
let width' = width /. scaleof_ctm () in
let w' =
if w >= width' then Pdfops.Op_w w else Pdfops.Op_w width'
replace_operators (w'::prev) more
| (Pdfops.Op_cm m)::more ->
(* Update CTM *)
begin try
let top = hd !ctmstack in
top := Pdftransform.matrix_compose !top m
2015-11-25 11:57:48 +00:00
Failure _ (*"hd"*) -> error "Malformed file."
2013-08-20 15:32:57 +01:00
replace_operators ((Pdfops.Op_cm m)::prev) more
| Pdfops.Op_q::more ->
(* Push stack *)
begin try
ctmstack =| ref (!(hd !ctmstack))
2015-11-25 11:57:48 +00:00
Failure _ (*"hd"*) -> error "Malformed file"
2013-08-20 15:32:57 +01:00
replace_operators (Pdfops.Op_q::prev) more
| Pdfops.Op_Q::more ->
(* Pop stack *)
begin try
ctmstack := tl !ctmstack
2015-11-25 11:57:48 +00:00
Failure _ (*"tl"*) -> error "Malformed file"
2013-08-20 15:32:57 +01:00
replace_operators (Pdfops.Op_Q::prev) more
| (Pdfops.Op_gs gsname)::more ->
(* Perhaps insert [Op_w]. *)
let opw =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| None -> []
| Some ext_state_dict ->
match Pdf.lookup_direct pdf gsname ext_state_dict with
| None -> []
| Some gdict ->
match Pdf.lookup_direct pdf "/LW" gdict with
| Some s -> (try [Pdfops.Op_w (Pdf.getnum s)] with _ -> [])
| None -> []
replace_operators (opw @ ((Pdfops.Op_gs gsname)::prev)) more
| x::more -> replace_operators (x::prev) more
let operators = replace_operators [] operators in
(* 2. Add an initial 'w' if width more than default width *)
let operators =
if width > 1. then (Pdfops.Op_w width)::operators else operators
let content' = [Pdfops.stream_of_ops operators] in
{page with Pdfpage.content = content'}
process_pages (ppstub thinpage) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Remove annotations} *)
let remove_annotations range pdf =
let remove_annotations_page pagenum page =
if mem pagenum range then
let rest' =
Pdf.remove_dict_entry page.Pdfpage.rest "/Annots"
{page with Pdfpage.rest = rest'}
process_pages (ppstub remove_annotations_page) pdf range
2013-08-20 15:32:57 +01:00
(* \section{Making draft documents} *)
(* Predicate on an xobject: true if an image xobject. *)
let isimage pdf (_, xobj) =
match Pdf.lookup_direct pdf "/Subtype" xobj with
| Some (Pdf.Name "/Image") -> true
| _ -> false
(* Given a set of resources for a page, and the name of a resource, determine if
that name refers to an image xobject. *)
let xobject_isimage pdf resources name =
match resources with
| Pdf.Dictionary _ ->
begin match Pdf.lookup_direct pdf "/XObject" resources with
| Some xobjects ->
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
| _ -> false
| _ -> failwith "bad resources"
(* The subsitute for an image. *)
let substitute boxes =
if boxes then
Pdfops.Op_w 0.;
Pdfops.Op_G 0.;
Pdfops.Op_re (0., 0., 1., 1.);
Pdfops.Op_m (0., 0.);
Pdfops.Op_l (1., 1.);
Pdfops.Op_m (0., 1.);
Pdfops.Op_l (1., 0.);
(* Remove references to images from a graphics stream. *)
let rec remove_images_stream onlyremove boxes pdf resources prev = function
2013-08-20 15:32:57 +01:00
| [] -> rev prev
| (Pdfops.Op_Do name) as h::t ->
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
2019-07-09 17:54:02 +01:00
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
| Pdfops.InlineImage _ as h::t ->
if onlyremove <> None
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
2013-08-20 15:32:57 +01:00
| h::t ->
remove_images_stream onlyremove boxes pdf resources (h::prev) t
2013-08-20 15:32:57 +01:00
let rec process_form_xobject onlyremove boxes pdf form =
2013-08-20 15:32:57 +01:00
let form = Pdf.direct pdf form in
let page =
{Pdfpage.content = [form];
Pdfpage.mediabox = Pdf.Null;
Pdfpage.resources =
begin match Pdf.lookup_direct pdf "/Resources" form with
| Some r -> r
| None -> Pdf.Dictionary []
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
let page', pdf =
remove_images_page onlyremove boxes pdf page
2013-08-20 15:32:57 +01:00
let form' =
match form with
| Pdf.Stream {contents = (dict, _)} ->
begin match
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
let dict' =
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
Pdf.Stream {contents = (dict', Pdf.Got data)}
| _ -> assert false
| _ -> raise (Pdf.PDFError "not a stream")
form', pdf
(* Remove images from a page. *)
and remove_images_page onlyremove boxes pdf page =
2013-08-20 15:32:57 +01:00
let isform pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
2019-07-09 17:54:02 +01:00
let isimage pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
2013-08-20 15:32:57 +01:00
(* Remove image xobjects and look into form ones *)
2019-07-09 17:54:02 +01:00
let form_xobjects, image_xobjects =
2013-08-20 15:32:57 +01:00
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
2019-07-09 17:54:02 +01:00
keep (function (_, p) -> isform pdf p) elts,
keep (function (_, p) -> isimage pdf p) elts
| _ -> [], []
2013-08-20 15:32:57 +01:00
let resources', pdf =
let names, pointers = split form_xobjects in
let form_xobjects', pdf =
let pdf = ref pdf
in let outputs = ref [] in
(fun p ->
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
2013-08-20 15:32:57 +01:00
pdf := pdf';
outputs =| p')
rev !outputs, !pdf
let nums = ref [] in
(fun xobj ->
let objnum = Pdf.addobj pdf xobj in
nums =| objnum)
2019-07-09 17:54:02 +01:00
let image_xobjects' =
match onlyremove with
2021-02-16 19:06:26 +00:00
None -> []
2019-07-09 17:54:02 +01:00
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
2013-08-20 15:32:57 +01:00
let newdict =
2019-07-09 17:54:02 +01:00
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
2013-08-20 15:32:57 +01:00
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
let content' =
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
2013-08-20 15:32:57 +01:00
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
{page with
Pdfpage.content =
(let stream = Pdfops.stream_of_ops content' in
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
Pdfpage.resources = resources'}, pdf
(* Remove images from all pages in a document. *)
2019-07-09 16:31:45 +01:00
let draft onlyremove boxes range pdf =
2013-08-20 15:32:57 +01:00
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf
in let pages' = ref [] in
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_images_page onlyremove boxes !pdf p
2013-08-20 15:32:57 +01:00
else p, !pdf
pdf := pdf';
pages' =| p')
Pdfpage.change_pages true !pdf (rev !pages')
let set_version v pdf =
pdf.Pdf.minor <- v
let blank_document width height pages =
let pdf_pages =
map (fun () -> Pdfpage.blankpage (Pdfpaper.make Pdfunits.PdfPoint width height)) (many () pages)
let pdf, pageroot = Pdfpage.add_pagetree pdf_pages (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf
let blank_document_paper papersize pages =
let pdf_pages =
map (fun () -> Pdfpage.blankpage papersize) (many () pages)
let pdf, pageroot = Pdfpage.add_pagetree pdf_pages (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf
2013-10-24 15:21:54 +01:00
(* Split the given range (which is in order) into multiple contiguous ones. *)
let rec ranges_of_range curr prev = function
| [] -> begin match curr with [] -> rev prev | _ -> rev (rev curr::prev) end
| x::xs ->
match curr with
| [] -> ranges_of_range [x] prev xs
| c::cs when x = c + 1 -> ranges_of_range (x::curr) prev xs
| cs -> ranges_of_range [x] (rev cs::prev) xs
(* Predicate which is true if at least one page range starts at page 1 *)
let page1 labels =
mem true (map (function l -> l.Pdfpagelabels.startpage = 1) labels)
let add_page_labels pdf progress style prefix startval range =
2013-10-24 15:21:54 +01:00
let ranges = map extremes (ranges_of_range [] [] range)
and labels = Pdfpagelabels.read pdf in
assert (length ranges > 0);
let startval_additions =
2020-02-07 13:48:09 +00:00
let r = ref [] in
let sofar = ref 0 in
iter (fun (s, e) -> r := !sofar :: !r; sofar := e - s + 1 + !sofar) ranges;
rev !r
2013-10-24 15:21:54 +01:00
let labels =
if not (page1 labels) then
2014-09-18 14:27:07 +01:00
({Pdfpagelabels.labelstyle = Pdfpagelabels.DecimalArabic;
2013-10-24 15:21:54 +01:00
Pdfpagelabels.labelprefix = None;
Pdfpagelabels.startpage = 1;
Pdfpagelabels.startvalue = 1}::labels)
ref labels
(fun (s, e) addition ->
2013-10-24 15:21:54 +01:00
let label =
{Pdfpagelabels.labelstyle = style;
Pdfpagelabels.labelprefix = prefix;
Pdfpagelabels.startpage = s;
2020-02-07 13:48:09 +00:00
Pdfpagelabels.startvalue = startval + if progress then addition else 0}
2013-10-24 15:21:54 +01:00
2014-09-17 17:59:27 +01:00
labels := Pdfpagelabels.add_label (Pdfpage.endpage pdf) !labels label e)
2013-10-24 15:21:54 +01:00
Pdfpagelabels.write pdf !labels
2019-10-02 13:41:56 +01:00
(* Parse the new content to make sure syntactically ok, append
* as required. Rewrite the content *)
let append_page_content_page fast s before pdf n page =
let ops =
Pdfops.parse_stream pdf page.Pdfpage.resources [bytes_of_string s]
(if before then Pdfpage.prepend_operators else Pdfpage.postpend_operators)
pdf ops ~fast page
let append_page_content s before fast range pdf =
process_pages (ppstub (append_page_content_page fast s before pdf)) pdf range
2019-10-02 13:41:56 +01:00
2016-11-13 14:02:09 +00:00
2020-02-27 15:14:51 +01:00
(* 1. Get list of indirects of all OCGs from the /OCProperties, and their textual names
* 2. Calculate a change list to coalesce them
* 3. Remove any changed ones from the /OCGs and /Order and /ON and /OFF in /OCProperties
* 4. Do the changes to all indirect references in the whole pdf *)
(*FIXME Pre-existing nulls - what to do? *)
let ocg_coalesce pdf =
match Pdf.lookup_direct pdf "/OCProperties" (Pdf.catalog_of_pdf pdf) with
None -> ()
| Some ocpdict ->
let number_name_pairs =
match Pdf.lookup_direct pdf "/OCGs" ocpdict with
Some (Pdf.Array ocgs) ->
begin let numbers =
map (function Pdf.Indirect i -> i | _ -> failwith "Malformed /OCG entry") ocgs
let names =
(fun i ->
begin match Pdf.lookup_obj pdf i with
Pdf.Dictionary d ->
begin match Pdf.lookup_direct pdf "/Name" (Pdf.Dictionary d) with
Some (Pdf.String s) -> s
| _ -> failwith "ocg: missing name"
| _ ->
failwith "ocg: not a dictionary"
with _ -> failwith "OCG object missing")
combine numbers names
| _ -> failwith "Malformed or missing /OCGs"
(*iter (fun (num, name) -> Printf.printf "%i = %s\n" num name) number_name_pairs;*)
let changes =
let cf (_, name) (_, name') = compare name name' in
let sets = collate cf (List.stable_sort cf number_name_pairs) in
flatten (option_map (function [] -> None | (hnum, _)::t -> Some (map (function (tnum, _) -> (tnum, hnum)) t)) sets)
(*Printf.printf "\nChanges are:\n";
List.iter (fun (f, t) -> Printf.printf "%i -> %i\n" f t) changes;*)
let new_ocproperties =
let remove_from_array key nums dict =
match Pdf.lookup_direct pdf key dict with
| Some (Pdf.Array elts) ->
let elts' = option_map (function Pdf.Indirect i -> if mem i nums then None else Some (Pdf.Indirect i) | _ -> None) elts in
Pdf.add_dict_entry dict key (Pdf.Array elts')
| _ -> dict
let remove_from_array_inside_d key nums dict =
match Pdf.lookup_direct pdf "/D" dict with
| Some (Pdf.Dictionary ddict) ->
begin match Pdf.lookup_direct pdf key (Pdf.Dictionary ddict) with
| Some (Pdf.Array elts) ->
let elts' = option_map (function Pdf.Indirect i -> if mem i nums then None else Some (Pdf.Indirect i) | _ -> None) elts in
Pdf.add_dict_entry dict "/D" (Pdf.add_dict_entry (Pdf.Dictionary ddict) key (Pdf.Array elts'))
| _ -> dict
| _ -> failwith "No /D dict in OCGProperties"
let nums = map fst changes in
(*Printf.printf "\nto remove:\n";
List.iter (Printf.printf "%i ") nums;*)
remove_from_array "/OCGs" nums
(remove_from_array_inside_d "/ON" nums
(remove_from_array_inside_d "/OFF" nums
(remove_from_array_inside_d "/Order" nums ocpdict)))
(*flprint (Pdfwrite.string_of_pdf new_ocproperties);*)
let ocp_objnum = Pdf.addobj pdf new_ocproperties in
let new_catalog = Pdf.addobj pdf (Pdf.add_dict_entry (Pdf.catalog_of_pdf pdf) "/OCProperties" (Pdf.Indirect ocp_objnum)) in
pdf.Pdf.trailerdict <- Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect new_catalog);
pdf.Pdf.root <- new_catalog;
Pdf.objselfmap (Pdf.renumber_object_parsed pdf (hashtable_of_dictionary changes)) pdf
2021-06-08 16:58:35 +01:00
let ocg_get_list pdf =
let l = ref [] in
begin match Pdf.lookup_direct pdf "/OCProperties" (Pdf.catalog_of_pdf pdf) with
2020-02-27 15:14:51 +01:00
None -> ()
| Some ocpdict ->
match Pdf.lookup_direct pdf "/OCGs" ocpdict with
Some (Pdf.Array elts) ->
2020-03-04 10:50:32 -08:00
2020-02-27 15:14:51 +01:00
Pdf.Indirect i ->
(match Pdf.lookup_direct pdf "/Name" (Pdf.lookup_obj pdf i) with
2021-06-08 16:58:35 +01:00
Some (Pdf.String s) -> l := s::!l | _ -> ())
2020-02-27 15:14:51 +01:00
| _ -> ())
| _ -> ()
2021-06-08 16:58:35 +01:00
rev !l
let ocg_list pdf =
List.iter (Printf.printf "%s\n") (ocg_get_list pdf)
2020-02-27 15:14:51 +01:00
let ocg_rename f t pdf =
Pdf.Dictionary d ->
begin match Pdf.lookup_direct pdf "/Type" (Pdf.Dictionary d) with
Some (Pdf.Name "/OCG") ->
begin match Pdf.lookup_direct pdf "/Name" (Pdf.Dictionary d) with
Some (Pdf.String s) when s = f ->
Pdf.add_dict_entry (Pdf.Dictionary d) "/Name" (Pdf.String t)
| _ -> Pdf.Dictionary d
| _ -> Pdf.Dictionary d
| x -> x
2020-03-02 13:37:39 +01:00
let ocg_order_all pdf =
match Pdf.lookup_direct pdf "/OCProperties" (Pdf.catalog_of_pdf pdf) with
None -> ()
| Some ocpdict ->
match Pdf.lookup_direct pdf "/OCGs" ocpdict with
Some (Pdf.Array elts) ->
begin match Pdf.lookup_direct pdf "/D" ocpdict with
Some (Pdf.Dictionary d) ->
let newd = Pdf.add_dict_entry (Pdf.Dictionary d) "/Order" (Pdf.Array elts) in
let new_ocproperties = Pdf.add_dict_entry ocpdict "/D" newd in
let ocp_objnum = Pdf.addobj pdf new_ocproperties in
let new_catalog = Pdf.addobj pdf (Pdf.add_dict_entry (Pdf.catalog_of_pdf pdf) "/OCProperties" (Pdf.Indirect ocp_objnum)) in
pdf.Pdf.trailerdict <- Pdf.add_dict_entry pdf.Pdf.trailerdict "/Root" (Pdf.Indirect new_catalog);
pdf.Pdf.root <- new_catalog
| _ -> ()
| _ -> ()
(* Add rectangles on top of pages to show Media, Crop, Art, Trim, Bleed boxes.
* We use different dash lengths and colours to help distinguish coincident
* boxes The sequence of operators is postpended to the page content,
* appropriately protected to prevent pollution of matrices.
* /MediaBox: Solid red line
* /CropBox: Dashed 7 on 7 off green line
* /ArtBox: Dashed 5 on 5 off blue line
* /TrimBox: Dashed 3 on 3 off orange line
* /BleedBox: Dashed 2 on 2 off pink line *)
let get_rectangle pdf page box =
if box = "/MediaBox" then
match page.Pdfpage.mediabox with
Pdf.Array [a; b; c; d] as r -> Some (Pdf.parse_rectangle r)
| _ -> None
match Pdf.lookup_direct pdf box page.Pdfpage.rest with
Some (Pdf.Array [a; b; c; d] as r) -> Some (Pdf.parse_rectangle r)
| _ -> None
let show_boxes_page fast pdf _ page =
let make_ops (r, g, b) on off boxname =
match get_rectangle pdf page boxname with
Some (r1, r2, r3, r4) ->
Pdfops.Op_RG (r /. 255., g /. 255., b /. 255.);
Pdfops.Op_w 1.;
Pdfops.Op_d ((if on = 0. && off = 0. then [] else [on; off]), 0.);
Pdfops.Op_re (r1, r2, r3 -. r1, r4 -. r2);
| None -> []
let ops =
make_ops (255., 0., 0.) 0. 0. "/MediaBox"
@ make_ops (0., 255., 0.) 7. 7. "/CropBox"
@ make_ops (0., 0., 255.) 5. 5. "/ArtBox"
@ make_ops (255.,150.,0.) 3. 3. "/TrimBox"
@ make_ops (255.,9.,147.) 2. 2. "/BleedBox"
Pdfpage.postpend_operators pdf ops ~fast page
let show_boxes ?(fast=false) pdf range =
process_pages (ppstub (show_boxes_page fast pdf)) pdf range
let allowance = 9.
let line (x0, y0, x1, y1) =
[Pdfops.Op_m (x0, y0);
Pdfops.Op_l (x1, y1);
let trim_marks_page fast pdf n page =
match get_rectangle pdf page "/TrimBox", get_rectangle pdf page "/MediaBox" with
| Some (tminx, tminy, tmaxx, tmaxy), Some (minx, miny, maxx, maxy) ->
let ops =
Pdfops.Op_K (1., 1., 1., 1.);
Pdfops.Op_w 1.]
@ line (minx, tmaxy, tminy -. allowance, tmaxy) (* top left *)
@ line (tminx, tmaxy +. allowance, tminx, maxy)
@ line (tmaxx +. allowance, tmaxy, maxx, tmaxy) (* top right *)
@ line (tmaxx, tmaxy +. allowance, tmaxx, maxy)
@ line (tmaxx +. allowance, tminy, maxx, tminy) (* bottom right *)
@ line (tmaxx, tminy -. allowance, tmaxx, miny)
@ line (tminx -. allowance, tminy, minx, tminy) (* bottom left *)
@ line (tminx, tminy -. allowance, tminx, miny)
@ [Pdfops.Op_Q]
Pdfpage.postpend_operators pdf ops ~fast page
2021-07-23 16:26:07 +01:00
| _, _ ->
(*Printf.eprintf "warning: no /TrimBox found on page %i\n%!" n;*)
let trim_marks ?(fast=false) pdf range =
process_pages (ppstub (trim_marks_page fast pdf)) pdf range
2020-11-14 16:40:01 +00:00
let rec remove_all_text_ops pdf resources content =
let is_textop = function
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
| _ -> false
let content' =
let ops = Pdfops.parse_operators pdf resources content in
(option_map (function x -> if is_textop x then None else Some x) ops)
let remove_all_text_page pdf p =
let resources = p.Pdfpage.resources in
let content = p.Pdfpage.content in
process_xobjects pdf p remove_all_text_ops;
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
let remove_all_text range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf in
let pages' = ref [] in
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_all_text_page !pdf p
else p, !pdf
pdf := pdf';
pages' =| p')
Pdfpage.change_pages true !pdf (rev !pages')
(* 1. Extend remove_dict_entry with search term
2. Implement replace_dict_entry by analogy to remove_dict_entry *)
2021-10-29 15:09:21 +01:00
let rec dict_entry_single_object f pdf = function
| (Pdf.Dictionary d) -> f (Pdf.recurse_dict (dict_entry_single_object f pdf) d)
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
2021-10-29 15:09:21 +01:00
f (Pdf.Stream {contents = (Pdf.recurse_dict (dict_entry_single_object f pdf) dict, data)})
| Pdf.Array a -> Pdf.recurse_array (dict_entry_single_object f pdf) a
| x -> x
2021-10-29 15:09:21 +01:00
(* FIXME are we sure that functional values can never appear in the equality here? *)
let remove_dict_entry pdf key search =
let f d =
match search with
| None -> Pdf.remove_dict_entry d key
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.remove_dict_entry d key
| _ -> d
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
2020-11-14 16:40:01 +00:00
2021-10-29 15:09:21 +01:00
let replace_dict_entry pdf key value search =
let f d =
match search with
| None -> Pdf.replace_dict_entry d key value
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.replace_dict_entry d key value
| _ -> d
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
(* FIXME no need to self map here, since nothing changes *)
let print_dict_entry pdf key =
let f d =
match Pdf.lookup_direct pdf key d with
| Some v ->
(* We use a double newline as a separator. *)
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object pdf (fun _ -> ()) false false v));
| None -> d
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
2021-10-28 17:06:46 +01:00
2020-11-14 16:40:01 +00:00
let remove_clipping_ops pdf resources content =
let ops = Pdfops.parse_operators pdf resources content in
let rec process a = function
Pdfops.Op_W::Pdfops.Op_n::t -> process (Pdfops.Op_n::a) t
| h::t -> process (h::a) t
| [] -> rev a
[Pdfops.stream_of_ops (process [] ops)]
let remove_clipping pdf range =
let remove_clipping_page _ page =
let content' =
remove_clipping_ops pdf page.Pdfpage.resources page.Pdfpage.content
process_xobjects pdf page remove_clipping_ops;
{page with Pdfpage.content = content'}
process_pages (ppstub remove_clipping_page) pdf range
2020-11-25 16:54:15 +00:00
(* Image resolution *)
type xobj =
| Image of int * int (* width, height *)
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
let image_results = ref []
let add_image_result i =
image_results := i::!image_results
(* Given a page and a list of (pagenum, name, thing) *)
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
and transform = ref [ref Pdftransform.i_matrix] in
| Pdfops.Op_cm matrix ->
begin match !transform with
| [] -> raise (Failure "no transform")
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
| Pdfops.Op_Do xobject ->
let trans (x, y) =
match !transform with
| [] -> raise (Failure "no transform")
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
let o = trans (0., 0.)
and x = trans (1., 0.)
and y = trans (0., 1.)
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
let rec lookup_image k = function
| [] -> assert false
| (_, a, _) as h::_ when a = k -> h
| _::t -> lookup_image k t
begin match lookup_image xobject images with
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
let content =
(* Add in matrix etc. *)
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
let ops =
Pdfops.Op_cm total_matrix::
Pdfops.parse_operators pdf resources [content]
Pdfops.stream_of_ops ops
let page =
{Pdfpage.content = [content];
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
2020-11-25 16:54:15 +00:00
Pdfpage.resources = resources;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
let newpdf = Pdfpage.change_pages false pdf [page] in
image_resolution newpdf [pagenum] dpi
| (pagenum, name, Image (w, h)) ->
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
let wdpi = float w /. lx
and hdpi = float h /. ly in
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
(*i else
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
| Pdfops.Op_q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| h::t ->
let h' = ref Pdftransform.i_matrix in
h' := !h;
transform := h'::h::t
| Pdfops.Op_Q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| _ -> transform := tl !transform
| _ -> ())
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
and image_resolution pdf range dpi =
let images = ref [] in
(fun pagenum page ->
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjects) ->
(function (name, xobject) ->
match Pdf.lookup_direct pdf "/Subtype" xobject with
| Some (Pdf.Name "/Image") ->
let width =
match Pdf.lookup_direct pdf "/Width" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
and height =
match Pdf.lookup_direct pdf "/Height" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
| Some (Pdf.Name "/Form") ->
let resources =
match Pdf.lookup_direct pdf "/Resources" xobject with
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
| Some r -> r
and contents =
and matrix =
match Pdf.lookup_direct pdf "/Matrix" xobject with
| Some (Pdf.Array [a; b; c; d; e; f]) ->
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
| _ -> Pdftransform.i_matrix
images := (pagenum, name, Form (matrix, contents, resources))::!images
| _ -> ()
| _ -> ())
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
let pagesplits =
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
and pages =
Pdfpage.pages_of_pagetree pdf
(function (pagenum, images) ->
let page = select pagenum pages in
image_resolution_page pdf page pagenum dpi images)
let image_resolution pdf range dpi =
image_results := [];
image_resolution pdf range dpi;
rev !image_results
2021-11-12 14:50:31 -08:00
(* copy the contents of the box f to the box t. If mediabox_if_missing is set,
the contents of the mediabox will be used if the from fox is not available. If
mediabox_is_missing is false, the page is unaltered. *)
let copy_box f t mediabox_if_missing pdf range =
(ppstub (fun _ page ->
if f = "/MediaBox" then
{page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (page.Pdfpage.mediabox))}
match Pdf.lookup_direct pdf f page.Pdfpage.rest with
| Some pdfobject ->
if t = "/MediaBox"
then {page with
Pdfpage.mediabox = Pdf.direct pdf pdfobject}
else {page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (Pdf.direct pdf pdfobject))}
| None ->
if mediabox_if_missing
then {page with Pdfpage.rest = Pdf.add_dict_entry page.Pdfpage.rest t page.Pdfpage.mediabox}
else page))
let dump_attachment out pdf (_, embeddedfile) =
match Pdf.lookup_direct pdf "/F" embeddedfile with
| Some (Pdf.String s) ->
let efdata =
begin match Pdf.lookup_direct pdf "/EF" embeddedfile with
| Some d ->
let stream =
match Pdf.lookup_direct pdf "/F" d with
| Some s -> s
| None -> error "Bad embedded file stream"
Pdfcodec.decode_pdfstream_until_unknown pdf stream;
begin match stream with Pdf.Stream {contents = (_, Pdf.Got b)} -> b | _ -> error "Bad embedded file stream" end
| _ -> error "Bad embedded file stream"
let s = remove_unsafe_characters s in
let filename = if out = "" then s else out ^ Filename.dir_sep ^ s in
begin try
let fh = open_out_bin filename in
for x = 0 to bytes_size efdata - 1 do output_byte fh (bget efdata x) done;
close_out fh
e -> Printf.eprintf "Failed to write attachment to %s\n%!" filename;
| _ -> ()
let dump_attached_document pdf out =
let root = Pdf.lookup_obj pdf pdf.Pdf.root in
let names =
match Pdf.lookup_direct pdf "/Names" root with Some n -> n | _ -> Pdf.Dictionary []
match Pdf.lookup_direct pdf "/EmbeddedFiles" names with
| Some x ->
iter (dump_attachment out pdf) (Pdf.contents_of_nametree pdf x)
| None -> ()
let dump_attached_page pdf out page =
let annots =
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array l) -> l
| _ -> []
let efannots =
(fun annot ->
match Pdf.lookup_direct pdf "/Subtype" annot with
| Some (Pdf.Name "/FileAttachment") -> true
| _ -> false)
let fsannots = option_map (Pdf.lookup_direct pdf "/FS") efannots in
iter (dump_attachment out pdf) (map (fun x -> 0, x) fsannots)
(* Dump both document-level and page-level attached files to file, using their file names *)
let dump_attached_files pdf out =
dump_attached_document pdf out;
iter (dump_attached_page pdf out) (Pdfpage.pages_of_pagetree pdf)
e -> error (Printf.sprintf "Couldn't dump attached files: %s\n" (Printexc.to_string e))
let remove_unused_resources_page pdf n page =
let xobjects, all_names =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary d) -> Pdf.Dictionary d, map fst d
| _ -> Pdf.Dictionary [], []
let names_to_keep =
(function Pdfops.Op_Do n -> Some n | _ -> None)
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
let names_to_remove = lose (mem' names_to_keep) all_names in
let xobjdict = fold_left (Pdf.remove_dict_entry) xobjects names_to_remove in
{page with Pdfpage.resources = Pdf.add_dict_entry page.Pdfpage.resources "/XObject" xobjdict}
let remove_unused_resources pdf =
process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
let print_spot_colour n s =
Printf.printf "%i %s\n" n s
let list_spot_colours pdf =
(fun _ obj ->
match obj with
Pdf.Array (Pdf.Name "/Separation"::x::_) ->
begin match Pdf.direct pdf x with
Pdf.Name col -> Printf.printf "%s\n" col
| _ -> ()
| _ -> ())
(* Indent bookmarks in each file by one and add a title bookmark pointing to the first page. *)
let add_bookmark_title filename use_title pdf =
let title =
if use_title then
match get_info_utf8 pdf "/Title", get_xmp_info pdf "/Title" with
"", x | x, "" | _, x -> x
Filename.basename filename
let marks = Pdfmarks.read_bookmarks pdf in
let page1objnum =
match Pdfpage.page_object_number pdf 1 with
None -> error "add_bookmark_title: page not found"
| Some x -> x
let newmarks =
{Pdfmarks.level = 0;
Pdfmarks.text = title;
Pdfmarks.target = Pdfdest.XYZ (Pdfdest.PageObject page1objnum, None, None, None);
Pdfmarks.isopen = false}
::map (function m -> {m with Pdfmarks.level = m.Pdfmarks.level + 1}) marks
Pdfmarks.add_bookmarks newmarks pdf
let bookmarks_open_to_level n pdf =
let marks = Pdfmarks.read_bookmarks pdf in
let newmarks =
(fun m -> {m with Pdfmarks.isopen = m.Pdfmarks.level < n})
Pdfmarks.add_bookmarks newmarks pdf
let create_pdf pages pagesize =
let page =
{(Pdfpage.blankpage pagesize) with
Pdfpage.content = [Pdfops.stream_of_ops []];
Pdfpage.resources = Pdf.Dictionary []}
let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf
2021-11-15 09:30:11 -08:00
(* Remove characters which might not make good filenames. *)
let remove_unsafe_characters encoding s =
if encoding = Raw then s else
let chars =
(function x ->
match x with
'/' | '?' | '<' | '>' | '\\' | ':' | '*' | '|' | '\"' | '^' | '+' | '=' -> true
| x when int_of_char x < 32 || (int_of_char x > 126 && encoding <> Stripped) -> true
| _ -> false)
(explode s)
match chars with
| '.'::more -> implode more
| chars -> implode chars
let get_bookmark_name encoding pdf marks splitlevel n _ =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
| {Pdfmarks.text = title}::_ -> remove_unsafe_characters encoding title
| _ -> ""
(* @F means filename without extension *)
(* @N means sequence number with no padding *)
(* @S means start page of this section *)
(* @E means end page of this section *)
(* @B means bookmark name at start page *)
let process_others encoding marks pdf splitlevel filename sequence startpage endpage s =
let rec find_ats p = function
'@'::r -> find_ats (p + 1) r
| r -> (p, r)
let string_of_int_width w i =
if w < 0 then raise (Pdf.PDFError "width of field too narrow")
else if w > 8 then raise (Pdf.PDFError "width of field too broad") else
let formats =
[|format_of_string "%i";
format_of_string "%i";
format_of_string "%02i";
format_of_string "%03i";
format_of_string "%04i";
format_of_string "%05i";
format_of_string "%06i";
format_of_string "%07i";
format_of_string "%08i"|]
Printf.sprintf formats.(w) i
let rec procss prev = function
| [] -> rev prev
| '@'::'F'::t -> procss (rev (explode filename) @ prev) t
| '@'::'N'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width sequence)) @ prev) rest
| '@'::'S'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width startpage)) @ prev) rest
| '@'::'E'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width endpage)) @ prev) rest
| '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t
| h::t -> procss (h::prev) t
implode (procss [] (explode s))
let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
let fill l n =
let chars = explode (string_of_int n) in
if length chars > l
then implode (drop chars (length chars - l))
else implode ((many '0' (l - length chars)) @ chars)
let chars = explode spec in
let before, including = cleavewhile (neq '%') chars in
let percents, after = cleavewhile (eq '%') including in
if percents = []
process_others encoding marks pdf splitlevel filename n startpage endpage spec
process_others encoding marks pdf splitlevel filename n startpage endpage
(implode before ^ fill (length percents) n ^ implode after)
(* Extract Images. *)
let pnm_to_channel_24 channel w h s =
let white () = output_char channel ' '
and newline () = output_char channel '\n'
and output_string = Pervasives.output_string channel in
output_string "P6";
white ();
output_string (string_of_int w);
white ();
output_string (string_of_int h);
white ();
output_string "255";
newline ();
let pos = ref 0 in
for y = 1 to h do
for x = 1 to w * 3 do
output_byte channel (bget s !pos);
incr pos
let write_stream name stream =
let fh = open_out_bin name in
for x = 0 to bytes_size stream - 1 do
output_byte fh (bget stream x)
close_out fh
let write_image path_to_p2p path_to_im pdf resources name image =
match Pdfimage.get_image_24bpp pdf resources image with
| Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
| Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
| Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream
| Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
let pnm = name ^ ".pnm" in
let png = name ^ ".png" in
let fh = open_out_bin pnm in
pnm_to_channel_24 fh w h stream;
close_out fh;
begin match path_to_p2p with
| "" ->
begin match path_to_im with
"" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!"
| _ ->
begin match
Sys.command (Filename.quote_command path_to_im [pnm; png])
0 -> Sys.remove pnm
| _ ->
Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!";
Sys.remove pnm
| _ ->
begin match
Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
| 0 -> Sys.remove pnm
| _ ->
Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!";
Sys.remove pnm
| _ ->
Printf.eprintf "Unsupported image type when extracting image %s %!" name
let written = ref []
let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images =
let names = map
(fun _ ->
encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
(let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
iter2 (write_image path_to_p2p path_to_im pdf resources) names images
let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
let resources =
match Pdf.lookup_direct pdf "/Resources" form with
Some (Pdf.Dictionary d) -> Pdf.Dictionary d
| _ -> Pdf.Dictionary []
let images =
let xobjects =
match Pdf.lookup_direct pdf "/XObject" resources with
| Some (Pdf.Dictionary elts) -> map snd elts
| _ -> []
(* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images
let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem =
if dedup || dedup_per_page then written := [];
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let pages =
(function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
(combine (indx pdf_pages) pdf_pages)
let serial = ref 0 in
(fun page pnum ->
if dedup_per_page then written := [];
let xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) -> map snd elts
| _ -> []
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
(indx pages)
2021-11-15 11:17:15 -08:00