cpdf-source/cpdf.ml

3418 lines
134 KiB
OCaml
Raw Normal View History

2013-08-20 16:32:57 +02:00
(* CPDF Core routines *)
open Pdfutil
open Pdfio
2021-10-02 13:22:59 +02:00
open Cpdferror
2013-08-20 16:32:57 +02:00
2021-11-15 20:17:15 +01:00
type color =
Grey of float
| RGB of float * float * float
| CYMK of float * float * float * float
2013-08-20 16:32:57 +02:00
(* Return page label at pdf page num, or page number in arabic if no label *)
let pagelabel pdf num =
Pdfpagelabels.pagelabeltext_of_pagenumber
num
(Pdfpagelabels.complete (Pdfpagelabels.read pdf))
2021-06-10 17:09:59 +02:00
let rec process_text time text m =
2013-08-20 16:32:57 +02:00
match m with
2021-06-10 17:09:59 +02:00
| [] -> Cpdfstrftime.strftime ~time text
| (s, r)::t -> process_text time (string_replace_all_lazy s r text) t
2013-08-20 16:32:57 +02:00
(* For uses of process_pages which don't need to deal with matrices, this
function transforms into one which returns the identity matrix *)
let ppstub f n p = (f n p, n, Pdftransform.i_matrix)
2013-08-20 16:32:57 +02:00
let process_pages f pdf range =
let pages = Pdfpage.pages_of_pagetree pdf in
let pages', pagenumbers, matrices = (* new page objects, page number, matrix *)
split3
(map2
(fun n p -> if mem n range then f n p else (p, n, Pdftransform.i_matrix))
(ilist 1 (length pages))
pages)
2013-08-20 16:32:57 +02:00
in
Pdfpage.change_pages ~matrices:(combine pagenumbers matrices) true pdf pages'
2013-08-20 16:32:57 +02:00
let iter_pages f pdf range =
let pages = Pdfpage.pages_of_pagetree pdf in
iter2
(fun n p -> if mem n range then f n p)
(ilist 1 (length pages))
pages
let map_pages f pdf range =
let pages = Pdfpage.pages_of_pagetree pdf in
option_map2
(fun n p -> if mem n range then Some (f n p) else None)
(ilist 1 (length pages))
pages
2016-04-03 20:46:54 +02:00
(* Add stack operators to a content stream to ensure it is composeable. On
-fast, we don't check for Q deficit, assuming PDF is ISO. *)
let protect fast pdf resources content =
let deficit =
if fast then 0 else
let ops = Pdfops.parse_operators pdf resources content in
let qs = length (keep (eq Pdfops.Op_q) ops) in
let bigqs = length (keep (eq Pdfops.Op_Q) ops) in
let deficit = if qs > bigqs then qs - bigqs else 0 in
2021-07-23 17:26:07 +02:00
if deficit <> 0 then Printf.eprintf "Q Deficit was nonzero. Fixing. %i\n%!" deficit;
2016-04-03 20:46:54 +02:00
deficit
in
let addstream ops = Pdf.addobj pdf (Pdfops.stream_of_ops ops) in
let q = addstream [Pdfops.Op_q] in
let qs = addstream (many Pdfops.Op_Q deficit @ [Pdfops.Op_Q]) in
[Pdf.Indirect q] @ content @ [Pdf.Indirect qs]
2013-08-20 16:32:57 +02:00
2021-10-26 17:18:09 +02:00
(* If a cropbox exists, make it the mediabox. If not, change nothing. *)
let copy_cropbox_to_mediabox pdf range =
process_pages
(ppstub (fun _ page ->
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> {page with Pdfpage.mediabox = Pdf.direct pdf pdfobject}
| None -> page))
pdf
range
2013-08-20 16:32:57 +02:00
(* Union two resource dictionaries from the same PDF. *)
let combine_pdf_resources pdf a b =
let a_entries =
match a with
| Pdf.Dictionary entries -> entries
| _ -> []
in let b_entries =
match b with
| Pdf.Dictionary entries -> entries
| _ -> []
in
let resource_keys =
["/Font"; "/ExtGState"; "/ColorSpace"; "/Pattern";
"/Shading"; "/XObject"; "/Properties"]
in
let combine_entries key =
let a_entries =
match Pdf.lookup_direct pdf key a with
| Some (Pdf.Dictionary d) -> d
| _ -> []
in let b_entries =
match Pdf.lookup_direct pdf key b with
| Some (Pdf.Dictionary d) -> d
| _ -> []
in
2016-04-03 20:51:47 +02:00
if a_entries = [] && b_entries = [] then
None
else
Some (key, Pdf.Dictionary (a_entries @ b_entries))
2013-08-20 16:32:57 +02:00
in
2016-04-03 20:51:47 +02:00
let unknown_keys_a = lose (fun (k, _) -> mem k resource_keys) a_entries in
let unknown_keys_b = lose (fun (k, _) -> mem k resource_keys) b_entries in
let combined_known_entries = option_map combine_entries resource_keys in
2021-10-26 16:36:56 +02:00
fold_left
(fun dict (k, v) -> Pdf.add_dict_entry dict k v)
(Pdf.Dictionary [])
(unknown_keys_a @ unknown_keys_b @ combined_known_entries)
2013-08-20 16:32:57 +02:00
(* \section{Remove bookmarks} *)
(* \section{Add bookmarks} *)
let read_lines input =
let lines = ref [] in
try
while true do
let c = read_line input in
lines =| c
done; []
with
_ -> rev !lines
(* Verify a list of bookmarks. Positive jumps of > 1 not allowed, no numbers
smaller than 0. *)
2017-05-29 15:39:01 +02:00
let rec verify_bookmarks pdf lastlevel fastrefnums endpage = function
2013-08-20 16:32:57 +02:00
| [] -> true
| {Pdfmarks.level = level; Pdfmarks.target = target}::more ->
2017-05-29 15:39:01 +02:00
let page = Pdfpage.pagenumber_of_target pdf ~fastrefnums target in
2013-08-20 16:32:57 +02:00
level < lastlevel + 2 &&
level >= 0 &&
page <= endpage &&
page >= 0 &&
2017-05-29 15:39:01 +02:00
verify_bookmarks pdf level fastrefnums endpage more
2017-05-28 20:19:17 +02:00
let verify_bookmarks pdf lastlevel endpage marks =
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 15:39:01 +02:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
2019-07-02 19:37:08 +02:00
match marks with
| [] -> true
| m::more -> m.Pdfmarks.level = 0 && verify_bookmarks pdf lastlevel fastrefnums endpage more
2013-08-20 16:32:57 +02:00
(* Parse a line of the bookmarks file. *)
(* Un-escape things which are escaped. Quotes, newlines and backslashes *)
let rec fixup_characters prev = function
| [] -> rev prev
| '\\'::'\\'::t -> fixup_characters ('\\'::prev) t
| '\\'::'"'::t -> fixup_characters ('"'::prev) t
| '\\'::'n'::t -> fixup_characters ('\n'::prev) t
2013-08-20 16:32:57 +02:00
| h::t -> fixup_characters (h::prev) t
2019-03-19 15:03:28 +01:00
let debug_bookmark_string s =
Printf.printf "STR: %s\n" s
(* If optionaldest = [Pdfgenlex.LexString s], we parse the string, convert the
* integer to an indirect of the real page target, and then put it in. *)
2021-10-27 14:16:55 +02:00
let target_of_markfile_obj pdf i' pdfobj =
(*Printf.printf "Parsed %s\n" (Pdfwrite.string_of_pdf pdfobj);*)
match pdfobj with
Pdf.Array (Pdf.Integer x::more) ->
let pageobjnum = Pdfpage.page_object_number pdf i' in
begin match pageobjnum with
None ->
raise (Pdf.PDFError "bookmark_of_data: page obj num not found")
| Some p ->
Pdfdest.read_destination pdf (Pdf.Array (Pdf.Indirect p::more))
end
(* Need to deal with "null", "(string)", and "<<other thing like action" *)
| Pdf.Null -> Pdfdest.NullDestination
| Pdf.String s -> Pdfdest.read_destination pdf (Pdf.String s)
| x -> Pdfdest.Action x
let target_of_markfile_target pdf i' = function
| [Pdfgenlex.LexString s] ->
let pdfobj = Pdfread.parse_single_object s in
target_of_markfile_obj pdf i' pdfobj
| _ -> Pdfpage.target_of_pagenumber pdf i'
let bookmark_of_data pdf i s i' isopen optionaldest =
2019-03-19 15:03:28 +01:00
(*debug_bookmark_string s;
debug_bookmark_string (implode (fixup_characters [] (explode s)));
debug_bookmark_string (Pdftext.pdfdocstring_of_utf8 (implode (fixup_characters [] (explode s))));*)
{Pdfmarks.level = i;
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 (implode (fixup_characters [] (explode s)));
2021-10-27 14:16:55 +02:00
Pdfmarks.target = target_of_markfile_target pdf i' optionaldest;
Pdfmarks.isopen = isopen}
2021-10-27 14:16:55 +02:00
let target_of_json_target pdf pagenumber target =
target_of_markfile_obj pdf pagenumber (Cpdfjson.object_of_json target)
let mark_of_json pdf = function
| `Assoc [("level", `Int level);
("text", `String text);
("page", `Int pagenumber);
("open", `Bool openstatus);
("target", target)] ->
{Pdfmarks.level = level;
2021-10-27 16:29:25 +02:00
Pdfmarks.text = Pdftext.pdfdocstring_of_utf8 text;
2021-10-27 14:16:55 +02:00
Pdfmarks.target = target_of_json_target pdf pagenumber target;
Pdfmarks.isopen = openstatus}
| _ -> error "malformed mark in mark_of_json"
let marks_of_json pdf = function
| `List ms -> map (mark_of_json pdf) ms
| _ -> error "top level of JSON boomark file not a list"
let parse_bookmark_file_json verify pdf i =
let module J = Cpdfyojson.Safe in
try
let json =
match i.Pdfio.caml_channel with
| Some ch -> J.from_channel ch
| None ->
let content = Pdfio.string_of_bytes (Pdfio.bytes_of_input i 0 i.Pdfio.in_channel_length) in
J.from_string content
in
let marks = marks_of_json pdf json in
if verify then
if verify_bookmarks pdf 0 (Pdfpage.endpage pdf) marks then marks else
error "Bad bookmark file (References non-existant pages or is malformed)"
else
marks
with
e ->
error (Printf.sprintf "Malformed JSON bookmark file (%s)" (Printexc.to_string e))
2021-10-27 12:44:30 +02:00
2013-08-20 16:32:57 +02:00
let parse_bookmark_file verify pdf input =
let currline = ref 0 in
2013-08-20 16:32:57 +02:00
try
let lines = Pdfio.read_lines input in
let currline = ref 0 in
let bookmarks = ref [] in
iter
(function line ->
match
incr currline;
Pdfgenlex.lex_string line
with
| Pdfgenlex.LexInt i::Pdfgenlex.LexString s::Pdfgenlex.LexInt i'::Pdfgenlex.LexName "open"::optionaldest ->
bookmarks =| bookmark_of_data pdf i s i' true optionaldest
| Pdfgenlex.LexInt i::Pdfgenlex.LexString s::Pdfgenlex.LexInt i'::optionaldest ->
bookmarks =| bookmark_of_data pdf i s i' false optionaldest
2013-08-20 16:32:57 +02:00
| [] -> () (* ignore blank lines *)
| _ ->
error ("Bad bookmark file, line " ^ (string_of_int !currline)))
lines;
let bookmarks = rev !bookmarks in
if verify then
if verify_bookmarks pdf 0 (Pdfpage.endpage pdf) bookmarks
then bookmarks
else
error
"Bad bookmark file (References non-existant pages or is malformed)"
else
bookmarks
with
e ->
error
(Printf.sprintf
"Bad bookmark file (syntax) at line %i (error was %s)"
!currline
(Printexc.to_string e))
2013-08-20 16:32:57 +02:00
2021-10-27 12:44:30 +02:00
let add_bookmarks ~json verify input pdf =
let parsed =
(if json then parse_bookmark_file_json else parse_bookmark_file) verify pdf input in
2013-08-20 16:32:57 +02:00
(*iter (fun b -> flprint (Pdfmarks.string_of_bookmark b); flprint "\n") parsed;*)
Pdfmarks.add_bookmarks parsed pdf
(* List bookmarks *)
let output_string_of_target pdf fastrefnums x =
match Pdfdest.pdfobject_of_destination x with
2019-07-06 16:09:25 +02:00
| Pdf.Array (_::more) ->
let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
in
"\"" ^ Pdfwrite.string_of_pdf a ^ "\""
| x -> "\"" ^ Pdfwrite.string_of_pdf x ^ "\""
2013-08-20 16:32:57 +02:00
2021-10-26 20:41:14 +02:00
let json_of_target pdf fastrefnums x =
match Pdfdest.pdfobject_of_destination x with
| Pdf.Array (_::more) ->
let a =
Pdf.Array (Pdf.Integer (Pdfpage.pagenumber_of_target ~fastrefnums pdf x)::more)
in
Cpdfjson.json_of_object pdf (fun _ -> ()) false false a
| x -> Cpdfjson.json_of_object pdf (fun _ -> ()) false false x
let output_json_marks ch calculate_page_number pdf fastrefnums marks =
let module J = Cpdfyojson.Safe in
let json_of_mark m =
`Assoc
[("level", `Int m.Pdfmarks.level);
2021-10-27 16:29:25 +02:00
("text", `String (Pdftext.utf8_of_pdfdocstring m.Pdfmarks.text));
2021-10-26 20:41:14 +02:00
("page", `Int (calculate_page_number m));
("open", `Bool m.Pdfmarks.isopen);
("target", json_of_target pdf fastrefnums m.Pdfmarks.target)]
in
let json = `List (map json_of_mark marks) in
J.pretty_to_channel ch json
2021-10-27 12:44:30 +02:00
(* List the bookmarks in the given range to the given output *)
2021-10-26 20:41:14 +02:00
let list_bookmarks ~json encoding range pdf output =
2017-05-28 20:58:30 +02:00
let process_stripped escaped =
let b = Buffer.create 200 in
2020-03-04 19:50:32 +01:00
iter
2017-05-28 20:58:30 +02:00
(fun x ->
if x <= 127 then Buffer.add_char b (char_of_int x))
escaped;
Buffer.contents b
in
2013-08-20 16:32:57 +02:00
let process_string s =
2017-05-28 20:45:02 +02:00
let rec replace c x y = function
2013-08-20 16:32:57 +02:00
| [] -> []
2017-05-28 20:45:02 +02:00
| h::t when h = c -> x::y::replace c x y t
| h::t -> h::replace c x y t
2013-08-20 16:32:57 +02:00
in
(* Convert to UTF8, raw, or stripped, and escape backslashed and quotation marks *)
let codepoints = Pdftext.codepoints_of_pdfdocstring s in
let escaped =
let bs = int_of_char '\\'
and nl = int_of_char '\n'
and n = int_of_char 'n'
and q = int_of_char '\"' in
2018-04-30 18:42:43 +02:00
replace q bs q (replace nl bs n (replace bs bs bs codepoints))
2013-08-20 16:32:57 +02:00
in
match encoding with
2021-12-19 13:55:06 +01:00
| Cpdfmetadata.UTF8 -> Pdftext.utf8_of_codepoints escaped
| Cpdfmetadata.Stripped -> process_stripped escaped
| Cpdfmetadata.Raw -> s
2013-08-20 16:32:57 +02:00
in
let bookmarks = Pdfmarks.read_bookmarks pdf in
2017-05-28 20:19:17 +02:00
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 15:02:34 +02:00
let rangetable = hashset_of_list range in
let range_is_all = range = ilist 1 (Pdfpage.endpage pdf) in
2017-05-29 15:02:34 +02:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
(* Find the pagenumber of each bookmark target. If it is in the range,
* keep that bookmark. Also keep the bookmark if its target is the null
* destination. *)
2013-08-20 16:32:57 +02:00
let inrange =
keep
(function x ->
range_is_all ||
x.Pdfmarks.target = Pdfdest.NullDestination ||
(match x.Pdfmarks.target with Pdfdest.NamedDestinationElsewhere _ -> true | _ -> false) ||
2017-05-29 15:02:34 +02:00
Hashtbl.mem rangetable (Pdfpage.pagenumber_of_target ~fastrefnums pdf x.Pdfmarks.target)) bookmarks
2013-08-20 16:32:57 +02:00
in
2019-07-06 16:09:25 +02:00
let calculate_page_number mark =
(* Some buggy PDFs use integers for page numbers instead of page
* object references. Adobe Reader and Preview seem to support
* this, for presumably historical reasons. So if we see a
* OtherDocPageNumber (which is what Pdfdest parses these as,
* because that's what they are legitimately, we use this as the
* page number. It is zero based, though, and we are one-based, so
* we add one. Pdfpage.pagenumber_of_target has been modified to support this.*)
Pdfpage.pagenumber_of_target ~fastrefnums pdf mark.Pdfmarks.target
in
2021-10-26 20:41:14 +02:00
if json then
output_json_marks stdout calculate_page_number pdf fastrefnums inrange
else
iter
(function mark ->
output.Pdfio.output_string
(Printf.sprintf "%i \"%s\" %i%s %s\n"
mark.Pdfmarks.level
(process_string mark.Pdfmarks.text)
(calculate_page_number mark)
(if mark.Pdfmarks.isopen then " open" else "")
(output_string_of_target pdf fastrefnums mark.Pdfmarks.target)))
inrange
2013-08-20 16:32:57 +02:00
(* o is the stamp, u is the main pdf page *)
(* \section{Split at bookmarks} *)
let get_bookmark_name pdf marks splitlevel n _ =
2017-05-28 20:19:17 +02:00
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 15:39:01 +02:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
2021-12-19 14:38:27 +01:00
| {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters Cpdfmetadata.UTF8 title
2013-08-20 16:32:57 +02:00
| _ -> ""
(* Find the stem of a filename *)
(*let stem s =
implode (rev (tail_no_fail (dropwhile (neq '.') (rev (explode (Filename.basename s))))))*)
2013-08-20 16:32:57 +02:00
(* Return list, in order, a *set* of page numbers of bookmarks at a given level *)
let bookmark_pages level pdf =
2017-05-28 20:19:17 +02:00
let refnums = Pdf.page_reference_numbers pdf in
2017-05-29 15:39:01 +02:00
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
2013-08-20 16:32:57 +02:00
setify_preserving_order
(option_map
2017-05-29 15:39:01 +02:00
(function l when l.Pdfmarks.level = level -> Some (Pdfpage.pagenumber_of_target ~fastrefnums pdf l.Pdfmarks.target) | _ -> None)
2013-08-20 16:32:57 +02:00
(Pdfmarks.read_bookmarks pdf))
(* Called from cpdflib.ml - different from above *)
let split_on_bookmarks pdf level =
let points = lose (eq 0) (map pred (bookmark_pages level pdf))
in let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let ranges = splitat points (indx pdf_pages) in
map (fun rs -> Pdfpage.pdf_of_pages pdf rs) ranges
(* Output information for each page *)
let output_page_info pdf range =
2013-08-20 16:32:57 +02:00
let pages = Pdfpage.pages_of_pagetree pdf
and labels = Pdfpagelabels.read pdf in
let getbox page box =
if box = "/MediaBox" then
match page.Pdfpage.mediabox with
| Pdf.Array [a; b; c; d] ->
Printf.sprintf "%f %f %f %f"
(Pdf.getnum a) (Pdf.getnum b) (Pdf.getnum c) (Pdf.getnum d)
| _ -> ""
else
match Pdf.lookup_direct pdf box page.Pdfpage.rest with
| Some (Pdf.Array [a; b; c; d]) ->
Printf.sprintf "%f %f %f %f"
(Pdf.getnum a) (Pdf.getnum b) (Pdf.getnum c) (Pdf.getnum d)
| _ -> ""
and rotation page =
Pdfpage.int_of_rotation page.Pdfpage.rotate
in
2020-03-04 19:50:32 +01:00
iter
(fun pnum ->
let page = select pnum pages in
Printf.printf "Page %i:\n" pnum;
Printf.printf "Label: %s\n"
(try Pdfpagelabels.pagelabeltext_of_pagenumber pnum labels with Not_found -> "");
Printf.printf "MediaBox: %s\n" (getbox page "/MediaBox");
Printf.printf "CropBox: %s\n" (getbox page "/CropBox");
Printf.printf "BleedBox: %s\n" (getbox page "/BleedBox");
Printf.printf "TrimBox: %s\n" (getbox page "/TrimBox");
Printf.printf "ArtBox: %s\n" (getbox page "/ArtBox");
Printf.printf "Rotation: %i\n" (rotation page))
range
2013-08-20 16:32:57 +02:00
(* Does the page have a defined box e.g "/CropBox" *)
let hasbox pdf page boxname =
let pages = Pdfpage.pages_of_pagetree pdf in
if page > length pages || page < 1 then raise (Failure "hasbox: bad page") else
let p = select page pages in
match Pdf.lookup_direct pdf boxname p.Pdfpage.rest with
| Some _ -> true
| _ -> false
2014-10-13 19:16:06 +02:00
2013-08-20 16:32:57 +02:00
2021-11-03 14:40:54 +01:00
(* List fonts *)
2013-08-20 16:32:57 +02:00
let list_font pdf page (name, dict) =
let subtype =
match Pdf.lookup_direct pdf "/Subtype" dict with
2021-11-03 14:40:54 +01:00
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
2013-08-20 16:32:57 +02:00
| _ -> ""
in let basefont =
match Pdf.lookup_direct pdf "/BaseFont" dict with
2021-11-03 14:40:54 +01:00
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
2013-08-20 16:32:57 +02:00
| _ -> ""
in let encoding =
match Pdf.lookup_direct pdf "/Encoding" dict with
2021-11-03 14:40:54 +01:00
| Some (Pdf.Name n) -> Pdfwrite.string_of_pdf (Pdf.Name n)
2013-08-20 16:32:57 +02:00
| _ -> ""
in
2021-11-03 14:40:54 +01:00
(page, name, subtype, basefont, encoding)
2013-08-20 16:32:57 +02:00
2021-11-03 17:51:28 +01:00
let list_fonts pdf range =
2013-08-20 16:32:57 +02:00
let pages = Pdfpage.pages_of_pagetree pdf in
flatten
(map
(fun (num, page) ->
2021-11-03 17:51:28 +01:00
if mem num range then
begin match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some (Pdf.Dictionary fontdict) ->
map (list_font pdf num) fontdict
| _ -> []
end
else
[])
2013-08-20 16:32:57 +02:00
(combine (ilist 1 (length pages)) pages))
let string_of_font (p, n, s, b, e) =
Printf.sprintf "%i %s %s %s %s\n" p n s b e
2021-11-03 17:51:28 +01:00
let print_fonts pdf range =
2013-08-20 16:32:57 +02:00
flprint
2021-11-03 17:51:28 +01:00
(fold_left ( ^ ) "" (map string_of_font (list_fonts pdf range)))
2013-08-20 16:32:57 +02:00
(* \section{Superimpose text, page numbers etc.} *)
2021-11-01 18:09:58 +01:00
(* Process UTF8 text to /WinAnsiEncoding string (for standard 14) or whatever
is in the font (for existing fonts). *)
2021-12-02 21:04:14 +01:00
let charcodes_of_utf8 font s =
let extractor = Pdftext.charcode_extractor_of_font_real ~debug:false font in
2021-11-01 18:09:58 +01:00
let codepoints = Pdftext.codepoints_of_utf8 s in
2021-11-12 00:05:07 +01:00
let charcodes =
option_map
(fun codepoint ->
match extractor codepoint with
| Some cc -> Some cc
| None -> Printf.eprintf "Warning: character not found in font for unicode codepoint 0x%X\n" codepoint; None)
codepoints
in
implode (map char_of_int charcodes)
2013-08-20 16:32:57 +02:00
(* Process codepoints back to UTF8, assuming it came from UTF8 to start with *)
let utf8_of_winansi s =
let text_extractor =
2021-12-02 21:04:14 +01:00
Pdftext.text_extractor_of_font_real
(Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding))
2013-08-20 16:32:57 +02:00
in
let codepoints = Pdftext.codepoints_of_text text_extractor s in
Pdftext.utf8_of_codepoints codepoints
(* Get the width of some text in the given font *)
let width_of_text font text =
match font with
| Pdftext.SimpleFont {Pdftext.fontmetrics = Some fontmetrics} ->
begin try
fold_left ( +. ) 0. (map (fun c -> fontmetrics.(int_of_char c)) (explode text))
with
_ -> 0.
end
| _ -> 0.
type ops_metrics =
{metrics_text : string;
metrics_x : float;
metrics_y : float;
metrics_rot : float}
let ops_metrics : ops_metrics list ref = ref []
let ops_baseline_adjustment = ref 0.
let metrics_howmany () = length !ops_metrics
let metrics_text n =
utf8_of_winansi (select n !ops_metrics).metrics_text
let metrics_x n =
(select n !ops_metrics).metrics_x
let metrics_y n =
(select n !ops_metrics).metrics_y
let metrics_rot n =
(select n !ops_metrics).metrics_rot
let metrics_baseline_adjustment () = !ops_baseline_adjustment
2021-11-15 20:17:15 +01:00
let colour_op = function
| RGB (r, g, b) -> Pdfops.Op_rg (r, g, b)
| Grey g -> Pdfops.Op_g g
| CYMK (c, y, m, k) -> Pdfops.Op_k (c, y, m, k)
let colour_op_stroke = function
| RGB (r, g, b) -> Pdfops.Op_RG (r, g, b)
| Grey g -> Pdfops.Op_G g
| CYMK (c, y, m, k) -> Pdfops.Op_K (c, y, m, k)
2013-08-20 16:32:57 +02:00
let ops longest_w metrics x y rotate hoffset voffset outline linewidth unique_fontname unique_extgstatename colour fontsize text =
if metrics then
ops_metrics :=
{metrics_text = text; metrics_x = x -. hoffset; metrics_y = y -. voffset; metrics_rot = rotate}
::!ops_metrics;
[Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
Pdfops.Op_cm
(Pdftransform.matrix_of_transform
[Pdftransform.Translate (x -. hoffset, y -. voffset);
Pdftransform.Rotate ((0., 0.), rotate)]);
Pdfops.Op_BT;
] @
(if outline then [Pdfops.Op_w linewidth; Pdfops.Op_Tr 1] else [Pdfops.Op_Tr 0]) @
2021-11-15 20:17:15 +01:00
[colour_op colour; colour_op_stroke colour]
2013-08-20 16:32:57 +02:00
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[Pdfops.Op_Tf (unique_fontname, fontsize);
Pdfops.Op_Tj text;
Pdfops.Op_ET;
Pdfops.Op_EMC;
Pdfops.Op_Q]
2021-08-12 21:38:55 +02:00
type justification = LeftJustify | CentreJustify | RightJustify
2013-08-20 16:32:57 +02:00
(* Find the h-offset for justification based on the longest width, the current
width, the justification and the position. *)
2021-08-12 21:38:55 +02:00
let find_justification_offsets longest_w w position j =
let open Cpdfposition in
match j with
| LeftJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> 0.
| Top _ | PosCentre _ | Bottom _ | Centre -> (longest_w -. w) /. 2.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> longest_w -. w
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| RightJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.(longest_w -. w)
| Top _ | PosCentre _ | Bottom _ | Centre -> ~-.((longest_w -. w) /. 2.)
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> 0.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
| CentreJustify ->
begin match position with
| TopLeft _ | Left _ | PosLeft _ | BottomLeft _ -> ~-.((longest_w -. w) /. 2.)
| Top _ | PosCentre _ | Bottom _ | Centre -> 0.
| TopRight _ | BottomRight _ | PosRight _ | Right _ -> (longest_w -. w) /. 2.
| Diagonal -> 0.
| ReverseDiagonal -> 0.
end
2013-08-20 16:32:57 +02:00
2014-09-08 18:55:14 +02:00
(* Lex an integer from the table *)
let extract_num header s =
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[Pdfgenlex.LexInt i] -> Pdf.Integer i
| [Pdfgenlex.LexReal f] -> Pdf.Real f
| _ -> raise (Failure ("extract_num: " ^ s))
2014-09-08 18:55:14 +02:00
let extract_fontbbox header s =
let num = function
Pdfgenlex.LexInt i -> Pdf.Integer i
| Pdfgenlex.LexReal f -> Pdf.Real f
| _ -> raise (Failure "extract_fontbbox")
in
match Pdfgenlex.lex_string (Hashtbl.find header s) with
[a; b; c; d] -> [num a; num b; num c; num d]
| _ -> raise (Failure "extract_fontbbox")
let remove_slash s =
match explode s with
'/'::x -> implode x
| _ -> raise (Failure "remove_slash")
let extract_widths chars_and_widths =
2020-03-04 19:50:32 +01:00
let win_to_name = map (fun (x, y) -> (y, x)) Pdfglyphlist.name_to_win in
map
(fun x ->
try
let name = List.assoc x win_to_name in
let width = List.assoc (remove_slash name) chars_and_widths in
width
with
_ -> 0)
(ilist 0 255)
2014-09-08 18:55:14 +02:00
let make_font embed fontname =
2014-09-08 18:55:14 +02:00
let font = unopt (Pdftext.standard_font_of_name ("/" ^ fontname)) in
let header, width_data, _, chars_and_widths = Pdfstandard14.afm_data font in
let widths = extract_widths (list_of_hashtbl chars_and_widths) in
let flags = Pdfstandard14.flags_of_standard_font font in
let fontbbox = extract_fontbbox header "FontBBox" in
let italicangle = extract_num header "ItalicAngle" in
let ascent = try extract_num header "Ascender" with _ -> Pdf.Integer 0 in
let descent = try extract_num header "Descender" with _ -> Pdf.Integer 0 in
let capheight = try extract_num header "CapHeight" with _ -> Pdf.Integer 0 in
let stemv = Pdfstandard14.stemv_of_standard_font font in
2014-09-08 18:55:14 +02:00
let fontdescriptor =
Pdf.Dictionary
[("/Type", Pdf.Name "/FontDescriptor");
("/FontName", Pdf.Name ("/" ^ fontname));
("/Flags", Pdf.Integer flags);
("/FontBBox", Pdf.Array fontbbox);
("/ItalicAngle", italicangle);
("/Ascent", ascent);
("/Descent", descent);
("/CapHeight", capheight);
2014-09-08 18:55:14 +02:00
("/StemV", Pdf.Integer stemv)]
in
(* With -no-embed-font, we use the standard encoding, and just the
* minimal stuff. Without -no-embed-font, we switch to WinAnsiEncoding,
* and fill out everything except the font file instead *)
if embed then
Pdf.Dictionary
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/BaseFont", Pdf.Name ("/" ^ fontname));
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/FirstChar", Pdf.Integer 0);
("/LastChar", Pdf.Integer 255);
("/Widths", Pdf.Array (map (fun x -> Pdf.Integer x) widths));
("/FontDescriptor", fontdescriptor)]
else
Pdf.Dictionary
[("/Type", Pdf.Name "/Font");
("/Subtype", Pdf.Name "/Type1");
("/Encoding", Pdf.Name "/WinAnsiEncoding");
("/BaseFont", Pdf.Name ("/" ^ fontname))]
2014-09-08 18:55:14 +02:00
2016-11-13 15:02:09 +01:00
let extract_page_text only_fontsize pdf _ page =
let text_extractor = ref None in
let right_font_size = ref false in
fold_left ( ^ ) ""
(map
(function
| Pdfops.Op_Tf (fontname, fontsize) ->
right_font_size :=
begin match only_fontsize with
Some x -> x = fontsize
| _ -> false
end;
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> raise (Pdf.PDFError "Missing /Font in text extraction")
| Some d ->
match Pdf.lookup_direct pdf fontname d with
| None -> raise (Pdf.PDFError "Missing font in text extraction")
| Some d -> d
in
text_extractor := Some (Pdftext.text_extractor_of_font pdf fontdict);
""
| Pdfops.Op_Tj text when !text_extractor <> None ->
if not !right_font_size then
""
else
Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text)
| Pdfops.Op_TJ (Pdf.Array objs) when !text_extractor <> None ->
if not !right_font_size then
""
else
fold_left ( ^ ) ""
(option_map
(function
| Pdf.String text ->
Some
(Pdftext.utf8_of_codepoints
(Pdftext.codepoints_of_text (unopt !text_extractor) text))
| _ -> None)
objs)
| _ -> "")
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content))
(* For each page, extract all the ops with text in them, and concatenate it all together *)
let extract_text extract_text_font_size pdf range =
fold_left (fun x y -> x ^ (if x <> "" && y <> "" then "\n" else "") ^ y) ""
(map_pages (extract_page_text extract_text_font_size pdf) pdf range)
2021-08-12 21:38:55 +02:00
2013-08-20 16:32:57 +02:00
let addtext
2015-07-17 17:34:47 +02:00
metrics lines linewidth outline fast colour fontname embed bates batespad fontsize font
2013-08-20 16:32:57 +02:00
underneath position hoffset voffset text pages orientation cropbox opacity
justification filename extract_text_font_size shift pdf
2013-08-20 16:32:57 +02:00
=
2021-06-10 17:09:59 +02:00
let time = Cpdfstrftime.current_time () in
2013-08-20 16:32:57 +02:00
let endpage = Pdfpage.endpage pdf in
2016-11-13 15:02:09 +01:00
let replace_pairs pdf filename bates batespad num page =
2017-09-24 14:14:29 +02:00
[
"%PageDiv2", (fun () -> string_of_int ((num + 1) / 2));
"%Page", (fun () -> string_of_int num);
2016-11-13 15:02:09 +01:00
"%Roman", (fun () -> roman_upper num);
"%roman", (fun () -> roman_lower num);
"%filename", (fun () -> filename);
"%Label", (fun () -> pagelabel pdf num);
"%EndPage", (fun () -> string_of_int endpage);
"%EndLabel", (fun () -> pagelabel pdf endpage);
"%ExtractedText", (fun () -> extract_page_text extract_text_font_size pdf num page);
2015-07-17 17:34:47 +02:00
"%Bates",
2016-11-13 15:02:09 +01:00
(fun () ->
(let numstring = string_of_int (bates + num - 1) in
2015-07-17 17:34:47 +02:00
match batespad with
None -> numstring
| Some w ->
if String.length numstring >= w
then numstring
2016-11-13 15:02:09 +01:00
else implode (many '0' (w - String.length numstring)) ^ numstring))]
2015-07-17 17:34:47 +02:00
in
let shifts = Cpdfcoord.parse_coordinates pdf shift in
2013-08-20 16:32:57 +02:00
let addtext_page num page =
let shift_x, shift_y = List.nth shifts (num - 1) in
2013-08-20 16:32:57 +02:00
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let fontdict =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| None -> Pdf.Dictionary []
| Some d -> d
in
let unique_fontname = Pdf.unique_key "F" fontdict in
let ops =
2021-06-10 17:09:59 +02:00
let text = process_text time text (replace_pairs pdf filename bates batespad num page) in
2013-08-20 16:32:57 +02:00
let calc_textwidth text =
match font with
| Some f ->
2016-11-04 16:57:28 +01:00
let rawwidth =
Pdfstandard14.textwidth
false
(if embed then Pdftext.WinAnsiEncoding else Pdftext.StandardEncoding)
f
text
2016-11-02 17:50:37 +01:00
in
2016-11-04 16:57:28 +01:00
(float rawwidth *. fontsize) /. 1000.
2013-08-20 16:32:57 +02:00
| None ->
let font =
match Pdf.lookup_direct pdf "/Font" page.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
2021-11-03 14:40:54 +01:00
| None ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname -> font := Some v
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
2013-08-20 16:32:57 +02:00
end
2021-11-03 14:40:54 +01:00
| _ -> failwith "addtext: font not found for width"
2013-08-20 16:32:57 +02:00
in
let rawwidth = width_of_text (Pdftext.read_font pdf font) text in
(rawwidth *. fontsize) /. 1000.
in
let expanded_lines =
2016-11-13 15:02:09 +01:00
map
(function text ->
2021-06-10 17:09:59 +02:00
process_text time text (replace_pairs pdf filename bates batespad num page))
2016-11-13 15:02:09 +01:00
lines
2013-08-20 16:32:57 +02:00
in
let textwidth = calc_textwidth text
and allwidths = map calc_textwidth expanded_lines in
let longest_w = last (sort compare allwidths) in
let joffset = find_justification_offsets longest_w textwidth position justification in
let mediabox =
if cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
else
Pdf.parse_rectangle page.Pdfpage.mediabox
in
2021-08-12 21:38:55 +02:00
let x, y, rotate = Cpdfposition.calculate_position false textwidth mediabox orientation position in
2013-08-20 16:32:57 +02:00
let hoffset, voffset =
if position = Diagonal || position = ReverseDiagonal
then -. (cos ((pi /. 2.) -. rotate) *. voffset), sin ((pi /. 2.) -. rotate) *. voffset
else hoffset, voffset
in
match font with
| Some f ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
2013-08-20 16:32:57 +02:00
unique_fontname unique_extgstatename colour fontsize text
| None ->
ops longest_w metrics (x +. shift_x) (y +. shift_y) rotate (hoffset +. joffset) voffset outline linewidth
2013-08-20 16:32:57 +02:00
fontname None colour fontsize text
in
let newresources =
match font with
| Some _ ->
2014-09-08 18:55:14 +02:00
let newfontdict =
Pdf.add_dict_entry fontdict unique_fontname (make_font embed fontname)
2013-08-20 16:32:57 +02:00
in
2014-09-08 18:55:14 +02:00
Pdf.add_dict_entry resources' "/Font" newfontdict
2013-08-20 16:32:57 +02:00
| None -> page.Pdfpage.resources
in
let page = {page with Pdfpage.resources = newresources} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
if metrics then
(ignore (iter_pages (fun a b -> ignore (addtext_page a b)) pdf pages); pdf)
else
process_pages (ppstub addtext_page) pdf pages
2013-08-20 16:32:57 +02:00
(* Prev is a list of lists of characters *)
let split_at_newline t =
let rec split_at_newline_inner prev = function
| [] -> rev (map implode (map rev prev))
| '\\'::'\\'::'n'::t -> split_at_newline_inner (('n'::'\\'::'\\'::hd prev)::tl prev) t
2013-08-20 16:32:57 +02:00
| '\\'::'n'::t -> split_at_newline_inner ([]::prev) t
| h::t -> split_at_newline_inner ((h::hd prev)::tl prev) t
in
split_at_newline_inner [[]] (explode t)
let rec unescape_chars prev = function
| [] -> rev prev
2015-05-31 16:10:48 +02:00
| '\\'::('0'..'7' as a)::('0'..'7' as b)::('0'..'7' as c)::t ->
2013-08-20 16:32:57 +02:00
let chr = char_of_int (int_of_string ("0o" ^ implode [a;b;c])) in
unescape_chars (chr::prev) t
| '\\'::'\\'::t -> unescape_chars ('\\'::prev) t
2013-08-20 16:32:57 +02:00
| '\\'::c::t when c <> 'n' -> unescape_chars (c::prev) t
| h::t -> unescape_chars (h::prev) t
let unescape_string s =
implode (unescape_chars [] (explode s))
let
2021-11-02 14:56:45 +01:00
addtexts metrics linewidth outline fast fontname (font : Pdftext.standard_font option) embed bates batespad colour position linespacing
2015-01-20 16:50:36 +01:00
fontsize underneath text pages orientation cropbox opacity justification
2021-11-03 13:15:15 +01:00
midline topline filename extract_text_font_size shift ?(raw=false) pdf
2013-08-20 16:32:57 +02:00
=
if pages = [] then error "addtexts: empty page range" else
2013-08-20 16:32:57 +02:00
(*flprint "addtexts:\n";
iter (Printf.printf "%C ") (explode text);
flprint "\n";
Printf.printf "\nCpdf.addtexts: metrics = %b" metrics;
flprint "\n";*)
(*Printf.printf "linewidth = %f\n" linewidth;
Printf.printf "outline = %b\n" outline;
Printf.printf "fast = %b\n" fast;
Printf.printf "fontname = %s\n" fontname;
Printf.printf "winansi text = %s\n" text;
Printf.printf "position = %s\n" (string_of_position position);
Printf.printf "bates = %i\n" bates;
Printf.printf "linespacing = %f\n" linespacing;
Printf.printf "fontsize = %f\n" fontsize;
Printf.printf "underneath = %b\n" underneath;
Printf.printf "font = %s\n" begin match font with None -> "None" | Some x -> Pdftext.string_of_standard_font x end;
Printf.printf "justification = %s\n"
begin match justification with LeftJustify -> "left" | RightJustify -> "right" | CentreJustify -> "centre" end;
Printf.printf "midline = %b\n" midline;
begin match colour with r, g, b -> Printf.printf "%f, %f, %f\n" r g b end;
Printf.printf "opacity = %f\n" opacity;
flprint "\n";
Printf.printf "relative-to-cropbox = %b" cropbox;
flprint "\n";*)
ops_metrics := [];
2021-11-03 14:40:54 +01:00
let realfontname = ref fontname in
2021-11-02 14:56:45 +01:00
let fontpdfobj =
match font with
| Some f ->
2021-11-02 14:56:45 +01:00
make_font embed (Pdftext.string_of_standard_font f)
| None ->
2021-11-02 14:56:45 +01:00
let firstpage =
2021-11-02 17:10:30 +01:00
List.nth (Pdfpage.pages_of_pagetree pdf) (hd pages - 1)
in
2021-11-02 14:56:45 +01:00
match Pdf.lookup_direct pdf "/Font" firstpage.Pdfpage.resources with
| Some fontdict ->
begin match Pdf.lookup_direct pdf fontname fontdict with
| Some font -> font
2021-11-03 14:40:54 +01:00
| _ ->
(* For each item in the fontdict, follow its value and find the basename. If it matches, return that font *)
let font = ref None in
iter
(fun (k, v) ->
match Pdf.lookup_direct pdf "/BaseFont" v with
| Some (Pdf.Name n) when n = fontname ->
font := Some v; realfontname := k
| _ -> ())
(match fontdict with Pdf.Dictionary d -> d | _ -> []);
match !font with Some f -> f | None -> failwith (Printf.sprintf "addtext: font %s not found" fontname)
2021-11-02 14:56:45 +01:00
end
2021-11-03 14:40:54 +01:00
| _ -> failwith "addtext: font dictionary not present"
in
2021-12-02 21:04:14 +01:00
let text = if raw then text else charcodes_of_utf8 (Pdftext.read_font pdf fontpdfobj) text in
let lines = map unescape_string (split_at_newline text) in
let pdf = ref pdf in
2013-08-20 16:32:57 +02:00
let voffset =
2021-08-12 21:38:55 +02:00
let open Cpdfposition in
2013-08-20 16:32:57 +02:00
match position with
| Bottom _ | BottomLeft _ | BottomRight _ ->
ref (0. -. (linespacing *. fontsize *. (float (length lines) -. 1.)))
| Left _ | Right _ ->
(* Vertically align *)
ref (0. -. (linespacing *. ((fontsize *. (float (length lines) -. 1.)) /. 2.)))
| Diagonal | ReverseDiagonal ->
(* Change so that the whole paragraph sits on the centre... *)
ref (0. -. ((linespacing *. fontsize *. (float (length lines) -. 1.)) /. 2.))
| _ -> ref 0.
in
if midline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font)) /. 1000.
in
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
2015-01-20 16:50:36 +01:00
end
else
if topline then
begin match font with
| Some font ->
let baseline_adjustment =
(fontsize *. float (Pdfstandard14.baseline_adjustment font) *. 2.0) /. 1000.
in
ops_baseline_adjustment := baseline_adjustment;
voffset := !voffset +. baseline_adjustment
| _ ->
ops_baseline_adjustment := 0.
2013-08-20 16:32:57 +02:00
end
else
ops_baseline_adjustment := 0.;
iter
(fun line ->
let voff, hoff =
2021-08-12 21:38:55 +02:00
if orientation = Cpdfposition.Vertical then 0., -.(!voffset) else !voffset, 0.
2013-08-20 16:32:57 +02:00
in
pdf :=
2021-11-03 14:40:54 +01:00
addtext metrics lines linewidth outline fast colour !realfontname
2015-07-17 17:34:47 +02:00
embed bates batespad fontsize font underneath position hoff voff line
pages orientation cropbox opacity justification filename
extract_text_font_size shift
!pdf;
2013-08-20 16:32:57 +02:00
voffset := !voffset +. (linespacing *. fontsize))
lines;
ops_metrics := rev !ops_metrics;
!pdf
let removetext range pdf =
(* Could fail on nesting, or other marked content inside our marked content.*)
let rec remove_until_last_EMC level = function
| [] -> []
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
remove_until_last_EMC (level + 1) more
| Pdfops.Op_EMC::more ->
if level = 1
then more
else remove_until_last_EMC (level - 1) more
| _::more ->
remove_until_last_EMC level more
in
let rec remove_stamps prev = function
| [] -> rev prev
| Pdfops.Op_BMC "/CPDFSTAMP"::more ->
let rest = remove_until_last_EMC 1 more in
remove_stamps prev rest
| h::t -> remove_stamps (h::prev) t
in
let removetext_page _ page =
{page with
Pdfpage.content =
let ops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content in
[Pdfops.stream_of_ops (remove_stamps [] ops)]}
in
process_pages (ppstub removetext_page) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Shift page data} *)
let make_mediabox (xmin, ymin, xmax, ymax) =
Pdf.Array
[Pdf.Real xmin; Pdf.Real ymin; Pdf.Real xmax; Pdf.Real ymax]
(* Change the media box and other known boxes by the function [f] which takes
xmin, xmax, ymin, ymax as input. *)
let change_boxes f pdf page =
let names = ["/TrimBox"; "/ArtBox"; "/CropBox"; "/BleedBox"]
in let getbox n =
Pdf.lookup_direct pdf n page.Pdfpage.rest
in
let boxes = combine names (map getbox names) in
let toreplace = lose (function (_, None) -> true | _ -> false) boxes in
let toreplace =
map
(function (name, Some value) -> (name, value) | _ -> assert false)
toreplace
in
let rest' =
fold_left
(fun e (k, v) ->
let v =
make_mediabox (f (Pdf.parse_rectangle v))
in
Pdf.replace_dict_entry e k v)
page.Pdfpage.rest
toreplace
in
{page with
Pdfpage.mediabox =
make_mediabox (f (Pdf.parse_rectangle page.Pdfpage.mediabox));
Pdfpage.rest = rest'}
2019-07-16 14:05:06 +02:00
let process_xobject f pdf resources i =
let xobj = Pdf.lookup_obj pdf i in
match Pdf.lookup_direct pdf "/Subtype" xobj with
| None -> raise (Pdf.PDFError "No /Subtype in Xobject")
| Some (Pdf.Name "/Form") ->
Pdf.getstream xobj;
begin match xobj with
| Pdf.Stream ({contents = Pdf.Dictionary dict, Pdf.Got bytes} as rf) ->
begin match f pdf resources [Pdf.Stream rf] with
| [Pdf.Stream {contents = (Pdf.Dictionary dict', data)}] ->
let dict' =
Pdf.remove_dict_entry
(Pdf.Dictionary (mergedict dict dict'))
"/Filter"
in
rf := (dict', data)
| _ -> assert false
end
| _ -> assert false (* getstream would have complained already *)
end
| Some _ -> ()
2019-07-15 19:46:17 +02:00
2019-07-16 14:05:06 +02:00
let process_xobjects pdf page f =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
iter
(fun (k, v) ->
match v with
| Pdf.Indirect i -> process_xobject f pdf page.Pdfpage.resources i
| _ -> raise (Pdf.PDFError "process_xobject"))
elts
| _ -> ()
2013-08-20 16:32:57 +02:00
2019-07-16 14:05:06 +02:00
(* The content transformed by altering any use of [Op_cm]. But we must also
alter any /Matrix entries in pattern dictionaries *)
let change_pattern_matrices_resources pdf tr resources =
2013-08-20 16:32:57 +02:00
try
begin match Pdf.lookup_direct pdf "/Pattern" resources with
2019-07-16 14:05:06 +02:00
| Some (Pdf.Dictionary patterns) ->
let entries =
map
(fun (name, p) ->
2019-07-18 18:26:18 +02:00
(*Printf.printf "Changing matrices of pattern %s\n" name;*)
2019-07-16 14:05:06 +02:00
let old_pattern = Pdf.direct pdf p in
let new_pattern =
let existing_tr = Pdf.parse_matrix pdf "/Matrix" old_pattern in
let new_tr = Pdftransform.matrix_compose (Pdftransform.matrix_invert tr) existing_tr in
Pdf.add_dict_entry old_pattern "/Matrix" (Pdf.make_matrix new_tr)
in
name, Pdf.Indirect (Pdf.addobj pdf new_pattern))
patterns
in
Pdf.add_dict_entry resources "/Pattern" (Pdf.Dictionary entries)
| _ -> resources
2019-07-16 14:05:06 +02:00
end
with
Pdftransform.NonInvertable ->
2021-07-23 17:26:07 +02:00
Printf.eprintf "Warning: noninvertible matrix\n%!";
resources
let change_pattern_matrices_page pdf tr page =
let page =
{page with Pdfpage.resources = change_pattern_matrices_resources pdf tr page.Pdfpage.resources}
in
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
iter
(fun (k, v) ->
match v with
| Pdf.Indirect i ->
(* Check if it's a form XObject. If so, rewrite its resources and add back as same number. *)
begin match Pdf.lookup_direct pdf "/Subtype" v with
| Some (Pdf.Name "/Form") ->
2019-07-18 18:26:18 +02:00
(*Printf.printf "Processing form xobject %s for patterns\n" k; *)
let form_xobject = Pdf.lookup_obj pdf i in
begin match Pdf.lookup_direct pdf "/Resources" form_xobject with
| Some resources ->
let form_xobject' =
Pdf.add_dict_entry form_xobject "/Resources" (change_pattern_matrices_resources pdf tr resources)
in
Pdf.addobj_given_num pdf (i, form_xobject')
| _ -> ()
end
| _ -> ()
end;
| _ -> raise (Pdf.PDFError "change_pattern_matrices_page"))
elts;
2019-07-16 14:05:06 +02:00
page
| _ -> page
2013-08-20 16:32:57 +02:00
let transform_rect transform rect =
let minx, miny, maxx, maxy = Pdf.parse_rectangle rect in
let (x0, y0) = Pdftransform.transform_matrix transform (minx, miny) in
let (x1, y1) = Pdftransform.transform_matrix transform (maxx, maxy) in
let (x2, y2) = Pdftransform.transform_matrix transform (minx, maxy) in
let (x3, y3) = Pdftransform.transform_matrix transform (maxx, miny) in
let minx = fmin (fmin x0 x1) (fmin x2 x3) in
let miny = fmin (fmin y0 y1) (fmin y2 y3) in
let maxx = fmax (fmax x0 x1) (fmax x2 x3) in
let maxy = fmax (fmax y0 y1) (fmax y2 y3) in
Pdf.Array [Pdf.Real minx; Pdf.Real miny; Pdf.Real maxx; Pdf.Real maxy]
2021-12-15 09:53:34 +01:00
let transform_quadpoint_single transform = function
| [x1; y1; x2; y2; x3; y3; x4; y4] ->
let x1, y1, x2, y2, x3, y3, x4, y4 =
Pdf.getnum x1, Pdf.getnum y1,
Pdf.getnum x2, Pdf.getnum y2,
Pdf.getnum x3, Pdf.getnum y3,
Pdf.getnum x4, Pdf.getnum y4
in
let (x1, y1) = Pdftransform.transform_matrix transform (x1, y1) in
let (x2, y2) = Pdftransform.transform_matrix transform (x2, y2) in
let (x3, y3) = Pdftransform.transform_matrix transform (x3, y3) in
let (x4, y4) = Pdftransform.transform_matrix transform (x4, y4) in
map (fun x -> Pdf.Real x) [x1; y1; x2; y2; x3; y3; x4; y4]
| qp ->
Printf.eprintf "Malformed /QuadPoints format: must be a multiple of 8 entries\n";
qp
let transform_quadpoints transform = function
| Pdf.Array qps ->
Pdf.Array (flatten (map (transform_quadpoint_single transform) (splitinto 8 qps)))
| qp ->
Printf.eprintf "Unknown or malformed /QuadPoints format %s\n" (Pdfwrite.string_of_pdf qp);
qp
(* Apply transformations to any annotations in /Annots (i.e their /Rect and /QuadPoints entries) *)
2019-07-10 14:16:32 +02:00
let transform_annotations pdf transform rest =
match Pdf.lookup_direct pdf "/Annots" rest with
| Some (Pdf.Array annots) ->
(* Always indirect references, so alter in place *)
2020-03-04 19:50:32 +01:00
iter
2019-07-10 14:16:32 +02:00
(function
| Pdf.Indirect i ->
let annot = Pdf.lookup_obj pdf i in
2021-12-15 09:53:34 +01:00
let rect' =
match Pdf.lookup_direct pdf "/Rect" annot with
| Some rect -> transform_rect transform rect
| None -> raise (Pdf.PDFError "transform_annotations: no rect")
in
let quadpoints' =
match Pdf.lookup_direct pdf "/QuadPoints" annot with
| Some qp -> Some (transform_quadpoints transform qp)
| None -> None
2019-07-10 14:16:32 +02:00
in
2021-12-15 09:53:34 +01:00
let annot = Pdf.add_dict_entry annot "/Rect" rect' in
let annot =
match quadpoints' with
| Some qp -> Pdf.add_dict_entry annot "/QuadPoints" qp
| None -> annot
in
Pdf.addobj_given_num pdf (i, annot)
2021-07-23 17:26:07 +02:00
| _ -> Printf.eprintf "transform_annotations: not indirect\n%!")
2019-07-10 14:16:32 +02:00
annots
| _ -> ()
2016-07-21 18:02:11 +02:00
let shift_page ?(fast=false) dxdylist pdf pnum page =
let dx, dy = List.nth dxdylist (pnum - 1) in
let transform_op =
Pdfops.Op_cm (Pdftransform.matrix_of_op (Pdftransform.Translate (dx, dy)))
2013-08-20 16:32:57 +02:00
in
2019-07-16 14:05:06 +02:00
let page =
change_pattern_matrices_page pdf (Pdftransform.mktranslate ~-.dx ~-.dy) page
2016-07-21 18:02:11 +02:00
in
transform_annotations pdf (Pdftransform.mktranslate dx dy) page.Pdfpage.rest;
2021-05-31 19:10:39 +02:00
(Pdfpage.prepend_operators pdf [transform_op] ~fast page, pnum, Pdftransform.mktranslate dx dy)
2013-08-20 16:32:57 +02:00
2016-07-21 18:02:11 +02:00
let shift_pdf ?(fast=false) dxdylist pdf range =
2021-05-31 19:10:39 +02:00
process_pages (shift_page ~fast dxdylist pdf) pdf range
2013-08-20 16:32:57 +02:00
(* Change a page's media box so its minimum x and y are 0, making other
operations simpler to think about. Any shift that is done is reflected in
other boxes (clip etc.) *)
let rectify_boxes ?(fast=false) pdf page =
let minx, miny, _, _ =
Pdf.parse_rectangle page.Pdfpage.mediabox
in
let f (iminx, iminy, imaxx, imaxy) =
iminx -. minx, iminy -. miny, imaxx -. minx, imaxy -. miny
in
let page = change_boxes f pdf page in
if minx <> 0. || miny <> 0.
2021-05-31 19:10:39 +02:00
then
begin let p, _, _ = shift_page ~fast [(-.minx),(-.miny)] pdf 1 page in p end
2013-08-20 16:32:57 +02:00
else page
(* \section{Flip pages} *)
2021-05-31 19:10:39 +02:00
let flip_page ?(fast=false) transform_op pdf pnum page =
2013-08-20 16:32:57 +02:00
let minx, miny, maxx, maxy =
Pdf.parse_rectangle page.Pdfpage.mediabox
in
let tr = transform_op minx miny maxx maxy in
2019-07-16 14:05:06 +02:00
let page = change_pattern_matrices_page pdf tr page in
2019-07-10 14:16:32 +02:00
transform_annotations pdf tr page.Pdfpage.rest;
2021-05-31 19:10:39 +02:00
(Pdfpage.prepend_operators pdf [Pdfops.Op_cm tr] ~fast page, pnum, tr)
2013-08-20 16:32:57 +02:00
let vflip_pdf ?(fast=false) pdf range =
let transform_op _ miny _ maxy =
Pdftransform.matrix_of_op
(Pdftransform.Scale ((0., ((miny +. maxy) /. 2.)), 1., -.1.))
in
2021-05-31 19:10:39 +02:00
process_pages (flip_page ~fast transform_op pdf) pdf range
2013-08-20 16:32:57 +02:00
let hflip_pdf ?(fast=false) pdf range =
let transform_op minx _ maxx _ =
Pdftransform.matrix_of_op
(Pdftransform.Scale (((minx +. maxx) /. 2., 0.), -.1., 1.))
in
2021-05-31 19:10:39 +02:00
process_pages (flip_page ~fast transform_op pdf) pdf range
2013-08-20 16:32:57 +02:00
2015-02-17 16:48:00 +01:00
let stamp_shift_of_position topline midline sw sh w h p =
let half x = x /. 2.
and dy =
if midline then sh /. 2.
else if topline then sh
else 0.
in
2021-08-12 21:38:55 +02:00
let open Cpdfposition in
match p with
| PosCentre (ox, oy) -> ox -. half sw, oy -. dy
| PosLeft (ox, oy) -> ox, oy -. dy
| PosRight (ox, oy) -> ox -. sw, oy -. dy
| Top o -> half w -. half sw, h -. o -. sh -. dy
| TopLeft o -> o, h -. sh -. o -. dy
| TopRight o -> w -. sw -. o, h -. sh -. o -. dy
| Left o -> o, half h -. half sh -. dy
| BottomLeft o -> o, o -. dy
| Bottom o -> half w -. half sw, o -. dy
| BottomRight o -> w -. sw -. o, o -. dy
| Right o -> w -. sw -. o, half h -. half sh -. dy
| Diagonal | ReverseDiagonal | Centre ->
half w -. half sw, half h -. half sh -. dy
2015-02-17 16:48:00 +01:00
(* Combine Pdfpage.rest items for two PDFs. For now, we combine /Annots, and
* copy everything else from adict. What else should we combine? *)
let combine_page_items pdf adict bdict =
let getannots dict =
begin match dict with
Pdf.Dictionary d ->
begin match lookup "/Annots" d with
Some (Pdf.Array items) -> items
| _ -> []
end
| _ -> []
end
in
let a_annots = getannots adict in
let b_annots = getannots bdict in
match a_annots @ b_annots with
[] -> adict
| annots -> Pdf.add_dict_entry adict "/Annots" (Pdf.Array annots)
let do_stamp relative_to_cropbox fast position topline midline scale_to_fit isover pdf o u opdf =
(* Scale page stamp o to fit page u *)
let sxmin, symin, sxmax, symax =
Pdf.parse_rectangle
(match Pdf.lookup_direct pdf "/CropBox" o.Pdfpage.rest with | Some r -> r | None -> o.Pdfpage.mediabox)
in let txmin, tymin, txmax, tymax =
Pdf.parse_rectangle
(match Pdf.lookup_direct pdf "/CropBox" u.Pdfpage.rest with | Some r -> r | None -> u.Pdfpage.mediabox)
in
let o =
if scale_to_fit then
let xmag = (txmax -. txmin) /. (sxmax -. sxmin) in
let ymag = (tymax -. tymin) /. (symax -. symin) in
let scale =
if xmag < 0.999 && ymag < 0.999 then
if xmag > ymag then xmag else ymag
else if xmag >= 1.001 && ymag >= 1.001 then
if xmag > ymag then ymag else xmag
else if xmag >= 1.001 then ymag
else xmag
in
let dx = txmin +. ((txmax -. txmin) -. (sxmax -. sxmin) *. scale) /. 2. in
let dy = tymin +. ((tymax -. tymin) -. (symax -. symin) *. scale) /. 2. in
let matrix =
(Pdftransform.matrix_of_transform
([Pdftransform.Translate (dx, dy)] @
(if relative_to_cropbox then [Pdftransform.Translate (txmin, tymin)] else []) @
[Pdftransform.Scale ((sxmin, symin), scale, scale)]))
in
transform_annotations pdf matrix o.Pdfpage.rest;
let r = Pdfpage.prepend_operators pdf [Pdfops.Op_cm matrix] ~fast o in
2019-07-16 14:05:06 +02:00
change_pattern_matrices_page pdf matrix r
else
2016-04-03 20:46:54 +02:00
let sw = sxmax -. sxmin and sh = symax -. symin
and w = txmax -. txmin and h = tymax -. tymin in
2015-02-17 16:48:00 +01:00
let dx, dy = stamp_shift_of_position topline midline sw sh w h position in
let matrix =
(Pdftransform.matrix_of_transform
((if relative_to_cropbox then [Pdftransform.Translate (txmin, tymin)] else []) @
[Pdftransform.Translate (dx, dy)]))
in
transform_annotations pdf matrix o.Pdfpage.rest;
let r = Pdfpage.prepend_operators pdf [Pdfops.Op_cm matrix] ~fast o in
2019-07-16 14:05:06 +02:00
change_pattern_matrices_page pdf matrix r
in
{u with
Pdfpage.content =
(if isover then ( @ ) else ( @@ ))
2016-04-03 20:46:54 +02:00
(protect fast pdf u.Pdfpage.resources u.Pdfpage.content)
(protect fast pdf o.Pdfpage.resources o.Pdfpage.content);
Pdfpage.rest =
combine_page_items pdf u.Pdfpage.rest o.Pdfpage.rest;
Pdfpage.resources =
combine_pdf_resources pdf u.Pdfpage.resources o.Pdfpage.resources}
(* Alter bookmark destinations given a hash table of (old page reference
* number, new page reference number) pairings *)
let change_destination t = function
Pdfdest.XYZ (Pdfdest.PageObject p, a, b, c) ->
Pdfdest.XYZ (Pdfdest.PageObject (Hashtbl.find t p), a, b, c)
| Pdfdest.Fit (Pdfdest.PageObject p) ->
Pdfdest.Fit (Pdfdest.PageObject (Hashtbl.find t p))
| Pdfdest.FitH (Pdfdest.PageObject p, x) ->
Pdfdest.FitH (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitV (Pdfdest.PageObject p, x) ->
Pdfdest.FitV (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitR (Pdfdest.PageObject p, a, b, c, d) ->
Pdfdest.FitR (Pdfdest.PageObject (Hashtbl.find t p), a, b, c, d)
| Pdfdest.FitB (Pdfdest.PageObject p) ->
Pdfdest.Fit (Pdfdest.PageObject (Hashtbl.find t p))
| Pdfdest.FitBH (Pdfdest.PageObject p, x) ->
Pdfdest.FitBH (Pdfdest.PageObject (Hashtbl.find t p), x)
| Pdfdest.FitBV (Pdfdest.PageObject p, x) ->
Pdfdest.FitBV (Pdfdest.PageObject (Hashtbl.find t p), x)
| x -> x
let change_bookmark t m =
{m with Pdfmarks.target =
try change_destination t m.Pdfmarks.target with Not_found -> m.Pdfmarks.target}
let stamp relative_to_cropbox position topline midline fast scale_to_fit isover range over pdf =
2016-03-30 18:04:28 +02:00
let prefix = Pdfpage.shortest_unused_prefix pdf in
Pdfpage.add_prefix over prefix;
let marks = Pdfmarks.read_bookmarks pdf in
let marks_refnumbers = Pdf.page_reference_numbers pdf in
let pdf = Pdfmarks.remove_bookmarks pdf in
let over = Pdfmarks.remove_bookmarks over in
let pageseqs = ilist 1 (Pdfpage.endpage pdf) in
let over_firstpage_pdf =
match Pdfpage.pages_of_pagetree over with
| [] -> error "empty PDF"
| h::_ -> Pdfpage.change_pages ~changes:[(1, 1)] true over [h]
in
let merged =
Pdfmerge.merge_pdfs
false false ["a"; "b"] [pdf; over_firstpage_pdf] [pageseqs; [1]]
in
let merged =
{merged with Pdf.saved_encryption = pdf.Pdf.saved_encryption}
in
2021-12-19 13:55:06 +01:00
let merged = Cpdfmetadata.copy_id true pdf merged in
2016-03-30 18:04:28 +02:00
let merged_pages = Pdfpage.pages_of_pagetree merged in
let under_pages, over_page =
2016-03-30 18:04:28 +02:00
all_but_last merged_pages, last merged_pages
in
let new_pages =
map2
(fun pageseq under_page ->
2016-03-30 18:04:28 +02:00
do_stamp relative_to_cropbox fast position topline midline scale_to_fit isover merged
(if mem pageseq range then over_page else
Pdfpage.blankpage Pdfpaper.a4)
under_page over)
pageseqs
under_pages
in
let changed =
let changes =
2020-03-04 19:50:32 +01:00
map (fun x -> (x, x)) (ilist 1 (length new_pages))
in
Pdfpage.change_pages ~changes true merged new_pages
in
let new_refnumbers = Pdf.page_reference_numbers changed in
2020-03-04 19:50:32 +01:00
let changetable = hashtable_of_dictionary (combine marks_refnumbers new_refnumbers) in
let new_marks = map (change_bookmark changetable) marks in
Pdfmarks.add_bookmarks new_marks changed
let add_xobject_to_page xobjname xobjnum page pdf =
let resources' =
let xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some xobjects -> xobjects
| _ -> Pdf.Dictionary []
in
let new_xobjects =
Pdf.add_dict_entry xobjects xobjname (Pdf.Indirect xobjnum)
in
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" new_xobjects
in
{page with Pdfpage.resources = resources'}
2020-03-19 17:23:16 +01:00
let add_page_as_xobject pdf range page name =
let xobject_data =
match Pdfops.stream_of_ops (Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content) with
Pdf.Stream {contents = (_, Got b)} -> b
| _ -> assert false
in
2020-03-19 17:23:16 +01:00
let xobject_dict =
["/Type", Pdf.Name "/XObject";
2020-03-25 14:46:54 +01:00
"/Subtype", Pdf.Name "/Form";
"/BBox", page.Pdfpage.mediabox;
"/Resources", page.Pdfpage.resources;
"/Length", Pdf.Integer (bytes_size xobject_data)]
2020-03-19 17:23:16 +01:00
in
let xobject =
Pdf.Stream {contents = (Pdf.Dictionary xobject_dict, Pdf.Got xobject_data)}
in
let xobject_objnum = Pdf.addobj pdf xobject in
let pages = Pdfpage.pages_of_pagetree pdf in
let new_pages =
List.map2
(fun page pnum ->
if mem pnum range
then add_xobject_to_page name xobject_objnum page pdf
else page)
pages
(indx pages)
in
Pdfpage.change_pages true pdf new_pages
2020-03-19 18:08:15 +01:00
(* n.b the use of change_pages here ensures no inheritable resources in the
* stamp, therefore creation of xobject from page is as simple as expected. *)
2020-03-19 17:23:16 +01:00
let stamp_as_xobject pdf range over =
let prefix = Pdfpage.shortest_unused_prefix pdf in
Pdfpage.add_prefix over prefix;
let marks = Pdfmarks.read_bookmarks pdf in
let marks_refnumbers = Pdf.page_reference_numbers pdf in
let pdf = Pdfmarks.remove_bookmarks pdf in
let over = Pdfmarks.remove_bookmarks over in
let pageseqs = ilist 1 (Pdfpage.endpage pdf) in
let over_firstpage_pdf =
match Pdfpage.pages_of_pagetree over with
| [] -> error "empty PDF"
| h::_ -> Pdfpage.change_pages ~changes:[(1, 1)] true over [h]
in
let merged =
Pdfmerge.merge_pdfs
false false ["a"; "b"] [pdf; over_firstpage_pdf] [pageseqs; [1]]
in
let merged =
{merged with Pdf.saved_encryption = pdf.Pdf.saved_encryption}
in
2021-12-19 13:55:06 +01:00
let merged = Cpdfmetadata.copy_id true pdf merged in
let merged_pages = Pdfpage.pages_of_pagetree merged in
let under_pages, over_page =
all_but_last merged_pages, last merged_pages
in
let new_pages = under_pages in
let changed =
let changes =
map (fun x -> (x, x)) (ilist 1 (length new_pages))
in
Pdfpage.change_pages ~changes true merged new_pages
in
let new_refnumbers = Pdf.page_reference_numbers changed in
let changetable = hashtable_of_dictionary (combine marks_refnumbers new_refnumbers) in
let new_marks = map (change_bookmark changetable) marks in
let pdf = Pdfmarks.add_bookmarks new_marks changed in
let name = "/" ^ Pdfpage.shortest_unused_prefix pdf ^ "CPDFXObj" in
2020-03-19 17:23:16 +01:00
(add_page_as_xobject pdf range over_page name, name)
(* Combine pages from two PDFs. For now, assume equal length. *)
(* If [over] has more pages than [under], chop the excess. If the converse, pad
[over] to the same length *)
let equalize_pages under over =
2016-04-03 13:48:44 +02:00
let length_under = Pdfpage.endpage under in
let length_over = Pdfpage.endpage over in
if length_over > length_under then
let changes =
2020-03-04 19:50:32 +01:00
map (fun x -> (x, x)) (ilist 1 length_under)
in
(under,
(Pdfpage.change_pages
~changes true over (take (Pdfpage.pages_of_pagetree over) length_under)))
else if length_under > length_over then
let changes =
2020-03-04 19:50:32 +01:00
map (fun x -> (x, x)) (ilist 1 length_over)
in
(under,
Pdfpage.change_pages
~changes true over
(Pdfpage.pages_of_pagetree over @
(many (Pdfpage.blankpage Pdfpaper.a4) (length_under - length_over))))
else
under, over
2016-11-25 18:38:19 +01:00
let combine_pages fast under over scaletofit swap equalize =
let debug_combine_pages = false in
2016-11-17 19:42:40 +01:00
let debug_pdf pdf n =
if debug_combine_pages then
2016-11-25 18:38:19 +01:00
begin Pdf.remove_unreferenced pdf; Pdfwrite.pdf_to_file pdf n end
2016-11-17 19:42:40 +01:00
in
2016-04-03 13:48:44 +02:00
Pdfpage.add_prefix over (Pdfpage.shortest_unused_prefix under);
2016-11-25 18:38:19 +01:00
let marks_under, marks_over = Pdfmarks.read_bookmarks under, Pdfmarks.read_bookmarks over in
2016-04-03 13:48:44 +02:00
let under, over = if equalize then equalize_pages under over else under, over in
2016-11-25 18:38:19 +01:00
let under_length, over_length = Pdfpage.endpage under, Pdfpage.endpage over in
2016-04-03 13:48:44 +02:00
if under_length <> over_length then
raise (Pdf.PDFError "combine_pages: not of equal length")
else
let pageseqs_under = ilist 1 (Pdfpage.endpage under) in
let pageseqs_over = ilist 1 (Pdfpage.endpage over) in
let merged =
Pdfmerge.merge_pdfs
false false ["a"; "b"] [under; over] [pageseqs_under; pageseqs_over]
in
2016-11-17 19:42:40 +01:00
debug_pdf merged "merged.pdf";
2016-04-03 13:48:44 +02:00
let under_pages, over_pages =
cleave (Pdfpage.pages_of_pagetree merged) under_length
in
let new_pages =
map2
(fun o u ->
do_stamp
2016-11-25 18:38:19 +01:00
false fast (BottomLeft 0.) false false scaletofit (not swap) merged o u over)
2016-04-03 13:48:44 +02:00
over_pages under_pages
in
2016-11-25 18:38:19 +01:00
(* Build the changes. 123456 -> 123123 *)
let changes =
2020-03-04 19:50:32 +01:00
let len = length new_pages in
2016-11-25 18:38:19 +01:00
combine (ilist 1 (len * 2)) (let x = ilist 1 len in x @ x)
in
let changed = Pdfpage.change_pages ~changes true merged new_pages in
let r = Pdfmarks.add_bookmarks (marks_under @ marks_over) changed in
debug_pdf r "final.pdf";
r
2013-08-20 16:32:57 +02:00
(* \section{Set media box} *)
2016-07-21 18:02:11 +02:00
let set_mediabox xywhlist pdf range =
let crop_page pnum page =
let x, y, w, h = List.nth xywhlist (pnum - 1) in
2013-08-20 16:32:57 +02:00
{page with
Pdfpage.mediabox =
(Pdf.Array
[Pdf.Real x; Pdf.Real y;
Pdf.Real (x +. w); Pdf.Real (y +. h)])}
in
process_pages (ppstub crop_page) pdf range
2013-08-20 16:32:57 +02:00
2019-08-08 17:57:25 +02:00
(* Just used by cpdflib for historical reasons *)
let setBox box minx maxx miny maxy pdf range =
2013-08-20 16:32:57 +02:00
let set_box_page _ page =
{page with
Pdfpage.rest =
Pdf.add_dict_entry
page.Pdfpage.rest box
(Pdf.Array [Pdf.Real minx; Pdf.Real miny; Pdf.Real maxx; Pdf.Real maxy])}
in
process_pages (ppstub set_box_page) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Cropping} *)
let crop_pdf ?(box="/CropBox") xywhlist pdf range =
2016-07-21 18:02:11 +02:00
let crop_page pagenum page =
2013-08-20 16:32:57 +02:00
{page with
Pdfpage.rest =
(Pdf.add_dict_entry
page.Pdfpage.rest
box
2016-07-21 18:02:11 +02:00
(let x, y, w, h = List.nth xywhlist (pagenum - 1) in
(Pdf.Array
[Pdf.Real x; Pdf.Real y;
Pdf.Real (x +. w); Pdf.Real (y +. h)])))}
2013-08-20 16:32:57 +02:00
in
process_pages (ppstub crop_page) pdf range
2013-08-20 16:32:57 +02:00
2017-05-19 20:10:49 +02:00
(* Clip a page to one of its boxes, or the media box if that box is not
* present. This is a hard clip, done by using a clipping rectangle, so that
* the page may then be used as a stamp without extraneous material reapearing.
* *)
let hard_box pdf range boxname mediabox_if_missing fast =
process_pages
(ppstub (fun pagenum page ->
2017-05-19 20:10:49 +02:00
let minx, miny, maxx, maxy =
if boxname = "/MediaBox" then
Pdf.parse_rectangle page.Pdfpage.mediabox
else
match Pdf.lookup_direct pdf boxname page.Pdfpage.rest with
| Some a -> Pdf.parse_rectangle a
| _ ->
if mediabox_if_missing
then Pdf.parse_rectangle page.Pdfpage.mediabox
2019-08-01 15:10:02 +02:00
else error (Printf.sprintf "hard_box: box %s not found" boxname)
2017-05-19 20:10:49 +02:00
in
2017-05-20 14:23:32 +02:00
let ops = [Pdfops.Op_re (minx, miny, maxx -. minx, maxy -. miny); Pdfops.Op_W; Pdfops.Op_n] in
2021-10-26 17:18:09 +02:00
Pdfpage.prepend_operators pdf ops ~fast page))
2017-05-19 20:10:49 +02:00
pdf
range
2013-08-20 16:32:57 +02:00
let remove_cropping_pdf pdf range =
let remove_cropping_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/CropBox")}
in
process_pages (ppstub remove_cropping_page) pdf range
2013-08-20 16:32:57 +02:00
let remove_trim_pdf pdf range =
let remove_trim_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/TrimBox")}
in
process_pages (ppstub remove_trim_page) pdf range
2013-08-20 16:32:57 +02:00
let remove_art_pdf pdf range =
let remove_art_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/ArtBox")}
in
process_pages (ppstub remove_art_page) pdf range
2013-08-20 16:32:57 +02:00
let remove_bleed_pdf pdf range =
let remove_bleed_page _ page =
{page with
Pdfpage.rest =
(Pdf.remove_dict_entry page.Pdfpage.rest "/BleedBox")}
in
process_pages (ppstub remove_bleed_page) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Rotating pages} *)
let rotate_pdf r pdf range =
let rotate_page _ page =
{page with Pdfpage.rotate =
Pdfpage.rotation_of_int r}
in
process_pages (ppstub rotate_page) pdf range
2013-08-20 16:32:57 +02:00
let rotate_pdf_by r pdf range =
let rotate_page_by _ page =
{page with Pdfpage.rotate =
Pdfpage.rotation_of_int ((Pdfpage.int_of_rotation page.Pdfpage.rotate + r) mod 360)}
in
process_pages (ppstub rotate_page_by) pdf range
2013-08-20 16:32:57 +02:00
2021-05-31 19:10:39 +02:00
let rotate_page_contents ~fast rotpoint r pdf pnum page =
2013-08-20 16:32:57 +02:00
let rotation_point =
match rotpoint with
| None ->
let minx, miny, maxx, maxy = Pdf.parse_rectangle page.Pdfpage.mediabox in
(minx +. maxx) /. 2., (miny +. maxy) /. 2.
| Some point -> point
in
let tr =
Pdftransform.matrix_of_op
(Pdftransform.Rotate (rotation_point, -.(rad_of_deg r)))
in let tr2 =
Pdftransform.matrix_of_op
(Pdftransform.Rotate (rotation_point, rad_of_deg r))
in
let transform_op = Pdfops.Op_cm tr in
2019-07-16 14:05:06 +02:00
let page = change_pattern_matrices_page pdf tr2 page in
transform_annotations pdf tr page.Pdfpage.rest;
2021-05-31 19:10:39 +02:00
(Pdfpage.prepend_operators pdf [transform_op] ~fast page, pnum, tr)
2013-08-20 16:32:57 +02:00
let rotate_contents ?(fast=false) r pdf range =
2021-05-31 19:10:39 +02:00
process_pages (rotate_page_contents ~fast None r pdf) pdf range
2013-08-20 16:32:57 +02:00
(* Return the pages from the pdf in the range, unordered. *)
let select_pages range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
option_map (function n -> try Some (select n pages) with _ -> None) range
(* Upright functionality *)
(* If all pages are already upright, and the mediabox is (0,0)-based, do nothing
to save time. *)
2013-08-20 16:32:57 +02:00
let allupright range pdf =
let page_is_upright page =
page.Pdfpage.rotate = Pdfpage.Rotate0 &&
(let (minx, miny, _, _) = Pdf.parse_rectangle page.Pdfpage.mediabox in
minx < 0.001 && miny < 0.001 && minx > ~-.0.001 && miny > ~-.0.001)
2013-08-20 16:32:57 +02:00
in
not (mem false (map page_is_upright (select_pages range pdf)))
let upright_transform page =
let rotate =
Pdfpage.int_of_rotation page.Pdfpage.rotate
and cx, cy =
let minx, miny, maxx, maxy = Pdf.parse_rectangle page.Pdfpage.mediabox in
(minx +. maxx) /. 2., (miny +. maxy) /. 2.
in
Pdftransform.mkrotate (cx, cy) (rad_of_deg (~-.(float rotate)))
let transform_boxes tr pdf page =
let f (minx, miny, maxx, maxy) =
let minx, miny = Pdftransform.transform_matrix tr (minx, miny)
and maxx, maxy = Pdftransform.transform_matrix tr (maxx, maxy) in
(minx, miny, maxx, maxy)
in
change_boxes f pdf page
let transform_contents ?(fast=false) tr pdf page =
let transform_op = Pdfops.Op_cm tr in
2019-07-16 14:05:06 +02:00
let page = change_pattern_matrices_page pdf (Pdftransform.matrix_invert tr) page in
2019-07-10 14:16:32 +02:00
transform_annotations pdf tr page.Pdfpage.rest;
2019-07-16 14:05:06 +02:00
Pdfpage.prepend_operators pdf [transform_op] ~fast page
2013-08-20 16:32:57 +02:00
let upright ?(fast=false) range pdf =
if allupright range pdf then pdf else
2021-05-31 19:10:39 +02:00
let upright_page _ pnum page =
2013-08-20 16:32:57 +02:00
let tr = upright_transform page in
let page = transform_boxes tr pdf page in
let page = transform_contents ~fast tr pdf page in
2021-05-31 19:10:39 +02:00
(rectify_boxes ~fast pdf {page with Pdfpage.rotate = Pdfpage.Rotate0}, pnum, tr)
2013-08-20 16:32:57 +02:00
in
2021-05-31 19:10:39 +02:00
process_pages (upright_page pdf) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Scale page data} *)
2016-07-21 18:02:11 +02:00
let scale_pdf ?(fast=false) sxsylist pdf range =
let scale_page pnum page =
let sx, sy = List.nth sxsylist (pnum - 1) in
let f (xmin, ymin, xmax, ymax) =
xmin *. sx, ymin *. sy, xmax *. sx, ymax *. sy
in
let page = change_boxes f pdf page
and matrix = Pdftransform.matrix_of_op (Pdftransform.Scale ((0., 0.), sx, sy)) in
let transform_op =
Pdfops.Op_cm matrix
2019-07-16 14:05:06 +02:00
and page =
change_pattern_matrices_page pdf (Pdftransform.matrix_invert matrix) page
2016-07-21 18:02:11 +02:00
in
2019-07-10 14:16:32 +02:00
transform_annotations pdf matrix page.Pdfpage.rest;
2021-05-28 19:00:27 +02:00
(Pdfpage.prepend_operators pdf ~fast [transform_op] page, pnum, matrix)
2016-07-21 18:02:11 +02:00
in
2021-05-28 19:00:27 +02:00
process_pages scale_page pdf range
2013-08-20 16:32:57 +02:00
(* Scale to fit page of size x * y *)
2019-03-19 15:03:28 +01:00
let scale_to_fit_pdf ?(fast=false) position input_scale xylist op pdf range =
2016-07-21 18:02:11 +02:00
let scale_page_to_fit pnum page =
let x, y = List.nth xylist (pnum - 1) in
2013-08-20 16:32:57 +02:00
let matrix =
let (minx, miny, maxx, maxy) =
(* Use cropbox if available *)
Pdf.parse_rectangle
(match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some r -> r
| None -> page.Pdfpage.mediabox)
in
if maxx <= 0. || maxy <= 0. then failwith "Zero-sized pages are invalid" else
let fx = x /. maxx in let fy = y /. maxy in
let scale = fmin fx fy *. input_scale in
2019-03-19 15:03:28 +01:00
let trans_x =
match position with
2021-08-12 21:38:55 +02:00
Cpdfposition.Left _ -> 0.
| Cpdfposition.Right _ -> (x -. (maxx *. scale))
2019-03-19 15:03:28 +01:00
| _ -> (x -. (maxx *. scale)) /. 2.
and trans_y =
match position with
2021-08-12 21:38:55 +02:00
| Cpdfposition.Top _ -> (y -. (maxy *. scale))
| Cpdfposition.Bottom _ -> 0.
2019-03-19 15:03:28 +01:00
| _ -> (y -. (maxy *. scale)) /. 2.
in
2013-08-20 16:32:57 +02:00
(Pdftransform.matrix_of_transform
[Pdftransform.Translate (trans_x, trans_y);
Pdftransform.Scale ((0., 0.), scale, scale)])
in
let page =
change_boxes
2016-11-13 19:01:01 +01:00
(function (minx, miny, maxx, maxy) -> 0., 0., x, y)
2013-08-20 16:32:57 +02:00
pdf page
in
2019-07-10 14:16:32 +02:00
transform_annotations pdf matrix page.Pdfpage.rest;
2021-05-31 19:10:39 +02:00
(Pdfpage.prepend_operators pdf [Pdfops.Op_cm matrix] ~fast
(change_pattern_matrices_page pdf (Pdftransform.matrix_invert matrix) page), pnum, matrix)
2013-08-20 16:32:57 +02:00
in
2021-05-31 19:10:39 +02:00
process_pages scale_page_to_fit pdf range
2013-08-20 16:32:57 +02:00
(* Scale contents *)
2021-05-31 19:10:39 +02:00
let scale_page_contents ?(fast=false) scale position pdf pnum page =
2013-08-20 16:32:57 +02:00
let (minx, miny, maxx, maxy) as box =
(* Use cropbox if available *)
Pdf.parse_rectangle
(match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some r -> r
| None -> page.Pdfpage.mediabox)
in
2021-08-12 21:38:55 +02:00
let sx, sy, _ = Cpdfposition.calculate_position true 0. box Horizontal position in
2013-08-20 16:32:57 +02:00
let tx, ty =
2021-08-12 21:38:55 +02:00
let open Cpdfposition in
2013-08-20 16:32:57 +02:00
match position with
| Top t -> 0., -.t
| TopLeft t -> t, -.t
2021-11-22 21:20:48 +01:00
| TopRight t -> -.t, -.t
2013-08-20 16:32:57 +02:00
| Left t -> t, 0.
| BottomLeft t -> t, t
| Bottom t -> 0., t
2021-11-22 21:20:48 +01:00
| BottomRight t -> -.t, t
2013-08-20 16:32:57 +02:00
| Right t -> -.t, 0.
| _ -> 0., 0. (* centre it... FIXME: We will add a center position, eventually, for text and this... *)
in
let transform =
Pdftransform.matrix_of_transform
[Pdftransform.Translate (tx, ty);
Pdftransform.Scale ((sx, sy), scale, scale)]
in
let transform_op = Pdfops.Op_cm transform in
2019-07-16 14:05:06 +02:00
let page = change_pattern_matrices_page pdf transform page in
2019-07-10 12:18:01 +02:00
transform_annotations pdf transform page.Pdfpage.rest;
2021-05-31 19:10:39 +02:00
(Pdfpage.prepend_operators pdf [transform_op] ~fast page, pnum, transform)
2013-08-20 16:32:57 +02:00
let scale_contents ?(fast=false) position scale pdf range =
2021-05-31 19:10:39 +02:00
process_pages (scale_page_contents ~fast scale position pdf) pdf range
2013-08-20 16:32:57 +02:00
(* \section{List annotations} *)
let get_annotation_string encoding pdf annot =
match Pdf.lookup_direct pdf "/Contents" annot with
2021-12-19 13:55:06 +01:00
| Some (Pdf.String s) -> Cpdfmetadata.encode_output encoding s
2013-08-20 16:32:57 +02:00
| _ -> ""
let print_annotation encoding pdf num s =
2013-08-20 16:32:57 +02:00
let s = get_annotation_string encoding pdf s in
match s with
| "" -> ()
| s ->
flprint (Printf.sprintf "Page %d: " num);
flprint s;
flprint "\n"
2013-08-20 16:32:57 +02:00
let list_page_annotations encoding pdf num page =
2013-08-20 16:32:57 +02:00
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
iter (print_annotation encoding pdf num) (map (Pdf.direct pdf) annots)
2013-08-20 16:32:57 +02:00
| _ -> ()
2021-10-28 18:06:46 +02:00
let annotations_json_page pdf page pagenum =
2021-10-28 16:34:03 +02:00
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
2021-10-28 18:06:46 +02:00
map
(fun annot ->
`List [`Int pagenum; Cpdfjson.json_of_object pdf (fun _ -> ()) false false annot])
(map (Pdf.direct pdf) annots)
2021-10-28 16:34:03 +02:00
| _ -> []
let list_annotations_json pdf =
let module J = Cpdfyojson.Safe in
2021-10-28 18:06:46 +02:00
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let json = `List (flatten (map2 (annotations_json_page pdf) pages pagenums)) in
2021-10-28 16:34:03 +02:00
J.pretty_to_channel stdout json
let list_annotations ~json encoding pdf =
let range = Cpdfpagespec.parse_pagespec pdf "all" in
2021-10-28 16:34:03 +02:00
if json
then list_annotations_json pdf
else iter_pages (list_page_annotations encoding pdf) pdf range
2013-08-20 16:32:57 +02:00
let get_annotations encoding pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
flatten
(map2
(fun page pagenumber ->
match Pdf.lookup_direct pdf "/Annots" page.Pdfpage.rest with
| Some (Pdf.Array annots) ->
let strings =
map (get_annotation_string encoding pdf) (map (Pdf.direct pdf) annots)
in
combine (many pagenumber (length strings)) strings
| _ -> [])
pages
(ilist 1 (length pages)))
(* Equalise the page lengths of two PDFs by chopping or extending the first one.
*)
let equalise_lengths a b =
let a' =
if Pdfpage.endpage a < Pdfpage.endpage b then
Pdfpage.change_pages false a
2013-08-20 16:32:57 +02:00
(Pdfpage.pages_of_pagetree a @
many (Pdfpage.blankpage Pdfpaper.a4) (Pdfpage.endpage b - Pdfpage.endpage a))
else if Pdfpage.endpage a > Pdfpage.endpage b then
Pdfpage.change_pages false a
2013-08-20 16:32:57 +02:00
(take (Pdfpage.pages_of_pagetree a) (Pdfpage.endpage b))
else a
in
2015-07-29 17:04:36 +02:00
a', b
(* Copy annotations *)
(* FIXME: Why does this chop the files to the same length? Should be able to
apply annotations from a longer file to a shorter? *)
(* Rewrite any annotation destinations to point to pages in the
destination file. This prevents pages being copied, and ensures the links are
correct Any Indirect link inside a /Dest is rewritten if in the table. If not
inside a /Dest, nothing is rewritten. *)
let rec renumber_in_dest table indest = function
Pdf.Indirect i ->
begin
try Pdf.Indirect (Hashtbl.find table i) with _ -> Pdf.Indirect i
end
| Pdf.Array a ->
Pdf.recurse_array (renumber_in_dest table indest) a
| Pdf.Dictionary d ->
Pdf.Dictionary
2020-03-04 19:50:32 +01:00
(map
(function
("/Dest", v) -> ("/Dest", renumber_in_dest table true v)
| (k, v) -> (k, renumber_in_dest table indest v))
d)
| x -> x
let renumber_in_object pdf objnum table =
Pdf.addobj_given_num
pdf (objnum, (renumber_in_dest table false (Pdf.lookup_obj pdf objnum)))
2015-07-29 17:04:36 +02:00
let copy_annotations_page topdf frompdf frompage topage =
match Pdf.lookup_direct frompdf "/Annots" frompage.Pdfpage.rest with
Some (Pdf.Array frompage_annots as annots) ->
let table =
hashtable_of_dictionary
(combine
(Pdf.page_reference_numbers frompdf)
(Pdf.page_reference_numbers topdf))
in
2020-03-04 19:50:32 +01:00
iter
(function
2016-11-24 20:10:53 +01:00
(* FIXME: We assume they are indirects. Must also do direct, though rare.*)
Pdf.Indirect x ->
(*Printf.printf "Copying annotation %s which is\n%s\n"
(Pdfwrite.string_of_pdf (Pdf.Indirect x))
(Pdfwrite.string_of_pdf (Pdf.direct frompdf (Pdf.Indirect
x)));*)
renumber_in_object frompdf x table
| _ -> ())
frompage_annots;
let objects_to_copy = Pdf.objects_referenced [] [] frompdf annots in
iter
(fun n ->
ignore (Pdf.addobj_given_num topdf (n, Pdf.lookup_obj frompdf n)))
objects_to_copy;
let topage_annots =
match Pdf.lookup_direct frompdf "/Annots" topage.Pdfpage.rest with
| Some (Pdf.Array annots) -> annots
| _ -> []
in
let merged_dict = Pdf.Array (frompage_annots @ topage_annots) in
let topage' =
{topage with Pdfpage.rest =
Pdf.add_dict_entry topage.Pdfpage.rest "/Annots" merged_dict}
in
topdf, topage'
2015-07-29 17:04:36 +02:00
| Some x -> topdf, topage
| None -> topdf, topage
2013-08-20 16:32:57 +02:00
let copy_annotations range frompdf topdf =
let frompdf, topdf = equalise_lengths frompdf topdf in
2015-07-29 17:04:36 +02:00
match Pdf.renumber_pdfs [frompdf; topdf] with
| [frompdf; topdf] ->
let frompdf_pages = Pdfpage.pages_of_pagetree frompdf in
let topdf_pages = Pdfpage.pages_of_pagetree topdf in
2015-07-29 17:04:36 +02:00
let pdf = ref topdf
and pages = ref []
and pnum = ref 1
and frompdf_pages = ref frompdf_pages
and topdf_pages = ref topdf_pages in
(* Go through, updating pdf and collecting new pages. *)
while not (isnull !frompdf_pages) do
let frompdf_page = hd !frompdf_pages
and topdf_page = hd !topdf_pages in
let pdf', page =
if mem !pnum range
then copy_annotations_page !pdf frompdf frompdf_page topdf_page
else !pdf, topdf_page
2013-08-20 16:32:57 +02:00
in
2015-07-29 17:04:36 +02:00
pdf := pdf';
pages =| page;
incr pnum;
frompdf_pages := tl !frompdf_pages;
topdf_pages := tl !topdf_pages
done;
Pdfpage.change_pages true !pdf (rev !pages)
| _ -> assert false
2013-08-20 16:32:57 +02:00
2021-10-26 16:09:07 +02:00
let addrectangle
fast (w, h) colour outline linewidth opacity position relative_to_cropbox
underneath range pdf
=
let addrectangle_page _ page =
let resources', unique_extgstatename =
if opacity < 1.0 then
let dict =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| Some d -> d
| None -> Pdf.Dictionary []
in
let unique_extgstatename = Pdf.unique_key "gs" dict in
let dict' =
Pdf.add_dict_entry dict unique_extgstatename
(Pdf.Dictionary [("/ca", Pdf.Real opacity); ("/CA", Pdf.Real opacity)])
in
Pdf.add_dict_entry page.Pdfpage.resources "/ExtGState" dict', Some unique_extgstatename
else
page.Pdfpage.resources, None
in
let mediabox =
if relative_to_cropbox then
match Pdf.lookup_direct pdf "/CropBox" page.Pdfpage.rest with
| Some pdfobject -> Pdf.parse_rectangle (Pdf.direct pdf pdfobject)
| None -> Pdf.parse_rectangle page.Pdfpage.mediabox
else
Pdf.parse_rectangle page.Pdfpage.mediabox
in
let x, y, _ =
Cpdfposition.calculate_position false w mediabox Cpdfposition.Horizontal position
in
let x, y =
match position with
Cpdfposition.Top _ | Cpdfposition.TopLeft _ | Cpdfposition.TopRight _ -> (x, y -. h)
| Cpdfposition.Centre | Cpdfposition.PosCentre _ -> (x, y -. (h /. 2.))
| _ -> (x, y)
in
let ops =
[
Pdfops.Op_q;
Pdfops.Op_BMC "/CPDFSTAMP";
2021-11-15 20:17:15 +01:00
colour_op colour;
colour_op_stroke colour;
2021-10-26 16:09:07 +02:00
]
@
(if outline then [Pdfops.Op_w linewidth] else [])
@
(match unique_extgstatename with None -> [] | Some n -> [Pdfops.Op_gs n])
@
[
Pdfops.Op_re (x, y, w, h);
(if outline then Pdfops.Op_s else Pdfops.Op_f);
Pdfops.Op_EMC;
Pdfops.Op_Q
]
in
let page = {page with Pdfpage.resources = resources'} in
if underneath
then Pdfpage.prepend_operators pdf ops ~fast:fast page
else Pdfpage.postpend_operators pdf ops ~fast:fast page
in
process_pages (ppstub addrectangle_page) pdf range
2021-10-19 17:18:15 +02:00
(* Imposition *)
2013-08-20 16:32:57 +02:00
(* Union two rest dictionaries from the same PDF. *)
let combine_pdf_rests pdf a b =
let a_entries =
match a with
| Pdf.Dictionary entries -> entries
| _ -> []
in let b_entries =
match b with
| Pdf.Dictionary entries -> entries
| _ -> []
in
2021-10-26 16:36:56 +02:00
let keys_to_combine = ["/Annots"] in
let combine_entries key =
let a_entries =
match Pdf.lookup_direct pdf key a with
| Some (Pdf.Array d) -> d
| _ -> []
in let b_entries =
match Pdf.lookup_direct pdf key b with
| Some (Pdf.Array d) -> d
| _ -> []
in
if a_entries = [] && b_entries = [] then
None
else
Some (key, Pdf.Array (a_entries @ b_entries))
in
let unknown_keys_a = lose (fun (k, _) -> mem k keys_to_combine) a_entries in
let unknown_keys_b = lose (fun (k, _) -> mem k keys_to_combine) b_entries in
let combined_known_entries = option_map combine_entries keys_to_combine in
2021-10-25 19:27:40 +02:00
fold_left
(fun dict (k, v) -> Pdf.add_dict_entry dict k v)
(Pdf.Dictionary [])
(unknown_keys_a @ unknown_keys_b @ combined_known_entries)
2021-10-19 17:18:15 +02:00
(* Calculate the transformation matrices for a single imposed output page. *)
2021-10-25 16:35:35 +02:00
(* make margins by scaling for a fitted impose. *)
2021-10-21 14:51:14 +02:00
let make_margin output_mediabox margin tr =
if margin = 0. then tr else
let width, height =
match Pdf.parse_rectangle output_mediabox with
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin
in
if margin > width /. 2. || margin > height /. 2. then error "margin would fill whole page!" else
2021-10-21 19:10:47 +02:00
let hfactor = (width -. margin -. margin) /. width in
let vfactor = (height -. margin -. margin) /. height in
let factor = fmin hfactor vfactor in
2021-10-21 14:51:14 +02:00
let scale = Pdftransform.matrix_of_op (Pdftransform.Scale ((0., 0.), factor, factor)) in
2021-10-21 19:10:47 +02:00
let shift =
Pdftransform.matrix_of_op (Pdftransform.Translate ((width -. width *. factor) /. 2.,
(height -. height *. factor) /. 2.))
in
2021-10-21 14:51:14 +02:00
(Pdftransform.matrix_compose shift (Pdftransform.matrix_compose scale tr))
2021-11-12 22:59:24 +01:00
(* FIXME fixup -center for next release. For now it has been disabled. *)
2021-10-26 16:09:07 +02:00
let impose_transforms fit fx fy columns rtl btt center margin mediabox output_mediabox fit_extra_hspace fit_extra_vspace len =
2021-10-19 17:18:15 +02:00
let width, height =
match Pdf.parse_rectangle mediabox with
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin
in
let trs = ref [] in
2021-10-19 18:55:03 +02:00
let len = ref len in
2021-10-21 19:10:47 +02:00
let cent_extra_x = ref 0. in
let cent_extra_y = ref 0. in
2021-10-25 15:07:05 +02:00
let addtr x y row col px py =
let cex, cey =
2021-10-21 19:10:47 +02:00
(if rtl then ~-.(!cent_extra_x) else !cent_extra_x), (if btt then ~-.(!cent_extra_y) else !cent_extra_y)
2021-10-19 19:43:12 +02:00
in
2021-10-25 15:07:05 +02:00
let spacecol = if rtl then x - col - 1 else col in
let total_fit_extra_hspace = fit_extra_hspace *. (float_of_int spacecol +. 1.) in
2021-10-25 15:23:21 +02:00
let total_fit_extra_vspace = fit_extra_vspace *. (float_of_int row +. 1.) in
(*Printf.printf "row = %i, py = %f, ey = %f, fit_extra_vspace = %f, total_fit_extra_vspace = %f\n" row py cey fit_extra_vspace total_fit_extra_vspace;*)
2021-10-21 19:10:47 +02:00
trs :=
Pdftransform.matrix_of_transform
2021-10-25 15:07:05 +02:00
[Pdftransform.Translate (px +. cex +. total_fit_extra_hspace, py +. cey +. total_fit_extra_vspace)]
2021-10-21 19:10:47 +02:00
::!trs
2021-10-19 17:56:39 +02:00
in
2021-10-19 18:55:03 +02:00
let x = int_of_float fx in
let y = int_of_float fy in
2021-10-21 19:36:23 +02:00
let final_full_cols = !len mod x in
2021-10-25 15:45:55 +02:00
let final_full_rows = !len mod y in
2021-10-19 17:56:39 +02:00
let order row col =
2021-10-19 18:55:03 +02:00
((if btt then y - row - 1 else row), (if rtl then x - col - 1 else col))
2021-10-19 17:56:39 +02:00
in
if columns then
2021-10-19 17:18:15 +02:00
for col = 0 to x - 1 do
2021-10-21 19:10:47 +02:00
if center && !len < y then if !cent_extra_y = 0. then cent_extra_y := ~-.(height *. float_of_int (y - !len)) /. 2.;
2021-10-19 17:56:39 +02:00
for row = y - 1 downto 0 do
2021-10-25 15:45:55 +02:00
let original_row = row in
2021-10-19 17:56:39 +02:00
let row, col = order row col in
2021-10-25 15:45:55 +02:00
let adjusted_row =
let final_empty_rows = y - final_full_rows in
if center && !len <= final_full_rows then original_row + (y - 1 - 1 - (final_empty_rows / 2)) else original_row
in
if !len > 0 then addtr x y adjusted_row col (width *. float_of_int col) (height *. float_of_int row);
len := !len - 1
2021-10-19 17:56:39 +02:00
done
2021-10-19 17:18:15 +02:00
done
2021-10-19 17:56:39 +02:00
else
for row = y - 1 downto 0 do
2021-10-21 19:10:47 +02:00
if center && !len < x then if !cent_extra_x = 0. then cent_extra_x := (width *. float_of_int (x - !len)) /. 2.;
2021-10-19 17:56:39 +02:00
for col = 0 to x - 1 do
2021-10-21 19:36:23 +02:00
let original_col = col in
2021-10-19 17:56:39 +02:00
let row, col = order row col in
2021-10-21 19:10:47 +02:00
let adjusted_col =
2021-10-21 19:36:23 +02:00
let final_empty_cols = x - final_full_cols in
if center && !len <= final_full_cols then original_col + (x - 1 - 1 - (final_empty_cols / 2)) else original_col
2021-10-21 19:10:47 +02:00
in
2021-10-25 15:45:55 +02:00
if !len > 0 then addtr x y row adjusted_col (width *. float_of_int col) (height *. float_of_int row);
len := !len - 1
2021-10-19 17:56:39 +02:00
done
done;
2021-10-25 16:35:35 +02:00
map (if fit then make_margin output_mediabox margin else Fun.id) (rev !trs)
2021-10-19 17:18:15 +02:00
2019-07-08 16:42:19 +02:00
(* Combine two pages into one throughout the document. The pages have already
2021-10-19 17:18:15 +02:00
had their objects renumbered so as not to clash. *)
2021-10-26 16:09:07 +02:00
let impose_pages fit x y columns rtl btt center margin output_mediabox fast fit_extra_hspace fit_extra_vspace pdf = function
2013-08-20 16:32:57 +02:00
| [] -> assert false
| (h::_) as pages ->
let transforms =
2021-10-26 17:39:26 +02:00
impose_transforms
fit x y columns rtl btt center margin h.Pdfpage.mediabox
output_mediabox fit_extra_hspace fit_extra_vspace (length pages)
2013-08-20 16:32:57 +02:00
in
(* Change the pattern matrices before combining resources *)
let pages, h =
2021-10-19 17:18:15 +02:00
let r = map2 (fun p t -> change_pattern_matrices_page pdf t p) pages transforms in
(r, List.hd r)
in
let resources' = pair_reduce (combine_pdf_resources pdf) (map (fun p -> p.Pdfpage.resources) pages) in
let rest' = pair_reduce (combine_pdf_rests pdf) (map (fun p -> p.Pdfpage.rest) pages) in
2013-08-20 16:32:57 +02:00
let content' =
2021-10-26 17:18:09 +02:00
let transform_stream transform contents =
(* If fast, no mismatched q/Q protection and no parsing of operators. *)
if fast then
2021-10-26 17:39:26 +02:00
[Pdfops.stream_of_ops [Pdfops.Op_q; Pdfops.Op_cm transform]] @ contents @ [Pdfops.stream_of_ops [Pdfops.Op_Q]]
2021-10-26 17:18:09 +02:00
else
(* If slow, use protect from Pdfpage. *)
let ops = Pdfpage.protect pdf resources' contents @ Pdfops.parse_operators pdf resources' contents in
2021-10-26 17:39:26 +02:00
[Pdfops.stream_of_ops ([Pdfops.Op_q] @ [Pdfops.Op_cm transform] @ ops @ [Pdfops.Op_Q])]
2013-08-20 16:32:57 +02:00
in
2020-03-04 19:50:32 +01:00
flatten
(map2
2021-10-26 17:39:26 +02:00
(fun p t -> transform_annotations pdf t p.Pdfpage.rest; transform_stream t p.Pdfpage.content)
pages
transforms)
2013-08-20 16:32:57 +02:00
in
2021-10-21 14:51:14 +02:00
{Pdfpage.mediabox = output_mediabox;
2013-08-20 16:32:57 +02:00
Pdfpage.rotate = h.Pdfpage.rotate;
Pdfpage.content = content';
2019-07-08 16:42:19 +02:00
Pdfpage.resources = resources';
2021-10-19 17:18:15 +02:00
Pdfpage.rest = rest'}
2021-10-25 19:48:28 +02:00
(* For fit, we scale contents, move to middle and retain page size. For xy, we
2021-10-26 17:18:09 +02:00
expand mediabox and move contents to middle. This function also does the hard boxing. *)
2021-10-25 19:48:28 +02:00
let make_space fit ~fast spacing pdf =
2021-10-25 19:27:40 +02:00
let endpage = Pdfpage.endpage pdf in
let all = ilist 1 endpage in
2021-10-26 17:18:09 +02:00
let pdf = hard_box pdf all "/MediaBox" false fast in
if spacing = 0. then pdf else
let margin = spacing /. 2. in
2021-10-25 19:27:40 +02:00
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
2021-10-25 19:48:28 +02:00
let width, height =
2021-10-25 19:27:40 +02:00
match Pdf.parse_rectangle firstpage.Pdfpage.mediabox with
2021-10-25 19:48:28 +02:00
xmin, ymin, xmax, ymax -> (xmax -. xmin, ymax -. ymin)
2021-10-25 19:27:40 +02:00
in
2021-10-25 19:48:28 +02:00
if fit then
2021-10-26 17:18:09 +02:00
(shift_pdf
2021-10-25 19:27:40 +02:00
~fast
(many (margin, margin) endpage)
2021-10-25 19:48:28 +02:00
(scale_contents ~fast (Cpdfposition.BottomLeft 0.) ((width -. spacing) /. width) pdf all)
2021-10-26 17:18:09 +02:00
all)
2021-10-25 19:48:28 +02:00
else
2021-10-26 17:18:09 +02:00
(set_mediabox
2021-10-25 19:48:28 +02:00
(many (0., 0., width +. spacing, height +. spacing) endpage)
2021-10-26 17:18:09 +02:00
(shift_pdf ~fast (many (margin, margin) endpage) pdf all) all)
2021-10-25 19:27:40 +02:00
2021-10-26 16:09:07 +02:00
(* We add the border as a thick unfilled rectangle just inside the page edge,
only if its linewidth is > 0 since, for us, 0 means none, not single-pixel
like in PDF. *)
let add_border linewidth ~fast pdf =
if linewidth = 0. then pdf else
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let _, _, w, h = Pdf.parse_rectangle firstpage.Pdfpage.mediabox in
addrectangle
2021-11-15 20:17:15 +01:00
fast (w -. linewidth, h -. linewidth) (RGB (0., 0., 0.)) true linewidth 1. (Cpdfposition.BottomLeft (linewidth /. 2.))
2021-10-26 16:09:07 +02:00
false false (ilist 1 (Pdfpage.endpage pdf)) pdf
2021-10-19 17:18:15 +02:00
let impose ~x ~y ~fit ~columns ~rtl ~btt ~center ~margin ~spacing ~linewidth ~fast pdf =
2021-10-26 17:18:09 +02:00
let endpage = Pdfpage.endpage pdf in
let pagenums = ilist 1 endpage in
let pdf = copy_cropbox_to_mediabox pdf pagenums in
let pdf = remove_cropping_pdf pdf pagenums in
let pdf = upright pagenums pdf in
2021-10-26 16:09:07 +02:00
let pdf = add_border linewidth ~fast pdf in
2021-10-25 19:48:28 +02:00
let pdf = make_space fit ~fast spacing pdf in
2021-10-20 19:51:53 +02:00
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let _, _, w, h = Pdf.parse_rectangle firstpage.Pdfpage.mediabox in
2021-10-19 20:26:02 +02:00
let ix = int_of_float x in
let iy = int_of_float y in
2021-10-21 19:10:47 +02:00
let n, ix, iy, fit_extra_hspace, fit_extra_vspace =
2021-10-20 19:51:53 +02:00
if fit then
(* +. 0.001 ensures a page always fits on itself, or on another page of same height or width. *)
let across = int_of_float (floor (x /. w +. 0.001)) in
let down = int_of_float (floor (y /. h +. 0.001)) in
if across < 1 || down < 1 then error "Not even a single page would fit." else
2021-10-21 19:10:47 +02:00
let excess_hspace = x -. float_of_int across *. w in
let excess_vspace = y -. float_of_int down *. h in
2021-10-25 15:45:55 +02:00
(*Printf.printf "across = %i, down =%i, excess_hspace = %f, excess_hspace = %f\n" across down excess_hspace excess_vspace;*)
2021-10-21 19:10:47 +02:00
(across * down,
across,
down,
excess_hspace /. (float_of_int across +. 1.),
excess_vspace /. (float_of_int down +. 1.))
2021-10-20 19:51:53 +02:00
else
if ix = 0 && iy = 0 then error "impose-xy: both dimensions cannot be zero" else
2021-10-21 19:10:47 +02:00
if ix = 0 then (endpage, endpage, 1, 0., 0.)
else if iy = 0 then (endpage, 1, endpage, 0., 0.)
else (ix * iy, ix, iy, 0., 0.)
2021-10-19 20:26:02 +02:00
in
2021-10-19 17:18:15 +02:00
let mediabox' =
if fit then Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real x; Pdf.Real y] else
2021-10-25 16:35:35 +02:00
let m2 = margin *. 2. in
if x = 0.0 then Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real (w *. float_of_int endpage +. m2); Pdf.Real (h +. m2)]
else if y = 0.0 then Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real (w +. m2); Pdf.Real (h *. float_of_int endpage +. m2)]
else Pdf.Array [Pdf.Real 0.; Pdf.Real 0.; Pdf.Real (w *. x +. m2); Pdf.Real (h *. y +. m2)]
2021-10-19 17:18:15 +02:00
in
let pages = Pdfpage.pages_of_pagetree pdf in
let pagesets = splitinto n pages in
let renumbered = map (Pdfpage.renumber_pages pdf) pagesets in
2021-10-25 16:35:35 +02:00
let pages =
2021-10-20 19:51:53 +02:00
map
2021-10-25 16:35:35 +02:00
(impose_pages fit (float_of_int ix) (float_of_int iy) columns rtl btt
2021-10-26 16:09:07 +02:00
center margin mediabox' fast fit_extra_hspace fit_extra_vspace pdf)
2021-10-20 19:51:53 +02:00
renumbered
in
2021-10-19 17:18:15 +02:00
let changes = map (fun x -> (x, (x + (n - 1)) / n)) pagenums in
2021-10-25 16:35:35 +02:00
let pdf = Pdfpage.change_pages ~changes true pdf pages in
if fit then pdf else shift_pdf ~fast (many (margin, margin) (length pages)) pdf (ilist 1 (Pdfpage.endpage pdf))
2013-08-20 16:32:57 +02:00
2021-10-20 19:51:53 +02:00
(* Legacy -twoup-stack. Impose 2x1 on a page twice the size then rotate. *)
2021-10-20 15:00:33 +02:00
let twoup_stack fast pdf =
let pdf =
impose
~x:2. ~y:1. ~fit:false ~columns:false ~rtl:false ~btt:false ~center:false
~margin:0. ~spacing:0. ~linewidth:0. ~fast pdf
in
let all = ilist 1 (Pdfpage.endpage pdf) in
upright ~fast all (rotate_pdf ~-90 pdf all)
2021-10-25 15:45:55 +02:00
(* Legacy -two-up. Rotate the pages and shrink them so as to fit 2x1 on a page the same size. *)
2021-10-20 15:00:33 +02:00
let twoup fast pdf =
let firstpage = hd (Pdfpage.pages_of_pagetree pdf) in
let width, height =
match Pdf.parse_rectangle firstpage.Pdfpage.mediabox with
xmin, ymin, xmax, ymax -> xmax -. xmin, ymax -. ymin
in
let width_exceeds_height = width > height in
let sc =
if width_exceeds_height
then fmin (height /. width) ((width /. 2.) /. height)
else fmin (width /. height) ((height /. 2.) /. width)
in
let endpage = Pdfpage.endpage pdf in
let all = ilist 1 endpage in
let pdf = scale_pdf ~fast (many (sc, sc) endpage) pdf all in
let pdf =
impose
~x:2. ~y:1. ~fit:false ~columns:false ~rtl:false ~btt:false ~center:true
~margin:0. ~spacing:0. ~linewidth:0. ~fast pdf
in
let endpage = Pdfpage.endpage pdf in
let all = ilist 1 endpage in
let pdf = upright all (rotate_pdf ~-90 pdf all) in
scale_to_fit_pdf ~fast Cpdfposition.Diagonal 1. (many (width, height) endpage) () pdf all
2013-08-20 16:32:57 +02:00
(* \section{Blacken text} *)
(*
\begin{verbatim}
Algorithm: Change
BT
<ops>
ET
...to...
BT
Op_g 0.
<ops minus any color, shading or gs operators>
ET
<ops minus any text positioning or text rendering ones>
\end{verbatim}
*)
2021-11-15 20:17:15 +01:00
let blacktext_ops colour pdf resources content =
2013-08-20 16:32:57 +02:00
let not_text = function
| Pdfops.Op_Tj _ | Pdfops.Op_TJ _
| Pdfops.Op_' _ | Pdfops.Op_'' (_, _, _)
| Pdfops.Op_Td (_, _) | Pdfops.Op_TD (_, _)
| Pdfops.Op_Tm _ | Pdfops.Op_T'
| Pdfops.Op_Tc _
| Pdfops.Op_Tw _
| Pdfops.Op_Tz _
| Pdfops.Op_TL _
| Pdfops.Op_Tf (_, _)
| Pdfops.Op_Tr _
| Pdfops.Op_Ts _ -> false
| _ -> true
in let textlevel = ref 0
in let removed = ref []
in let operators =
Pdfops.parse_operators pdf resources content
in
let rec remove_colourops prev = function
| [] -> rev prev
| Pdfops.Op_BT::more ->
incr textlevel;
remove_colourops
2021-11-15 20:17:15 +01:00
(colour_op colour::Pdfops.Op_BT::prev)
2013-08-20 16:32:57 +02:00
more
| Pdfops.Op_ET::more ->
decr textlevel;
let prev' = !removed @ Pdfops.Op_ET::prev in
removed := [];
remove_colourops prev' more
| (Pdfops.Op_G _
| Pdfops.Op_g _
| Pdfops.Op_RG (_, _, _)
| Pdfops.Op_rg (_, _, _)
| Pdfops.Op_k (_, _, _, _)
| Pdfops.Op_K (_, _, _, _)
| Pdfops.Op_SCN _
| Pdfops.Op_SC _
| Pdfops.Op_scn _
| Pdfops.Op_sc _
| Pdfops.Op_SCNName (_, _)
| Pdfops.Op_scnName (_, _)
| Pdfops.Op_CS _
| Pdfops.Op_cs _
| Pdfops.Op_sh _
| Pdfops.Op_gs _)
as op::more ->
if !textlevel > 0
then
begin
removed =| op;
remove_colourops prev more
end
else remove_colourops (op::prev) more
| op::more ->
if !textlevel > 0 && not_text op then removed =| op;
remove_colourops (op::prev) more
in
let operators' = remove_colourops [] operators in
[Pdfops.stream_of_ops operators']
(* Blacken a form xobject, writing it to the same object. *)
2021-11-15 20:17:15 +01:00
let blacktext c range pdf =
2013-08-20 16:32:57 +02:00
let blacktext_page _ page =
let content' =
2021-11-15 20:17:15 +01:00
blacktext_ops c pdf page.Pdfpage.resources page.Pdfpage.content
2013-08-20 16:32:57 +02:00
in
2021-11-15 20:17:15 +01:00
process_xobjects pdf page (blacktext_ops c);
2013-08-20 16:32:57 +02:00
{page with Pdfpage.content = content'}
in
process_pages (ppstub blacktext_page) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Blacken lines} *)
2021-11-15 20:17:15 +01:00
let blacklines_ops c pdf resources content =
2013-08-20 16:32:57 +02:00
let rec blacken_strokeops prev = function
| [] -> rev prev
| Pdfops.Op_CS _::t ->
2016-11-09 19:15:23 +01:00
blacken_strokeops (Pdfops.Op_CS "/DeviceRGB"::prev) t
2013-08-20 16:32:57 +02:00
| (Pdfops.Op_SC _ | Pdfops.Op_SCN _ | Pdfops.Op_SCNName _ | Pdfops.Op_G _
| Pdfops.Op_RG _ | Pdfops.Op_K _)::t ->
2021-11-15 20:17:15 +01:00
blacken_strokeops (colour_op_stroke c::prev) t
2013-08-20 16:32:57 +02:00
| h::t -> blacken_strokeops (h::prev) t
and operators =
Pdfops.parse_operators pdf resources content
in
let operators' = blacken_strokeops [] operators in
[Pdfops.stream_of_ops operators']
2021-11-15 20:17:15 +01:00
let blacklines c range pdf =
2013-08-20 16:32:57 +02:00
let blacklines_page _ page =
let content' =
2021-11-15 20:17:15 +01:00
blacklines_ops c pdf page.Pdfpage.resources page.Pdfpage.content
2013-08-20 16:32:57 +02:00
in
2021-11-15 20:17:15 +01:00
process_xobjects pdf page (blacklines_ops c);
2013-08-20 16:32:57 +02:00
{page with Pdfpage.content = content'}
in
process_pages (ppstub blacklines_page) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Blacken Fills} *)
2021-11-15 20:17:15 +01:00
let blackfills_ops c pdf resources content =
2013-08-20 16:32:57 +02:00
let rec blacken_fillops prev = function
| [] -> rev prev
| Pdfops.Op_cs _::t ->
2016-11-09 19:15:23 +01:00
blacken_fillops (Pdfops.Op_cs "/DeviceRGB"::prev) t
2013-08-20 16:32:57 +02:00
| (Pdfops.Op_sc _ | Pdfops.Op_scn _ | Pdfops.Op_scnName _ | Pdfops.Op_g _
| Pdfops.Op_rg _ | Pdfops.Op_k _)::t ->
2021-11-15 20:17:15 +01:00
blacken_fillops (colour_op c::prev) t
2013-08-20 16:32:57 +02:00
| h::t -> blacken_fillops (h::prev) t
and operators =
Pdfops.parse_operators pdf resources content
in
let operators' = blacken_fillops [] operators in
[Pdfops.stream_of_ops operators']
2021-11-15 20:17:15 +01:00
let blackfills c range pdf =
2013-08-20 16:32:57 +02:00
let blackfills_page _ page =
let content' =
2021-11-15 20:17:15 +01:00
blackfills_ops c pdf page.Pdfpage.resources page.Pdfpage.content
2013-08-20 16:32:57 +02:00
in
2021-11-15 20:17:15 +01:00
process_xobjects pdf page (blackfills_ops c);
2013-08-20 16:32:57 +02:00
{page with Pdfpage.content = content'}
in
process_pages (ppstub blackfills_page) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Set a minimum line width to avoid dropout} *)
let thinlines range width pdf =
let thinpage _ page =
let operators =
Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
in
let ctmstack = ref [ref Pdftransform.i_matrix] in
let scaleof_ctm () =
try
match Pdftransform.decompose (!(hd !ctmstack)) with
(scale, _, _, _, _, _) ->
scale
with
2015-11-25 12:57:48 +01:00
Failure _ (*"hd"*) -> 1.
2013-08-20 16:32:57 +02:00
in
let rec replace_operators prev = function
| [] -> rev prev
| (Pdfops.Op_w w)::more ->
(* Alter width. *)
let width' = width /. scaleof_ctm () in
let w' =
if w >= width' then Pdfops.Op_w w else Pdfops.Op_w width'
in
replace_operators (w'::prev) more
| (Pdfops.Op_cm m)::more ->
(* Update CTM *)
begin try
let top = hd !ctmstack in
top := Pdftransform.matrix_compose !top m
with
2015-11-25 12:57:48 +01:00
Failure _ (*"hd"*) -> error "Malformed file."
2013-08-20 16:32:57 +02:00
end;
replace_operators ((Pdfops.Op_cm m)::prev) more
| Pdfops.Op_q::more ->
(* Push stack *)
begin try
ctmstack =| ref (!(hd !ctmstack))
with
2015-11-25 12:57:48 +01:00
Failure _ (*"hd"*) -> error "Malformed file"
2013-08-20 16:32:57 +02:00
end;
replace_operators (Pdfops.Op_q::prev) more
| Pdfops.Op_Q::more ->
(* Pop stack *)
begin try
ctmstack := tl !ctmstack
with
2015-11-25 12:57:48 +01:00
Failure _ (*"tl"*) -> error "Malformed file"
2013-08-20 16:32:57 +02:00
end;
replace_operators (Pdfops.Op_Q::prev) more
| (Pdfops.Op_gs gsname)::more ->
(* Perhaps insert [Op_w]. *)
let opw =
match Pdf.lookup_direct pdf "/ExtGState" page.Pdfpage.resources with
| None -> []
| Some ext_state_dict ->
match Pdf.lookup_direct pdf gsname ext_state_dict with
| None -> []
| Some gdict ->
match Pdf.lookup_direct pdf "/LW" gdict with
| Some s -> (try [Pdfops.Op_w (Pdf.getnum s)] with _ -> [])
| None -> []
in
replace_operators (opw @ ((Pdfops.Op_gs gsname)::prev)) more
| x::more -> replace_operators (x::prev) more
in
let operators = replace_operators [] operators in
(* 2. Add an initial 'w' if width more than default width *)
let operators =
if width > 1. then (Pdfops.Op_w width)::operators else operators
in
let content' = [Pdfops.stream_of_ops operators] in
{page with Pdfpage.content = content'}
in
process_pages (ppstub thinpage) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Remove annotations} *)
let remove_annotations range pdf =
let remove_annotations_page pagenum page =
if mem pagenum range then
let rest' =
Pdf.remove_dict_entry page.Pdfpage.rest "/Annots"
in
{page with Pdfpage.rest = rest'}
else
page
in
process_pages (ppstub remove_annotations_page) pdf range
2013-08-20 16:32:57 +02:00
(* \section{Making draft documents} *)
(* Predicate on an xobject: true if an image xobject. *)
let isimage pdf (_, xobj) =
match Pdf.lookup_direct pdf "/Subtype" xobj with
| Some (Pdf.Name "/Image") -> true
| _ -> false
(* Given a set of resources for a page, and the name of a resource, determine if
that name refers to an image xobject. *)
let xobject_isimage pdf resources name =
match resources with
| Pdf.Dictionary _ ->
begin match Pdf.lookup_direct pdf "/XObject" resources with
| Some xobjects ->
isimage pdf ("", Pdf.lookup_fail "xobject not there" pdf name xobjects)
| _ -> false
end
| _ -> failwith "bad resources"
(* The subsitute for an image. *)
let substitute boxes =
if boxes then
rev
[Pdfops.Op_q;
Pdfops.Op_w 0.;
Pdfops.Op_G 0.;
Pdfops.Op_re (0., 0., 1., 1.);
Pdfops.Op_m (0., 0.);
Pdfops.Op_l (1., 1.);
Pdfops.Op_m (0., 1.);
Pdfops.Op_l (1., 0.);
Pdfops.Op_S;
Pdfops.Op_Q]
else
[]
(* Remove references to images from a graphics stream. *)
let rec remove_images_stream onlyremove boxes pdf resources prev = function
2013-08-20 16:32:57 +02:00
| [] -> rev prev
| (Pdfops.Op_Do name) as h::t ->
if xobject_isimage pdf resources name && (match onlyremove with None -> true | Some x -> x = name)
2019-07-09 18:54:02 +02:00
then remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
else remove_images_stream onlyremove boxes pdf resources (h::prev) t
| Pdfops.InlineImage _ as h::t ->
if onlyremove <> None
then remove_images_stream onlyremove boxes pdf resources (h::prev) t
else remove_images_stream onlyremove boxes pdf resources (substitute boxes @ prev) t
2013-08-20 16:32:57 +02:00
| h::t ->
remove_images_stream onlyremove boxes pdf resources (h::prev) t
2013-08-20 16:32:57 +02:00
let rec process_form_xobject onlyremove boxes pdf form =
2013-08-20 16:32:57 +02:00
let form = Pdf.direct pdf form in
let page =
{Pdfpage.content = [form];
Pdfpage.mediabox = Pdf.Null;
Pdfpage.resources =
begin match Pdf.lookup_direct pdf "/Resources" form with
| Some r -> r
| None -> Pdf.Dictionary []
end;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let page', pdf =
remove_images_page onlyremove boxes pdf page
2013-08-20 16:32:57 +02:00
in
let form' =
match form with
| Pdf.Stream {contents = (dict, _)} ->
begin match
Pdfops.stream_of_ops
(Pdfops.parse_operators pdf (Pdf.Dictionary []) page'.Pdfpage.content)
with
| Pdf.Stream {contents = (_, Pdf.Got data)} ->
let dict' =
Pdf.add_dict_entry dict "/Length" (Pdf.Integer (bytes_size data))
in
Pdf.Stream {contents = (dict', Pdf.Got data)}
| _ -> assert false
end
| _ -> raise (Pdf.PDFError "not a stream")
in
form', pdf
(* Remove images from a page. *)
and remove_images_page onlyremove boxes pdf page =
2013-08-20 16:32:57 +02:00
let isform pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Form") -> true | _ -> false
in
2019-07-09 18:54:02 +02:00
let isimage pdf xobj =
match Pdf.lookup_direct pdf "/Subtype" xobj with Some (Pdf.Name "/Image") -> true | _ -> false
in
2013-08-20 16:32:57 +02:00
(* Remove image xobjects and look into form ones *)
2019-07-09 18:54:02 +02:00
let form_xobjects, image_xobjects =
2013-08-20 16:32:57 +02:00
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) ->
2019-07-09 18:54:02 +02:00
keep (function (_, p) -> isform pdf p) elts,
keep (function (_, p) -> isimage pdf p) elts
| _ -> [], []
2013-08-20 16:32:57 +02:00
in
let resources', pdf =
let names, pointers = split form_xobjects in
let form_xobjects', pdf =
let pdf = ref pdf
in let outputs = ref [] in
iter
(fun p ->
let p', pdf' = process_form_xobject onlyremove boxes !pdf p in
2013-08-20 16:32:57 +02:00
pdf := pdf';
outputs =| p')
pointers;
rev !outputs, !pdf
in
let nums = ref [] in
iter
(fun xobj ->
let objnum = Pdf.addobj pdf xobj in
nums =| objnum)
form_xobjects';
2019-07-09 18:54:02 +02:00
let image_xobjects' =
match onlyremove with
2021-02-16 20:06:26 +01:00
None -> []
2019-07-09 18:54:02 +02:00
| Some n -> option_map (function (n', _) as xobj -> if n = n' then None else Some xobj) image_xobjects
in
2013-08-20 16:32:57 +02:00
let newdict =
2019-07-09 18:54:02 +02:00
Pdf.Dictionary (image_xobjects' @ combine names (map (fun x -> Pdf.Indirect x) (rev !nums)))
2013-08-20 16:32:57 +02:00
in
Pdf.add_dict_entry page.Pdfpage.resources "/XObject" newdict, pdf
in
let content' =
remove_images_stream onlyremove boxes pdf page.Pdfpage.resources []
2013-08-20 16:32:57 +02:00
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
in
{page with
Pdfpage.content =
(let stream = Pdfops.stream_of_ops content' in
Pdfcodec.encode_pdfstream pdf Pdfcodec.Flate stream;
[stream]);
Pdfpage.resources = resources'}, pdf
(* Remove images from all pages in a document. *)
2019-07-09 17:31:45 +02:00
let draft onlyremove boxes range pdf =
2013-08-20 16:32:57 +02:00
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf
in let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_images_page onlyremove boxes !pdf p
2013-08-20 16:32:57 +02:00
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')
2019-10-02 14:41:56 +02:00
(* Parse the new content to make sure syntactically ok, append
* as required. Rewrite the content *)
let append_page_content_page fast s before pdf n page =
let ops =
Pdfops.parse_stream pdf page.Pdfpage.resources [bytes_of_string s]
in
(if before then Pdfpage.prepend_operators else Pdfpage.postpend_operators)
pdf ops ~fast page
let append_page_content s before fast range pdf =
process_pages (ppstub (append_page_content_page fast s before pdf)) pdf range
2019-10-02 14:41:56 +02:00
(* Add rectangles on top of pages to show Media, Crop, Art, Trim, Bleed boxes.
*
* We use different dash lengths and colours to help distinguish coincident
* boxes The sequence of operators is postpended to the page content,
* appropriately protected to prevent pollution of matrices.
*
* /MediaBox: Solid red line
* /CropBox: Dashed 7 on 7 off green line
* /ArtBox: Dashed 5 on 5 off blue line
* /TrimBox: Dashed 3 on 3 off orange line
* /BleedBox: Dashed 2 on 2 off pink line *)
let get_rectangle pdf page box =
if box = "/MediaBox" then
match page.Pdfpage.mediabox with
Pdf.Array [a; b; c; d] as r -> Some (Pdf.parse_rectangle r)
| _ -> None
else
match Pdf.lookup_direct pdf box page.Pdfpage.rest with
Some (Pdf.Array [a; b; c; d] as r) -> Some (Pdf.parse_rectangle r)
| _ -> None
let show_boxes_page fast pdf _ page =
let make_ops (r, g, b) on off boxname =
match get_rectangle pdf page boxname with
Some (r1, r2, r3, r4) ->
[Pdfops.Op_q;
Pdfops.Op_RG (r /. 255., g /. 255., b /. 255.);
Pdfops.Op_w 1.;
Pdfops.Op_d ((if on = 0. && off = 0. then [] else [on; off]), 0.);
Pdfops.Op_re (r1, r2, r3 -. r1, r4 -. r2);
Pdfops.Op_S;
Pdfops.Op_Q]
| None -> []
in
let ops =
make_ops (255., 0., 0.) 0. 0. "/MediaBox"
@ make_ops (0., 255., 0.) 7. 7. "/CropBox"
@ make_ops (0., 0., 255.) 5. 5. "/ArtBox"
@ make_ops (255.,150.,0.) 3. 3. "/TrimBox"
@ make_ops (255.,9.,147.) 2. 2. "/BleedBox"
in
Pdfpage.postpend_operators pdf ops ~fast page
let show_boxes ?(fast=false) pdf range =
process_pages (ppstub (show_boxes_page fast pdf)) pdf range
let allowance = 9.
let line (x0, y0, x1, y1) =
[Pdfops.Op_m (x0, y0);
Pdfops.Op_l (x1, y1);
Pdfops.Op_s]
let trim_marks_page fast pdf n page =
match get_rectangle pdf page "/TrimBox", get_rectangle pdf page "/MediaBox" with
| Some (tminx, tminy, tmaxx, tmaxy), Some (minx, miny, maxx, maxy) ->
let ops =
[Pdfops.Op_q;
Pdfops.Op_K (1., 1., 1., 1.);
Pdfops.Op_w 1.]
@ line (minx, tmaxy, tminy -. allowance, tmaxy) (* top left *)
@ line (tminx, tmaxy +. allowance, tminx, maxy)
@ line (tmaxx +. allowance, tmaxy, maxx, tmaxy) (* top right *)
@ line (tmaxx, tmaxy +. allowance, tmaxx, maxy)
@ line (tmaxx +. allowance, tminy, maxx, tminy) (* bottom right *)
@ line (tmaxx, tminy -. allowance, tmaxx, miny)
@ line (tminx -. allowance, tminy, minx, tminy) (* bottom left *)
@ line (tminx, tminy -. allowance, tminx, miny)
@ [Pdfops.Op_Q]
in
Pdfpage.postpend_operators pdf ops ~fast page
2021-07-23 17:26:07 +02:00
| _, _ ->
(*Printf.eprintf "warning: no /TrimBox found on page %i\n%!" n;*)
page
let trim_marks ?(fast=false) pdf range =
process_pages (ppstub (trim_marks_page fast pdf)) pdf range
2020-11-14 17:40:01 +01:00
let rec remove_all_text_ops pdf resources content =
let is_textop = function
Pdfops.Op_Tj _ | Pdfops.Op_' _ | Pdfops.Op_'' _ | Pdfops.Op_TJ _ -> true
| _ -> false
in
let content' =
let ops = Pdfops.parse_operators pdf resources content in
Pdfops.stream_of_ops
(option_map (function x -> if is_textop x then None else Some x) ops)
in
[content']
let remove_all_text_page pdf p =
let resources = p.Pdfpage.resources in
let content = p.Pdfpage.content in
process_xobjects pdf p remove_all_text_ops;
{p with Pdfpage.content = remove_all_text_ops pdf resources content}, pdf
let remove_all_text range pdf =
let pages = Pdfpage.pages_of_pagetree pdf in
let pagenums = indx pages in
let pdf = ref pdf in
let pages' = ref [] in
iter2
(fun p pagenum ->
let p', pdf' =
if mem pagenum range
then remove_all_text_page !pdf p
else p, !pdf
in
pdf := pdf';
pages' =| p')
pages
pagenums;
Pdfpage.change_pages true !pdf (rev !pages')
(* 1. Extend remove_dict_entry with search term
2. Implement replace_dict_entry by analogy to remove_dict_entry *)
2021-10-29 16:09:21 +02:00
let rec dict_entry_single_object f pdf = function
| (Pdf.Dictionary d) -> f (Pdf.recurse_dict (dict_entry_single_object f pdf) d)
| (Pdf.Stream {contents = (Pdf.Dictionary dict, data)}) ->
2021-10-29 16:09:21 +02:00
f (Pdf.Stream {contents = (Pdf.recurse_dict (dict_entry_single_object f pdf) dict, data)})
| Pdf.Array a -> Pdf.recurse_array (dict_entry_single_object f pdf) a
| x -> x
2021-10-29 16:09:21 +02:00
(* FIXME are we sure that functional values can never appear in the equality here? *)
let remove_dict_entry pdf key search =
let f d =
match search with
| None -> Pdf.remove_dict_entry d key
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.remove_dict_entry d key
| _ -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
2020-11-14 17:40:01 +01:00
2021-10-29 16:09:21 +02:00
let replace_dict_entry pdf key value search =
let f d =
match search with
| None -> Pdf.replace_dict_entry d key value
| Some s ->
match Pdf.lookup_direct pdf key d with
| Some v when v = s -> Pdf.replace_dict_entry d key value
| _ -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
(* FIXME no need to self map here, since nothing changes *)
let print_dict_entry pdf key =
let f d =
match Pdf.lookup_direct pdf key d with
| Some v ->
(* We use a double newline as a separator. *)
Printf.printf "%s\n\n" (Cpdfyojson.Safe.to_string (Cpdfjson.json_of_object pdf (fun _ -> ()) false false v));
d
| None -> d
in
Pdf.objselfmap (dict_entry_single_object f pdf) pdf;
pdf.Pdf.trailerdict <- dict_entry_single_object f pdf pdf.Pdf.trailerdict
2021-10-28 18:06:46 +02:00
2020-11-14 17:40:01 +01:00
let remove_clipping_ops pdf resources content =
let ops = Pdfops.parse_operators pdf resources content in
let rec process a = function
Pdfops.Op_W::Pdfops.Op_n::t -> process (Pdfops.Op_n::a) t
| h::t -> process (h::a) t
| [] -> rev a
in
[Pdfops.stream_of_ops (process [] ops)]
let remove_clipping pdf range =
let remove_clipping_page _ page =
let content' =
remove_clipping_ops pdf page.Pdfpage.resources page.Pdfpage.content
in
process_xobjects pdf page remove_clipping_ops;
{page with Pdfpage.content = content'}
in
process_pages (ppstub remove_clipping_page) pdf range
2020-11-25 17:54:15 +01:00
(* Image resolution *)
type xobj =
| Image of int * int (* width, height *)
| Form of Pdftransform.transform_matrix * Pdf.pdfobject * Pdf.pdfobject (* Will add actual data later. *)
let image_results = ref []
let add_image_result i =
image_results := i::!image_results
(* Given a page and a list of (pagenum, name, thing) *)
let rec image_resolution_page pdf page pagenum dpi (images : (int * string * xobj) list) =
try
let pageops = Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content
and transform = ref [ref Pdftransform.i_matrix] in
iter
(function
| Pdfops.Op_cm matrix ->
begin match !transform with
| [] -> raise (Failure "no transform")
| _ -> (hd !transform) := Pdftransform.matrix_compose !(hd !transform) matrix
end
| Pdfops.Op_Do xobject ->
let trans (x, y) =
match !transform with
| [] -> raise (Failure "no transform")
| _ -> Pdftransform.transform_matrix !(hd !transform) (x, y)
in
let o = trans (0., 0.)
and x = trans (1., 0.)
and y = trans (0., 1.)
in
(*i Printf.printf "o = %f, %f, x = %f, %f, y = %f, %f\n" (fst o) (snd o) (fst x) (snd x) (fst y) (snd y); i*)
let rec lookup_image k = function
| [] -> assert false
| (_, a, _) as h::_ when a = k -> h
| _::t -> lookup_image k t
in
begin match lookup_image xobject images with
| (pagenum, name, Form (xobj_matrix, content, resources)) ->
let content =
(* Add in matrix etc. *)
let total_matrix = Pdftransform.matrix_compose xobj_matrix !(hd !transform) in
let ops =
Pdfops.Op_cm total_matrix::
Pdfops.parse_operators pdf resources [content]
in
Pdfops.stream_of_ops ops
in
let page =
{Pdfpage.content = [content];
Pdfpage.mediabox = Pdfpage.rectangle_of_paper Pdfpaper.a4;
2020-11-25 17:54:15 +01:00
Pdfpage.resources = resources;
Pdfpage.rotate = Pdfpage.Rotate0;
Pdfpage.rest = Pdf.Dictionary []}
in
let newpdf = Pdfpage.change_pages false pdf [page] in
image_resolution newpdf [pagenum] dpi
| (pagenum, name, Image (w, h)) ->
let lx = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o x)
and ly = Pdfunits.convert 0. Pdfunits.PdfPoint Pdfunits.Inch (distance_between o y) in
let wdpi = float w /. lx
and hdpi = float h /. ly in
add_image_result (pagenum, xobject, w, h, wdpi, hdpi)
(*Printf.printf "%i, %s, %i, %i, %f, %f\n" pagenum xobject w h wdpi hdpi*)
(*i else
Printf.printf "S %i, %s, %i, %i, %f, %f\n" pagenum xobject (int_of_float w) (int_of_float h) wdpi hdpi i*)
end
| Pdfops.Op_q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| h::t ->
let h' = ref Pdftransform.i_matrix in
h' := !h;
transform := h'::h::t
end
| Pdfops.Op_Q ->
begin match !transform with
| [] -> raise (Failure "Unbalanced q/Q ops")
| _ -> transform := tl !transform
end
| _ -> ())
pageops
with
e -> Printf.printf "Error %s\n" (Printexc.to_string e); flprint "\n"
and image_resolution pdf range dpi =
let images = ref [] in
iter_pages
(fun pagenum page ->
(* 1. Get all image names and their native resolutions from resources as string * int * int *)
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary xobjects) ->
iter
(function (name, xobject) ->
match Pdf.lookup_direct pdf "/Subtype" xobject with
| Some (Pdf.Name "/Image") ->
let width =
match Pdf.lookup_direct pdf "/Width" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
and height =
match Pdf.lookup_direct pdf "/Height" xobject with
| Some x -> Pdf.getnum x
| None -> 1.
in
images := (pagenum, name, Image (int_of_float width, int_of_float height))::!images
| Some (Pdf.Name "/Form") ->
let resources =
match Pdf.lookup_direct pdf "/Resources" xobject with
| None -> page.Pdfpage.resources (* Inherit from page or form above. *)
| Some r -> r
and contents =
xobject
and matrix =
match Pdf.lookup_direct pdf "/Matrix" xobject with
| Some (Pdf.Array [a; b; c; d; e; f]) ->
{Pdftransform.a = Pdf.getnum a; Pdftransform.b = Pdf.getnum b; Pdftransform.c = Pdf.getnum c;
Pdftransform.d = Pdf.getnum d; Pdftransform.e = Pdf.getnum e; Pdftransform.f = Pdf.getnum f}
| _ -> Pdftransform.i_matrix
in
images := (pagenum, name, Form (matrix, contents, resources))::!images
| _ -> ()
)
xobjects
| _ -> ())
pdf
range;
(* Now, split into differing pages, and call [image_resolution_page] on each one *)
let pagesplits =
map
(function (a, _, _)::_ as ls -> (a, ls) | _ -> assert false)
(collate (fun (a, _, _) (b, _, _) -> compare a b) (rev !images))
and pages =
Pdfpage.pages_of_pagetree pdf
in
iter
(function (pagenum, images) ->
let page = select pagenum pages in
image_resolution_page pdf page pagenum dpi images)
pagesplits
let image_resolution pdf range dpi =
image_results := [];
image_resolution pdf range dpi;
rev !image_results
2021-11-12 23:50:31 +01:00
(* copy the contents of the box f to the box t. If mediabox_if_missing is set,
the contents of the mediabox will be used if the from fox is not available. If
mediabox_is_missing is false, the page is unaltered. *)
let copy_box f t mediabox_if_missing pdf range =
process_pages
(ppstub (fun _ page ->
if f = "/MediaBox" then
{page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (page.Pdfpage.mediabox))}
else
match Pdf.lookup_direct pdf f page.Pdfpage.rest with
| Some pdfobject ->
if t = "/MediaBox"
then {page with
Pdfpage.mediabox = Pdf.direct pdf pdfobject}
else {page with Pdfpage.rest =
(Pdf.add_dict_entry page.Pdfpage.rest t (Pdf.direct pdf pdfobject))}
| None ->
if mediabox_if_missing
then {page with Pdfpage.rest = Pdf.add_dict_entry page.Pdfpage.rest t page.Pdfpage.mediabox}
else page))
pdf
range
let remove_unused_resources_page pdf n page =
let xobjects, all_names =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary d) -> Pdf.Dictionary d, map fst d
| _ -> Pdf.Dictionary [], []
in
let names_to_keep =
option_map
(function Pdfops.Op_Do n -> Some n | _ -> None)
(Pdfops.parse_operators pdf page.Pdfpage.resources page.Pdfpage.content)
in
let names_to_remove = lose (mem' names_to_keep) all_names in
let xobjdict = fold_left (Pdf.remove_dict_entry) xobjects names_to_remove in
{page with Pdfpage.resources = Pdf.add_dict_entry page.Pdfpage.resources "/XObject" xobjdict}
let remove_unused_resources pdf =
process_pages (ppstub (remove_unused_resources_page pdf)) pdf (ilist 1 (Pdfpage.endpage pdf))
(* Indent bookmarks in each file by one and add a title bookmark pointing to the first page. *)
let add_bookmark_title filename use_title pdf =
let title =
if use_title then
2021-12-19 13:55:06 +01:00
match Cpdfmetadata.get_info_utf8 pdf "/Title", Cpdfmetadata.get_xmp_info pdf "/Title" with
2021-11-12 23:50:31 +01:00
"", x | x, "" | _, x -> x
else
Filename.basename filename
in
let marks = Pdfmarks.read_bookmarks pdf in
let page1objnum =
match Pdfpage.page_object_number pdf 1 with
None -> error "add_bookmark_title: page not found"
| Some x -> x
in
let newmarks =
{Pdfmarks.level = 0;
Pdfmarks.text = title;
Pdfmarks.target = Pdfdest.XYZ (Pdfdest.PageObject page1objnum, None, None, None);
Pdfmarks.isopen = false}
::map (function m -> {m with Pdfmarks.level = m.Pdfmarks.level + 1}) marks
in
Pdfmarks.add_bookmarks newmarks pdf
let bookmarks_open_to_level n pdf =
let marks = Pdfmarks.read_bookmarks pdf in
let newmarks =
map
(fun m -> {m with Pdfmarks.isopen = m.Pdfmarks.level < n})
marks
in
Pdfmarks.add_bookmarks newmarks pdf
let create_pdf pages pagesize =
let page =
{(Pdfpage.blankpage pagesize) with
Pdfpage.content = [Pdfops.stream_of_ops []];
Pdfpage.resources = Pdf.Dictionary []}
in
let pdf, pageroot = Pdfpage.add_pagetree (many page pages) (Pdf.empty ()) in
Pdfpage.add_root pageroot [] pdf
2021-11-15 18:30:11 +01:00
let get_bookmark_name encoding pdf marks splitlevel n _ =
let refnums = Pdf.page_reference_numbers pdf in
let fastrefnums = hashtable_of_dictionary (combine refnums (indx refnums)) in
match keep (function m -> n = Pdfpage.pagenumber_of_target ~fastrefnums pdf m.Pdfmarks.target && m.Pdfmarks.level <= splitlevel) marks with
2021-12-19 14:38:27 +01:00
| {Pdfmarks.text = title}::_ -> Cpdfattach.remove_unsafe_characters encoding title
2021-11-15 18:30:11 +01:00
| _ -> ""
(* @F means filename without extension *)
(* @N means sequence number with no padding *)
(* @S means start page of this section *)
(* @E means end page of this section *)
(* @B means bookmark name at start page *)
let process_others encoding marks pdf splitlevel filename sequence startpage endpage s =
let rec find_ats p = function
'@'::r -> find_ats (p + 1) r
| r -> (p, r)
in
let string_of_int_width w i =
if w < 0 then raise (Pdf.PDFError "width of field too narrow")
else if w > 8 then raise (Pdf.PDFError "width of field too broad") else
let formats =
[|format_of_string "%i";
format_of_string "%i";
format_of_string "%02i";
format_of_string "%03i";
format_of_string "%04i";
format_of_string "%05i";
format_of_string "%06i";
format_of_string "%07i";
format_of_string "%08i"|]
in
Printf.sprintf formats.(w) i
in
let rec procss prev = function
| [] -> rev prev
| '@'::'F'::t -> procss (rev (explode filename) @ prev) t
| '@'::'N'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width sequence)) @ prev) rest
| '@'::'S'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width startpage)) @ prev) rest
| '@'::'E'::t ->
let width, rest = find_ats 0 t in
procss (rev (explode (string_of_int_width width endpage)) @ prev) rest
| '@'::'B'::t -> procss (rev (explode (get_bookmark_name encoding pdf marks splitlevel startpage pdf)) @ prev) t
| h::t -> procss (h::prev) t
in
implode (procss [] (explode s))
let name_of_spec encoding marks (pdf : Pdf.t) splitlevel spec n filename startpage endpage =
let fill l n =
let chars = explode (string_of_int n) in
if length chars > l
then implode (drop chars (length chars - l))
else implode ((many '0' (l - length chars)) @ chars)
in
let chars = explode spec in
let before, including = cleavewhile (neq '%') chars in
let percents, after = cleavewhile (eq '%') including in
if percents = []
then
process_others encoding marks pdf splitlevel filename n startpage endpage spec
else
process_others encoding marks pdf splitlevel filename n startpage endpage
(implode before ^ fill (length percents) n ^ implode after)
(* Extract Images. *)
let pnm_to_channel_24 channel w h s =
let white () = output_char channel ' '
and newline () = output_char channel '\n'
and output_string = Pervasives.output_string channel in
output_string "P6";
white ();
output_string (string_of_int w);
white ();
output_string (string_of_int h);
white ();
output_string "255";
newline ();
let pos = ref 0 in
for y = 1 to h do
for x = 1 to w * 3 do
output_byte channel (bget s !pos);
incr pos
done
done
let write_stream name stream =
let fh = open_out_bin name in
for x = 0 to bytes_size stream - 1 do
output_byte fh (bget stream x)
done;
close_out fh
let write_image path_to_p2p path_to_im pdf resources name image =
match Pdfimage.get_image_24bpp pdf resources image with
| Pdfimage.JPEG (stream, _) -> write_stream (name ^ ".jpg") stream
| Pdfimage.JPEG2000 (stream, _) -> write_stream (name ^ ".jpx") stream
| Pdfimage.JBIG2 (stream, _) -> write_stream (name ^ ".jbig2") stream
| Pdfimage.Raw (w, h, Pdfimage.BPP24, stream) ->
let pnm = name ^ ".pnm" in
let png = name ^ ".png" in
let fh = open_out_bin pnm in
pnm_to_channel_24 fh w h stream;
close_out fh;
begin match path_to_p2p with
| "" ->
begin match path_to_im with
"" -> Printf.eprintf "Neither pnm2png nor imagemagick found. Specify with -p2p or -im\n%!"
| _ ->
begin match
Sys.command (Filename.quote_command path_to_im [pnm; png])
with
0 -> Sys.remove pnm
| _ ->
Printf.eprintf "Call to imagemagick failed: did you specify -p2p correctly?\n%!";
Sys.remove pnm
end
end
| _ ->
begin match
Sys.command (Filename.quote_command path_to_p2p ~stdout:png ["-gamma"; "0.45"; "-quiet"; pnm])
with
| 0 -> Sys.remove pnm
| _ ->
Printf.eprintf "Call to pnmtopng failed: did you specify -p2p correctly?\n%!";
Sys.remove pnm
end
end
| _ ->
Printf.eprintf "Unsupported image type when extracting image %s %!" name
let written = ref []
let extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images =
let names = map
(fun _ ->
name_of_spec
encoding [] pdf 0 (stem ^ "-p" ^ string_of_int pnum)
(let r = !serial in serial := !serial + 1; r) "" 0 0) (indx images)
in
iter2 (write_image path_to_p2p path_to_im pdf resources) names images
let rec extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum form =
let resources =
match Pdf.lookup_direct pdf "/Resources" form with
Some (Pdf.Dictionary d) -> Pdf.Dictionary d
| _ -> Pdf.Dictionary []
in
let images =
let xobjects =
match Pdf.lookup_direct pdf "/XObject" resources with
| Some (Pdf.Dictionary elts) -> map snd elts
| _ -> []
in
(* Remove any already in !written. Add any remaining to !written, if !args.dedup or !args.dedup_page *)
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
images
in
extract_images_inner path_to_p2p path_to_im encoding serial pdf resources stem pnum images
let extract_images path_to_p2p path_to_im encoding dedup dedup_per_page pdf range stem =
if dedup || dedup_per_page then written := [];
let pdf_pages = Pdfpage.pages_of_pagetree pdf in
let pages =
option_map
(function (i, pdf_pages) -> if mem i range then Some pdf_pages else None)
(combine (indx pdf_pages) pdf_pages)
in
let serial = ref 0 in
iter2
(fun page pnum ->
if dedup_per_page then written := [];
let xobjects =
match Pdf.lookup_direct pdf "/XObject" page.Pdfpage.resources with
| Some (Pdf.Dictionary elts) -> map snd elts
| _ -> []
in
let images = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Image")) xobjects in
let already_written, images = List.partition (function Pdf.Indirect n -> mem n !written | _ -> false) images in
if dedup || dedup_per_page then
written := (option_map (function Pdf.Indirect n -> Some n | _ -> None) images) @ !written;
let forms = keep (fun o -> Pdf.lookup_direct pdf "/Subtype" o = Some (Pdf.Name "/Form")) xobjects in
extract_images_inner path_to_p2p path_to_im encoding serial pdf page.Pdfpage.resources stem pnum images;
iter (extract_images_form_xobject path_to_p2p path_to_im encoding dedup dedup_per_page pdf serial stem pnum) forms)
pages
(indx pages)
2021-11-15 20:17:15 +01:00