cpdf-source/cpdfdraw.ml

744 lines
28 KiB
OCaml
Raw Normal View History

2022-12-15 13:41:19 +01:00
open Pdfutil
2023-05-11 16:55:48 +02:00
open Cpdferror
2022-12-15 13:41:19 +01:00
2024-09-17 15:03:34 +02:00
let do_add_artifacts = ref true
2024-09-16 19:34:44 +02:00
let do_auto_tag = ref true
2022-12-22 17:20:00 +01:00
type colspec =
2022-12-15 13:41:19 +01:00
NoCol
| RGB of float * float * float
| Grey of float
| CYMK of float * float * float * float
2024-09-12 17:08:05 +02:00
type justification =
Left | Right | Centre
2022-12-15 13:41:19 +01:00
type drawops =
2022-12-21 17:09:04 +01:00
| Rect of float * float * float * float
| Bezier of float * float * float * float * float * float
2023-05-12 20:01:59 +02:00
| Bezier23 of float * float * float * float
| Bezier13 of float * float * float * float
2022-12-15 13:41:19 +01:00
| To of float * float
| Line of float * float
2022-12-16 14:13:55 +01:00
| ClosePath
2022-12-22 17:20:00 +01:00
| SetFill of colspec
| SetStroke of colspec
2022-12-15 15:20:41 +01:00
| SetLineThickness of float
| SetLineCap of int
| SetLineJoin of int
| SetMiterLimit of float
| SetDashPattern of float list * float
2022-12-16 13:13:38 +01:00
| Matrix of Pdftransform.transform_matrix
2023-05-09 15:30:30 +02:00
| Qq of drawops list
2022-12-16 14:13:55 +01:00
| Fill
| FillEvenOdd
| Stroke
| FillStroke
| FillStrokeEvenOdd
2022-12-21 17:40:13 +01:00
| Clip
| ClipEvenOdd
2023-05-04 14:53:49 +02:00
| FormXObject of float * float * float * float * string * drawops list
2023-05-03 15:19:55 +02:00
| Use of string
2022-12-22 21:42:55 +01:00
| ImageXObject of string * Pdf.pdfobject
2024-09-11 14:57:57 +02:00
| Image of string * string option
2023-04-27 20:14:58 +02:00
| NewPage
| Opacity of float
| SOpacity of float
| FontPack of string * Cpdfembed.cpdffont * (int, unit) Hashtbl.t
| Font of string * float
2023-05-09 15:30:30 +02:00
| TextSection of drawops list
2023-04-27 20:14:58 +02:00
| Text of string
2023-05-02 15:47:18 +02:00
| SpecialText of string
2024-09-19 15:41:51 +02:00
| Para of justification * float * string list
2023-04-28 20:03:10 +02:00
| Newline
2023-05-01 15:39:42 +02:00
| Leading of float
| CharSpace of float
| WordSpace of float
| TextScale of float
| RenderMode of int
| Rise of float
2024-09-16 19:34:44 +02:00
| Tag of string
| EndTag
| STag of string
| EndSTag
| BeginArtifact
| EndArtifact
2024-09-18 16:31:24 +02:00
| Namespace of string
2022-12-22 17:20:00 +01:00
2024-02-23 17:17:12 +01:00
(*let rec string_of_drawop = function
2023-05-11 16:31:10 +02:00
| Qq o -> "Qq (" ^ string_of_drawops o ^ ")"
| FormXObject (_, _, _, _, _, o) -> "FormXObject (" ^ string_of_drawops o ^ ")"
| TextSection o -> "TextSection (" ^ string_of_drawops o ^ ")"
2023-05-12 20:01:59 +02:00
| Rect _ -> "Rect" | Bezier _ -> "Bezier" | Bezier23 _ -> "Bezier23"
| Bezier13 _ -> "Bezier13" | To _ -> "To" | Line _ -> "Line"
2023-05-11 16:31:10 +02:00
| ClosePath -> "ClosePath" | SetFill _ -> "SetFill" | SetStroke _ -> "SetStroke"
| SetLineThickness _ -> "SetLineThickness" | SetLineCap _ -> "SetLineCap"
| SetLineJoin _ -> "SetLineJoin" | SetMiterLimit _ -> "SetMiterLimit"
| SetDashPattern _ -> "SetDashPattern" | Matrix _ -> "SetMatrix"
| Fill -> "Fill" | FillEvenOdd -> "FillEvenOdd" | Stroke -> "Stroke"
| FillStroke -> "FillStroke" | FillStrokeEvenOdd -> "FillStrokeEvenOdd"
| Clip -> "Clip" | ClipEvenOdd -> "ClipEvenOdd" | Use _ -> "Use"
| ImageXObject _ -> "ImageXObject" | Image _ -> "Image" | NewPage -> "NewPage"
2023-07-14 15:40:59 +02:00
| Opacity _ -> "Opacity" | SOpacity _ -> "SOpacity" | FontPack (n, _, _) -> "FontPack " ^ n ^ " "
| Font (f, _) -> "Font " ^ f ^ " " | Text _ -> "Text" | SpecialText _ -> "SpecialText"
| Newline -> "Newline" | Leading _ -> "Leading" | CharSpace _ -> "CharSpace"
| WordSpace _ -> "WordSpace" | TextScale _ -> "TextScale"
2023-05-11 16:31:10 +02:00
| RenderMode _ -> "RenderMode" | Rise _ -> "Rise"
and string_of_drawops l =
2024-02-23 17:17:12 +01:00
fold_left (fun x y -> x ^ " " ^ y) "" (map string_of_drawop l)*)
2023-05-11 16:31:10 +02:00
2023-05-12 23:54:08 +02:00
(* Per page / xobject resources *)
2023-05-04 16:01:12 +02:00
type res =
{images : (string, (string * int)) Hashtbl.t; (* (name, (pdf name, objnum)) *)
2023-05-05 17:17:35 +02:00
extgstates : ((string * float), string) Hashtbl.t; (* (kind, value), name *)
2023-07-20 15:48:49 +02:00
fonts : (string * int, (string * int)) Hashtbl.t; (* (font, (objnum, pdf name)) *)
2023-05-05 14:42:47 +02:00
form_xobjects : (string, (string * int)) Hashtbl.t; (* (name, (pdf name, objnum)) *)
2023-05-07 17:40:02 +02:00
mutable page_names : string list;
2023-05-04 16:01:12 +02:00
mutable time : Cpdfstrftime.t;
2023-07-17 14:38:35 +02:00
mutable current_fontpack : string * Cpdfembed.t;
2023-07-13 16:57:31 +02:00
mutable current_fontpack_codepoints : (int, unit) Hashtbl.t;
mutable font_size : float;
2023-05-04 16:01:12 +02:00
mutable num : int}
2023-05-03 16:49:14 +02:00
2023-07-13 16:57:31 +02:00
let default_fontpack =
Cpdfembed.fontpack_of_standardfont
(Pdftext.StandardFont (Pdftext.TimesRoman, Pdftext.WinAnsiEncoding))
2023-07-22 17:08:03 +02:00
let fontpacks = ref (null_hash ())
2023-05-10 18:03:53 +02:00
let empty_res () =
2023-05-04 16:01:12 +02:00
{images = null_hash ();
extgstates = null_hash ();
fonts = null_hash ();
form_xobjects = null_hash ();
2023-05-07 17:40:02 +02:00
page_names = [];
2023-05-04 16:01:12 +02:00
time = Cpdfstrftime.dummy;
2023-07-17 14:38:35 +02:00
current_fontpack = ("Times-Roman", default_fontpack);
2023-07-13 16:57:31 +02:00
current_fontpack_codepoints = null_hash ();
font_size = 12.;
2023-05-04 16:01:12 +02:00
num = 0}
2022-12-22 17:20:00 +01:00
2023-05-08 16:13:17 +02:00
let resstack =
2023-05-10 18:03:53 +02:00
ref [empty_res ()]
2023-05-08 16:13:17 +02:00
2023-05-08 17:58:19 +02:00
let rescopy r =
{r with
images = Hashtbl.copy r.images;
fonts = Hashtbl.copy r.fonts;
extgstates = Hashtbl.copy r.extgstates;
form_xobjects = Hashtbl.copy r.form_xobjects}
2023-07-13 17:22:50 +02:00
let res () =
try hd !resstack with _ -> error "graphics stack empty"
2023-05-08 16:48:18 +02:00
let respush () =
2023-05-08 17:58:19 +02:00
resstack := (rescopy (res ()))::!resstack
2023-05-08 16:48:18 +02:00
let respop () =
2023-05-11 16:55:48 +02:00
let n = (res ()).num in
2023-05-09 13:36:45 +02:00
resstack := tl !resstack;
(* not necessary, since names are isolated in the xobject, but it makes
manual debugging of PDF files easier if we don't re-use numbers *)
(res ()).num <- max n (res ()).num
2023-05-08 16:48:18 +02:00
2023-05-04 16:01:12 +02:00
let fresh_name s =
2023-05-08 16:13:17 +02:00
(res ()).num <- (res ()).num + 1;
s ^ string_of_int (res ()).num
2022-12-16 13:13:38 +01:00
2023-05-07 17:40:02 +02:00
(* At end of page, we keep things for which we have indirects - but ExtGStates
aren't indirect, so they go. *)
2023-05-03 14:43:57 +02:00
let reset_state () =
2023-07-22 17:08:03 +02:00
Hashtbl.clear (res ()).extgstates(*;
(res ()).page_names <- []*)
2023-05-01 17:53:28 +02:00
2023-05-01 20:00:28 +02:00
let process_specials pdf endpage filename bates batespad num page s =
let pairs =
Cpdfaddtext.replace_pairs pdf endpage None filename bates batespad num page
in
2023-05-08 16:13:17 +02:00
Cpdfaddtext.process_text (res ()).time s pairs
2023-05-01 20:00:28 +02:00
(* FIXME cache (just for paragraph) *)
let font_widths f fontsize =
match f with
| Pdftext.StandardFont (sf, encoding) ->
Array.init
256
(fun x ->
fontsize
*. float_of_int
(Pdfstandard14.textwidth false encoding sf (string_of_char (char_of_int x)))
/. 1000.)
| Pdftext.SimpleFont {fontmetrics = Some m} ->
Array.map (fun x -> fontsize *. x /. 1000. ) m
| _ -> raise (Pdf.PDFError "Cpdfdraw: Unsupported font")
let runs_of_utf8 s =
2023-07-17 14:38:35 +02:00
let identifier, fontpack = (res ()).current_fontpack in
let codepoints = Pdftext.codepoints_of_utf8 s in
let triples = option_map (Cpdfembed.get_char fontpack) codepoints in
let collated = Cpdfembed.collate_runs triples in
(* FIXME Efficiency: runs, cacheing *)
let w =
fold_left ( +. ) 0.
(map
(fun (charcode, _, font) ->
let widths = font_widths font (res ()).font_size in
widths.(charcode))
triples)
in
let output =
flatten
(map
(fun l ->
if l = [] then [] else
2023-07-20 15:48:49 +02:00
let f, n = match l with (_, n, f)::_ -> f, n | _ -> assert false in
let fontname = fst (Hashtbl.find (res ()).fonts (identifier, n)) in
let charcodes = map (fun (c, _, _) -> char_of_int c) l in
[Pdfops.Op_Tf (fontname, (res ()).font_size);
Pdfops.Op_Tj (implode charcodes)])
collated)
in
(output, w)
2023-05-05 17:17:35 +02:00
let extgstate kind v =
2023-05-08 16:13:17 +02:00
try Hashtbl.find (res ()).extgstates (kind, v) with
2023-05-05 17:17:35 +02:00
Not_found ->
2023-05-08 17:58:19 +02:00
let n = fresh_name "/G" in
2023-07-17 14:38:35 +02:00
Hashtbl.replace (res ()).extgstates (kind, v) n;
2023-05-05 17:17:35 +02:00
n
2023-05-08 17:29:03 +02:00
let read_resource pdf n res =
match Pdf.lookup_direct pdf n res with
| Some (Pdf.Dictionary d) -> d
| _ -> []
let update_resources pdf old_resources =
2023-05-12 17:24:36 +02:00
let gss_resources = map (fun ((kind, v), n) -> (n, Pdf.Dictionary [(kind, Pdf.Real v)])) (list_of_hashtbl (res ()).extgstates) in
2023-05-08 17:29:03 +02:00
let select_resources t =
option_map (fun (_, (n, o)) -> if mem n (res ()).page_names then Some (n, Pdf.Indirect o) else None) (list_of_hashtbl t)
in
let update = fold_right (fun (k, v) d -> add k v d) in
let new_gss = update gss_resources (read_resource pdf "/ExtGState" old_resources) in
let new_xobjects = update (select_resources (res ()).form_xobjects @ select_resources (res ()).images) (read_resource pdf "/XObject" old_resources) in
let new_fonts = update (select_resources (res ()).fonts) (read_resource pdf "/Font" old_resources) in
let add_if_non_empty dict name newdict =
if newdict = Pdf.Dictionary [] then dict else
Pdf.add_dict_entry dict name newdict
in
add_if_non_empty
(add_if_non_empty
(add_if_non_empty old_resources "/XObject" (Pdf.Dictionary new_xobjects))
"/ExtGState"
(Pdf.Dictionary new_gss))
"/Font"
(Pdf.Dictionary new_fonts)
2024-09-06 17:02:20 +02:00
let mcidr = ref ~-1
let mcid () = (incr mcidr; !mcidr)
let mcpage = ref ~-1
2024-09-06 17:02:20 +02:00
2024-09-18 16:31:24 +02:00
let standard_namespace = "http://iso.org/pdf/ssn"
let pdf2_namespace = "http://iso.org/pdf2/ssn"
2024-09-18 17:32:29 +02:00
(* namespace, object number pair. *)
let namespaces = null_hash ()
(* Add the object, add its number and this namespace to the hash. *)
let add_namespace pdf s =
if s = standard_namespace then () else
match Hashtbl.find_opt namespaces s with
| Some _ -> ()
| None ->
let objnum = Pdf.addobj pdf (Pdf.Dictionary [("/NS", Pdf.String s)]) in
Hashtbl.add namespaces s objnum
2024-09-18 16:31:24 +02:00
(* The structure data, as it is created, in flat form. Later on, this will be
reconstructed into a structure tree. *)
2024-09-06 17:02:20 +02:00
type structdata =
| StDataBeginTree of string
| StDataEndTree
2024-09-11 14:57:57 +02:00
| StDataMCID of string * int * string option
| StDataPage of int
2024-09-18 17:32:29 +02:00
| StDataNamespace of string
2024-09-06 17:02:20 +02:00
let structdata = ref []
(* TODO: Use Uuseg for proper unicode segmentation. *)
let format_paragraph j w s =
let ss = String.split_on_char ' ' s in
let rs_and_widths = ref (map runs_of_utf8 ss) in
let space_runs, space_width = runs_of_utf8 " " in
let remaining = ref w in
2024-09-15 16:33:00 +02:00
let allops = ref [] in
2024-09-13 18:08:06 +02:00
let ops = ref [] in
2024-09-15 16:33:00 +02:00
let justify ops =
match j with
| Left -> ops
| Right -> [Pdfops.Op_Td (~-.(!remaining), 0.)] @ ops @ [Pdfops.Op_Td (!remaining, 0.)]
| Centre -> [Pdfops.Op_Td (~-.(!remaining) /. 2., 0.)] @ ops @ [Pdfops.Op_Td (!remaining /. 2., 0.)]
in
while !rs_and_widths <> [] do
2024-09-13 18:08:06 +02:00
let word, word_width = hd !rs_and_widths in
if !remaining = w then
(* If current line empty, output word. *)
begin
ops := rev word @ !ops;
remaining := !remaining -. word_width;
rs_and_widths := tl !rs_and_widths
end
else if word_width +. space_width <= !remaining then
(* If current line not empty, and space for space char and word, emit them. *)
begin
ops := rev space_runs @ !ops;
ops := rev word @ !ops;
remaining := !remaining -. word_width -. space_width;
rs_and_widths := tl !rs_and_widths
end
else
(* If current line not empty, and not enough space, emit newline. *)
begin
2024-09-15 16:33:00 +02:00
allops =| rev (Pdfops.Op_T'::justify !ops);
ops := [];
2024-09-13 18:08:06 +02:00
remaining := w
end
done;
2024-09-15 16:33:00 +02:00
allops =| rev (Pdfops.Op_T'::justify !ops);
flatten (rev !allops)
2024-09-11 18:05:20 +02:00
let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function
| Qq ops ->
2024-09-11 18:05:20 +02:00
[Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q]
2022-12-16 13:13:38 +01:00
| Matrix m -> [Pdfops.Op_cm m]
2022-12-15 13:41:19 +01:00
| Rect (x, y, w, h) -> [Pdfops.Op_re (x, y, w, h)]
2022-12-21 17:09:04 +01:00
| Bezier (a, b, c, d, e, f) -> [Pdfops.Op_c (a, b, c, d, e, f)]
2023-05-12 20:01:59 +02:00
| Bezier23 (a, b, c, d) -> [Pdfops.Op_v (a, b, c, d)]
| Bezier13 (a, b, c, d) -> [Pdfops.Op_y (a, b, c, d)]
2022-12-15 13:41:19 +01:00
| To (x, y) -> [Pdfops.Op_m (x, y)]
| Line (x, y) -> [Pdfops.Op_l (x, y)]
2022-12-16 14:13:55 +01:00
| SetFill x ->
2022-12-15 13:41:19 +01:00
begin match x with
| RGB (r, g, b) -> [Op_rg (r, g, b)]
| Grey g -> [Op_g g]
| CYMK (c, y, m, k) -> [Op_k (c, y, m, k)]
| NoCol -> []
end
2022-12-16 14:13:55 +01:00
| SetStroke x ->
2022-12-15 13:41:19 +01:00
begin match x with
| RGB (r, g, b) -> [Op_RG (r, g, b)]
| Grey g -> [Op_G g]
| CYMK (c, y, m, k) -> [Op_K (c, y, m, k)]
| NoCol -> []
end
2023-05-12 16:33:28 +02:00
| ClosePath -> [Pdfops.Op_h]
2022-12-16 14:13:55 +01:00
| Fill -> [Pdfops.Op_f]
| FillEvenOdd -> [Pdfops.Op_f']
| Stroke -> [Pdfops.Op_S]
| FillStroke -> [Pdfops.Op_B]
| FillStrokeEvenOdd -> [Pdfops.Op_B']
2022-12-21 17:40:13 +01:00
| Clip -> [Pdfops.Op_W; Pdfops.Op_n]
2023-05-12 20:36:53 +02:00
| ClipEvenOdd -> [Pdfops.Op_W'; Pdfops.Op_n]
2023-05-12 16:33:28 +02:00
| SetLineThickness t -> [Pdfops.Op_w t]
2022-12-16 14:13:55 +01:00
| SetLineCap c -> [Pdfops.Op_J c]
| SetLineJoin j -> [Pdfops.Op_j j]
| SetMiterLimit m -> [Pdfops.Op_M m]
| SetDashPattern (x, y) -> [Pdfops.Op_d (x, y)]
2023-05-11 16:31:10 +02:00
| FormXObject (a, b, c, d, n, ops) ->
2024-09-11 18:05:20 +02:00
create_form_xobject struct_tree dryrun a b c d pdf endpage filename bates batespad num page n ops;
2023-05-11 16:31:10 +02:00
[]
2023-05-07 17:40:02 +02:00
| Use n ->
2023-05-11 16:55:48 +02:00
let pdfname = try fst (Hashtbl.find (res ()).form_xobjects n) with _ -> error ("Form XObject not found: " ^ n) in
2023-05-08 16:13:17 +02:00
(res ()).page_names <- pdfname::(res ()).page_names;
2023-05-07 17:40:02 +02:00
[Pdfops.Op_Do pdfname]
2024-09-11 14:57:57 +02:00
| Image (s, t) ->
let m = mcid () in
if not dryrun then structdata := StDataMCID ("/Figure", m, t)::!structdata;
2023-05-11 16:55:48 +02:00
let pdfname = try fst (Hashtbl.find (res ()).images s) with _ -> error ("Image not found: " ^ s) in
2023-05-08 16:13:17 +02:00
(res ()).page_names <- pdfname::(res ()).page_names;
2024-09-16 19:34:44 +02:00
(if struct_tree && !do_auto_tag then [Pdfops.Op_BDC ("/Figure", Pdf.Dictionary ["/MCID", Pdf.Integer m])] else [])
2024-09-11 18:05:20 +02:00
@ [Pdfops.Op_Do pdfname]
2024-09-16 19:34:44 +02:00
@ (if struct_tree && !do_auto_tag then [Pdfops.Op_EMC] else [])
2022-12-22 21:42:55 +01:00
| ImageXObject (s, obj) ->
2023-07-17 14:38:35 +02:00
Hashtbl.replace (res ()).images s (fresh_name "/I", Pdf.addobj pdf obj);
2022-12-22 17:20:00 +01:00
[]
2023-04-27 20:14:58 +02:00
| NewPage -> Pdfe.log ("NewPage remaining in graphic stream"); assert false
2023-05-05 17:17:35 +02:00
| Opacity v -> [Pdfops.Op_gs (extgstate "/ca" v)]
| SOpacity v -> [Pdfops.Op_gs (extgstate "/CA" v)]
| FontPack (identifier, cpdffont, codepoints) ->
(*Printf.printf "FontPack op: |%s|\n%!" identifier;*)
2023-07-17 14:38:35 +02:00
let fontpack =
2023-07-22 17:08:03 +02:00
match Hashtbl.find !fontpacks identifier with
2023-07-17 14:38:35 +02:00
| (fontpack, _) ->
(*Printf.printf "Cpdfdraw FontPack op: using existing fontpack |%s|\n%!" identifier;*)
2023-07-17 14:38:35 +02:00
fontpack
| exception Not_found ->
(*Printf.printf "Cpdfdraw FontPack op: storing new fontpack |%s|\n%!" identifier;*)
2023-07-17 14:38:35 +02:00
let fontpack =
match cpdffont with
| PreMadeFontPack fp ->
(*Printf.printf "it's a pre-made font pack\n%!";*)
2023-07-17 14:38:35 +02:00
fp
| EmbedInfo {fontfile; fontname; encoding} ->
let codepoints = map fst (list_of_hashtbl codepoints) in
(*Printf.printf "%i codepoints to embed\n%!" (length codepoints);*)
2023-07-17 14:38:35 +02:00
if codepoints = [] then default_fontpack else
Cpdfembed.embed_truetype pdf ~fontfile ~fontname ~codepoints ~encoding
| ExistingNamedFont ->
error "-draw does not support using an existing named font"
in
2023-07-22 17:08:03 +02:00
Hashtbl.replace !fontpacks identifier (fontpack, codepoints);
2023-07-17 14:38:35 +02:00
fontpack
in
let ns =
2023-07-20 15:48:49 +02:00
map2
(fun font n ->
try fst (Hashtbl.find (res ()).fonts (identifier, n)) with
2023-07-17 14:38:35 +02:00
Not_found ->
let o = if dryrun then 0 else Pdftext.write_font pdf font in
2023-07-20 15:48:49 +02:00
let name = fresh_name "/F" in
(*Printf.printf "Adding font %s as %s\n%!" identifier name;*)
2023-07-20 15:48:49 +02:00
Hashtbl.replace (res ()).fonts (identifier, n) (name, o);
name)
2023-07-17 14:38:35 +02:00
(fst fontpack)
2023-07-20 15:48:49 +02:00
(indx0 (fst fontpack))
2023-07-17 14:38:35 +02:00
in
(res ()).page_names <- ns @ (res ()).page_names;
[]
| Font (identifier, size) ->
(*Printf.printf "Cpdfdraw Font op: Changing to stored font %s\n%!" identifier;*)
2023-07-22 17:08:03 +02:00
let fontpack, codepoints = Hashtbl.find !fontpacks identifier in
2023-07-17 14:38:35 +02:00
(res ()).current_fontpack <- (identifier, fontpack);
if dryrun then (res ()).current_fontpack_codepoints <- codepoints;
(res ()).font_size <- size;
[]
2024-09-06 17:02:20 +02:00
| TextSection ops ->
let m = mcid () in
2024-09-11 14:57:57 +02:00
if not dryrun then structdata := StDataMCID ("/P", m, None)::!structdata;
2024-09-16 19:34:44 +02:00
(if struct_tree && !do_auto_tag then [Pdfops.Op_BDC ("/P", Pdf.Dictionary ["/MCID", Pdf.Integer m])] else [])
2024-09-11 18:05:20 +02:00
@ [Pdfops.Op_BT]
@ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops
@ [Pdfops.Op_ET]
2024-09-16 19:34:44 +02:00
@ (if struct_tree && !do_auto_tag then [Pdfops.Op_EMC] else [])
2023-07-13 16:57:31 +02:00
| Text s ->
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
fst (runs_of_utf8 s)
2023-07-13 16:57:31 +02:00
| SpecialText s ->
let s = process_specials pdf endpage filename bates batespad num page s in
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
fst (runs_of_utf8 s)
2024-09-12 17:08:05 +02:00
| Para (j, w, s) ->
2024-09-19 16:41:56 +02:00
if dryrun then iter (iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ())) (map Pdftext.codepoints_of_utf8 s);
let first = ref true in
flatten
(map
(function para ->
if not !first then ([Pdfops.Op_T']) else (clear first; []) @ format_paragraph j w para)
s)
2023-05-01 15:39:42 +02:00
| Leading f -> [Pdfops.Op_TL f]
| CharSpace f -> [Pdfops.Op_Tc f]
| WordSpace f -> [Pdfops.Op_Tw f]
| TextScale f -> [Pdfops.Op_Tz f]
| RenderMode i -> [Pdfops.Op_Tr i]
| Rise f -> [Pdfops.Op_Ts f]
| Newline -> [Pdfops.Op_T']
2024-09-16 19:34:44 +02:00
| Tag s ->
let m = mcid () in
if not dryrun then structdata := StDataMCID (s, m, None)::!structdata;
[Pdfops.Op_BDC (s, Pdf.Dictionary ["/MCID", Pdf.Integer m])]
| EndTag -> [Pdfops.Op_EMC]
2024-09-18 16:15:07 +02:00
| STag s -> if not dryrun then structdata =| StDataBeginTree s; []
| EndSTag -> if not dryrun then structdata =| StDataEndTree; []
2024-09-17 15:32:22 +02:00
| BeginArtifact -> [Pdfops.Op_BMC "/BeginArtifact"]
| EndArtifact -> [Pdfops.Op_BMC "/EndArtifact"]
2024-09-18 17:32:29 +02:00
| Namespace s ->
if not dryrun then
begin
add_namespace pdf s;
structdata =| StDataNamespace s
end;
[]
2024-09-12 17:20:38 +02:00
and ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page drawops =
flatten (map (ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page) drawops)
2022-12-15 13:41:19 +01:00
2024-09-11 18:05:20 +02:00
and create_form_xobject struct_tree dryrun a b c d pdf endpage filename bates batespad num page n ops =
2023-05-08 16:48:18 +02:00
respush ();
2023-05-08 17:29:03 +02:00
reset_state ();
2023-05-03 20:01:25 +02:00
let data =
2024-09-11 18:05:20 +02:00
Pdfio.bytes_of_string (Pdfops.string_of_ops (ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops))
2023-05-03 20:01:25 +02:00
in
let obj =
Pdf.Stream
{contents =
(Pdf.Dictionary
[("/Length", Pdf.Integer (Pdfio.bytes_size data));
("/Subtype", Pdf.Name "/Form");
2023-05-08 17:29:03 +02:00
("/Resources", update_resources pdf (Pdf.Dictionary []));
2023-05-04 14:53:49 +02:00
("/BBox", Pdf.Array [Pdf.Real a; Pdf.Real b; Pdf.Real c; Pdf.Real d])
2023-05-03 20:01:25 +02:00
],
Pdf.Got data)}
in
2023-05-08 17:58:19 +02:00
respop ();
2023-07-17 14:38:35 +02:00
Hashtbl.replace (res ()).form_xobjects n (fresh_name "/X", (if dryrun then 0 else Pdf.addobj pdf obj))
2023-05-03 20:01:25 +02:00
2023-05-04 19:57:08 +02:00
let minimum_resource_number pdf range =
2023-05-05 15:46:51 +02:00
let pages = Pdfpage.pages_of_pagetree pdf in
let pages_in_range =
option_map2 (fun p n -> if mem n range then Some p else None) pages (indx pages) in
let number_of_name s =
match implode (rev (takewhile (function '0'..'9' -> true | _ -> false) (rev (explode s)))) with
| "" -> None
| s -> Some (int_of_string s)
in
let resource_names_page p =
let names n =
match Pdf.lookup_direct pdf n p.Pdfpage.resources with
| Some (Pdf.Dictionary d) -> map fst d
| _ -> []
in
names "/XObject" @ names "/ExtGState" @ names "/Font"
in
match
sort
(fun a b -> compare b a)
(option_map number_of_name (flatten (map resource_names_page pages_in_range)))
with
| [] -> 0
| n::_ -> n + 1
2023-05-04 19:57:08 +02:00
2023-05-12 15:04:14 +02:00
let rec contains_specials_drawop = function
| SpecialText _ -> true
| Qq l | TextSection l | FormXObject (_, _, _, _, _, l) -> contains_specials l
| _ -> false
and contains_specials l =
List.exists contains_specials_drawop l
2023-05-05 17:27:41 +02:00
2023-07-13 17:22:50 +02:00
let save_whole_stack () =
map (fun r -> rescopy r) !resstack
let restore_whole_stack r =
resstack := r
2024-09-11 16:15:35 +02:00
(* Mark as an artifact anything not already marked. *)
let add_artifacts ops =
let content = ref false in
let artifact = ref false in
let rec loop a = function
| [] ->
(* The end. Must end artifact if in artifact. *)
if !artifact then rev (Pdfops.Op_EMC::a) else rev a
2024-09-17 15:32:22 +02:00
| Pdfops.Op_BMC "/BeginArtifact"::t ->
(* Convert back-channel artifact beginning. *)
set artifact;
loop (Pdfops.Op_BMC "/Artifact"::a) t
| Pdfops.Op_BMC "/EndArtifact"::t ->
(* Convert back-channel artifact ending. *)
clear artifact;
loop (Pdfops.Op_EMC::a) t
2024-09-11 16:15:35 +02:00
| Pdfops.Op_BDC _ as h::t ->
(* Entering content. If in artifact, must end artifact. *)
let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in
set content; clear artifact; loop a' t
| Pdfops.Op_EMC as h::t ->
(* Exiting content. *)
clear content;
loop (h::a) t
| h::t ->
(* A normal operation. If not in content or artifact must start artifact. *)
let a' =
if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a
in
loop a' t
in
loop [] ops
2024-09-11 18:05:20 +02:00
let draw_single ~struct_tree ~fast ~underneath ~filename ~bates ~batespad range pdf drawops =
2023-05-08 16:13:17 +02:00
(res ()).num <- max (res ()).num (minimum_resource_number pdf range);
2023-05-02 15:47:18 +02:00
let endpage = Pdfpage.endpage pdf in
let pages = Pdfpage.pages_of_pagetree pdf in
2023-05-11 20:18:14 +02:00
let ops =
2023-05-08 17:29:03 +02:00
if contains_specials drawops
then None
2024-09-11 18:05:20 +02:00
else Some (ops_of_drawops struct_tree false pdf endpage filename bates batespad 0 (hd pages) drawops)
2023-05-05 17:27:41 +02:00
in
2023-05-02 15:47:18 +02:00
let ss =
map2
2023-05-05 14:42:47 +02:00
(fun n p ->
2023-05-08 17:29:03 +02:00
if mem n range
2023-07-13 16:57:31 +02:00
then
(match ops with
| Some x -> x
2024-09-11 18:05:20 +02:00
| None -> ops_of_drawops struct_tree false pdf endpage filename bates batespad n p drawops)
2023-05-11 20:18:14 +02:00
else [])
2023-05-02 15:47:18 +02:00
(ilist 1 endpage)
pages
in
2023-05-04 16:51:03 +02:00
let pages =
2023-05-11 20:18:14 +02:00
map3
(fun n p ops ->
if not (mem n range) then p else
2024-09-16 17:44:57 +02:00
let ops = if struct_tree && !do_add_artifacts then add_artifacts ops else ops in
2023-05-11 20:18:14 +02:00
let page = {p with Pdfpage.resources = update_resources pdf p.Pdfpage.resources} in
(if underneath then Pdfpage.prepend_operators else Pdfpage.postpend_operators) pdf ops ~fast page)
2023-05-08 15:15:03 +02:00
(ilist 1 endpage)
2023-05-04 16:51:03 +02:00
(Pdfpage.pages_of_pagetree pdf)
2023-05-11 20:18:14 +02:00
ss
2023-05-04 16:51:03 +02:00
in
Pdfpage.change_pages true pdf pages
2023-05-03 14:43:57 +02:00
2023-10-23 17:53:27 +02:00
(* Do a dry run of all the drawing to collect subset information. *)
2024-09-11 18:05:20 +02:00
let dryrun ~struct_tree ~filename ~bates ~batespad range pdf chunks =
2023-10-23 17:53:27 +02:00
let endpage = Pdfpage.endpage pdf in
let pages = Pdfpage.pages_of_pagetree pdf in
let r = save_whole_stack () in
let saved_fontpacks = Hashtbl.copy !fontpacks in
let pagenum = ref (hd range) in
iter
(fun chunk ->
2024-09-11 18:05:20 +02:00
ignore (ops_of_drawops struct_tree true pdf endpage filename bates batespad !pagenum (hd pages) chunk);
2023-10-23 17:53:27 +02:00
match range with
| [x] when endpage > x -> pagenum := x + 1
| _ -> pagenum := endpage + 1)
chunks;
restore_whole_stack r;
fontpacks := saved_fontpacks
type st =
StMCID of int
2024-09-18 17:32:29 +02:00
| StItem of {kind : string; namespace : string; pageobjnum : int option; alt : string option; children : st list}
(* Build a tree from the MCIDs and structure tree instructions gathered *)
2024-09-17 17:09:29 +02:00
let rec find_tree_contents a level = function
| [] -> error "not enough -end-stag"
| StDataBeginTree _ as h::t ->
find_tree_contents (h::a) (level + 1) t
| StDataEndTree::t ->
if level = 1 then (rev a, t) else find_tree_contents a (level - 1) t
| h::t -> find_tree_contents (h::a) level t
2024-09-18 17:32:29 +02:00
let rec make_structure_tree pageobjnums pdf pagenum namespace = function
| [] -> []
| StDataMCID (n, mcid, alt)::t ->
2024-09-18 17:32:29 +02:00
StItem {kind = n; namespace = !namespace; alt; pageobjnum = lookup !pagenum pageobjnums; children = [StMCID mcid]}::make_structure_tree pageobjnums pdf pagenum namespace t
| StDataPage n::t ->
pagenum := n;
2024-09-18 17:32:29 +02:00
make_structure_tree pageobjnums pdf pagenum namespace t
| StDataNamespace s::t ->
namespace := s;
make_structure_tree pageobjnums pdf pagenum namespace t
| StDataBeginTree s::t ->
2024-09-17 17:09:29 +02:00
let tree_contents, rest = find_tree_contents [] 1 t in
2024-09-18 17:32:29 +02:00
[StItem {kind = s; namespace = !namespace; alt = None; pageobjnum = None; children = make_structure_tree pageobjnums pdf pagenum namespace rest}]
| StDataEndTree::t ->
2024-09-17 17:09:29 +02:00
error "Too many -end-tags"
let make_structure_tree pdf items =
let pageobjnums =
let objnums = Pdf.page_reference_numbers pdf in
combine (indx objnums) objnums
in
2024-09-18 17:32:29 +02:00
make_structure_tree pageobjnums pdf (ref 0) (ref standard_namespace) items
2024-09-18 15:04:50 +02:00
(* Write such a structure tree to a PDF. *)
let write_structure_tree pdf st =
2024-09-09 17:39:32 +02:00
let parentmap = ref [] in
2024-09-10 17:40:33 +02:00
let add_parentmap pon this_objnum =
match lookup pon !parentmap with
| None -> parentmap =| (pon, [this_objnum])
| Some objnums -> parentmap := add pon (this_objnum::objnums) !parentmap
in
2024-09-09 19:05:12 +02:00
let struct_tree_root = Pdf.addobj pdf Pdf.Null in
2024-09-18 15:04:50 +02:00
let rec mktree struct_tree_parent = function
2024-09-18 17:32:29 +02:00
| StItem {kind; namespace; pageobjnum; alt; children} ->
2024-09-18 15:04:50 +02:00
let this_objnum = Pdf.addobj pdf Pdf.Null in
begin match pageobjnum with
| Some p -> add_parentmap p this_objnum
| _ -> ()
end;
let alt =
match alt with
| Some s -> [("/Alt", Pdf.String s)]
| None -> []
in
let page =
match pageobjnum with
| Some i -> [("/Pg", Pdf.Indirect i)]
| None -> []
in
2024-09-18 17:32:29 +02:00
let namespace =
if namespace = standard_namespace then [] else
[("/NS", Pdf.Indirect (Hashtbl.find namespaces namespace))]
in
2024-09-18 15:04:50 +02:00
let this_obj =
Pdf.Dictionary
(alt
@ page
2024-09-18 17:32:29 +02:00
@ namespace
2024-09-18 15:04:50 +02:00
@ [("/S", Pdf.Name kind);
("/P", Pdf.Indirect struct_tree_parent);
("/K", Pdf.Array (map (mktree this_objnum) children))])
in
Pdf.addobj_given_num pdf (this_objnum, this_obj);
Pdf.Indirect this_objnum
| StMCID x ->
Pdf.Integer x
in
2024-09-18 15:04:50 +02:00
let items = map (mktree struct_tree_root) st in
2024-09-10 17:40:33 +02:00
iter
(fun (pon, _) ->
2024-09-10 17:41:18 +02:00
Pdf.addobj_given_num pdf (pon, Pdf.add_dict_entry (Pdf.lookup_obj pdf pon) "/StructParents" (Pdf.Integer pon)))
2024-09-10 17:40:33 +02:00
!parentmap;
let parentmap =
map (fun (pon, items) -> (string_of_int pon, Pdf.Array (map (fun x -> Pdf.Indirect x) (rev items)))) !parentmap
in
let st =
2024-09-18 17:32:29 +02:00
let namespaces =
match list_of_hashtbl namespaces with
| [] -> []
| ns -> [("/Namespaces", Pdf.Array (map (function (_, objnum) -> Pdf.Indirect objnum) ns))]
in
Pdf.Dictionary
(namespaces @
[("/Type", Pdf.Name "/StructTreeRoot");
("/ParentTree", Pdf.Indirect (Pdf.addobj pdf (Pdftree.build_name_tree true pdf parentmap)));
("/K", Pdf.Array items)])
in
2024-09-09 19:05:12 +02:00
Pdf.addobj_given_num pdf (struct_tree_root, st);
Pdf.replace_chain pdf ["/Root"] ("/StructTreeRoot", (Pdf.Indirect struct_tree_root))
2024-09-06 15:04:17 +02:00
let draw ~struct_tree ~fast ~underneath ~filename ~bates ~batespad range pdf drawops =
2023-10-23 17:53:27 +02:00
(*Printf.printf "%s\n" (string_of_drawops drawops);*)
2024-09-17 15:03:34 +02:00
if not struct_tree then clear do_add_artifacts;
2023-05-10 18:03:53 +02:00
resstack := [empty_res ()];
2023-07-22 17:08:03 +02:00
Hashtbl.clear !fontpacks;
2023-05-08 16:13:17 +02:00
(res ()).time <- Cpdfstrftime.current_time ();
2023-05-03 14:43:57 +02:00
let pdf = ref pdf in
let range = ref range in
2023-05-10 20:11:31 +02:00
(* Double up a trailing NewPage so it actually does something... *)
2023-05-11 15:39:37 +02:00
let drawops = match rev drawops with NewPage::t -> rev (NewPage::NewPage::t) | _ -> drawops in
2023-05-03 14:43:57 +02:00
let chunks = ref (split_around (eq NewPage) drawops) in
2024-09-11 18:05:20 +02:00
dryrun ~struct_tree ~filename ~bates ~batespad !range !pdf !chunks;
mcpage := 0;
2023-05-03 14:43:57 +02:00
while !chunks <> [] do
mcidr := -1;
mcpage += 1;
structdata =| StDataPage !mcpage;
2023-05-03 14:43:57 +02:00
reset_state ();
2024-09-11 18:05:20 +02:00
if hd !chunks <> [] then pdf := draw_single ~struct_tree ~fast ~underneath ~filename ~bates ~batespad !range !pdf (hd !chunks);
2023-05-03 14:43:57 +02:00
chunks := tl !chunks;
if !chunks <> [] then begin
(* If the range is just a single page, and there is a next page, move to it. Otherwise,
add a blank page at the end of the document. *)
2023-05-03 14:43:57 +02:00
let endpage = Pdfpage.endpage !pdf in
match !range with
| [x] when endpage > x -> range := [x + 1]
| _ ->
pdf := Cpdfpad.padafter [endpage] !pdf;
range := [endpage + 1]
2023-05-03 14:43:57 +02:00
end
done;
2024-09-09 19:05:12 +02:00
if struct_tree then write_structure_tree !pdf (make_structure_tree !pdf (rev !structdata));
2023-05-03 14:43:57 +02:00
!pdf