Obet -draw-struct-trees properly

This commit is contained in:
John Whitington 2024-09-11 17:05:20 +01:00
parent 9ed77b6950
commit eeb64d48df
1 changed files with 23 additions and 21 deletions

View File

@ -207,9 +207,9 @@ type structdata =
let structdata = ref [] let structdata = ref []
let rec ops_of_drawop dryrun pdf endpage filename bates batespad num page = function let rec ops_of_drawop struct_tree dryrun pdf endpage filename bates batespad num page = function
| Qq ops -> | Qq ops ->
[Pdfops.Op_q] @ ops_of_drawops dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q] [Pdfops.Op_q] @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops @ [Pdfops.Op_Q]
| Matrix m -> [Pdfops.Op_cm m] | Matrix m -> [Pdfops.Op_cm m]
| Rect (x, y, w, h) -> [Pdfops.Op_re (x, y, w, h)] | Rect (x, y, w, h) -> [Pdfops.Op_re (x, y, w, h)]
| Bezier (a, b, c, d, e, f) -> [Pdfops.Op_c (a, b, c, d, e, f)] | Bezier (a, b, c, d, e, f) -> [Pdfops.Op_c (a, b, c, d, e, f)]
@ -245,7 +245,7 @@ let rec ops_of_drawop dryrun pdf endpage filename bates batespad num page = func
| SetMiterLimit m -> [Pdfops.Op_M m] | SetMiterLimit m -> [Pdfops.Op_M m]
| SetDashPattern (x, y) -> [Pdfops.Op_d (x, y)] | SetDashPattern (x, y) -> [Pdfops.Op_d (x, y)]
| FormXObject (a, b, c, d, n, ops) -> | FormXObject (a, b, c, d, n, ops) ->
create_form_xobject dryrun a b c d pdf endpage filename bates batespad num page n ops; create_form_xobject struct_tree dryrun a b c d pdf endpage filename bates batespad num page n ops;
[] []
| Use n -> | Use n ->
let pdfname = try fst (Hashtbl.find (res ()).form_xobjects n) with _ -> error ("Form XObject not found: " ^ n) in let pdfname = try fst (Hashtbl.find (res ()).form_xobjects n) with _ -> error ("Form XObject not found: " ^ n) in
@ -256,7 +256,9 @@ let rec ops_of_drawop dryrun pdf endpage filename bates batespad num page = func
if not dryrun then structdata := StDataMCID ("/Figure", m, t)::!structdata; if not dryrun then structdata := StDataMCID ("/Figure", m, t)::!structdata;
let pdfname = try fst (Hashtbl.find (res ()).images s) with _ -> error ("Image not found: " ^ s) in let pdfname = try fst (Hashtbl.find (res ()).images s) with _ -> error ("Image not found: " ^ s) in
(res ()).page_names <- pdfname::(res ()).page_names; (res ()).page_names <- pdfname::(res ()).page_names;
[Pdfops.Op_BDC ("/Figure", Pdf.Dictionary ["/MCID", Pdf.Integer m]); Pdfops.Op_Do pdfname; Pdfops.Op_EMC] (if struct_tree then [Pdfops.Op_BDC ("/Figure", Pdf.Dictionary ["/MCID", Pdf.Integer m])] else [])
@ [Pdfops.Op_Do pdfname]
@ (if struct_tree then [Pdfops.Op_EMC] else [])
| ImageXObject (s, obj) -> | ImageXObject (s, obj) ->
Hashtbl.replace (res ()).images s (fresh_name "/I", Pdf.addobj pdf obj); Hashtbl.replace (res ()).images s (fresh_name "/I", Pdf.addobj pdf obj);
[] []
@ -313,11 +315,11 @@ let rec ops_of_drawop dryrun pdf endpage filename bates batespad num page = func
| TextSection ops -> | TextSection ops ->
let m = mcid () in let m = mcid () in
if not dryrun then structdata := StDataMCID ("/P", m, None)::!structdata; if not dryrun then structdata := StDataMCID ("/P", m, None)::!structdata;
[Pdfops.Op_BDC ("/P", Pdf.Dictionary ["/MCID", Pdf.Integer m]); (if struct_tree then [Pdfops.Op_BDC ("/P", Pdf.Dictionary ["/MCID", Pdf.Integer m])] else [])
Pdfops.Op_BT] @ [Pdfops.Op_BT]
@ ops_of_drawops dryrun pdf endpage filename bates batespad num page ops @ @ ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops
[Pdfops.Op_ET; @ [Pdfops.Op_ET]
Pdfops.Op_EMC] @ (if struct_tree then [Pdfops.Op_EMC] else [])
| Text s -> | Text s ->
if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s); if dryrun then iter (fun c -> Hashtbl.replace (res ()).current_fontpack_codepoints c ()) (Pdftext.codepoints_of_utf8 s);
runs_of_utf8 s runs_of_utf8 s
@ -333,14 +335,14 @@ let rec ops_of_drawop dryrun pdf endpage filename bates batespad num page = func
| Rise f -> [Pdfops.Op_Ts f] | Rise f -> [Pdfops.Op_Ts f]
| Newline -> [Pdfops.Op_T'] | Newline -> [Pdfops.Op_T']
and ops_of_drawops dryrun pdf endpage filename bates batespad num page drawops = and ops_of_drawops draw_struct dryrun pdf endpage filename bates batespad num page drawops =
flatten (map (ops_of_drawop dryrun pdf endpage filename bates batespad num page) drawops) flatten (map (ops_of_drawop draw_struct dryrun pdf endpage filename bates batespad num page) drawops)
and create_form_xobject dryrun a b c d pdf endpage filename bates batespad num page n ops = and create_form_xobject struct_tree dryrun a b c d pdf endpage filename bates batespad num page n ops =
respush (); respush ();
reset_state (); reset_state ();
let data = let data =
Pdfio.bytes_of_string (Pdfops.string_of_ops (ops_of_drawops dryrun pdf endpage filename bates batespad num page ops)) Pdfio.bytes_of_string (Pdfops.string_of_ops (ops_of_drawops struct_tree dryrun pdf endpage filename bates batespad num page ops))
in in
let obj = let obj =
Pdf.Stream Pdf.Stream
@ -420,14 +422,14 @@ let add_artifacts ops =
in in
loop [] ops loop [] ops
let draw_single ~fast ~underneath ~filename ~bates ~batespad range pdf drawops = let draw_single ~struct_tree ~fast ~underneath ~filename ~bates ~batespad range pdf drawops =
(res ()).num <- max (res ()).num (minimum_resource_number pdf range); (res ()).num <- max (res ()).num (minimum_resource_number pdf range);
let endpage = Pdfpage.endpage pdf in let endpage = Pdfpage.endpage pdf in
let pages = Pdfpage.pages_of_pagetree pdf in let pages = Pdfpage.pages_of_pagetree pdf in
let ops = let ops =
if contains_specials drawops if contains_specials drawops
then None then None
else Some (ops_of_drawops false pdf endpage filename bates batespad 0 (hd pages) drawops) else Some (ops_of_drawops struct_tree false pdf endpage filename bates batespad 0 (hd pages) drawops)
in in
let ss = let ss =
map2 map2
@ -436,7 +438,7 @@ let draw_single ~fast ~underneath ~filename ~bates ~batespad range pdf drawops =
then then
(match ops with (match ops with
| Some x -> x | Some x -> x
| None -> ops_of_drawops false pdf endpage filename bates batespad n p drawops) | None -> ops_of_drawops struct_tree false pdf endpage filename bates batespad n p drawops)
else []) else [])
(ilist 1 endpage) (ilist 1 endpage)
pages pages
@ -445,7 +447,7 @@ let draw_single ~fast ~underneath ~filename ~bates ~batespad range pdf drawops =
map3 map3
(fun n p ops -> (fun n p ops ->
if not (mem n range) then p else if not (mem n range) then p else
let ops = add_artifacts ops in let ops = if struct_tree then add_artifacts ops else ops in
let page = {p with Pdfpage.resources = update_resources pdf p.Pdfpage.resources} in let page = {p with Pdfpage.resources = update_resources pdf p.Pdfpage.resources} in
(if underneath then Pdfpage.prepend_operators else Pdfpage.postpend_operators) pdf ops ~fast page) (if underneath then Pdfpage.prepend_operators else Pdfpage.postpend_operators) pdf ops ~fast page)
(ilist 1 endpage) (ilist 1 endpage)
@ -455,7 +457,7 @@ let draw_single ~fast ~underneath ~filename ~bates ~batespad range pdf drawops =
Pdfpage.change_pages true pdf pages Pdfpage.change_pages true pdf pages
(* Do a dry run of all the drawing to collect subset information. *) (* Do a dry run of all the drawing to collect subset information. *)
let dryrun ~filename ~bates ~batespad range pdf chunks = let dryrun ~struct_tree ~filename ~bates ~batespad range pdf chunks =
let endpage = Pdfpage.endpage pdf in let endpage = Pdfpage.endpage pdf in
let pages = Pdfpage.pages_of_pagetree pdf in let pages = Pdfpage.pages_of_pagetree pdf in
let r = save_whole_stack () in let r = save_whole_stack () in
@ -463,7 +465,7 @@ let dryrun ~filename ~bates ~batespad range pdf chunks =
let pagenum = ref (hd range) in let pagenum = ref (hd range) in
iter iter
(fun chunk -> (fun chunk ->
ignore (ops_of_drawops true pdf endpage filename bates batespad !pagenum (hd pages) chunk); ignore (ops_of_drawops struct_tree true pdf endpage filename bates batespad !pagenum (hd pages) chunk);
match range with match range with
| [x] when endpage > x -> pagenum := x + 1 | [x] when endpage > x -> pagenum := x + 1
| _ -> pagenum := endpage + 1) | _ -> pagenum := endpage + 1)
@ -554,14 +556,14 @@ let draw ~struct_tree ~fast ~underneath ~filename ~bates ~batespad range pdf dra
(* Double up a trailing NewPage so it actually does something... *) (* Double up a trailing NewPage so it actually does something... *)
let drawops = match rev drawops with NewPage::t -> rev (NewPage::NewPage::t) | _ -> drawops in let drawops = match rev drawops with NewPage::t -> rev (NewPage::NewPage::t) | _ -> drawops in
let chunks = ref (split_around (eq NewPage) drawops) in let chunks = ref (split_around (eq NewPage) drawops) in
dryrun ~filename ~bates ~batespad !range !pdf !chunks; dryrun ~struct_tree ~filename ~bates ~batespad !range !pdf !chunks;
mcpage := 0; mcpage := 0;
while !chunks <> [] do while !chunks <> [] do
mcidr := -1; mcidr := -1;
mcpage += 1; mcpage += 1;
structdata =| StDataPage !mcpage; structdata =| StDataPage !mcpage;
reset_state (); reset_state ();
if hd !chunks <> [] then pdf := draw_single ~fast ~underneath ~filename ~bates ~batespad !range !pdf (hd !chunks); if hd !chunks <> [] then pdf := draw_single ~struct_tree ~fast ~underneath ~filename ~bates ~batespad !range !pdf (hd !chunks);
chunks := tl !chunks; chunks := tl !chunks;
if !chunks <> [] then begin if !chunks <> [] then begin
(* If the range is just a single page, and there is a next page, move to it. Otherwise, (* If the range is just a single page, and there is a next page, move to it. Otherwise,