From 686318718df00cc58cc4bdf88d0f4814608a78c4 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Thu, 3 Oct 2024 16:18:09 +0100 Subject: [PATCH] More structure tree work --- cpdftexttopdf.ml | 40 ++++++++++++++++++++++++++++++++-------- cpdftype.ml | 5 +++-- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/cpdftexttopdf.ml b/cpdftexttopdf.ml index 9922761..e6789bd 100644 --- a/cpdftexttopdf.ml +++ b/cpdftexttopdf.ml @@ -94,6 +94,30 @@ let typeset ~process_struct_tree ?subformat ?title ~papersize ~font ~fontsize te let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) tagged) in [firstfont; Cpdftype.BeginDocument] @ tagged in + let pages, tags = Cpdftype.typeset ~process_struct_tree margin margin margin margin papersize pdf instrs in + iter (fun x -> Printf.printf "PAGE\n"; iter (fun (_, i) -> Printf.printf "Tag number %i\n" i) x) tags; + (* We make (tag number, page number, mcid) triples *) + let tagtriples = + flatten + (map2 + (fun pn tags -> + map2 (fun (_, tagnum) mcid -> (tagnum, pn, mcid)) tags (indx0 tags)) + (indx0 tags) + tags) + in + Printf.printf "(tag number, page number, mcid) triples:\n"; + iter (fun (tagnum, pn, mcid) -> Printf.printf "%i, %i, %i\n" tagnum pn mcid) tagtriples; + (* Now work out the nodes and what each /K and /Pg in them is *) + let pages_and_mcids = + [] + in + Printf.printf "Pages and their MCIDs\n"; + iter + (fun (page, mcids) -> + Printf.printf "Page %i\n"; + iter (Printf.printf "%i ") mcids; + Printf.printf "\n") + pages_and_mcids; if subformat = Some Cpdfua.PDFUA2 then begin let str = Pdf.addobj pdf Pdf.Null in @@ -118,11 +142,11 @@ let typeset ~process_struct_tree ?subformat ?title ~papersize ~font ~fontsize te Pdf.addobj_given_num pdf (str, Pdf.Dictionary [("/Type", Pdf.Name "/StructTreeRoot"); ("/K", Pdf.Array [Pdf.Indirect p]); ("/ParentTree", Pdf.Indirect parent_tree)]); Pdf.replace_chain pdf ["/Root"] ("/StructTreeRoot", (Pdf.Indirect str)) end; - let pages, tags = Cpdftype.typeset ~process_struct_tree margin margin margin margin papersize pdf instrs in - let pages = - map - (fun p -> if process_struct_tree then {p with Pdfpage.rest = Pdf.add_dict_entry p.Pdfpage.rest "/StructParents" (Pdf.Integer 1)} else p) - pages - in - let pdf, pageroot = Pdfpage.add_pagetree pages pdf in - Pdfpage.add_root pageroot [] pdf + let pages = + map2 + (fun pn p -> if process_struct_tree then {p with Pdfpage.rest = Pdf.add_dict_entry p.Pdfpage.rest "/StructParents" (Pdf.Integer pn)} else p) + (indx0 pages) + pages + in + let pdf, pageroot = Pdfpage.add_pagetree pages pdf in + Pdfpage.add_root pageroot [] pdf diff --git a/cpdftype.ml b/cpdftype.ml index 4fc061a..552ff11 100644 --- a/cpdftype.ml +++ b/cpdftype.ml @@ -246,7 +246,7 @@ let typeset ~process_struct_tree lmargin rmargin tmargin bmargin papersize pdf i Hashtbl.clear width_table_cache; let debug = false in if debug then (print_endline "***input:\n\n"; print_endline (to_string i)); - let i = number_tags 1 i in + let i = number_tags 0 i in let i = layout lmargin rmargin papersize i in if debug then (print_endline "***after layout:\n\n"; print_endline (to_string i)); let i = paginate tmargin bmargin papersize i in @@ -275,7 +275,8 @@ let typeset ~process_struct_tree lmargin rmargin tmargin bmargin papersize pdf i Pdfpage.rest = make_annotations pdf !thispageannotations} in pages := page::!pages; - tagsout := rev !tags::!tagsout + tagsout := rev !tags::!tagsout; + tags := [] in let rec typeset_element = function | Text cps ->