Scaffolding for struct trees in typesetter

This commit is contained in:
John Whitington 2024-10-02 13:27:57 +01:00
parent a7d0fcad56
commit c827f7b410
7 changed files with 55 additions and 41 deletions

View File

@ -4549,7 +4549,8 @@ let go () =
| Some (Typeset filename) ->
let text = Pdfio.bytes_of_input_channel (open_in_bin filename) in
let cpdffont = embed_font () in
let pdf = Cpdftexttopdf.typeset ~subformat:args.subformat ~title:args.title ~font:cpdffont ~papersize:args.createpdf_pagesize ~fontsize:args.fontsize text in
let pdf = Cpdftexttopdf.typeset ~process_struct_tree:args.process_struct_trees
?subformat:args.subformat ?title:args.title ~font:cpdffont ~papersize:args.createpdf_pagesize ~fontsize:args.fontsize text in
write_pdf false pdf
| Some (TextWidth s) ->
let rawwidth =

View File

@ -559,39 +559,6 @@ let save_whole_stack () =
let restore_whole_stack r =
resstack := r
(* Mark as an artifact anything not already marked. *)
let add_artifacts ops =
let content = ref false in
let artifact = ref false in
let rec loop a = function
| [] ->
(* The end. Must end artifact if in artifact. *)
if !artifact then rev (Pdfops.Op_EMC::a) else rev a
| Pdfops.Op_BMC "/BeginArtifact"::t ->
(* Convert back-channel artifact beginning. *)
set artifact;
loop (Pdfops.Op_BMC "/Artifact"::a) t
| Pdfops.Op_BMC "/EndArtifact"::t ->
(* Convert back-channel artifact ending. *)
clear artifact;
loop (Pdfops.Op_EMC::a) t
| Pdfops.Op_BDC _ as h::t ->
(* Entering content. If in artifact, must end artifact. *)
let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in
set content; clear artifact; loop a' t
| Pdfops.Op_EMC as h::t ->
(* Exiting content. *)
clear content;
loop (h::a) t
| h::t ->
(* A normal operation. If not in content or artifact must start artifact. *)
let a' =
if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a
in
loop a' t
in
loop [] ops
(* When no automatic artifacting, we still need to fix our backchannel manual artifacts. *)
let fixup_manual_artifacts =
map (function Pdfops.Op_BMC "/BeginArtifact" -> Pdfops.Op_BMC "/Artifact"
@ -623,7 +590,7 @@ let draw_single ~struct_tree ~fast ~underneath ~filename ~bates ~batespad range
map3
(fun n p ops ->
if not (mem n range) then p else
let ops = if struct_tree && !do_add_artifacts then add_artifacts ops else fixup_manual_artifacts ops in
let ops = if struct_tree && !do_add_artifacts then Cpdftype.add_artifacts ops else fixup_manual_artifacts ops in
let page = {p with Pdfpage.resources = update_resources pdf p.Pdfpage.resources} in
(if underneath then Pdfpage.prepend_operators else Pdfpage.postpend_operators) pdf ops ~fast page)
(ilist 1 endpage)

View File

@ -43,7 +43,10 @@ let of_utf8_with_newlines fontpack fontsize t =
if c <> [] then process_codepoints c;
rev !items
let typeset ?subformat ?title ~papersize ~font ~fontsize text =
let typeset ~process_struct_tree ?subformat ?title ~papersize ~font ~fontsize text =
let process_struct_tree =
process_struct_tree || subformat = Some Cpdfua.PDFUA1 || subformat = Some Cpdfua.PDFUA2
in
let pdf = Pdf.empty () in
let codepoints = setify (Pdftext.codepoints_of_utf8 (Pdfio.string_of_bytes text)) in
let fontpack =
@ -61,6 +64,6 @@ let typeset ?subformat ?title ~papersize ~font ~fontsize text =
let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) instrs) in
[firstfont; Cpdftype.BeginDocument] @ instrs
in
let pages = Cpdftype.typeset margin margin margin margin papersize pdf instrs in
let pages = Cpdftype.typeset ~process_struct_tree margin margin margin margin papersize pdf instrs in
let pdf, pageroot = Pdfpage.add_pagetree pages pdf in
Pdfpage.add_root pageroot [] pdf

View File

@ -1,4 +1,4 @@
(** Text to PDF *)
(** Typeset a text file as a PDF. *)
val typeset : ?subformat:Cpdfua.subformat option -> ?title:string option -> papersize:Pdfpaper.t -> font:Cpdfembed.cpdffont -> fontsize:float -> Pdfio.bytes -> Pdf.t
val typeset : process_struct_tree:bool -> ?subformat:Cpdfua.subformat -> ?title:string -> papersize:Pdfpaper.t -> font:Cpdfembed.cpdffont -> fontsize:float -> Pdfio.bytes -> Pdf.t

View File

@ -175,7 +175,7 @@ let typeset_table_of_contents ~font ~fontsize ~title ~bookmark pdf =
let firstfont =
hd (keep (function Cpdftype.Font _ -> true | _ -> false) (title @ flatten lines))
in
Cpdftype.typeset lm rm tm bm firstpage_papersize pdf
Cpdftype.typeset ~process_struct_tree:false lm rm tm bm firstpage_papersize pdf
([firstfont; Cpdftype.BeginDocument] @ title @ flatten lines)
in
let toc_pages =

View File

@ -18,6 +18,8 @@ type element =
| BeginDest of Pdfdest.t
| EndDest
| BeginDocument
| Tag of string
| EndTag
let to_string_elt = function
| Text t -> implode t
@ -29,6 +31,8 @@ let to_string_elt = function
| BeginDest _ -> "BeginDest"
| EndDest -> "EndDest"
| BeginDocument -> "BeginDocument"
| Tag s -> "Tag " ^ s
| EndTag -> "EndTag"
let to_string es = fold_left (fun a b -> a ^ "\n" ^ b) "" (map to_string_elt es)
@ -54,6 +58,39 @@ let initial_state () =
ypos = 0.;
dest = None}
(* Mark as an artifact anything not already marked. *)
let add_artifacts ops =
let content = ref false in
let artifact = ref false in
let rec loop a = function
| [] ->
(* The end. Must end artifact if in artifact. *)
if !artifact then rev (Pdfops.Op_EMC::a) else rev a
| Pdfops.Op_BMC "/BeginArtifact"::t ->
(* Convert back-channel artifact beginning. *)
set artifact;
loop (Pdfops.Op_BMC "/Artifact"::a) t
| Pdfops.Op_BMC "/EndArtifact"::t ->
(* Convert back-channel artifact ending. *)
clear artifact;
loop (Pdfops.Op_EMC::a) t
| Pdfops.Op_BDC _ as h::t ->
(* Entering content. If in artifact, must end artifact. *)
let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in
set content; clear artifact; loop a' t
| Pdfops.Op_EMC as h::t ->
(* Exiting content. *)
clear content;
loop (h::a) t
| h::t ->
(* A normal operation. If not in content or artifact must start artifact. *)
let a' =
if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a
in
loop a' t
in
loop [] ops
let font_widths id f fontsize =
match Hashtbl.find width_table_cache (id, fontsize) with
| x -> x
@ -189,7 +226,7 @@ let make_annotations pdf annots =
(* At this stage, just Font and Text and HGlue 0. and VGlue 0. and Newline and
NewPage elements. Split on NewPages, typeset each page, add font
dictionaries. New page only creates a page when that page has content. *)
let typeset lmargin rmargin tmargin bmargin papersize pdf i =
let typeset ~process_struct_tree lmargin rmargin tmargin bmargin papersize pdf i =
Hashtbl.clear width_table_cache;
let debug = false in
if debug then (print_endline "***input:\n\n"; print_endline (to_string i));
@ -286,3 +323,5 @@ let typeset lmargin rmargin tmargin bmargin papersize pdf i =
iter typeset_element i;
write_page ();
rev !pages

View File

@ -9,6 +9,8 @@ type element =
| BeginDest of Pdfdest.t
| EndDest
| BeginDocument
| Tag of string
| EndTag
type t = element list
@ -22,4 +24,6 @@ val font_widths : string -> Pdftext.font -> float -> float array
val width_of_string : float array -> char list -> float
(** [typeset lmargin rmargin tmargin bmargin papersize pdf contents] builds a list of pages of typset content. *)
val typeset : float -> float -> float -> float -> Pdfpaper.t -> Pdf.t -> t -> Pdfpage.t list
val typeset : process_struct_tree:bool -> float -> float -> float -> float -> Pdfpaper.t -> Pdf.t -> t -> Pdfpage.t list
val add_artifacts : Pdfops.t list -> Pdfops.t list