Scaffolding for struct trees in typesetter

This commit is contained in:
John Whitington 2024-10-02 13:27:57 +01:00
parent a7d0fcad56
commit c827f7b410
7 changed files with 55 additions and 41 deletions

View File

@ -4549,7 +4549,8 @@ let go () =
| Some (Typeset filename) -> | Some (Typeset filename) ->
let text = Pdfio.bytes_of_input_channel (open_in_bin filename) in let text = Pdfio.bytes_of_input_channel (open_in_bin filename) in
let cpdffont = embed_font () in let cpdffont = embed_font () in
let pdf = Cpdftexttopdf.typeset ~subformat:args.subformat ~title:args.title ~font:cpdffont ~papersize:args.createpdf_pagesize ~fontsize:args.fontsize text in let pdf = Cpdftexttopdf.typeset ~process_struct_tree:args.process_struct_trees
?subformat:args.subformat ?title:args.title ~font:cpdffont ~papersize:args.createpdf_pagesize ~fontsize:args.fontsize text in
write_pdf false pdf write_pdf false pdf
| Some (TextWidth s) -> | Some (TextWidth s) ->
let rawwidth = let rawwidth =

View File

@ -559,39 +559,6 @@ let save_whole_stack () =
let restore_whole_stack r = let restore_whole_stack r =
resstack := r resstack := r
(* Mark as an artifact anything not already marked. *)
let add_artifacts ops =
let content = ref false in
let artifact = ref false in
let rec loop a = function
| [] ->
(* The end. Must end artifact if in artifact. *)
if !artifact then rev (Pdfops.Op_EMC::a) else rev a
| Pdfops.Op_BMC "/BeginArtifact"::t ->
(* Convert back-channel artifact beginning. *)
set artifact;
loop (Pdfops.Op_BMC "/Artifact"::a) t
| Pdfops.Op_BMC "/EndArtifact"::t ->
(* Convert back-channel artifact ending. *)
clear artifact;
loop (Pdfops.Op_EMC::a) t
| Pdfops.Op_BDC _ as h::t ->
(* Entering content. If in artifact, must end artifact. *)
let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in
set content; clear artifact; loop a' t
| Pdfops.Op_EMC as h::t ->
(* Exiting content. *)
clear content;
loop (h::a) t
| h::t ->
(* A normal operation. If not in content or artifact must start artifact. *)
let a' =
if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a
in
loop a' t
in
loop [] ops
(* When no automatic artifacting, we still need to fix our backchannel manual artifacts. *) (* When no automatic artifacting, we still need to fix our backchannel manual artifacts. *)
let fixup_manual_artifacts = let fixup_manual_artifacts =
map (function Pdfops.Op_BMC "/BeginArtifact" -> Pdfops.Op_BMC "/Artifact" map (function Pdfops.Op_BMC "/BeginArtifact" -> Pdfops.Op_BMC "/Artifact"
@ -623,7 +590,7 @@ let draw_single ~struct_tree ~fast ~underneath ~filename ~bates ~batespad range
map3 map3
(fun n p ops -> (fun n p ops ->
if not (mem n range) then p else if not (mem n range) then p else
let ops = if struct_tree && !do_add_artifacts then add_artifacts ops else fixup_manual_artifacts ops in let ops = if struct_tree && !do_add_artifacts then Cpdftype.add_artifacts ops else fixup_manual_artifacts ops in
let page = {p with Pdfpage.resources = update_resources pdf p.Pdfpage.resources} in let page = {p with Pdfpage.resources = update_resources pdf p.Pdfpage.resources} in
(if underneath then Pdfpage.prepend_operators else Pdfpage.postpend_operators) pdf ops ~fast page) (if underneath then Pdfpage.prepend_operators else Pdfpage.postpend_operators) pdf ops ~fast page)
(ilist 1 endpage) (ilist 1 endpage)

View File

@ -43,7 +43,10 @@ let of_utf8_with_newlines fontpack fontsize t =
if c <> [] then process_codepoints c; if c <> [] then process_codepoints c;
rev !items rev !items
let typeset ?subformat ?title ~papersize ~font ~fontsize text = let typeset ~process_struct_tree ?subformat ?title ~papersize ~font ~fontsize text =
let process_struct_tree =
process_struct_tree || subformat = Some Cpdfua.PDFUA1 || subformat = Some Cpdfua.PDFUA2
in
let pdf = Pdf.empty () in let pdf = Pdf.empty () in
let codepoints = setify (Pdftext.codepoints_of_utf8 (Pdfio.string_of_bytes text)) in let codepoints = setify (Pdftext.codepoints_of_utf8 (Pdfio.string_of_bytes text)) in
let fontpack = let fontpack =
@ -61,6 +64,6 @@ let typeset ?subformat ?title ~papersize ~font ~fontsize text =
let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) instrs) in let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) instrs) in
[firstfont; Cpdftype.BeginDocument] @ instrs [firstfont; Cpdftype.BeginDocument] @ instrs
in in
let pages = Cpdftype.typeset margin margin margin margin papersize pdf instrs in let pages = Cpdftype.typeset ~process_struct_tree margin margin margin margin papersize pdf instrs in
let pdf, pageroot = Pdfpage.add_pagetree pages pdf in let pdf, pageroot = Pdfpage.add_pagetree pages pdf in
Pdfpage.add_root pageroot [] pdf Pdfpage.add_root pageroot [] pdf

View File

@ -1,4 +1,4 @@
(** Text to PDF *) (** Text to PDF *)
(** Typeset a text file as a PDF. *) (** Typeset a text file as a PDF. *)
val typeset : ?subformat:Cpdfua.subformat option -> ?title:string option -> papersize:Pdfpaper.t -> font:Cpdfembed.cpdffont -> fontsize:float -> Pdfio.bytes -> Pdf.t val typeset : process_struct_tree:bool -> ?subformat:Cpdfua.subformat -> ?title:string -> papersize:Pdfpaper.t -> font:Cpdfembed.cpdffont -> fontsize:float -> Pdfio.bytes -> Pdf.t

View File

@ -175,7 +175,7 @@ let typeset_table_of_contents ~font ~fontsize ~title ~bookmark pdf =
let firstfont = let firstfont =
hd (keep (function Cpdftype.Font _ -> true | _ -> false) (title @ flatten lines)) hd (keep (function Cpdftype.Font _ -> true | _ -> false) (title @ flatten lines))
in in
Cpdftype.typeset lm rm tm bm firstpage_papersize pdf Cpdftype.typeset ~process_struct_tree:false lm rm tm bm firstpage_papersize pdf
([firstfont; Cpdftype.BeginDocument] @ title @ flatten lines) ([firstfont; Cpdftype.BeginDocument] @ title @ flatten lines)
in in
let toc_pages = let toc_pages =

View File

@ -18,6 +18,8 @@ type element =
| BeginDest of Pdfdest.t | BeginDest of Pdfdest.t
| EndDest | EndDest
| BeginDocument | BeginDocument
| Tag of string
| EndTag
let to_string_elt = function let to_string_elt = function
| Text t -> implode t | Text t -> implode t
@ -29,6 +31,8 @@ let to_string_elt = function
| BeginDest _ -> "BeginDest" | BeginDest _ -> "BeginDest"
| EndDest -> "EndDest" | EndDest -> "EndDest"
| BeginDocument -> "BeginDocument" | BeginDocument -> "BeginDocument"
| Tag s -> "Tag " ^ s
| EndTag -> "EndTag"
let to_string es = fold_left (fun a b -> a ^ "\n" ^ b) "" (map to_string_elt es) let to_string es = fold_left (fun a b -> a ^ "\n" ^ b) "" (map to_string_elt es)
@ -54,6 +58,39 @@ let initial_state () =
ypos = 0.; ypos = 0.;
dest = None} dest = None}
(* Mark as an artifact anything not already marked. *)
let add_artifacts ops =
let content = ref false in
let artifact = ref false in
let rec loop a = function
| [] ->
(* The end. Must end artifact if in artifact. *)
if !artifact then rev (Pdfops.Op_EMC::a) else rev a
| Pdfops.Op_BMC "/BeginArtifact"::t ->
(* Convert back-channel artifact beginning. *)
set artifact;
loop (Pdfops.Op_BMC "/Artifact"::a) t
| Pdfops.Op_BMC "/EndArtifact"::t ->
(* Convert back-channel artifact ending. *)
clear artifact;
loop (Pdfops.Op_EMC::a) t
| Pdfops.Op_BDC _ as h::t ->
(* Entering content. If in artifact, must end artifact. *)
let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in
set content; clear artifact; loop a' t
| Pdfops.Op_EMC as h::t ->
(* Exiting content. *)
clear content;
loop (h::a) t
| h::t ->
(* A normal operation. If not in content or artifact must start artifact. *)
let a' =
if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a
in
loop a' t
in
loop [] ops
let font_widths id f fontsize = let font_widths id f fontsize =
match Hashtbl.find width_table_cache (id, fontsize) with match Hashtbl.find width_table_cache (id, fontsize) with
| x -> x | x -> x
@ -189,7 +226,7 @@ let make_annotations pdf annots =
(* At this stage, just Font and Text and HGlue 0. and VGlue 0. and Newline and (* At this stage, just Font and Text and HGlue 0. and VGlue 0. and Newline and
NewPage elements. Split on NewPages, typeset each page, add font NewPage elements. Split on NewPages, typeset each page, add font
dictionaries. New page only creates a page when that page has content. *) dictionaries. New page only creates a page when that page has content. *)
let typeset lmargin rmargin tmargin bmargin papersize pdf i = let typeset ~process_struct_tree lmargin rmargin tmargin bmargin papersize pdf i =
Hashtbl.clear width_table_cache; Hashtbl.clear width_table_cache;
let debug = false in let debug = false in
if debug then (print_endline "***input:\n\n"; print_endline (to_string i)); if debug then (print_endline "***input:\n\n"; print_endline (to_string i));
@ -286,3 +323,5 @@ let typeset lmargin rmargin tmargin bmargin papersize pdf i =
iter typeset_element i; iter typeset_element i;
write_page (); write_page ();
rev !pages rev !pages

View File

@ -9,6 +9,8 @@ type element =
| BeginDest of Pdfdest.t | BeginDest of Pdfdest.t
| EndDest | EndDest
| BeginDocument | BeginDocument
| Tag of string
| EndTag
type t = element list type t = element list
@ -22,4 +24,6 @@ val font_widths : string -> Pdftext.font -> float -> float array
val width_of_string : float array -> char list -> float val width_of_string : float array -> char list -> float
(** [typeset lmargin rmargin tmargin bmargin papersize pdf contents] builds a list of pages of typset content. *) (** [typeset lmargin rmargin tmargin bmargin papersize pdf contents] builds a list of pages of typset content. *)
val typeset : float -> float -> float -> float -> Pdfpaper.t -> Pdf.t -> t -> Pdfpage.t list val typeset : process_struct_tree:bool -> float -> float -> float -> float -> Pdfpaper.t -> Pdf.t -> t -> Pdfpage.t list
val add_artifacts : Pdfops.t list -> Pdfops.t list