From c827f7b41041932c6ca253d53a0fa81c36392871 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Wed, 2 Oct 2024 13:27:57 +0100 Subject: [PATCH] Scaffolding for struct trees in typesetter --- cpdfcommand.ml | 3 ++- cpdfdraw.ml | 35 +---------------------------------- cpdftexttopdf.ml | 7 +++++-- cpdftexttopdf.mli | 2 +- cpdftoc.ml | 2 +- cpdftype.ml | 41 ++++++++++++++++++++++++++++++++++++++++- cpdftype.mli | 6 +++++- 7 files changed, 55 insertions(+), 41 deletions(-) diff --git a/cpdfcommand.ml b/cpdfcommand.ml index 8ca4802..82b6956 100644 --- a/cpdfcommand.ml +++ b/cpdfcommand.ml @@ -4549,7 +4549,8 @@ let go () = | Some (Typeset filename) -> let text = Pdfio.bytes_of_input_channel (open_in_bin filename) in let cpdffont = embed_font () in - let pdf = Cpdftexttopdf.typeset ~subformat:args.subformat ~title:args.title ~font:cpdffont ~papersize:args.createpdf_pagesize ~fontsize:args.fontsize text in + let pdf = Cpdftexttopdf.typeset ~process_struct_tree:args.process_struct_trees + ?subformat:args.subformat ?title:args.title ~font:cpdffont ~papersize:args.createpdf_pagesize ~fontsize:args.fontsize text in write_pdf false pdf | Some (TextWidth s) -> let rawwidth = diff --git a/cpdfdraw.ml b/cpdfdraw.ml index b840a43..c85ffb4 100644 --- a/cpdfdraw.ml +++ b/cpdfdraw.ml @@ -559,39 +559,6 @@ let save_whole_stack () = let restore_whole_stack r = resstack := r -(* Mark as an artifact anything not already marked. *) -let add_artifacts ops = - let content = ref false in - let artifact = ref false in - let rec loop a = function - | [] -> - (* The end. Must end artifact if in artifact. *) - if !artifact then rev (Pdfops.Op_EMC::a) else rev a - | Pdfops.Op_BMC "/BeginArtifact"::t -> - (* Convert back-channel artifact beginning. *) - set artifact; - loop (Pdfops.Op_BMC "/Artifact"::a) t - | Pdfops.Op_BMC "/EndArtifact"::t -> - (* Convert back-channel artifact ending. *) - clear artifact; - loop (Pdfops.Op_EMC::a) t - | Pdfops.Op_BDC _ as h::t -> - (* Entering content. If in artifact, must end artifact. *) - let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in - set content; clear artifact; loop a' t - | Pdfops.Op_EMC as h::t -> - (* Exiting content. *) - clear content; - loop (h::a) t - | h::t -> - (* A normal operation. If not in content or artifact must start artifact. *) - let a' = - if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a - in - loop a' t - in - loop [] ops - (* When no automatic artifacting, we still need to fix our backchannel manual artifacts. *) let fixup_manual_artifacts = map (function Pdfops.Op_BMC "/BeginArtifact" -> Pdfops.Op_BMC "/Artifact" @@ -623,7 +590,7 @@ let draw_single ~struct_tree ~fast ~underneath ~filename ~bates ~batespad range map3 (fun n p ops -> if not (mem n range) then p else - let ops = if struct_tree && !do_add_artifacts then add_artifacts ops else fixup_manual_artifacts ops in + let ops = if struct_tree && !do_add_artifacts then Cpdftype.add_artifacts ops else fixup_manual_artifacts ops in let page = {p with Pdfpage.resources = update_resources pdf p.Pdfpage.resources} in (if underneath then Pdfpage.prepend_operators else Pdfpage.postpend_operators) pdf ops ~fast page) (ilist 1 endpage) diff --git a/cpdftexttopdf.ml b/cpdftexttopdf.ml index ec9c8cf..7effe22 100644 --- a/cpdftexttopdf.ml +++ b/cpdftexttopdf.ml @@ -43,7 +43,10 @@ let of_utf8_with_newlines fontpack fontsize t = if c <> [] then process_codepoints c; rev !items -let typeset ?subformat ?title ~papersize ~font ~fontsize text = +let typeset ~process_struct_tree ?subformat ?title ~papersize ~font ~fontsize text = + let process_struct_tree = + process_struct_tree || subformat = Some Cpdfua.PDFUA1 || subformat = Some Cpdfua.PDFUA2 + in let pdf = Pdf.empty () in let codepoints = setify (Pdftext.codepoints_of_utf8 (Pdfio.string_of_bytes text)) in let fontpack = @@ -61,6 +64,6 @@ let typeset ?subformat ?title ~papersize ~font ~fontsize text = let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) instrs) in [firstfont; Cpdftype.BeginDocument] @ instrs in - let pages = Cpdftype.typeset margin margin margin margin papersize pdf instrs in + let pages = Cpdftype.typeset ~process_struct_tree margin margin margin margin papersize pdf instrs in let pdf, pageroot = Pdfpage.add_pagetree pages pdf in Pdfpage.add_root pageroot [] pdf diff --git a/cpdftexttopdf.mli b/cpdftexttopdf.mli index 2902cd4..852f9b9 100644 --- a/cpdftexttopdf.mli +++ b/cpdftexttopdf.mli @@ -1,4 +1,4 @@ (** Text to PDF *) (** Typeset a text file as a PDF. *) -val typeset : ?subformat:Cpdfua.subformat option -> ?title:string option -> papersize:Pdfpaper.t -> font:Cpdfembed.cpdffont -> fontsize:float -> Pdfio.bytes -> Pdf.t +val typeset : process_struct_tree:bool -> ?subformat:Cpdfua.subformat -> ?title:string -> papersize:Pdfpaper.t -> font:Cpdfembed.cpdffont -> fontsize:float -> Pdfio.bytes -> Pdf.t diff --git a/cpdftoc.ml b/cpdftoc.ml index caf5a98..81d818f 100644 --- a/cpdftoc.ml +++ b/cpdftoc.ml @@ -175,7 +175,7 @@ let typeset_table_of_contents ~font ~fontsize ~title ~bookmark pdf = let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) (title @ flatten lines)) in - Cpdftype.typeset lm rm tm bm firstpage_papersize pdf + Cpdftype.typeset ~process_struct_tree:false lm rm tm bm firstpage_papersize pdf ([firstfont; Cpdftype.BeginDocument] @ title @ flatten lines) in let toc_pages = diff --git a/cpdftype.ml b/cpdftype.ml index 8812a8b..ed8cdc6 100644 --- a/cpdftype.ml +++ b/cpdftype.ml @@ -18,6 +18,8 @@ type element = | BeginDest of Pdfdest.t | EndDest | BeginDocument +| Tag of string +| EndTag let to_string_elt = function | Text t -> implode t @@ -29,6 +31,8 @@ let to_string_elt = function | BeginDest _ -> "BeginDest" | EndDest -> "EndDest" | BeginDocument -> "BeginDocument" + | Tag s -> "Tag " ^ s + | EndTag -> "EndTag" let to_string es = fold_left (fun a b -> a ^ "\n" ^ b) "" (map to_string_elt es) @@ -54,6 +58,39 @@ let initial_state () = ypos = 0.; dest = None} +(* Mark as an artifact anything not already marked. *) +let add_artifacts ops = + let content = ref false in + let artifact = ref false in + let rec loop a = function + | [] -> + (* The end. Must end artifact if in artifact. *) + if !artifact then rev (Pdfops.Op_EMC::a) else rev a + | Pdfops.Op_BMC "/BeginArtifact"::t -> + (* Convert back-channel artifact beginning. *) + set artifact; + loop (Pdfops.Op_BMC "/Artifact"::a) t + | Pdfops.Op_BMC "/EndArtifact"::t -> + (* Convert back-channel artifact ending. *) + clear artifact; + loop (Pdfops.Op_EMC::a) t + | Pdfops.Op_BDC _ as h::t -> + (* Entering content. If in artifact, must end artifact. *) + let a' = if !artifact then h::Pdfops.Op_EMC::a else h::a in + set content; clear artifact; loop a' t + | Pdfops.Op_EMC as h::t -> + (* Exiting content. *) + clear content; + loop (h::a) t + | h::t -> + (* A normal operation. If not in content or artifact must start artifact. *) + let a' = + if not (!content || !artifact) then (set artifact; h::Pdfops.Op_BMC "/Artifact"::a) else h::a + in + loop a' t + in + loop [] ops + let font_widths id f fontsize = match Hashtbl.find width_table_cache (id, fontsize) with | x -> x @@ -189,7 +226,7 @@ let make_annotations pdf annots = (* At this stage, just Font and Text and HGlue 0. and VGlue 0. and Newline and NewPage elements. Split on NewPages, typeset each page, add font dictionaries. New page only creates a page when that page has content. *) -let typeset lmargin rmargin tmargin bmargin papersize pdf i = +let typeset ~process_struct_tree lmargin rmargin tmargin bmargin papersize pdf i = Hashtbl.clear width_table_cache; let debug = false in if debug then (print_endline "***input:\n\n"; print_endline (to_string i)); @@ -286,3 +323,5 @@ let typeset lmargin rmargin tmargin bmargin papersize pdf i = iter typeset_element i; write_page (); rev !pages + + diff --git a/cpdftype.mli b/cpdftype.mli index 069a130..1af6bb6 100644 --- a/cpdftype.mli +++ b/cpdftype.mli @@ -9,6 +9,8 @@ type element = | BeginDest of Pdfdest.t | EndDest | BeginDocument +| Tag of string +| EndTag type t = element list @@ -22,4 +24,6 @@ val font_widths : string -> Pdftext.font -> float -> float array val width_of_string : float array -> char list -> float (** [typeset lmargin rmargin tmargin bmargin papersize pdf contents] builds a list of pages of typset content. *) -val typeset : float -> float -> float -> float -> Pdfpaper.t -> Pdf.t -> t -> Pdfpage.t list +val typeset : process_struct_tree:bool -> float -> float -> float -> float -> Pdfpaper.t -> Pdf.t -> t -> Pdfpage.t list + +val add_artifacts : Pdfops.t list -> Pdfops.t list