2021-12-07 00:55:46 +01:00
|
|
|
open Pdfutil
|
2024-10-02 15:20:51 +02:00
|
|
|
open Cpdferror
|
2021-12-07 00:55:46 +01:00
|
|
|
|
2024-02-27 16:57:31 +01:00
|
|
|
let of_utf8_with_newlines fontpack fontsize t =
|
2021-12-07 00:55:46 +01:00
|
|
|
let items = ref [] in
|
2023-06-13 21:23:59 +02:00
|
|
|
let currfont = ref ~-1 in
|
2022-09-21 16:21:57 +02:00
|
|
|
let codepoints = Pdftext.codepoints_of_utf8 t in
|
2022-10-20 15:28:14 +02:00
|
|
|
let currtext = ref [] in
|
|
|
|
let process_codepoints cs =
|
|
|
|
iter
|
2022-09-21 16:21:57 +02:00
|
|
|
(fun u ->
|
2022-10-19 15:47:20 +02:00
|
|
|
match Cpdfembed.get_char fontpack u with
|
2022-10-20 15:28:14 +02:00
|
|
|
| Some (c, n, f) ->
|
|
|
|
begin if n <> !currfont then
|
|
|
|
begin
|
|
|
|
if !currtext <> [] then items := Cpdftype.Text (rev !currtext)::!items;
|
|
|
|
currtext := [];
|
|
|
|
currfont := n;
|
2023-07-20 14:35:06 +02:00
|
|
|
items := Cpdftype.Font (string_of_int n, f, fontsize)::!items;
|
2022-10-20 15:28:14 +02:00
|
|
|
currtext := char_of_int c::!currtext;
|
|
|
|
end
|
|
|
|
else
|
|
|
|
currtext := char_of_int c::!currtext
|
|
|
|
end
|
|
|
|
| None -> Printf.printf "No glyph for unicode U+%04X in this font\n" u)
|
|
|
|
cs;
|
|
|
|
items := Cpdftype.Text (rev !currtext)::!items
|
2022-09-21 16:21:57 +02:00
|
|
|
in
|
2022-10-20 15:28:14 +02:00
|
|
|
let buf = ref [] in
|
2022-09-21 16:21:57 +02:00
|
|
|
List.iter
|
2021-12-07 00:55:46 +01:00
|
|
|
(function
|
2022-09-21 16:21:57 +02:00
|
|
|
| 10 (*'\n'*) ->
|
|
|
|
let c = rev !buf in
|
2022-10-20 15:28:14 +02:00
|
|
|
if c <> [] then process_codepoints c;
|
2021-12-07 00:55:46 +01:00
|
|
|
items := Cpdftype.NewLine::!items;
|
2023-06-27 16:54:15 +02:00
|
|
|
currtext := [];
|
2022-09-21 16:21:57 +02:00
|
|
|
buf := []
|
|
|
|
| 13 (*'\r'*) -> ()
|
2021-12-07 00:55:46 +01:00
|
|
|
| x ->
|
2022-09-21 16:21:57 +02:00
|
|
|
buf := x::!buf)
|
|
|
|
codepoints;
|
2021-12-07 00:55:46 +01:00
|
|
|
(* Do last one *)
|
2022-09-21 16:21:57 +02:00
|
|
|
let c = rev !buf in
|
2022-10-20 15:28:14 +02:00
|
|
|
if c <> [] then process_codepoints c;
|
2022-09-21 16:21:57 +02:00
|
|
|
rev !items
|
2021-12-07 00:55:46 +01:00
|
|
|
|
2024-10-02 15:45:40 +02:00
|
|
|
(* Post process, adding Tag / EndTag around paragraphs *)
|
|
|
|
let rec tag_paragraphs = function
|
|
|
|
| Cpdftype.NewLine::Cpdftype.NewLine::t ->
|
2024-10-02 17:39:16 +02:00
|
|
|
Cpdftype.EndTag::Cpdftype.NewLine::Cpdftype.NewLine::Cpdftype.Tag ("P", 0)::tag_paragraphs t
|
2024-10-02 15:45:40 +02:00
|
|
|
| x::t -> x::tag_paragraphs t
|
|
|
|
| [] -> [Cpdftype.EndTag]
|
|
|
|
|
|
|
|
let tag_paragraphs l =
|
2024-10-02 17:39:16 +02:00
|
|
|
Cpdftype.Tag ("P", 0)::tag_paragraphs l
|
2024-10-02 15:45:40 +02:00
|
|
|
|
2024-10-02 14:27:57 +02:00
|
|
|
let typeset ~process_struct_tree ?subformat ?title ~papersize ~font ~fontsize text =
|
|
|
|
let process_struct_tree =
|
2024-10-03 15:47:57 +02:00
|
|
|
match process_struct_tree, subformat with
|
|
|
|
| _, (Some Cpdfua.PDFUA1 | Some Cpdfua.PDFUA2) | true, _ -> true
|
|
|
|
| _ -> false
|
2024-10-02 14:27:57 +02:00
|
|
|
in
|
2024-10-02 15:20:51 +02:00
|
|
|
let pdf, title =
|
|
|
|
match subformat with
|
|
|
|
| None -> Pdf.empty (), begin match title with Some x -> x | None -> "" end
|
|
|
|
| Some Cpdfua.PDFUA1 ->
|
|
|
|
begin match title with
|
|
|
|
| None -> error "no -title given"
|
|
|
|
| Some title -> Cpdfua.create_pdfua1 title papersize 1, title
|
|
|
|
end
|
|
|
|
| Some Cpdfua.PDFUA2 ->
|
|
|
|
begin match title with
|
|
|
|
| None -> error "no -title given"
|
|
|
|
| Some title -> Cpdfua.create_pdfua2 title papersize 1, title
|
|
|
|
end
|
|
|
|
in
|
2023-06-13 16:21:23 +02:00
|
|
|
let codepoints = setify (Pdftext.codepoints_of_utf8 (Pdfio.string_of_bytes text)) in
|
2023-06-13 15:35:01 +02:00
|
|
|
let fontpack =
|
2022-10-19 14:48:13 +02:00
|
|
|
match font with
|
2023-06-13 15:35:01 +02:00
|
|
|
| Cpdfembed.PreMadeFontPack t -> t
|
2022-10-19 14:48:13 +02:00
|
|
|
| Cpdfembed.EmbedInfo {fontfile; fontname; encoding} ->
|
2023-06-13 15:35:01 +02:00
|
|
|
Cpdfembed.embed_truetype pdf ~fontfile ~fontname ~codepoints ~encoding
|
|
|
|
| Cpdfembed.ExistingNamedFont ->
|
|
|
|
raise (Pdf.PDFError "Can't use existing named font for text-to-PDF")
|
2022-10-19 14:48:13 +02:00
|
|
|
in
|
2022-10-20 15:28:14 +02:00
|
|
|
let instrs = of_utf8_with_newlines fontpack fontsize (Pdfio.string_of_bytes text) in
|
2024-10-02 16:26:30 +02:00
|
|
|
(*flprint (Cpdftype.to_string instrs);
|
|
|
|
flprint "------------------------------";*)
|
2024-10-02 15:45:40 +02:00
|
|
|
let tagged = tag_paragraphs instrs in
|
2024-10-02 16:26:30 +02:00
|
|
|
(*flprint (Cpdftype.to_string tagged);*)
|
2023-06-13 21:23:59 +02:00
|
|
|
let margin = Pdfunits.points (Pdfpaper.width papersize) (Pdfpaper.unit papersize) /. 15. in
|
2024-05-06 09:37:58 +02:00
|
|
|
let instrs =
|
2024-10-02 15:45:40 +02:00
|
|
|
if tagged = [] then [] else
|
|
|
|
let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) tagged) in
|
2024-10-02 16:26:30 +02:00
|
|
|
[firstfont; Cpdftype.BeginDocument] @ tagged
|
2024-05-06 09:37:58 +02:00
|
|
|
in
|
2024-10-03 17:18:09 +02:00
|
|
|
let pages, tags = Cpdftype.typeset ~process_struct_tree margin margin margin margin papersize pdf instrs in
|
|
|
|
iter (fun x -> Printf.printf "PAGE\n"; iter (fun (_, i) -> Printf.printf "Tag number %i\n" i) x) tags;
|
|
|
|
(* We make (tag number, page number, mcid) triples *)
|
|
|
|
let tagtriples =
|
|
|
|
flatten
|
|
|
|
(map2
|
|
|
|
(fun pn tags ->
|
|
|
|
map2 (fun (_, tagnum) mcid -> (tagnum, pn, mcid)) tags (indx0 tags))
|
|
|
|
(indx0 tags)
|
|
|
|
tags)
|
|
|
|
in
|
|
|
|
Printf.printf "(tag number, page number, mcid) triples:\n";
|
|
|
|
iter (fun (tagnum, pn, mcid) -> Printf.printf "%i, %i, %i\n" tagnum pn mcid) tagtriples;
|
|
|
|
(* Now work out the nodes and what each /K and /Pg in them is *)
|
|
|
|
let pages_and_mcids =
|
|
|
|
[]
|
|
|
|
in
|
|
|
|
Printf.printf "Pages and their MCIDs\n";
|
|
|
|
iter
|
|
|
|
(fun (page, mcids) ->
|
|
|
|
Printf.printf "Page %i\n";
|
|
|
|
iter (Printf.printf "%i ") mcids;
|
|
|
|
Printf.printf "\n")
|
|
|
|
pages_and_mcids;
|
2024-10-03 15:47:57 +02:00
|
|
|
if subformat = Some Cpdfua.PDFUA2 then
|
|
|
|
begin
|
|
|
|
let str = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let p = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let parent_tree = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let namespace = Pdf.addobj pdf (Pdf.Dictionary [("/NS", Pdf.String "http://iso.org/pdf2/ssn")]) in
|
|
|
|
let document = Pdf.addobj pdf Pdf.Null in
|
|
|
|
Pdf.addobj_given_num pdf (document, Pdf.Dictionary [("/K", Pdf.Array [Pdf.Indirect p]); ("/P", Pdf.Indirect str); ("/S", Pdf.Name "/Document"); ("/NS", Pdf.Indirect namespace)]);
|
|
|
|
Pdf.addobj_given_num pdf (parent_tree, Pdf.Dictionary [("/Nums", Pdf.Array [Pdf.Integer 1; Pdf.Array [Pdf.Indirect p]])]);
|
|
|
|
Pdf.addobj_given_num pdf (p, Pdf.Dictionary [("/K", Pdf.Array [Pdf.Integer 0]); ("/P", Pdf.Indirect document); ("/S", Pdf.Name "/P")]);
|
|
|
|
Pdf.addobj_given_num pdf (str, Pdf.Dictionary [("/Namespaces", Pdf.Array [Pdf.Indirect namespace]); ("/Type", Pdf.Name "/StructTreeRoot");
|
|
|
|
("/K", Pdf.Array [Pdf.Indirect document]); ("/ParentTree", Pdf.Indirect parent_tree)]);
|
|
|
|
Pdf.replace_chain pdf ["/Root"] ("/StructTreeRoot", (Pdf.Indirect str))
|
|
|
|
end
|
|
|
|
else if process_struct_tree || subformat = Some Cpdfua.PDFUA1 then
|
|
|
|
begin
|
|
|
|
let str = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let p = Pdf.addobj pdf Pdf.Null in
|
|
|
|
let parent_tree = Pdf.addobj pdf Pdf.Null in
|
|
|
|
Pdf.addobj_given_num pdf (parent_tree, Pdf.Dictionary [("/Nums", Pdf.Array [Pdf.Integer 1; Pdf.Array [Pdf.Indirect p]])]);
|
|
|
|
Pdf.addobj_given_num pdf (p, Pdf.Dictionary [("/K", Pdf.Array [Pdf.Integer 0]); ("/P", Pdf.Indirect str); ("/S", Pdf.Name "/P")]);
|
|
|
|
Pdf.addobj_given_num pdf (str, Pdf.Dictionary [("/Type", Pdf.Name "/StructTreeRoot"); ("/K", Pdf.Array [Pdf.Indirect p]); ("/ParentTree", Pdf.Indirect parent_tree)]);
|
|
|
|
Pdf.replace_chain pdf ["/Root"] ("/StructTreeRoot", (Pdf.Indirect str))
|
|
|
|
end;
|
2024-10-03 17:18:09 +02:00
|
|
|
let pages =
|
|
|
|
map2
|
|
|
|
(fun pn p -> if process_struct_tree then {p with Pdfpage.rest = Pdf.add_dict_entry p.Pdfpage.rest "/StructParents" (Pdf.Integer pn)} else p)
|
|
|
|
(indx0 pages)
|
|
|
|
pages
|
|
|
|
in
|
|
|
|
let pdf, pageroot = Pdfpage.add_pagetree pages pdf in
|
|
|
|
Pdfpage.add_root pageroot [] pdf
|