cpdf-source/cpdftexttopdf.ml

177 lines
7.5 KiB
OCaml
Raw Permalink Normal View History

2021-12-07 00:55:46 +01:00
open Pdfutil
2024-10-02 15:20:51 +02:00
open Cpdferror
2021-12-07 00:55:46 +01:00
2024-02-27 16:57:31 +01:00
let of_utf8_with_newlines fontpack fontsize t =
2021-12-07 00:55:46 +01:00
let items = ref [] in
2023-06-13 21:23:59 +02:00
let currfont = ref ~-1 in
2022-09-21 16:21:57 +02:00
let codepoints = Pdftext.codepoints_of_utf8 t in
2022-10-20 15:28:14 +02:00
let currtext = ref [] in
let process_codepoints cs =
iter
2022-09-21 16:21:57 +02:00
(fun u ->
2022-10-19 15:47:20 +02:00
match Cpdfembed.get_char fontpack u with
2022-10-20 15:28:14 +02:00
| Some (c, n, f) ->
begin if n <> !currfont then
begin
if !currtext <> [] then items := Cpdftype.Text (rev !currtext)::!items;
currtext := [];
currfont := n;
2023-07-20 14:35:06 +02:00
items := Cpdftype.Font (string_of_int n, f, fontsize)::!items;
2022-10-20 15:28:14 +02:00
currtext := char_of_int c::!currtext;
end
else
currtext := char_of_int c::!currtext
end
| None -> Printf.printf "No glyph for unicode U+%04X in this font\n" u)
cs;
items := Cpdftype.Text (rev !currtext)::!items
2022-09-21 16:21:57 +02:00
in
2022-10-20 15:28:14 +02:00
let buf = ref [] in
2022-09-21 16:21:57 +02:00
List.iter
2021-12-07 00:55:46 +01:00
(function
2022-09-21 16:21:57 +02:00
| 10 (*'\n'*) ->
let c = rev !buf in
2022-10-20 15:28:14 +02:00
if c <> [] then process_codepoints c;
2021-12-07 00:55:46 +01:00
items := Cpdftype.NewLine::!items;
2023-06-27 16:54:15 +02:00
currtext := [];
2022-09-21 16:21:57 +02:00
buf := []
| 13 (*'\r'*) -> ()
2021-12-07 00:55:46 +01:00
| x ->
2022-09-21 16:21:57 +02:00
buf := x::!buf)
codepoints;
2021-12-07 00:55:46 +01:00
(* Do last one *)
2022-09-21 16:21:57 +02:00
let c = rev !buf in
2022-10-20 15:28:14 +02:00
if c <> [] then process_codepoints c;
2022-09-21 16:21:57 +02:00
rev !items
2021-12-07 00:55:46 +01:00
2024-10-02 15:45:40 +02:00
(* Post process, adding Tag / EndTag around paragraphs *)
let rec tag_paragraphs = function
| Cpdftype.NewLine::Cpdftype.NewLine::t ->
Cpdftype.EndTag::Cpdftype.NewLine::Cpdftype.NewLine::Cpdftype.Tag ("P", 0)::tag_paragraphs t
2024-10-02 15:45:40 +02:00
| x::t -> x::tag_paragraphs t
| [] -> [Cpdftype.EndTag]
let tag_paragraphs l =
Cpdftype.Tag ("P", 0)::tag_paragraphs l
2024-10-02 15:45:40 +02:00
let typeset ~process_struct_tree ?subformat ?title ~papersize ~font ~fontsize text =
let process_struct_tree =
match process_struct_tree, subformat with
| _, (Some Cpdfua.PDFUA1 | Some Cpdfua.PDFUA2) | true, _ -> true
| _ -> false
in
2024-10-02 15:20:51 +02:00
let pdf, title =
match subformat with
| None -> Pdf.empty (), begin match title with Some x -> x | None -> "" end
| Some Cpdfua.PDFUA1 ->
begin match title with
| None -> error "no -title given"
| Some title -> Cpdfua.create_pdfua1 title papersize 1, title
end
| Some Cpdfua.PDFUA2 ->
begin match title with
| None -> error "no -title given"
| Some title -> Cpdfua.create_pdfua2 title papersize 1, title
end
in
2023-06-13 16:21:23 +02:00
let codepoints = setify (Pdftext.codepoints_of_utf8 (Pdfio.string_of_bytes text)) in
let fontpack =
2022-10-19 14:48:13 +02:00
match font with
| Cpdfembed.PreMadeFontPack t -> t
2022-10-19 14:48:13 +02:00
| Cpdfembed.EmbedInfo {fontfile; fontname; encoding} ->
Cpdfembed.embed_truetype pdf ~fontfile ~fontname ~codepoints ~encoding
| Cpdfembed.ExistingNamedFont ->
raise (Pdf.PDFError "Can't use existing named font for text-to-PDF")
2022-10-19 14:48:13 +02:00
in
2022-10-20 15:28:14 +02:00
let instrs = of_utf8_with_newlines fontpack fontsize (Pdfio.string_of_bytes text) in
2024-10-02 16:26:30 +02:00
(*flprint (Cpdftype.to_string instrs);
flprint "------------------------------";*)
let tagged = if process_struct_tree then tag_paragraphs instrs else instrs in
2024-10-02 16:26:30 +02:00
(*flprint (Cpdftype.to_string tagged);*)
2023-06-13 21:23:59 +02:00
let margin = Pdfunits.points (Pdfpaper.width papersize) (Pdfpaper.unit papersize) /. 15. in
let instrs =
2024-10-02 15:45:40 +02:00
if tagged = [] then [] else
let firstfont = hd (keep (function Cpdftype.Font _ -> true | _ -> false) tagged) in
2024-10-02 16:26:30 +02:00
[firstfont; Cpdftype.BeginDocument] @ tagged
in
2024-10-03 17:18:09 +02:00
let pages, tags = Cpdftype.typeset ~process_struct_tree margin margin margin margin papersize pdf instrs in
2024-10-04 13:24:50 +02:00
(*iter (fun x -> Printf.printf "PAGE\n"; iter (fun (_, i) -> Printf.printf "Paragraph number %i\n" i) x) tags;*)
2024-10-03 17:18:09 +02:00
(* We make (tag number, page number, mcid) triples *)
let tagtriples =
flatten
(map2
(fun pn tags ->
map2 (fun (_, tagnum) mcid -> (tagnum, pn, mcid)) tags (indx0 tags))
(indx0 tags)
tags)
in
2024-10-04 13:24:50 +02:00
(* Printf.printf "(paragraph number, page number, mcid) triples:\n";
iter (fun (tagnum, pn, mcid) -> Printf.printf "%i, %i, %i\n" tagnum pn mcid) tagtriples;*)
2024-10-04 13:15:49 +02:00
(* Now work out the nodes and which MCIDs in which pages they point to. Each paragraph may point to 1 or more nodes. *)
let rec find_nodes (a : ((int * int * int) list) list) = function
| (para, page, mcid)::nodes ->
begin match a with
| ((para', page', mcid')::t)::rest when para = para' ->
find_nodes (((para, page, mcid)::(para', page', mcid')::t)::rest) nodes
| (h::t)::rest ->
find_nodes (([(para, page, mcid)])::(h::t)::rest) nodes
| []::rest ->
find_nodes (([(para, page, mcid)])::rest) nodes
| [] -> assert false
end
| [] -> rev (map rev a)
2024-10-03 17:18:09 +02:00
in
2024-10-04 13:15:49 +02:00
let nodes = find_nodes [[]] tagtriples in
(*Printf.printf "Paragraphs and their page and MCIDs\n";
2024-10-03 17:18:09 +02:00
iter
2024-10-04 13:15:49 +02:00
(fun parts_of_para ->
Printf.printf "Paragraph:\n";
iter (fun (para, page, mcid) -> Printf.printf "Para %i, Page %i, MCID %i\n" para page mcid) parts_of_para)
nodes;*)
2024-10-03 17:18:09 +02:00
let pages =
map2
(fun pn p -> if process_struct_tree then {p with Pdfpage.rest = Pdf.add_dict_entry p.Pdfpage.rest "/StructParents" (Pdf.Integer pn)} else p)
(indx0 pages)
pages
in
let pdf, pageroot = Pdfpage.add_pagetree pages pdf in
2024-10-04 14:41:47 +02:00
let pdf = Pdfpage.add_root pageroot [] pdf in
let refnums = let ns = Pdf.page_reference_numbers pdf in combine (indx0 ns) ns in
2024-10-04 16:26:47 +02:00
if process_struct_tree || subformat = Some Cpdfua.PDFUA1 || subformat = Some Cpdfua.PDFUA2 then
2024-10-04 14:41:47 +02:00
begin
2024-10-04 16:26:47 +02:00
let namespace = if subformat = Some Cpdfua.PDFUA2 then Pdf.addobj pdf (Pdf.Dictionary [("/NS", Pdf.String "http://iso.org/pdf2/ssn")]) else 0 in
let document = if subformat = Some Cpdfua.PDFUA2 then Pdf.addobj pdf Pdf.Null else 0 in
2024-10-04 14:41:47 +02:00
let str = Pdf.addobj pdf Pdf.Null in
let topks =
map
(fun parts_of_para ->
let ks =
map (fun (_, pagenumber, mcid) -> Pdf.Dictionary [("/Type", Pdf.Name "/MCR"); ("/Pg", Pdf.Indirect (unopt (lookup pagenumber refnums))); ("/MCID", Pdf.Integer mcid)]) parts_of_para
in
2024-10-04 16:26:47 +02:00
Pdf.Indirect (Pdf.addobj pdf (Pdf.Dictionary [("/K", Pdf.Array ks); ("/P", Pdf.Indirect (if subformat = Some Cpdfua.PDFUA2 then document else str)); ("/S", Pdf.Name "/P")])))
2024-10-04 14:41:47 +02:00
nodes
in
2024-10-04 16:26:47 +02:00
if subformat = Some Cpdfua.PDFUA2 then
Pdf.addobj_given_num pdf (document, Pdf.Dictionary [("/K", Pdf.Array topks); ("/P", Pdf.Indirect str); ("/S", Pdf.Name "/Document"); ("/NS", Pdf.Indirect namespace)]);
2024-10-04 15:32:49 +02:00
let parent_tree =
let pairs =
map
(fun pn ->
2024-10-04 16:26:47 +02:00
let this_page_triples = keep (fun (para, page, mcid) -> page = pn) tagtriples in
2024-10-04 15:32:49 +02:00
(string_of_int pn, Pdf.Array (map (function (para, _, _) -> (List.nth topks para)) this_page_triples)))
(indx0 pages)
in
Pdf.addobj pdf (Pdftree.build_name_tree true pdf pairs)
in
2024-10-04 16:26:47 +02:00
let stns =
if subformat = Some Cpdfua.PDFUA2 then [("/Namespaces", Pdf.Array [Pdf.Indirect namespace])] else []
in
let k =
if subformat = Some Cpdfua.PDFUA2 then Pdf.Indirect document else Pdf.Array topks
in
Pdf.addobj_given_num pdf (str, Pdf.Dictionary (stns @ [("/Type", Pdf.Name "/StructTreeRoot"); ("/K", k); ("/ParentTree", Pdf.Indirect parent_tree)]));
2024-10-23 14:44:31 +02:00
Pdf.replace_chain pdf ["/Root"; "/StructTreeRoot"] (Pdf.Indirect str)
2024-10-04 14:41:47 +02:00
end;
pdf