From 64f0339b71fbb1d77ffa646143850754db9f8cc5 Mon Sep 17 00:00:00 2001 From: John Whitington Date: Sun, 11 Sep 2022 14:52:08 +0100 Subject: [PATCH] Include TrueType embedder --- Makefile | 27 ++-- cpdfembed.ml | 144 ++++++++++++++++++++ cpdfembed.mli | 1 + cpdftruetype.ml | 332 +++++++++++++++++++++++++++++++++++++++++++++++ cpdftruetype.mli | 24 ++++ 5 files changed, 513 insertions(+), 15 deletions(-) create mode 100644 cpdfembed.ml create mode 100644 cpdfembed.mli create mode 100644 cpdftruetype.ml create mode 100644 cpdftruetype.mli diff --git a/Makefile b/Makefile index ca8bedb..3445c27 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,15 @@ # Build the cpdf command line tools and top level -MODS = cpdfyojson cpdfxmlm cpdfutil \ - cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ - cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ - cpdfbookmarks cpdfpage cpdfaddtext cpdfimage cpdffont cpdftype \ - cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \ - cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \ - cpdfcommand +NONDOC = cpdfyojson cpdfxmlm cpdfutil + +DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \ + cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \ + cpdfbookmarks cpdfpage cpdfaddtext cpdfimage cpdffont cpdftype \ + cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \ + cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \ + cpdftruetype cpdfembed \ + cpdfcommand + +MODS = $(NONDOC) $(DOC) SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml @@ -36,14 +40,7 @@ clean :: rm -rf doc foo foo2 out.pdf out2.pdf foo.pdf decomp.pdf *.cmt *.cmti \ *.json test/*.pdf debug/*.pdf *.ps *.aux *.idx *.log *.out *.toc *.cut -DOC_FILES = cpdfunicodedata.mli cpdferror.mli cpdfdebug.mli cpdfjson.mli \ - cpdfstrftime.mli cpdfcoord.mli cpdfattach.mli cpdfpagespec.mli \ - cpdfposition.mli cpdfpresent.mli cpdfmetadata.mli \ - cpdfbookmarks.mli cpdfpage.mli cpdfaddtext.mli cpdfimage.mli \ - cpdffont.mli cpdftype.mli cpdftexttopdf.mli cpdftoc.mli \ - cpdfpad.mli cpdfocg.mli cpdfsqueeze.mli cpdfdraft.mli \ - cpdfspot.mli cpdfpagelabels.mli cpdfcreate.mli cpdfannot.mli \ - cpdfxobject.mli cpdfimpose.mli cpdftweak.mli cpdfcommand.mli +DOC_FILES = $(foreach x,$(DOC),$(x).mli ) install : libinstall diff --git a/cpdfembed.ml b/cpdfembed.ml new file mode 100644 index 0000000..de9978d --- /dev/null +++ b/cpdfembed.ml @@ -0,0 +1,144 @@ +(* Truetype font embedding example *) +open Pdfutil + +(* For the first stage of our embedder, we are only allowing standard encodings, and we don't actually subset. + a) Get a list of Unicode codepoints; + b) See which of them are in the glyph list; + c) See which of those are in (StdEncoding|MacRomanEncoding|WinAnsiEncoding), and get their codes; + d) Build a font to do just those; + e) We put question marks for any character not in the encoding + (* FUTURE *) + 1) Actually subset the font to save size + 2) Allow characters not in the standard encodings by builing one or more secondary subsets *) + +(* UTF8 Input text *) +let text = "Noto Sans Black Àë" +let encoding = Pdftext.MacRomanEncoding + +let unicodepoints = Pdftext.codepoints_of_utf8 text + +let glyphlist_table = Pdfglyphlist.reverse_glyph_hashes () + +let encoding_table = Pdftext.reverse_table_of_encoding encoding + +let () = + iter + (fun u -> + Printf.printf "unicode %i --> " u; + let glyphname = Hashtbl.find glyphlist_table [u] in + Printf.printf "glyph name %s --> " glyphname; + let pdfcode = Hashtbl.find encoding_table glyphname in + Printf.printf "pdf code %i\n" pdfcode) + unicodepoints + +let pdfcode_of_unicode_codepoint u = + try + Some (Hashtbl.find encoding_table (Hashtbl.find glyphlist_table [u])) + with + Not_found -> None + +let tj_text = + implode + (map + (fun x -> match pdfcode_of_unicode_codepoint x with Some c -> char_of_int c | None -> '?') + unicodepoints) + +let calc_accepted_unicodepoints codepoints = + setify + (option_map + (fun u -> match pdfcode_of_unicode_codepoint u with Some _ -> Some u | None -> None) + codepoints) + +let accepted_unicodepoints = + map + (fun u -> (u, pdfcode_of_unicode_codepoint u)) + (calc_accepted_unicodepoints unicodepoints) + +let contents_of_file filename = + let ch = open_in_bin filename in + let s = really_input_string ch (in_channel_length ch) in + close_in ch; + s + +let fontname = "NotoSans-Black" +let fontstr = contents_of_file (fontname ^ ".ttf") + +let f = + Cpdftruetype.parse ~subset:accepted_unicodepoints (Pdfio.bytes_of_string fontstr) + +let contents = + "1 0 0 1 50 770 cm BT/TT1 36 Tf(" ^ tj_text ^ ")Tj ET" + +let widths = + Pdf.Array (map (fun x -> Pdf.Integer x) (Array.to_list f.Cpdftruetype.widths)) + +let fontnum = ref 0 + +let basename () = + incr fontnum; + "AAAAA" ^ string_of_char (char_of_int (!fontnum + 65)) + +let name_1 = basename () + +let string_of_encoding = function + | Pdftext.WinAnsiEncoding -> "/WinAnsiEncoding" + | Pdftext.MacRomanEncoding -> "/MacRomanEncoding" + | Pdftext.StandardEncoding -> "/StandardEncoding" + | _ -> failwith "unknown encoding" + +let font = + Pdf.add_dict_entry + (Pdfread.parse_single_object + (Printf.sprintf "<>" name_1 fontname (string_of_encoding encoding) f.Cpdftruetype.firstchar f.Cpdftruetype.lastchar)) + "/Widths" + widths + +let fontdescriptor = + Pdfread.parse_single_object + (Printf.sprintf "<>" + name_1 fontname f.Cpdftruetype.flags f.Cpdftruetype.minx f.Cpdftruetype.miny f.Cpdftruetype.maxx f.Cpdftruetype.maxy f.Cpdftruetype.italicangle + f.Cpdftruetype.ascent f.Cpdftruetype.descent f.Cpdftruetype.capheight f.Cpdftruetype.stemv f.Cpdftruetype.xheight f.Cpdftruetype.avgwidth f.Cpdftruetype.maxwidth) + +let fontfile = + let len = String.length fontstr in + Pdf.Stream + {contents = + (Pdf.Dictionary [("/Length", Pdf.Integer len); ("/Length1", Pdf.Integer len)], + Pdf.Got (Pdfio.bytes_of_string fontstr))} + +let objects = + [(1, Pdfread.parse_single_object "<>"); + (2, Pdfread.parse_single_object "<>"); + (3, Pdfread.parse_single_object "<>>>/Parent 2 0 R/MediaBox[0 0 595 842]/Rotate 0/Contents[4 0 R]>>"); + (4, Pdf.Stream + {contents = (Pdf.Dictionary [("/Length", Pdf.Integer (String.length contents))], + (Pdf.Got (Pdfio.bytes_of_string contents)))}); + (5, font); + (6, fontdescriptor); + (7, fontfile); + ] + +let root = 1 + +let trailerdict = + Pdfread.parse_single_object (Printf.sprintf "<>" (length objects + 1)) + +let pdf = + let pdf = + {Pdf.major = 2; + Pdf.minor = 0; + Pdf.root = root; + Pdf.objects = + {Pdf.maxobjnum = 0; + Pdf.parse = None; + Pdf.pdfobjects = Pdf.pdfobjmap_empty (); + Pdf.object_stream_ids = null_hash ()}; + Pdf.trailerdict = trailerdict; + Pdf.was_linearized = false; + Pdf.saved_encryption = None} + in + iter (Pdf.addobj_given_num pdf) objects; + pdf + +let () = + Pdfwrite.pdf_to_file pdf "subset.pdf" diff --git a/cpdfembed.mli b/cpdfembed.mli new file mode 100644 index 0000000..948db8f --- /dev/null +++ b/cpdfembed.mli @@ -0,0 +1 @@ +(* *) diff --git a/cpdftruetype.ml b/cpdftruetype.ml new file mode 100644 index 0000000..c1c6799 --- /dev/null +++ b/cpdftruetype.ml @@ -0,0 +1,332 @@ +(* Truetype font parsing and subsetting *) +open Pdfutil +open Pdfio + +type t = + {flags : int; + minx : int; + miny : int; + maxx : int; + maxy : int; + italicangle : int; + ascent : int; + descent : int; + capheight : int; + stemv : int; + xheight : int; + avgwidth : int; + maxwidth : int; + firstchar : int; + lastchar : int; + widths : int array; + subset : Pdfio.bytes} + +let dbg = ref true (* text-based debug *) + +(* 32-bit signed fixed-point number (16.16) returned as two ints *) +let read_fixed b = + let a = getval_31 b 16 in + let b = getval_31 b 16 in + a, b + +(* 16-bit unsigned integer *) +let read_ushort b = getval_31 b 16 + +(* 32-bit unsigned integer *) +let read_ulong b = getval_32 b 32 + +(* Signed byte *) +let read_byte b = getval_31 b 8 + +(* Signed short *) +let read_short b = sign_extend 16 (getval_31 b 16) + +(* fword *) +let read_fword = read_short +let read_ufword = read_ushort + +(* f2dot14 - 2 bit signed integer part, 14 bit unsigned fraction *) +let read_f2dot14 b = + let v = read_ushort b in + float_of_int (sign_extend 2 (v lsr 14)) +. (float_of_int (v land 0x3FFF) /. 16384.) + +(* discard n bytes *) +let discard_bytes b n = + for x = 1 to n do ignore (getval_31 b 8) done + +let string_of_tag t = + Printf.sprintf "%c%c%c%c" + (char_of_int (i32toi (Int32.shift_right t 24))) + (char_of_int (i32toi (Int32.logand 0x000000FFl (Int32.shift_right t 16)))) + (char_of_int (i32toi (Int32.logand 0x000000FFl (Int32.shift_right t 8)))) + (char_of_int (i32toi (Int32.logand 0x000000FFl t))) + +let read_format_6_encoding_table b = + let firstCode = read_ushort b in + let entryCount = read_ushort b in + let t = null_hash () in + (* FIXME: This format can address glyphs > 255, but we don't support that + elsewhere yet --- but we read the whole format table nonethless *) + try + for x = firstCode to firstCode + entryCount - 1 do + Hashtbl.add t x (read_ushort b) + done; + t + with + e -> failwith ("bad format 6 table: " ^ Printexc.to_string e ^ "\n") + +(* fixme might need indexToLocFormat here, to undo the "clever" formula. *) +let read_format_4_encoding_table b = + let t = null_hash () in + let segCountX2 = read_ushort b in + let segCount = segCountX2 / 2 in + let searchRange = read_ushort b in + let entrySelector = read_ushort b in + let rangeShift = read_ushort b in + let endCodes = Array.init segCount (fun _ -> read_ushort b) in + let _ (* reservedPad *) = read_ushort b in + let startCodes = Array.init segCount (fun _ -> read_ushort b) in + let idDelta = Array.init segCount (fun _ -> read_ushort b) in + let idRangeOffset = Array.init segCount (fun _ -> read_ushort b) in + if !dbg then + begin + Printf.printf "segCount = %i, searchRange = %i, entrySelector = %i, rangeShift = %i\n" segCount searchRange entrySelector rangeShift; + Printf.printf "endCodes\n"; + print_ints (Array.to_list endCodes); + Printf.printf "startCodes\n"; + print_ints (Array.to_list startCodes); + Printf.printf "idDelta\n"; + print_ints (Array.to_list idDelta); + Printf.printf "idRangeOffset\n"; + print_ints (Array.to_list idRangeOffset); + end; + for seg = 0 to segCount - 1 do + let ec = endCodes.(seg) in + let sc = startCodes.(seg) in + let del = idDelta.(seg) in + let ro = idRangeOffset.(seg) in + for c = sc to ec do + if ro = 0 then + Hashtbl.add t c ((c + del) mod 65536) + else + let sum = (c - sc) + del in + () + done + done; + t + +let read_encoding_table fmt length version b = + match fmt with + | 0 -> + let t = null_hash () in + for x = 0 to 255 do Hashtbl.add t x (read_byte b) done; + t + | 4 -> read_format_4_encoding_table b + | 6 -> read_format_6_encoding_table b + | n -> raise (Pdf.PDFError "read_encoding_table: format %i not known\n%!") + +let read_loca_table indexToLocFormat numGlyphs b = + let fix_empties arr = + for x = 1 to Array.length arr - 1 do + if arr.(x) = arr.(x - 1) then arr.(x - 1) <- -1l + done; + if arr <> [||] then arr.(Array.length arr - 1) <- -1l + in + match indexToLocFormat with + | 0 -> + let arr = Array.init (numGlyphs + 1) (function _ -> i32ofi (read_ushort b * 2)) in + fix_empties arr; arr + | 1 -> + let arr = Array.init (numGlyphs + 1) (function _ -> read_ulong b) in + fix_empties arr; arr + | _ -> raise (Pdf.PDFError "Unknown indexToLocFormat in read_loca_table") + +let read_os2_table b blength = + let version = read_ushort b in + if !dbg then Printf.printf "OS/2 table blength = %i bytes, version number = %i\n" blength version; + if version < 2 then failwith "read_os2_table: version number too low" else + let xAvgCharWidth = read_short b in + discard_bytes b 64; (* discard 14 entries usWeightClass...fsLastCharIndex *) + (* -- end of original OS/2 Version 0 Truetype table. Must check length before reading now. *) + let sTypoAscender = read_short b in + let sTypoDescender = read_short b in + discard_bytes b 6; (* discard sTypoLineGap...usWinDescent *) + (* -- end of OpenType version 0 table *) + discard_bytes b 8; (* discard ulCodePageRange1, ulCodePageRange2 *) + (* -- end of OpenType version 1 table *) + let sxHeight = read_short b in + let sCapHeight = read_short b in + (sTypoAscender, sTypoDescender, sCapHeight, sxHeight, xAvgCharWidth) + +let read_post_table b = + discard_bytes b 4; (* discard version *) + let italicangle, n = read_fixed b in + italicangle + +(* Eventually: +Set bit 6 for non symbolic. (nb bit 1 is actualy bit 0 etc.) +Set bit 7 if italicangle <> 0 +Set bit 2 if serif ? +Set bit 1 if fixed pitch (calculate from widths) *) +let calculate_flags italicangle = + let italic = if italicangle <> 0 then 1 else 0 in + 32 lor italic lsl 6 + +let calculate_limits subset = + if subset = [] then (0, 255) else + extremes (sort compare subset) + +let calculate_stemv () = 80 + +let read_hhea_table b = + discard_bytes b 34; + read_ushort b (* numOfLongHorMetrics *) + +let read_hmtx_table numOfLongHorMetrics b = + Array.init + numOfLongHorMetrics + (fun _ -> let r = read_ushort b in ignore (read_short b); r) + +let calculate_widths firstchar lastchar subset (cmapdata : (int, int) Hashtbl.t) (hmtxdata : int array) = + if lastchar < firstchar then failwith "lastchar < firschar" else + List.iter (fun (a, b) -> Printf.printf "%i -> %i\n" a b) (sort compare (list_of_hashtbl cmapdata)); + Array.init + (lastchar - firstchar + 1) + (fun pos -> + let code = pos + firstchar in + Printf.printf "code %i --> " code; + if not (mem code subset) then 0 else + try + let glyphnum = Hashtbl.find cmapdata code in + Printf.printf "glyph number %i --> " glyphnum; + let width = hmtxdata.(glyphnum) in + Printf.printf "width %i\n" width; + width + with e -> Printf.printf "no width for %i\n" code; 0) + +let calculate_maxwidth hmtxdata = + hd (sort (fun a b -> compare b a) (Array.to_list hmtxdata)) + +let parse ?(subset=[]) data = + let subset = map fst subset in + let mk_b byte_offset = bitbytes_of_input (let i = input_of_bytes data in i.seek_in byte_offset; i) in + let b = mk_b 0 in + let major, minor = read_fixed b in + if !dbg then Printf.printf "Truetype font version %i.%i\n" major minor; + let numTables = read_ushort b in + let searchRange = read_ushort b in + let entrySelector = read_ushort b in + let rangeShift = read_ushort b in + if !dbg then Printf.printf "numTables = %i, searchRange = %i, entrySelector = %i, rangeShift = %i\n" + numTables searchRange entrySelector rangeShift; + let tables = ref [] in + for x = 1 to numTables do + let tag = read_ulong b in + let checkSum = read_ulong b in + let offset = read_ulong b in + let ttlength = read_ulong b in + if !dbg then Printf.printf "tag = %li = %s, checkSum = %li, offset = %li, ttlength = %li\n" + tag (string_of_tag tag) checkSum offset ttlength; + tables =| (tag, checkSum, offset, ttlength); + done; + let os2 = + match keep (function (t, _, _, _) -> string_of_tag t = "OS/2") !tables with + | (_, _, o, l)::_ -> Some (o, l) + | [] -> None + in + let ascent, descent, capheight, xheight, avgwidth = + match os2 with + | None -> raise (Pdf.PDFError "No os/2 table found in truetype font") + | Some (o, l) -> let b = mk_b (i32toi o) in read_os2_table b (i32toi l) + in + let italicangle = + match keep (function (t, _, _, _) -> string_of_tag t = "post") !tables with + | (_, _, o, _)::_ -> read_post_table (mk_b (i32toi o)) + | _ -> 0 + in + if !dbg then + Printf.printf "ascent %i descent %i capheight %i xheight %i avgwidth %i\n" + ascent descent capheight xheight avgwidth; + let cmap = + match keep (function (t, _, _, _) -> string_of_tag t = "cmap") !tables with + | (_, _, o, l)::_ -> Some (o, l) + | [] -> None + in + let glyphcodes = ref (null_hash ()) in + begin match cmap with + | None -> + let t = null_hash () in + for x = 0 to 255 do Hashtbl.add t x x done; + glyphcodes := t + | Some (cmapoffset, cmaplength) -> + let b = mk_b (i32toi cmapoffset) in + let cmap_version = read_ushort b in + let num_encoding_tables = read_ushort b in + if !dbg then Printf.printf "cmap version %i. There are %i encoding tables\n" + cmap_version num_encoding_tables; + for x = 1 to num_encoding_tables do + let platform_id = read_ushort b in + let encoding_id = read_ushort b in + let subtable_offset = read_ulong b in + if !dbg then Printf.printf "subtable %i. platform_id = %i, encoding_id = %i, subtable_offset = %li\n" + x platform_id encoding_id subtable_offset; + let b = mk_b (i32toi cmapoffset + i32toi subtable_offset) in + let fmt = read_ushort b in + let lngth = read_ushort b in + let version = read_ushort b in + if !dbg then Printf.printf "subtable has format %i, length %i, version %i\n" fmt lngth version; + let got_glyphcodes = read_encoding_table fmt length version b in + glyphcodes := got_glyphcodes + done; + end; + let maxpoffset, maxplength = + match keep (function (t, _, _, _) -> string_of_tag t = "maxp") !tables with + | (_, _, o, l)::_ -> o, l + | [] -> raise (Pdf.PDFError "No maxp table found in TrueType font") + in + let b = mk_b (i32toi maxpoffset) in + let major, minor = read_fixed b in + let numGlyphs = read_ushort b in + if !dbg then Printf.printf "maxp table version %i.%i: This font has %i glyphs\n" major minor numGlyphs; + let headoffset, headlength = + match keep (function (t, _, _, _) -> string_of_tag t = "head") !tables with + | (_, _, o, l)::_ -> o, l + | [] -> raise (Pdf.PDFError "No maxp table found in TrueType font") + in + let b = mk_b (i32toi headoffset) in + discard_bytes b 36; + let minx = read_fword b in + let miny = read_fword b in + let maxx = read_fword b in + let maxy = read_fword b in + discard_bytes b 6; + let indexToLocFormat = read_short b in + let _ (*glyphDataFormat*) = read_short b in + if !dbg then Printf.printf "head table: indexToLocFormat is %i\n" indexToLocFormat; + if !dbg then Printf.printf "box %i %i %i %i\n" minx miny maxx maxy; + let locaoffset, localength = + match keep (function (t, _, _, _) -> string_of_tag t = "loca") !tables with + | (_, _, o, l)::_ -> o, l + | [] -> raise (Pdf.PDFError "No loca table found in TrueType font") + in + let flags = calculate_flags italicangle in + let firstchar, lastchar = calculate_limits subset in + let numOfLongHorMetrics = + match keep (function (t, _, _, _) -> string_of_tag t = "hhea") !tables with + | (_, _, o, l)::_ -> let b = mk_b (i32toi o) in read_hhea_table b + | _ -> 0 + in + let hmtxdata = + match keep (function (t, _, _, _) -> string_of_tag t = "hmtx") !tables with + | (_, _, o, _)::_ -> read_hmtx_table numOfLongHorMetrics (mk_b (i32toi o)) + | [] -> raise (Pdf.PDFError "No hmtx table found in TrueType font") + in + let widths = calculate_widths firstchar lastchar subset !glyphcodes hmtxdata in + let maxwidth = calculate_maxwidth hmtxdata in + let stemv = calculate_stemv () in + let b = mk_b (i32toi locaoffset) in + let offsets = read_loca_table indexToLocFormat numGlyphs b in + let subset = data in + {flags; minx; miny; maxx; maxy; italicangle; ascent; descent; + capheight; stemv; xheight; avgwidth; maxwidth; firstchar; lastchar; + widths; subset} diff --git a/cpdftruetype.mli b/cpdftruetype.mli new file mode 100644 index 0000000..cb5142f --- /dev/null +++ b/cpdftruetype.mli @@ -0,0 +1,24 @@ +(* Parse a TrueType font *) +type t = + {flags : int; + minx : int; + miny : int; + maxx : int; + maxy : int; + italicangle : int; + ascent : int; + descent : int; + capheight : int; + stemv : int; + xheight : int; + avgwidth : int; + maxwidth : int; + firstchar : int; + lastchar : int; + widths : int array; + subset : Pdfio.bytes} + +(* Parse the font, given the list of Unicode codepoints required for the subset + and optionally their PDF codepoint too. Returns the information required for + embedding this font in a PDF. *) +val parse : ?subset:(int * int option) list -> Pdfio.bytes -> t