From 64f0339b71fbb1d77ffa646143850754db9f8cc5 Mon Sep 17 00:00:00 2001
From: John Whitington <john@coherentgraphics.co.uk>
Date: Sun, 11 Sep 2022 14:52:08 +0100
Subject: [PATCH] Include TrueType embedder

---
 Makefile         |  27 ++--
 cpdfembed.ml     | 144 ++++++++++++++++++++
 cpdfembed.mli    |   1 +
 cpdftruetype.ml  | 332 +++++++++++++++++++++++++++++++++++++++++++++++
 cpdftruetype.mli |  24 ++++
 5 files changed, 513 insertions(+), 15 deletions(-)
 create mode 100644 cpdfembed.ml
 create mode 100644 cpdfembed.mli
 create mode 100644 cpdftruetype.ml
 create mode 100644 cpdftruetype.mli

diff --git a/Makefile b/Makefile
index ca8bedb..3445c27 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,15 @@
 # Build the cpdf command line tools and top level
-MODS = cpdfyojson cpdfxmlm cpdfutil \
-       cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
-       cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
-       cpdfbookmarks cpdfpage cpdfaddtext cpdfimage cpdffont cpdftype \
-       cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \
-       cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \
-       cpdfcommand
+NONDOC = cpdfyojson cpdfxmlm cpdfutil
+
+DOC = cpdfunicodedata cpdferror cpdfdebug cpdfjson cpdfstrftime cpdfcoord \
+      cpdfattach cpdfpagespec cpdfposition cpdfpresent cpdfmetadata \
+      cpdfbookmarks cpdfpage cpdfaddtext cpdfimage cpdffont cpdftype \
+      cpdftexttopdf cpdftoc cpdfpad cpdfocg cpdfsqueeze cpdfdraft cpdfspot \
+      cpdfpagelabels cpdfcreate cpdfannot cpdfxobject cpdfimpose cpdftweak \
+      cpdftruetype cpdfembed \
+      cpdfcommand
+
+MODS = $(NONDOC) $(DOC)
 
 SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml
 
@@ -36,14 +40,7 @@ clean ::
 	rm -rf doc foo foo2 out.pdf out2.pdf foo.pdf decomp.pdf *.cmt *.cmti \
 	*.json test/*.pdf debug/*.pdf *.ps *.aux *.idx *.log *.out *.toc *.cut
 
-DOC_FILES = cpdfunicodedata.mli cpdferror.mli cpdfdebug.mli cpdfjson.mli \
-            cpdfstrftime.mli cpdfcoord.mli cpdfattach.mli cpdfpagespec.mli \
-	    cpdfposition.mli cpdfpresent.mli cpdfmetadata.mli \
-	    cpdfbookmarks.mli cpdfpage.mli cpdfaddtext.mli cpdfimage.mli \
-	    cpdffont.mli cpdftype.mli cpdftexttopdf.mli cpdftoc.mli \
-	    cpdfpad.mli cpdfocg.mli cpdfsqueeze.mli cpdfdraft.mli \
-            cpdfspot.mli cpdfpagelabels.mli cpdfcreate.mli cpdfannot.mli \
-	    cpdfxobject.mli cpdfimpose.mli cpdftweak.mli cpdfcommand.mli
+DOC_FILES = $(foreach x,$(DOC),$(x).mli )
 
 install : libinstall
 
diff --git a/cpdfembed.ml b/cpdfembed.ml
new file mode 100644
index 0000000..de9978d
--- /dev/null
+++ b/cpdfembed.ml
@@ -0,0 +1,144 @@
+(* Truetype font embedding example *)
+open Pdfutil
+
+(* For the first stage of our embedder, we are only allowing standard encodings, and we don't actually subset.
+  a) Get a list of Unicode codepoints;
+  b) See which of them are in the glyph list;
+  c) See which of those are in (StdEncoding|MacRomanEncoding|WinAnsiEncoding), and get their codes;
+  d) Build a font to do just those;
+  e) We put question marks for any character not in the encoding
+  (* FUTURE *)
+  1) Actually subset the font to save size
+  2) Allow characters not in the standard encodings by builing one or more secondary subsets *)
+
+(* UTF8 Input text *)
+let text = "Noto Sans Black Àë"
+let encoding = Pdftext.MacRomanEncoding
+
+let unicodepoints = Pdftext.codepoints_of_utf8 text
+
+let glyphlist_table = Pdfglyphlist.reverse_glyph_hashes ()
+  
+let encoding_table = Pdftext.reverse_table_of_encoding encoding
+
+let () =
+  iter
+    (fun u ->
+       Printf.printf "unicode %i --> " u;
+       let glyphname = Hashtbl.find glyphlist_table [u] in
+         Printf.printf "glyph name %s --> " glyphname;
+         let pdfcode = Hashtbl.find encoding_table glyphname in
+         Printf.printf "pdf code %i\n" pdfcode)
+    unicodepoints
+ 
+let pdfcode_of_unicode_codepoint u =
+  try
+    Some (Hashtbl.find encoding_table (Hashtbl.find glyphlist_table [u]))
+  with
+    Not_found -> None
+
+let tj_text =
+  implode
+    (map
+      (fun x -> match pdfcode_of_unicode_codepoint x with Some c -> char_of_int c | None -> '?')
+      unicodepoints)
+
+let calc_accepted_unicodepoints codepoints =
+  setify
+    (option_map
+       (fun u -> match pdfcode_of_unicode_codepoint u with Some _ -> Some u | None -> None)
+       codepoints)
+
+let accepted_unicodepoints =
+  map
+    (fun u -> (u, pdfcode_of_unicode_codepoint u))
+    (calc_accepted_unicodepoints unicodepoints)
+
+let contents_of_file filename =
+  let ch = open_in_bin filename in
+  let s = really_input_string ch (in_channel_length ch) in
+    close_in ch;
+    s
+
+let fontname = "NotoSans-Black"
+let fontstr = contents_of_file (fontname ^ ".ttf")
+
+let f =
+  Cpdftruetype.parse ~subset:accepted_unicodepoints (Pdfio.bytes_of_string fontstr)
+
+let contents =
+  "1 0 0 1 50 770 cm BT/TT1 36 Tf(" ^ tj_text ^ ")Tj ET"
+
+let widths =
+  Pdf.Array (map (fun x -> Pdf.Integer x) (Array.to_list f.Cpdftruetype.widths))
+
+let fontnum = ref 0
+
+let basename () =
+  incr fontnum;
+  "AAAAA" ^ string_of_char (char_of_int (!fontnum + 65))
+
+let name_1 = basename ()
+
+let string_of_encoding = function
+  | Pdftext.WinAnsiEncoding -> "/WinAnsiEncoding"
+  | Pdftext.MacRomanEncoding -> "/MacRomanEncoding"
+  | Pdftext.StandardEncoding -> "/StandardEncoding"
+  | _ -> failwith "unknown encoding"
+
+let font =
+  Pdf.add_dict_entry
+    (Pdfread.parse_single_object
+      (Printf.sprintf "<</Type/Font/Subtype/TrueType/BaseFont/%s+%s/FontDescriptor 6 0 R/Encoding%s/FirstChar %i/LastChar %i>>" name_1 fontname (string_of_encoding encoding) f.Cpdftruetype.firstchar f.Cpdftruetype.lastchar))
+    "/Widths"
+    widths
+
+let fontdescriptor =
+  Pdfread.parse_single_object
+    (Printf.sprintf "<</Type/FontDescriptor/FontName/%s+%s/Flags %i/FontBBox[%i %i %i %i]/ItalicAngle %i/Ascent %i/Descent %i/CapHeight %i/StemV %i/XHeight %i/AvgWidth %i/MaxWidth %i/FontFile2 7 0 R>>"
+    name_1 fontname f.Cpdftruetype.flags f.Cpdftruetype.minx f.Cpdftruetype.miny f.Cpdftruetype.maxx f.Cpdftruetype.maxy f.Cpdftruetype.italicangle
+    f.Cpdftruetype.ascent f.Cpdftruetype.descent f.Cpdftruetype.capheight f.Cpdftruetype.stemv f.Cpdftruetype.xheight f.Cpdftruetype.avgwidth f.Cpdftruetype.maxwidth)
+
+let fontfile =
+  let len = String.length fontstr in
+  Pdf.Stream
+    {contents =
+       (Pdf.Dictionary [("/Length", Pdf.Integer len); ("/Length1", Pdf.Integer len)],
+        Pdf.Got (Pdfio.bytes_of_string fontstr))}
+
+let objects =
+  [(1, Pdfread.parse_single_object "<</Type/Catalog/Pages 2 0 R>>");
+   (2, Pdfread.parse_single_object "<</Type/Pages/Kids[3 0 R]/Count 1>>");
+   (3, Pdfread.parse_single_object "<</Type/Page/Resources<</Font <</TT1 5 0 R>>>>/Parent 2 0 R/MediaBox[0 0 595 842]/Rotate 0/Contents[4 0 R]>>");
+   (4, Pdf.Stream
+         {contents = (Pdf.Dictionary [("/Length", Pdf.Integer (String.length contents))],
+                     (Pdf.Got (Pdfio.bytes_of_string contents)))});
+   (5, font);
+   (6, fontdescriptor);
+   (7, fontfile);
+  ]
+
+let root = 1
+
+let trailerdict =
+  Pdfread.parse_single_object (Printf.sprintf "<</Root 1 0 R/Size %i>>" (length objects + 1))
+
+let pdf =
+  let pdf =
+   {Pdf.major = 2;
+    Pdf.minor = 0;
+    Pdf.root = root;
+    Pdf.objects =
+      {Pdf.maxobjnum = 0;
+       Pdf.parse = None;
+       Pdf.pdfobjects = Pdf.pdfobjmap_empty ();
+       Pdf.object_stream_ids = null_hash ()};
+    Pdf.trailerdict = trailerdict;
+    Pdf.was_linearized = false;
+    Pdf.saved_encryption = None}
+  in
+    iter (Pdf.addobj_given_num pdf) objects;
+    pdf
+
+let () =
+  Pdfwrite.pdf_to_file pdf "subset.pdf"
diff --git a/cpdfembed.mli b/cpdfembed.mli
new file mode 100644
index 0000000..948db8f
--- /dev/null
+++ b/cpdfembed.mli
@@ -0,0 +1 @@
+(* *)
diff --git a/cpdftruetype.ml b/cpdftruetype.ml
new file mode 100644
index 0000000..c1c6799
--- /dev/null
+++ b/cpdftruetype.ml
@@ -0,0 +1,332 @@
+(* Truetype font parsing and subsetting *)
+open Pdfutil
+open Pdfio
+
+type t =
+  {flags : int;
+   minx : int;
+   miny : int;
+   maxx : int;
+   maxy : int;
+   italicangle : int;
+   ascent : int;
+   descent : int;
+   capheight : int;
+   stemv : int;
+   xheight : int;
+   avgwidth : int;
+   maxwidth : int;
+   firstchar : int;
+   lastchar : int;
+   widths : int array;
+   subset : Pdfio.bytes}
+
+let dbg = ref true (* text-based debug *)
+
+(* 32-bit signed fixed-point number (16.16) returned as two ints *)
+let read_fixed b =
+  let a = getval_31 b 16 in
+    let b = getval_31 b 16 in
+      a, b
+
+(* 16-bit unsigned integer *)
+let read_ushort b = getval_31 b 16
+
+(* 32-bit unsigned integer *)
+let read_ulong b = getval_32 b 32
+
+(* Signed byte *)
+let read_byte b = getval_31 b 8
+
+(* Signed short *)
+let read_short b = sign_extend 16 (getval_31 b 16)
+
+(* fword *)
+let read_fword = read_short
+let read_ufword = read_ushort
+
+(* f2dot14 - 2 bit signed integer part, 14 bit unsigned fraction *)
+let read_f2dot14 b =
+  let v = read_ushort b in
+    float_of_int (sign_extend 2 (v lsr 14)) +. (float_of_int (v land 0x3FFF) /. 16384.)
+
+(* discard n bytes *)
+let discard_bytes b n =
+  for x = 1 to n do ignore (getval_31 b 8) done
+
+let string_of_tag t =
+  Printf.sprintf "%c%c%c%c"
+    (char_of_int (i32toi (Int32.shift_right t 24)))
+    (char_of_int (i32toi (Int32.logand 0x000000FFl (Int32.shift_right t 16))))
+    (char_of_int (i32toi (Int32.logand 0x000000FFl (Int32.shift_right t 8))))
+    (char_of_int (i32toi (Int32.logand 0x000000FFl t)))
+
+let read_format_6_encoding_table b =
+  let firstCode = read_ushort b in
+  let entryCount = read_ushort b in
+  let t = null_hash () in
+    (* FIXME: This format can address glyphs > 255, but we don't support that
+    elsewhere yet --- but we read the whole format table nonethless *)
+    try
+      for x = firstCode to firstCode + entryCount - 1 do
+        Hashtbl.add t x (read_ushort b)
+      done;
+      t
+    with
+      e -> failwith ("bad format 6 table: " ^ Printexc.to_string e ^ "\n")
+
+(* fixme might need indexToLocFormat here, to undo the "clever" formula. *)
+let read_format_4_encoding_table b =
+  let t = null_hash () in
+  let segCountX2 = read_ushort b in
+  let segCount = segCountX2 / 2 in
+  let searchRange = read_ushort b in
+  let entrySelector = read_ushort b in
+  let rangeShift = read_ushort b in
+  let endCodes = Array.init segCount (fun _ -> read_ushort b) in
+  let _ (* reservedPad *) = read_ushort b in
+  let startCodes = Array.init segCount (fun _ -> read_ushort b) in
+  let idDelta = Array.init segCount (fun _ -> read_ushort b) in
+  let idRangeOffset = Array.init segCount (fun _ -> read_ushort b) in
+    if !dbg then
+    begin
+    Printf.printf "segCount = %i, searchRange = %i, entrySelector = %i, rangeShift = %i\n" segCount searchRange entrySelector rangeShift;
+    Printf.printf "endCodes\n";
+    print_ints (Array.to_list endCodes);
+    Printf.printf "startCodes\n";
+    print_ints (Array.to_list startCodes);
+    Printf.printf "idDelta\n";
+    print_ints (Array.to_list idDelta);
+    Printf.printf "idRangeOffset\n";
+    print_ints (Array.to_list idRangeOffset);
+    end;
+    for seg = 0 to segCount - 1 do
+      let ec = endCodes.(seg) in
+      let sc = startCodes.(seg) in
+      let del = idDelta.(seg) in
+      let ro = idRangeOffset.(seg) in
+        for c = sc to ec do
+          if ro = 0 then
+            Hashtbl.add t c ((c + del) mod 65536)
+          else
+            let sum = (c - sc) + del in
+              ()
+        done
+    done;
+    t
+
+let read_encoding_table fmt length version b =
+  match fmt with
+  | 0 ->
+      let t = null_hash () in
+        for x = 0 to 255 do Hashtbl.add t x (read_byte b) done;
+        t
+  | 4 -> read_format_4_encoding_table b
+  | 6 -> read_format_6_encoding_table b
+  | n -> raise (Pdf.PDFError "read_encoding_table: format %i not known\n%!")
+
+let read_loca_table indexToLocFormat numGlyphs b =
+  let fix_empties arr =
+    for x = 1 to Array.length arr - 1 do
+      if arr.(x) = arr.(x - 1) then arr.(x - 1) <- -1l
+    done;
+    if arr <> [||] then arr.(Array.length arr - 1) <- -1l
+  in
+    match indexToLocFormat with
+    | 0 ->
+        let arr = Array.init (numGlyphs + 1) (function _ -> i32ofi (read_ushort b * 2)) in
+          fix_empties arr; arr
+    | 1 ->
+        let arr = Array.init (numGlyphs + 1) (function _ -> read_ulong b) in
+          fix_empties arr; arr
+    | _ -> raise (Pdf.PDFError "Unknown indexToLocFormat in read_loca_table")
+
+let read_os2_table b blength =
+  let version = read_ushort b in
+  if !dbg then Printf.printf "OS/2 table blength = %i bytes, version number = %i\n" blength version;
+  if version < 2 then failwith "read_os2_table: version number too low" else
+  let xAvgCharWidth = read_short b in
+  discard_bytes b 64; (* discard 14 entries usWeightClass...fsLastCharIndex *)
+  (* -- end of original OS/2 Version 0 Truetype table. Must check length before reading now. *)
+  let sTypoAscender = read_short b in
+  let sTypoDescender = read_short b in
+  discard_bytes b 6; (* discard sTypoLineGap...usWinDescent *)
+  (* -- end of OpenType version 0 table *)
+  discard_bytes b 8; (* discard ulCodePageRange1, ulCodePageRange2 *)
+  (* -- end of OpenType version 1 table *)
+  let sxHeight = read_short b in
+  let sCapHeight = read_short b in
+    (sTypoAscender, sTypoDescender, sCapHeight, sxHeight, xAvgCharWidth)
+
+let read_post_table b =
+  discard_bytes b 4; (* discard version *)
+  let italicangle, n = read_fixed b in
+    italicangle
+
+(* Eventually:
+Set bit 6 for non symbolic. (nb bit 1 is actualy bit 0 etc.)
+Set bit 7 if italicangle <> 0
+Set bit 2 if serif ?
+Set bit 1 if fixed pitch (calculate from widths) *)
+let calculate_flags italicangle =
+  let italic = if italicangle <> 0 then 1 else 0 in 
+    32 lor italic lsl 6
+
+let calculate_limits subset =
+  if subset = [] then (0, 255) else
+    extremes (sort compare subset)
+
+let calculate_stemv () = 80
+
+let read_hhea_table b =
+  discard_bytes b 34;
+  read_ushort b (* numOfLongHorMetrics *)
+
+let read_hmtx_table numOfLongHorMetrics b =
+  Array.init
+    numOfLongHorMetrics
+    (fun _ -> let r = read_ushort b in ignore (read_short b); r)
+
+let calculate_widths firstchar lastchar subset (cmapdata : (int, int) Hashtbl.t) (hmtxdata : int array) =
+  if lastchar < firstchar then failwith "lastchar < firschar" else
+  List.iter (fun (a, b) -> Printf.printf "%i -> %i\n" a b) (sort compare (list_of_hashtbl cmapdata));
+  Array.init
+    (lastchar - firstchar + 1)
+    (fun pos ->
+       let code = pos + firstchar in
+         Printf.printf "code %i --> " code;
+       if not (mem code subset) then 0 else
+       try
+         let glyphnum = Hashtbl.find cmapdata code in
+           Printf.printf "glyph number %i --> " glyphnum;
+           let width = hmtxdata.(glyphnum) in
+             Printf.printf "width %i\n" width;
+             width
+       with e -> Printf.printf "no width for %i\n" code; 0)
+
+let calculate_maxwidth hmtxdata =
+  hd (sort (fun a b -> compare b a) (Array.to_list hmtxdata))
+
+let parse ?(subset=[]) data =
+  let subset = map fst subset in
+  let mk_b byte_offset = bitbytes_of_input (let i = input_of_bytes data in i.seek_in byte_offset; i) in
+  let b = mk_b 0 in
+  let major, minor = read_fixed b in
+    if !dbg then Printf.printf "Truetype font version %i.%i\n" major minor;
+    let numTables = read_ushort b in
+    let searchRange = read_ushort b in
+    let entrySelector = read_ushort b in
+    let rangeShift = read_ushort b in
+      if !dbg then Printf.printf "numTables = %i, searchRange = %i, entrySelector = %i, rangeShift = %i\n"
+        numTables searchRange entrySelector rangeShift;
+      let tables = ref [] in
+        for x = 1 to numTables do
+          let tag = read_ulong b in
+          let checkSum = read_ulong b in
+          let offset = read_ulong b in
+          let ttlength = read_ulong b in
+            if !dbg then Printf.printf "tag = %li = %s, checkSum = %li, offset = %li, ttlength = %li\n"
+            tag (string_of_tag tag) checkSum offset ttlength;
+            tables =| (tag, checkSum, offset, ttlength);
+        done;
+        let os2 =
+          match keep (function (t, _, _, _) -> string_of_tag t = "OS/2") !tables with
+          | (_, _, o, l)::_ -> Some (o, l)
+          | [] -> None
+        in
+        let ascent, descent, capheight, xheight, avgwidth =
+          match os2 with
+          | None -> raise (Pdf.PDFError "No os/2 table found in truetype font")
+          | Some (o, l) -> let b = mk_b (i32toi o) in read_os2_table b (i32toi l)
+        in
+        let italicangle =
+          match keep (function (t, _, _, _) -> string_of_tag t = "post") !tables with
+          | (_, _, o, _)::_ -> read_post_table (mk_b (i32toi o))
+          | _ -> 0
+        in
+        if !dbg then
+          Printf.printf "ascent %i descent %i capheight %i xheight %i avgwidth %i\n"
+            ascent descent capheight xheight avgwidth;
+        let cmap =
+          match keep (function (t, _, _, _) -> string_of_tag t = "cmap") !tables with
+          | (_, _, o, l)::_ -> Some (o, l)
+          | [] -> None
+        in
+        let glyphcodes = ref (null_hash ()) in
+          begin match cmap with
+          | None ->
+              let t = null_hash () in
+                for x = 0 to 255 do Hashtbl.add t x x done;
+                glyphcodes := t
+          | Some (cmapoffset, cmaplength) -> 
+              let b = mk_b (i32toi cmapoffset) in
+                let cmap_version = read_ushort b in
+                let num_encoding_tables = read_ushort b in
+                  if !dbg then Printf.printf "cmap version %i. There are %i encoding tables\n"
+                    cmap_version num_encoding_tables;
+                  for x = 1 to num_encoding_tables do
+                    let platform_id = read_ushort b in
+                    let encoding_id = read_ushort b in
+                    let subtable_offset = read_ulong b in
+                      if !dbg then Printf.printf "subtable %i. platform_id = %i, encoding_id = %i, subtable_offset = %li\n"
+                        x platform_id encoding_id subtable_offset;
+                      let b = mk_b (i32toi cmapoffset + i32toi subtable_offset) in
+                        let fmt = read_ushort b in
+                        let lngth = read_ushort b in
+                        let version = read_ushort b in
+                          if !dbg then Printf.printf "subtable has format %i, length %i, version %i\n" fmt lngth version;
+                          let got_glyphcodes = read_encoding_table fmt length version b in
+                            glyphcodes := got_glyphcodes
+                  done;
+          end;
+          let maxpoffset, maxplength =
+            match keep (function (t, _, _, _) -> string_of_tag t = "maxp") !tables with
+            | (_, _, o, l)::_ -> o, l
+            | [] -> raise (Pdf.PDFError "No maxp table found in TrueType font")
+          in
+          let b = mk_b (i32toi maxpoffset) in
+            let major, minor = read_fixed b in
+            let numGlyphs = read_ushort b in
+              if !dbg then Printf.printf "maxp table version %i.%i: This font has %i glyphs\n" major minor numGlyphs;
+          let headoffset, headlength =
+            match keep (function (t, _, _, _) -> string_of_tag t = "head") !tables with
+            | (_, _, o, l)::_ -> o, l
+            | [] -> raise (Pdf.PDFError "No maxp table found in TrueType font")
+          in
+            let b = mk_b (i32toi headoffset) in
+              discard_bytes b 36;
+              let minx = read_fword b in
+              let miny = read_fword b in
+              let maxx = read_fword b in
+              let maxy = read_fword b in
+              discard_bytes b 6;
+              let indexToLocFormat = read_short b in
+              let _ (*glyphDataFormat*) = read_short b in
+                if !dbg then Printf.printf "head table: indexToLocFormat is %i\n" indexToLocFormat;
+                if !dbg then Printf.printf "box %i %i %i %i\n" minx miny maxx maxy;
+          let locaoffset, localength =
+            match keep (function (t, _, _, _) -> string_of_tag t = "loca") !tables with
+            | (_, _, o, l)::_ -> o, l
+            | [] -> raise (Pdf.PDFError "No loca table found in TrueType font")
+          in
+            let flags = calculate_flags italicangle in
+            let firstchar, lastchar = calculate_limits subset in
+            let numOfLongHorMetrics =
+              match keep (function (t, _, _, _) -> string_of_tag t = "hhea") !tables with
+              | (_, _, o, l)::_ -> let b = mk_b (i32toi o) in read_hhea_table b
+              | _ -> 0
+            in
+            let hmtxdata =
+              match keep (function (t, _, _, _) -> string_of_tag t = "hmtx") !tables with
+              | (_, _, o, _)::_ -> read_hmtx_table numOfLongHorMetrics (mk_b (i32toi o))
+              | [] -> raise (Pdf.PDFError "No hmtx table found in TrueType font")
+            in
+            let widths = calculate_widths firstchar lastchar subset !glyphcodes hmtxdata in
+            let maxwidth = calculate_maxwidth hmtxdata in
+            let stemv = calculate_stemv () in
+            let b = mk_b (i32toi locaoffset) in
+            let offsets = read_loca_table indexToLocFormat numGlyphs b in
+            let subset = data in
+              {flags; minx; miny; maxx; maxy; italicangle; ascent; descent;
+              capheight; stemv; xheight; avgwidth; maxwidth; firstchar; lastchar;
+              widths; subset}
diff --git a/cpdftruetype.mli b/cpdftruetype.mli
new file mode 100644
index 0000000..cb5142f
--- /dev/null
+++ b/cpdftruetype.mli
@@ -0,0 +1,24 @@
+(* Parse a TrueType font *)
+type t =
+  {flags : int;
+   minx : int;
+   miny : int;
+   maxx : int;
+   maxy : int;
+   italicangle : int;
+   ascent : int;
+   descent : int;
+   capheight : int;
+   stemv : int;
+   xheight : int;
+   avgwidth : int;
+   maxwidth : int;
+   firstchar : int;
+   lastchar : int;
+   widths : int array;
+   subset : Pdfio.bytes}
+
+(* Parse the font, given the list of Unicode codepoints required for the subset
+   and optionally their PDF codepoint too. Returns the information required for
+   embedding this font in a PDF. *)
+val parse : ?subset:(int * int option) list -> Pdfio.bytes -> t