cpdf-source/cpdftruetype.ml

350 lines
14 KiB
OCaml
Raw Normal View History

2022-09-11 15:52:08 +02:00
(* Truetype font parsing and subsetting *)
open Pdfutil
open Pdfio
type t =
{flags : int;
minx : int;
miny : int;
maxx : int;
maxy : int;
italicangle : int;
ascent : int;
descent : int;
capheight : int;
stemv : int;
xheight : int;
avgwidth : int;
maxwidth : int;
firstchar : int;
lastchar : int;
widths : int array;
subset : Pdfio.bytes}
2022-09-19 15:30:32 +02:00
let dbg = ref false (* text-based debug *)
2022-09-11 15:52:08 +02:00
2022-09-28 14:04:20 +02:00
let required_tables =
["head"; "hhea"; "loca"; "cmap"; "maxp"; "cvt "; "glyf"; "prep"; "hmtx"; "fpgm"]
2022-09-11 15:52:08 +02:00
(* 32-bit signed fixed-point number (16.16) returned as two ints *)
let read_fixed b =
let a = getval_31 b 16 in
let b = getval_31 b 16 in
a, b
(* 16-bit unsigned integer *)
let read_ushort b = getval_31 b 16
(* 32-bit unsigned integer *)
let read_ulong b = getval_32 b 32
(* Signed byte *)
let read_byte b = getval_31 b 8
(* Signed short *)
let read_short b = sign_extend 16 (getval_31 b 16)
(* fword *)
let read_fword = read_short
let read_ufword = read_ushort
(* f2dot14 - 2 bit signed integer part, 14 bit unsigned fraction *)
let read_f2dot14 b =
let v = read_ushort b in
float_of_int (sign_extend 2 (v lsr 14)) +. (float_of_int (v land 0x3FFF) /. 16384.)
(* discard n bytes *)
let discard_bytes b n =
for x = 1 to n do ignore (getval_31 b 8) done
2022-09-26 20:12:44 +02:00
let pdf_unit unitsPerEm x =
2022-09-26 20:14:48 +02:00
int_of_float (float_of_int x *. 1000. /. float_of_int unitsPerEm +. 0.5)
2022-09-26 20:12:44 +02:00
2022-09-11 15:52:08 +02:00
let string_of_tag t =
Printf.sprintf "%c%c%c%c"
(char_of_int (i32toi (Int32.shift_right t 24)))
(char_of_int (i32toi (Int32.logand 0x000000FFl (Int32.shift_right t 16))))
(char_of_int (i32toi (Int32.logand 0x000000FFl (Int32.shift_right t 8))))
(char_of_int (i32toi (Int32.logand 0x000000FFl t)))
let read_format_6_encoding_table b =
let firstCode = read_ushort b in
let entryCount = read_ushort b in
let t = null_hash () in
try
for x = firstCode to firstCode + entryCount - 1 do
Hashtbl.add t x (read_ushort b)
done;
t
with
e -> failwith ("bad format 6 table: " ^ Printexc.to_string e ^ "\n")
(* fixme might need indexToLocFormat here, to undo the "clever" formula. *)
let read_format_4_encoding_table b =
let t = null_hash () in
let segCountX2 = read_ushort b in
let segCount = segCountX2 / 2 in
let searchRange = read_ushort b in
let entrySelector = read_ushort b in
let rangeShift = read_ushort b in
let endCodes = Array.init segCount (fun _ -> read_ushort b) in
let _ (* reservedPad *) = read_ushort b in
let startCodes = Array.init segCount (fun _ -> read_ushort b) in
let idDelta = Array.init segCount (fun _ -> read_ushort b) in
let idRangeOffset = Array.init segCount (fun _ -> read_ushort b) in
if !dbg then
begin
Printf.printf "segCount = %i, searchRange = %i, entrySelector = %i, rangeShift = %i\n" segCount searchRange entrySelector rangeShift;
Printf.printf "endCodes\n";
print_ints (Array.to_list endCodes);
Printf.printf "startCodes\n";
print_ints (Array.to_list startCodes);
Printf.printf "idDelta\n";
print_ints (Array.to_list idDelta);
Printf.printf "idRangeOffset\n";
print_ints (Array.to_list idRangeOffset);
end;
for seg = 0 to segCount - 1 do
let ec = endCodes.(seg) in
let sc = startCodes.(seg) in
let del = idDelta.(seg) in
let ro = idRangeOffset.(seg) in
for c = sc to ec do
if ro = 0 then
Hashtbl.add t c ((c + del) mod 65536)
else
let sum = (c - sc) + del in
()
done
done;
t
let read_encoding_table fmt length version b =
match fmt with
| 0 ->
let t = null_hash () in
for x = 0 to 255 do Hashtbl.add t x (read_byte b) done;
t
| 4 -> read_format_4_encoding_table b
| 6 -> read_format_6_encoding_table b
| n -> raise (Pdf.PDFError "read_encoding_table: format %i not known\n%!")
let read_loca_table indexToLocFormat numGlyphs b =
let fix_empties arr =
for x = 1 to Array.length arr - 1 do
if arr.(x) = arr.(x - 1) then arr.(x - 1) <- -1l
done;
if arr <> [||] then arr.(Array.length arr - 1) <- -1l
in
match indexToLocFormat with
| 0 ->
let arr = Array.init (numGlyphs + 1) (function _ -> i32ofi (read_ushort b * 2)) in
fix_empties arr; arr
| 1 ->
let arr = Array.init (numGlyphs + 1) (function _ -> read_ulong b) in
fix_empties arr; arr
| _ -> raise (Pdf.PDFError "Unknown indexToLocFormat in read_loca_table")
2022-09-26 20:12:44 +02:00
let read_os2_table unitsPerEm b blength =
2022-09-11 15:52:08 +02:00
let version = read_ushort b in
if !dbg then Printf.printf "OS/2 table blength = %i bytes, version number = %i\n" blength version;
2022-09-26 20:12:44 +02:00
let xAvgCharWidth = pdf_unit unitsPerEm (read_short b) in
2022-09-11 15:52:08 +02:00
discard_bytes b 64; (* discard 14 entries usWeightClass...fsLastCharIndex *)
(* -- end of original OS/2 Version 0 Truetype table. Must check length before reading now. *)
2022-09-26 20:12:44 +02:00
let sTypoAscender = if blength > 68 then pdf_unit unitsPerEm (read_short b) else 0 in
let sTypoDescender = if blength > 68 then pdf_unit unitsPerEm (read_short b) else 0 in
2022-09-11 15:52:08 +02:00
discard_bytes b 6; (* discard sTypoLineGap...usWinDescent *)
(* -- end of OpenType version 0 table *)
discard_bytes b 8; (* discard ulCodePageRange1, ulCodePageRange2 *)
(* -- end of OpenType version 1 table *)
2022-09-26 20:12:44 +02:00
let sxHeight = if version < 2 then 0 else pdf_unit unitsPerEm (read_short b) in
let sCapHeight = if version < 2 then 0 else pdf_unit unitsPerEm (read_short b) in
2022-09-11 15:52:08 +02:00
(sTypoAscender, sTypoDescender, sCapHeight, sxHeight, xAvgCharWidth)
let read_post_table b =
discard_bytes b 4; (* discard version *)
let italicangle, n = read_fixed b in
italicangle
(* Eventually:
Set bit 6 for non symbolic. (nb bit 1 is actualy bit 0 etc.)
Set bit 7 if italicangle <> 0
Set bit 2 if serif ?
Set bit 1 if fixed pitch (calculate from widths) *)
let calculate_flags italicangle =
let italic = if italicangle <> 0 then 1 else 0 in
32 lor italic lsl 6
let calculate_limits subset =
if subset = [] then (0, 255) else
extremes (sort compare subset)
2022-09-26 20:12:44 +02:00
let calculate_stemv () = 0
2022-09-11 15:52:08 +02:00
let read_hhea_table b =
discard_bytes b 34;
read_ushort b (* numOfLongHorMetrics *)
let read_hmtx_table numOfLongHorMetrics b =
Array.init
numOfLongHorMetrics
(fun _ -> let r = read_ushort b in ignore (read_short b); r)
2022-09-19 17:21:14 +02:00
(* For widths, we need the unicode code, not the unencoded byte *)
let unicode_codepoint_of_pdfcode encoding_table glyphlist_table p =
try
hd (Hashtbl.find glyphlist_table (Hashtbl.find encoding_table p))
with
Not_found -> 0
2022-09-25 17:00:45 +02:00
let calculate_widths unitsPerEm encoding firstchar lastchar subset (cmapdata : (int, int) Hashtbl.t) (hmtxdata : int array) =
2022-09-11 15:52:08 +02:00
if lastchar < firstchar then failwith "lastchar < firschar" else
2022-09-14 19:08:14 +02:00
if !dbg then List.iter (fun (a, b) -> Printf.printf "%i -> %i\n" a b) (sort compare (list_of_hashtbl cmapdata));
2022-09-19 17:21:14 +02:00
let encoding_table = Pdftext.table_of_encoding encoding in
let glyphlist_table = Pdfglyphlist.glyph_hashes () in
2022-09-11 15:52:08 +02:00
Array.init
(lastchar - firstchar + 1)
(fun pos ->
let code = pos + firstchar in
2022-09-14 19:08:14 +02:00
if !dbg then Printf.printf "code %i --> " code;
2022-09-19 17:21:14 +02:00
let code = unicode_codepoint_of_pdfcode encoding_table glyphlist_table code in
if !dbg then Printf.printf "unicode %i --> " code;
2022-09-16 18:18:17 +02:00
if subset <> [] && not (mem code subset) then 0 else
2022-09-11 15:52:08 +02:00
try
let glyphnum = Hashtbl.find cmapdata code in
2022-09-14 19:08:14 +02:00
if !dbg then Printf.printf "glyph number %i --> " glyphnum;
2022-09-11 15:52:08 +02:00
let width = hmtxdata.(glyphnum) in
2022-09-14 19:08:14 +02:00
if !dbg then Printf.printf "width %i\n" width;
2022-09-25 17:00:45 +02:00
pdf_unit unitsPerEm width
2022-09-14 19:08:14 +02:00
with e -> if !dbg then Printf.printf "no width for %i\n" code; 0)
2022-09-11 15:52:08 +02:00
2022-09-26 20:12:44 +02:00
let calculate_maxwidth unitsPerEm hmtxdata =
pdf_unit unitsPerEm (hd (sort (fun a b -> compare b a) (Array.to_list hmtxdata)))
2022-09-11 15:52:08 +02:00
2022-09-19 17:21:14 +02:00
let parse ?(subset=[]) data ~encoding =
2022-09-11 15:52:08 +02:00
let subset = map fst subset in
let mk_b byte_offset = bitbytes_of_input (let i = input_of_bytes data in i.seek_in byte_offset; i) in
let b = mk_b 0 in
let major, minor = read_fixed b in
if !dbg then Printf.printf "Truetype font version %i.%i\n" major minor;
let numTables = read_ushort b in
let searchRange = read_ushort b in
let entrySelector = read_ushort b in
let rangeShift = read_ushort b in
if !dbg then Printf.printf "numTables = %i, searchRange = %i, entrySelector = %i, rangeShift = %i\n"
numTables searchRange entrySelector rangeShift;
let tables = ref [] in
for x = 1 to numTables do
let tag = read_ulong b in
let checkSum = read_ulong b in
let offset = read_ulong b in
let ttlength = read_ulong b in
if !dbg then Printf.printf "tag = %li = %s, checkSum = %li, offset = %li, ttlength = %li\n"
tag (string_of_tag tag) checkSum offset ttlength;
tables =| (tag, checkSum, offset, ttlength);
done;
2022-09-26 20:12:44 +02:00
let headoffset, headlength =
match keep (function (t, _, _, _) -> string_of_tag t = "head") !tables with
| (_, _, o, l)::_ -> o, l
| [] -> raise (Pdf.PDFError "No maxp table found in TrueType font")
in
let b = mk_b (i32toi headoffset) in
discard_bytes b 18;
let unitsPerEm = read_ushort b in
discard_bytes b 16;
let minx = pdf_unit unitsPerEm (read_fword b) in
let miny = pdf_unit unitsPerEm (read_fword b) in
let maxx = pdf_unit unitsPerEm (read_fword b) in
let maxy = pdf_unit unitsPerEm (read_fword b) in
discard_bytes b 6;
let indexToLocFormat = read_short b in
let _ (*glyphDataFormat*) = read_short b in
if !dbg then Printf.printf "head table: indexToLocFormat is %i\n" indexToLocFormat;
if !dbg then Printf.printf "box %i %i %i %i\n" minx miny maxx maxy;
2022-09-11 15:52:08 +02:00
let os2 =
match keep (function (t, _, _, _) -> string_of_tag t = "OS/2") !tables with
| (_, _, o, l)::_ -> Some (o, l)
| [] -> None
in
let ascent, descent, capheight, xheight, avgwidth =
match os2 with
| None -> raise (Pdf.PDFError "No os/2 table found in truetype font")
2022-09-26 20:12:44 +02:00
| Some (o, l) -> let b = mk_b (i32toi o) in read_os2_table unitsPerEm b (i32toi l)
2022-09-11 15:52:08 +02:00
in
let italicangle =
match keep (function (t, _, _, _) -> string_of_tag t = "post") !tables with
| (_, _, o, _)::_ -> read_post_table (mk_b (i32toi o))
| _ -> 0
in
if !dbg then
Printf.printf "ascent %i descent %i capheight %i xheight %i avgwidth %i\n"
ascent descent capheight xheight avgwidth;
let cmap =
match keep (function (t, _, _, _) -> string_of_tag t = "cmap") !tables with
| (_, _, o, l)::_ -> Some (o, l)
| [] -> None
in
let glyphcodes = ref (null_hash ()) in
begin match cmap with
| None ->
let t = null_hash () in
for x = 0 to 255 do Hashtbl.add t x x done;
glyphcodes := t
| Some (cmapoffset, cmaplength) ->
let b = mk_b (i32toi cmapoffset) in
let cmap_version = read_ushort b in
let num_encoding_tables = read_ushort b in
if !dbg then Printf.printf "cmap version %i. There are %i encoding tables\n"
cmap_version num_encoding_tables;
for x = 1 to num_encoding_tables do
let platform_id = read_ushort b in
let encoding_id = read_ushort b in
let subtable_offset = read_ulong b in
if !dbg then Printf.printf "subtable %i. platform_id = %i, encoding_id = %i, subtable_offset = %li\n"
x platform_id encoding_id subtable_offset;
let b = mk_b (i32toi cmapoffset + i32toi subtable_offset) in
let fmt = read_ushort b in
let lngth = read_ushort b in
let version = read_ushort b in
if !dbg then Printf.printf "subtable has format %i, length %i, version %i\n" fmt lngth version;
let got_glyphcodes = read_encoding_table fmt length version b in
glyphcodes := got_glyphcodes
done;
end;
let maxpoffset, maxplength =
match keep (function (t, _, _, _) -> string_of_tag t = "maxp") !tables with
| (_, _, o, l)::_ -> o, l
| [] -> raise (Pdf.PDFError "No maxp table found in TrueType font")
in
let b = mk_b (i32toi maxpoffset) in
let major, minor = read_fixed b in
let numGlyphs = read_ushort b in
if !dbg then Printf.printf "maxp table version %i.%i: This font has %i glyphs\n" major minor numGlyphs;
2022-09-26 20:12:44 +02:00
2022-09-11 15:52:08 +02:00
let locaoffset, localength =
match keep (function (t, _, _, _) -> string_of_tag t = "loca") !tables with
| (_, _, o, l)::_ -> o, l
| [] -> raise (Pdf.PDFError "No loca table found in TrueType font")
in
let flags = calculate_flags italicangle in
let firstchar, lastchar = calculate_limits subset in
let numOfLongHorMetrics =
match keep (function (t, _, _, _) -> string_of_tag t = "hhea") !tables with
| (_, _, o, l)::_ -> let b = mk_b (i32toi o) in read_hhea_table b
| _ -> 0
in
let hmtxdata =
match keep (function (t, _, _, _) -> string_of_tag t = "hmtx") !tables with
| (_, _, o, _)::_ -> read_hmtx_table numOfLongHorMetrics (mk_b (i32toi o))
| [] -> raise (Pdf.PDFError "No hmtx table found in TrueType font")
in
2022-09-25 17:00:45 +02:00
let widths = calculate_widths unitsPerEm encoding firstchar lastchar subset !glyphcodes hmtxdata in
2022-09-26 20:12:44 +02:00
let maxwidth = calculate_maxwidth unitsPerEm hmtxdata in
2022-09-11 15:52:08 +02:00
let stemv = calculate_stemv () in
let b = mk_b (i32toi locaoffset) in
let offsets = read_loca_table indexToLocFormat numGlyphs b in
let subset = data in
{flags; minx; miny; maxx; maxy; italicangle; ascent; descent;
capheight; stemv; xheight; avgwidth; maxwidth; firstchar; lastchar;
widths; subset}