Copyright (c) 2007 Daniel C. Bünzli. All rights reserved.
Distributed under the ISC license, see terms at the end of the file.
module Std_string = String
module Std_buffer = Buffer
type std_string = string
type std_buffer = Buffer.t
module type String = sig
type t
val empty : t
val length : t -> int
val append : t -> t -> t
val lowercase : t -> t
val iter : (int -> unit) -> t -> unit
val of_string : std_string -> t
val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a
val compare : t -> t -> int
module type Buffer = sig
type string
type t
exception Full
val create : int -> t
val add_uchar : t -> int -> unit
val clear : t -> unit
val contents : t -> string
val length : t -> int
module type S = sig
type string
type encoding = [
| `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1 | `US_ASCII ]
type dtd = string option
type name = string * string
type attribute = name * string
type tag = name * attribute list
type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
val ns_xml : string
val ns_xmlns : string
type pos = int * int
type error = [
| `Max_buffer_size
| `Unexpected_eoi
| `Malformed_char_stream
| `Unknown_encoding of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Expected_char_seqs of string list * string
| `Expected_root_element ]
exception Error of pos * error
val error_message : error -> string
type source = [
| `Channel of in_channel
| `String of int * std_string
| `Fun of (unit -> int) ]
type input
val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) ->
?entity: (string -> string option) -> source -> input
val input : input -> signal
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> 'a
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> (dtd * 'a)
val peek : input -> signal
val eoi : input -> bool
val pos : input -> pos
type 'a frag = [ `El of tag * 'a list | `Data of string ]
type dest = [
| `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
type output
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
?ns_prefix:(string -> string option) -> dest -> output
val output_depth : output -> int
val output : output -> signal -> unit
val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
(* Unicode character lexers *)
exception Malformed (* for character stream, internal only. *)
let utf8_len = [| (* Char byte length according to first UTF-8 byte. *)
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1;
1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0;
0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2;
2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3;
4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |]
let uchar_utf8 i =
let b0 = i () in
begin match utf8_len.(b0) with
| 0 -> raise Malformed
| 1 -> b0
| 2 ->
let b1 = i () in
if b1 lsr 6 != 0b10 then raise Malformed else
((b0 land 0x1F) lsl 6) lor (b1 land 0x3F)
| 3 ->
let b1 = i () in
let b2 = i () in
if b2 lsr 6 != 0b10 then raise Malformed else
begin match b0 with
| 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then raise Malformed else ()
| 0xED -> if b1 < 0x80 || 0x9F < b1 then raise Malformed else ()
| _ -> if b1 lsr 6 != 0b10 then raise Malformed else ()
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
| 4 ->
let b1 = i () in
let b2 = i () in
let b3 = i () in
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then raise Malformed else
begin match b0 with
| 0xF0 -> if b1 < 0x90 || 0xBF < b1 then raise Malformed else ()
| 0xF4 -> if b1 < 0x80 || 0x8F < b1 then raise Malformed else ()
| _ -> if b1 lsr 6 != 0b10 then raise Malformed else ()
((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
((b2 land 0x3F) lsl 6) lor (b3 land 0x3F)
| _ -> assert false
let int16_be i =
let b0 = i () in
let b1 = i () in
(b0 lsl 8) lor b1
let int16_le i =
let b0 = i () in
let b1 = i () in
(b1 lsl 8) lor b0
let uchar_utf16 int16 i =
let c0 = int16 i in
if c0 < 0xD800 || c0 > 0xDFFF then c0 else
if c0 > 0xDBFF then raise Malformed else
let c1 = int16 i in
(((c0 land 0x3FF) lsl 10) lor (c1 land 0x3FF)) + 0x10000
let uchar_utf16be = uchar_utf16 int16_be
let uchar_utf16le = uchar_utf16 int16_le
let uchar_byte i = i ()
let uchar_iso_8859_1 i = i ()
let uchar_ascii i = let b = i () in if b > 127 then raise Malformed else b
(* Functorized streaming XML IO *)
module Make (String : String) (Buffer : Buffer with type string = String.t) =
type string = String.t
let str = String.of_string
let str_eq s s' = (compare s s') = 0
let str_empty s = (compare s String.empty) = 0
let cat = String.append
let str_of_char u =
let b = Buffer.create 4 in
Buffer.add_uchar b u;
Buffer.contents b
module Ht = Hashtbl.Make (struct type t = string
let equal = str_eq
let hash = Hashtbl.hash end)
let u_nl = 0x000A (* newline *)
let u_cr = 0x000D (* carriage return *)
let u_space = 0x0020 (* space *)
let u_quot = 0x0022 (* quote *)
let u_sharp = 0x0023 (* # *)
let u_amp = 0x0026 (* & *)
let u_apos = 0x0027 (* ' *)
let u_minus = 0x002D (* - *)
let u_slash = 0x002F (* / *)
let u_colon = 0x003A (* : *)
let u_scolon = 0x003B (* ; *)
let u_lt = 0x003C (* < *)
let u_eq = 0x003D (* = *)
let u_gt = 0x003E (* > *)
let u_qmark = 0x003F (* ? *)
let u_emark = 0x0021 (* ! *)
let u_lbrack = 0x005B (* [ *)
let u_rbrack = 0x005D (* ] *)
let u_x = 0x0078 (* x *)
let u_bom = 0xFEFF (* BOM *)
let u_9 = 0x0039 (* 9 *)
let u_F = 0x0046 (* F *)
let u_D = 0X0044 (* D *)
let s_cdata = str "CDATA["
let ns_xml = str "http://www.w3.org/XML/1998/namespace"
let ns_xmlns = str "http://www.w3.org/2000/xmlns/"
let n_xml = str "xml"
let n_xmlns = str "xmlns"
let n_space = str "space"
let n_version = str "version"
let n_encoding = str "encoding"
let n_standalone = str "standalone"
let v_yes = str "yes"
let v_no = str "no"
let v_preserve = str "preserve"
let v_default = str "default"
let v_version_1_0 = str "1.0"
let v_version_1_1 = str "1.1"
let v_utf_8 = str "utf-8"
let v_utf_16 = str "utf-16"
let v_utf_16be = str "utf-16be"
let v_utf_16le = str "utf-16le"
let v_iso_8859_1 = str "iso-8859-1"
let v_us_ascii = str "us-ascii"
let v_ascii = str "ascii"
let name_str (p,l) = if str_empty p then l else cat p (cat (str ":") l)
(* Basic types and values *)
type encoding = [
| `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1 | `US_ASCII ]
type dtd = string option
type name = string * string
type attribute = name * string
type tag = name * attribute list
type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
(* Input *)
type pos = int * int
type error = [
| `Max_buffer_size
| `Unexpected_eoi
| `Malformed_char_stream
| `Unknown_encoding of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Expected_char_seqs of string list * string
| `Expected_root_element ]
exception Error of pos * error
let error_message e =
let bracket l v r = cat (str l) (cat v (str r)) in
match e with
| `Expected_root_element -> str "expected root element"
| `Max_buffer_size -> str "maximal buffer size exceeded"
| `Unexpected_eoi -> str "unexpected end of input"
| `Malformed_char_stream -> str "malformed character stream"
| `Unknown_encoding e -> bracket "unknown encoding (" e ")"
| `Unknown_entity_ref e -> bracket "unknown entity reference (" e ")"
| `Unknown_ns_prefix e -> bracket "unknown namespace prefix (" e ")"
| `Illegal_char_ref s -> bracket "illegal character reference (#" s ")"
| `Illegal_char_seq s ->
bracket "character sequence illegal here (\"" s "\")"
| `Expected_char_seqs (exps, fnd) ->
let exps =
let exp acc v = cat acc (bracket "\"" v "\", ") in
List.fold_left exp String.empty exps
cat (str "expected one of these character sequence: ")
(cat exps (bracket "found \"" fnd "\""))
type limit = (* XML is odd to parse. *)
| Stag of name (* '<' qname *)
| Etag of name (* '</' qname whitespace* *)
| Pi of name (* '<?' qname *)
| Comment (* '<!--' *)
| Cdata (* '<![CDATA[' *)
| Dtd (* '<!' *)
| Text (* other character *)
| Eoi (* End of input *)
type source = [
| `Channel of in_channel
| `String of int * std_string
| `Fun of (unit -> int) ]
type input =
{ enc : encoding option; (* Expected encoding. *)
strip : bool; (* Whitespace stripping default behaviour. *)
fun_ns : string -> string option; (* Namespace callback. *)
fun_entity : string -> string option; (* Entity reference callback. *)
i : unit -> int; (* Byte level input. *)
mutable uchar : (unit -> int) -> int; (* Unicode character lexer. *)
mutable c : int; (* Character lookahead. *)
mutable cr : bool; (* True if last u was '\r'. *)
mutable line : int; (* Current line number. *)
mutable col : int; (* Current column number. *)
mutable limit : limit; (* Last parsed limit. *)
mutable peek : signal; (* Signal lookahead. *)
mutable stripping : bool; (* True if stripping whitespace. *)
mutable last_white : bool; (* True if last char was white. *)
mutable scopes : (name * string list * bool) list;
(* Stack of qualified el. name, bound prefixes and strip behaviour. *)
ns : string Ht.t; (* prefix -> uri bindings. *)
ident : Buffer.t; (* Buffer for names and entity refs. *)
data : Buffer.t; } (* Buffer for character and attribute data. *)
let err_input_tree = "input signal not `El_start or `Data"
let err_input_doc_tree = "input signal not `Dtd"
let err i e = raise (Error ((i.line, i.col), e))
let err_illegal_char i u = err i (`Illegal_char_seq (str_of_char u))
let err_expected_seqs i exps s = err i (`Expected_char_seqs (exps, s))
let err_expected_chars i exps =
err i (`Expected_char_seqs (List.map str_of_char exps, str_of_char i.c))
let u_eoi = max_int
let u_start_doc = u_eoi - 1
let u_end_doc = u_start_doc - 1
let signal_start_stream = `Data String.empty
let make_input ?(enc = None) ?(strip = false) ?(ns = fun _ -> None)
?(entity = fun _ -> None) src =
let i = match src with
| `Fun f -> f
| `Channel ic -> fun () -> input_byte ic
| `String (pos, s) ->
let len = Std_string.length s in
let pos = ref (pos - 1) in
fun () ->
incr pos;
if !pos = len then raise End_of_file else
Char.code (Std_string.get s !pos)
let bindings =
let h = Ht.create 15 in
Ht.add h String.empty String.empty;
Ht.add h n_xml ns_xml;
Ht.add h n_xmlns ns_xmlns;
{ enc = enc; strip = strip; fun_ns = ns; fun_entity = entity;
i = i; uchar = uchar_byte; c = u_start_doc; cr = false;
line = 1; col = 0; limit = Text; peek = signal_start_stream;
stripping = strip; last_white = true; scopes = []; ns = bindings;
ident = Buffer.create 64; data = Buffer.create 1024; }
(* Bracketed non-terminals in comments refer to XML 1.0 non terminals *)
let r : int -> int -> int -> bool = fun u a b -> a <= u && u <= b
let is_white = function 0x0020 | 0x0009 | 0x000D | 0x000A -> true | _ -> false
let is_char = function (* {Char} *)
| u when r u 0x0020 0xD7FF -> true
| 0x0009 | 0x000A | 0x000D -> true
| u when r u 0xE000 0xFFFD || r u 0x10000 0x10FFFF -> true
| _ -> false
let is_digit u = r u 0x0030 0x0039
let is_hex_digit u =
r u 0x0030 0x0039 || r u 0x0041 0x0046 || r u 0x0061 0x0066
let comm_range u = (* common to functions below *)
r u 0x00C0 0x00D6 || r u 0x00D8 0x00F6 || r u 0x00F8 0x02FF ||
r u 0x0370 0x037D || r u 0x037F 0x1FFF || r u 0x200C 0x200D ||
r u 0x2070 0x218F || r u 0x2C00 0x2FEF || r u 0x3001 0xD7FF ||
r u 0xF900 0xFDCF || r u 0xFDF0 0xFFFD || r u 0x10000 0xEFFFF
let is_name_start_char = function (* {NameStartChar} - ':' (XML 1.1) *)
| u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *)
| u when is_white u -> false
| 0x005F -> true (* '_' *)
| u when comm_range u -> true
| _ -> false
let is_name_char = function (* {NameChar} - ':' (XML 1.1) *)
| u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *)
| u when is_white u -> false
| u when r u 0x0030 0x0039 -> true (* [0-9] *)
| 0x005F | 0x002D | 0x002E | 0x00B7 -> true (* '_' '-' '.' *)
| u when comm_range u || r u 0x0300 0x036F || r u 0x203F 0x2040 -> true
| _ -> false
let rec nextc i =
if i.c = u_eoi then err i `Unexpected_eoi;
if i.c = u_nl then (i.line <- i.line + 1; i.col <- 1)
else i.col <- i.col + 1;
i.c <- i.uchar i.i;
if not (is_char i.c) then raise Malformed;
if i.cr && i.c = u_nl then i.c <- i.uchar i.i; (* cr nl business *)
if i.c = u_cr then (i.cr <- true; i.c <- u_nl) else i.cr <- false
let nextc_eof i = try nextc i with End_of_file -> i.c <- u_eoi
let skip_white i = while (is_white i.c) do nextc i done
let skip_white_eof i = while (is_white i.c) do nextc_eof i done
let accept i c = if i.c = c then nextc i else err_expected_chars i [ c ]
let clear_ident i = Buffer.clear i.ident
let clear_data i = Buffer.clear i.data
let addc_ident i c = Buffer.add_uchar i.ident c
let addc_data i c = Buffer.add_uchar i.data c
let addc_data_strip i c =
if is_white c then i.last_white <- true else
if i.last_white && Buffer.length i.data <> 0 then addc_data i u_space;
i.last_white <- false;
addc_data i c
let expand_name i (prefix, local) =
let external_ prefix = match i.fun_ns prefix with
| None -> err i (`Unknown_ns_prefix prefix)
| Some uri -> uri
let uri = Ht.find i.ns prefix in
if not (str_empty uri) then (uri, local) else
if str_empty prefix then String.empty, local else
(external_ prefix), local (* unbound with xmlns:prefix="" *)
with Not_found -> external_ prefix, local
let find_encoding i = (* Encoding mess. *)
let reset uchar i = i.uchar <- uchar; i.col <- 0; nextc i in
match i.enc with
| None -> (* User doesn't know encoding. *)
begin match nextc i; i.c with
| 0xFE -> (* UTF-16BE BOM. *)
nextc i; if i.c <> 0xFF then err i `Malformed_char_stream;
reset uchar_utf16be i;
| 0xFF -> (* UTF-16LE BOM. *)
nextc i; if i.c <> 0xFE then err i `Malformed_char_stream;
reset uchar_utf16le i;
| 0xEF -> (* UTF-8 BOM. *)
nextc i; if i.c <> 0xBB then err i `Malformed_char_stream;
nextc i; if i.c <> 0xBF then err i `Malformed_char_stream;
reset uchar_utf8 i;
| 0x3C | _ -> (* UTF-8 or other, try declaration. *)
i.uchar <- uchar_utf8;
| Some e -> (* User knows encoding. *)
begin match e with
| `US_ASCII -> reset uchar_ascii i
| `ISO_8859_1 -> reset uchar_iso_8859_1 i
| `UTF_8 -> (* Skip BOM if present. *)
reset uchar_utf8 i; if i.c = u_bom then (i.col <- 0; nextc i)
| `UTF_16 -> (* Which UTF-16 ? look BOM. *)
let b0 = nextc i; i.c in
let b1 = nextc i; i.c in
begin match b0, b1 with
| 0xFE, 0xFF -> reset uchar_utf16be i
| 0xFF, 0xFE -> reset uchar_utf16le i
| _ -> err i `Malformed_char_stream;
| `UTF_16BE -> (* Skip BOM if present. *)
reset uchar_utf16be i; if i.c = u_bom then (i.col <- 0; nextc i)
| `UTF_16LE ->
reset uchar_utf16le i; if i.c = u_bom then (i.col <- 0; nextc i)
true (* Ignore xml declaration. *)
let p_ncname i = (* {NCName} (Namespace 1.1) *)
clear_ident i;
if not (is_name_start_char i.c) then err_illegal_char i i.c else
addc_ident i i.c; nextc i;
while is_name_char i.c do addc_ident i i.c; nextc i done;
Buffer.contents i.ident
let p_qname i = (* {QName} (Namespace 1.1) *)
let n = p_ncname i in
if i.c <> u_colon then (String.empty, n) else (nextc i; (n, p_ncname i))
let p_charref i = (* {CharRef}, '&' was eaten. *)
let c = ref 0 in
clear_ident i;
nextc i;
if i.c = u_scolon then err i (`Illegal_char_ref String.empty) else
if i.c = u_x then
addc_ident i i.c;
nextc i;
while (i.c <> u_scolon) do
addc_ident i i.c;
if not (is_hex_digit i.c) then raise Exit else
c := !c * 16 + (if i.c <= u_9 then i.c - 48 else
if i.c <= u_F then i.c - 55 else
i.c - 87);
nextc i;
while (i.c <> u_scolon) do
addc_ident i i.c;
if not (is_digit i.c) then raise Exit else
c := !c * 10 + (i.c - 48);
nextc i
with Exit ->
c := -1; while i.c <> u_scolon do addc_ident i i.c; nextc i done
nextc i;
if is_char !c then (clear_ident i; addc_ident i !c; Buffer.contents i.ident)
else err i (`Illegal_char_ref (Buffer.contents i.ident))
let predefined_entities =
let h = Ht.create 5 in
let e k v = Ht.add h (str k) (str v) in
e "lt" "<"; e "gt" ">"; e "amp" "&"; e "apos" "'"; e "quot" "\"";
let p_entity_ref i = (* {EntityRef}, '&' was eaten. *)
let ent = p_ncname i in
accept i u_scolon;
try Ht.find predefined_entities ent with Not_found ->
match i.fun_entity ent with
| Some s -> s
| None -> err i (`Unknown_entity_ref ent)
let p_reference i = (* {Reference} *)
nextc i; if i.c = u_sharp then p_charref i else p_entity_ref i
let p_attr_value i = (* {S}? {AttValue} *)
skip_white i;
let delim =
if i.c = u_quot || i.c = u_apos then i.c else
err_expected_chars i [ u_quot; u_apos]
nextc i;
skip_white i;
clear_data i;
i.last_white <- true;
while (i.c <> delim) do
if i.c = u_lt then err_illegal_char i u_lt else
if i.c = u_amp then String.iter (addc_data_strip i) (p_reference i)
else (addc_data_strip i i.c; nextc i)
nextc i;
Buffer.contents i.data
let p_attributes i = (* ({S} {Attribute})* {S}? *)
let rec aux i pre_acc acc =
if not (is_white i.c) then pre_acc, acc else
skip_white i;
if i.c = u_slash || i.c = u_gt then pre_acc, acc else
let (prefix, local) as n = p_qname i in
let v = skip_white i; accept i u_eq; p_attr_value i in
let att = n, v in
if str_empty prefix && str_eq local n_xmlns then
begin (* xmlns *)
Ht.add i.ns String.empty v;
aux i (String.empty :: pre_acc) (att :: acc)
else if str_eq prefix n_xmlns then
begin (* xmlns:local *)
Ht.add i.ns local v;
aux i (local :: pre_acc) (att :: acc)
else if str_eq prefix n_xml && str_eq local n_space then
begin (* xml:space *)
if str_eq v v_preserve then i.stripping <- false else
if str_eq v v_default then i.stripping <- i.strip else ();
aux i pre_acc (att :: acc)
aux i pre_acc (att :: acc)
aux i [] [] (* Returns a list of bound prefixes and attributes *)
let p_limit i = (* Parses a markup limit *)
i.limit <-
if i.c = u_eoi then Eoi else
if i.c <> u_lt then Text else
nextc i;
if i.c = u_qmark then (nextc i; Pi (p_qname i)) else
if i.c = u_slash then
nextc i;
let n = p_qname i in
skip_white i;
Etag n
else if i.c = u_emark then
nextc i;
if i.c = u_minus then (nextc i; accept i u_minus; Comment) else
if i.c = u_D then Dtd else
if i.c = u_lbrack then
nextc i;
clear_ident i;
for k = 1 to 6 do (addc_ident i i.c; nextc i) done;
let cdata = Buffer.contents i.ident in
if str_eq cdata s_cdata then Cdata else
err_expected_seqs i [ s_cdata ] cdata
err i (`Illegal_char_seq (cat (str "<!") (str_of_char i.c)))
Stag (p_qname i)
let rec skip_comment i = (* {Comment}, '<!--' was eaten *)
while (i.c <> u_minus) do nextc i done;
nextc i;
if i.c <> u_minus then skip_comment i else
nextc i;
if i.c <> u_gt then err_expected_chars i [ u_gt ];
nextc_eof i
let rec skip_pi i = (* {PI}, '<?' qname was eaten *)
while (i.c <> u_qmark) do nextc i done;
nextc i;
if i.c <> u_gt then skip_pi i else nextc_eof i
let rec skip_misc i ~allow_xmlpi = match i.limit with (* {Misc}* *)
| Pi (p,l) when (str_empty p && str_eq n_xml (String.lowercase l)) ->
if allow_xmlpi then () else err i (`Illegal_char_seq l)
| Pi _ -> skip_pi i; p_limit i; skip_misc i ~allow_xmlpi
| Comment -> skip_comment i; p_limit i; skip_misc i ~allow_xmlpi
| Text when is_white i.c ->
skip_white_eof i; p_limit i; skip_misc i ~allow_xmlpi
| _ -> ()
let p_chardata addc i = (* {CharData}* ({Reference}{Chardata})* *)
while (i.c <> u_lt) do
if i.c = u_amp then String.iter (addc i) (p_reference i)
else if i.c = u_rbrack then
addc i i.c;
nextc i;
if i.c = u_rbrack then begin
addc i i.c;
nextc i; (* detects ']'*']]>' *)
while (i.c = u_rbrack) do addc i i.c; nextc i done;
if i.c = u_gt then err i (`Illegal_char_seq (str "]]>"));
(addc i i.c; nextc i)
let rec p_cdata addc i = (* {CData} {CDEnd} *)
try while (true) do
if i.c = u_rbrack then begin
nextc i;
while i.c = u_rbrack do
nextc i;
if i.c = u_gt then (nextc i; raise Exit);
addc i u_rbrack
addc i u_rbrack;
addc i i.c;
nextc i;
done with Exit -> ()
let p_xml_decl i ~ignore_enc ~ignore_utf16 = (* {XMLDecl}? *)
let yes_no = [v_yes; v_no] in
let p_val i = skip_white i; accept i u_eq; skip_white i; p_attr_value i in
let p_val_exp i exp =
let v = p_val i in
if not (List.exists (str_eq v) exp) then err_expected_seqs i exp v
match i.limit with
| Pi (p, l) when (str_empty p && str_eq l n_xml) ->
let v = skip_white i; p_ncname i in
if not (str_eq v n_version) then err_expected_seqs i [ n_version ] v;
p_val_exp i [v_version_1_0; v_version_1_1];
skip_white i;
if i.c <> u_qmark then begin
let n = p_ncname i in
if str_eq n n_encoding then begin
let enc = String.lowercase (p_val i) in
if not ignore_enc then begin
if str_eq enc v_utf_8 then i.uchar <- uchar_utf8 else
if str_eq enc v_utf_16be then i.uchar <- uchar_utf16be else
if str_eq enc v_utf_16le then i.uchar <- uchar_utf16le else
if str_eq enc v_iso_8859_1 then i.uchar <- uchar_iso_8859_1 else
if str_eq enc v_us_ascii then i.uchar <- uchar_ascii else
if str_eq enc v_ascii then i.uchar <- uchar_ascii else
if str_eq enc v_utf_16 then
if ignore_utf16 then () else (err i `Malformed_char_stream)
(* A BOM should have been found. *)
err i (`Unknown_encoding enc)
skip_white i;
if i.c <> u_qmark then begin
let n = p_ncname i in
if str_eq n n_standalone then p_val_exp i yes_no else
err_expected_seqs i [ n_standalone; str "?>" ] n
else if str_eq n n_standalone then
p_val_exp i yes_no
err_expected_seqs i [ n_encoding; n_standalone; str "?>" ] n
skip_white i;
accept i u_qmark;
accept i u_gt;
p_limit i
| _ -> ()
let p_dtd_signal i =(* {Misc}* {doctypedecl} {Misc}* *)
skip_misc i ~allow_xmlpi:false;
if i.limit <> Dtd then `Dtd None else
let buf = addc_data i in
let nest = ref 1 in
clear_data i;
buf u_lt; buf u_emark; (* add eaten "<!" *)
while (!nest > 0) do
if i.c = u_lt then
nextc i;
if i.c <> u_emark then
(buf u_lt; incr nest)
nextc i;
if i.c <> u_minus then (* Carefull with comments ! *)
(buf u_lt; buf u_emark; incr nest)
nextc i;
if i.c <> u_minus then
(buf u_lt; buf u_emark; buf u_minus; incr nest)
(nextc i; skip_comment i)
else if i.c = u_quot || i.c = u_apos then
let c = i.c in
buf c; nextc i;
while (i.c <> c) do (buf i.c; nextc i) done;
buf c; nextc i
else if i.c = u_gt then (buf u_gt; nextc i; decr nest)
else (buf i.c; nextc i)
let dtd = Buffer.contents i.data in
p_limit i;
skip_misc i ~allow_xmlpi:false;
`Dtd (Some dtd);
let p_data i =
let rec bufferize addc i = match i.limit with
| Text -> p_chardata addc i; p_limit i; bufferize addc i
| Cdata -> p_cdata addc i; p_limit i; bufferize addc i
| (Stag _ | Etag _) -> ()
| Pi _ -> skip_pi i; p_limit i; bufferize addc i
| Comment -> skip_comment i; p_limit i; bufferize addc i
| Dtd -> err i (`Illegal_char_seq (str "<!D"))
| Eoi -> err i `Unexpected_eoi
clear_data i;
i.last_white <- true;
bufferize (if i.stripping then addc_data_strip else addc_data) i;
let d = Buffer.contents i.data in
let p_el_start_signal i n =
let expand_att (((prefix, local) as n, v) as att) =
if not (str_eq prefix String.empty) then expand_name i n, v else
if str_eq local n_xmlns then (ns_xmlns, n_xmlns), v else
att (* default namespaces do not influence attributes. *)
let strip = i.stripping in (* save it here, p_attributes may change it. *)
let prefixes, atts = p_attributes i in
i.scopes <- (n, prefixes, strip) :: i.scopes;
`El_start ((expand_name i n), List.rev_map expand_att atts)
let p_el_end_signal i n = match i.scopes with
| (n', prefixes, strip) :: scopes ->
if i.c <> u_gt then err_expected_chars i [ u_gt ];
if not (str_eq n n') then err_expected_seqs i [name_str n'] (name_str n);
i.scopes <- scopes;
i.stripping <- strip;
List.iter (Ht.remove i.ns) prefixes;
if scopes = [] then i.c <- u_end_doc else (nextc i; p_limit i);
| _ -> assert false
let p_signal i =
if i.scopes = [] then
match i.limit with
| Stag n -> p_el_start_signal i n
| _ -> err i `Expected_root_element
let rec find i = match i.limit with
| Stag n -> p_el_start_signal i n
| Etag n -> p_el_end_signal i n
| Text | Cdata ->
let d = p_data i in
if str_empty d then find i else `Data d
| Pi _ -> skip_pi i; p_limit i; find i
| Comment -> skip_comment i; p_limit i; find i
| Dtd -> err i (`Illegal_char_seq (str "<!D"))
| Eoi -> err i `Unexpected_eoi
begin match i.peek with
| `El_start (n, _) -> (* finish to input start el. *)
skip_white i;
if i.c = u_gt then (accept i u_gt; p_limit i) else
if i.c = u_slash then
let tag = match i.scopes with
| (tag, _, _) :: _ -> tag | _ -> assert false
(nextc i; i.limit <- Etag tag)
err_expected_chars i [ u_slash; u_gt ]
| _ -> ()
find i
let eoi i =
if i.c = u_eoi then true else
if i.c <> u_start_doc then false else (* In a document. *)
if i.peek <> `El_end then (* Start of document sequence. *)
let ignore_enc = find_encoding i in
p_limit i;
p_xml_decl i ~ignore_enc ~ignore_utf16:false;
i.peek <- p_dtd_signal i;
else (* Subsequent documents. *)
nextc_eof i;
p_limit i;
if i.c = u_eoi then true else
skip_misc i ~allow_xmlpi:true;
if i.c = u_eoi then true else
p_xml_decl i ~ignore_enc:false ~ignore_utf16:true;
i.peek <- p_dtd_signal i;
| Buffer.Full -> err i `Max_buffer_size
| Malformed -> err i `Malformed_char_stream
| End_of_file -> err i `Unexpected_eoi
let peek i = if eoi i then err i `Unexpected_eoi else i.peek
let input i =
if i.c = u_end_doc then (i.c <- u_start_doc; i.peek) else
let s = peek i in
i.peek <- p_signal i;
| Buffer.Full -> err i `Max_buffer_size
| Malformed -> err i `Malformed_char_stream
| End_of_file -> err i `Unexpected_eoi
let input_tree ~el ~data i = match input i with
| `Data d -> data d
| `El_start tag ->
let rec aux i tags context = match input i with
| `El_start tag -> aux i (tag :: tags) ([] :: context)
| `El_end ->
begin match tags, context with
| tag :: tags', childs :: context' ->
let el = el tag (List.rev childs) in
begin match context' with
| parent :: context'' -> aux i tags' ((el :: parent) :: context'')
| [] -> el
| _ -> assert false
| `Data d ->
begin match context with
| childs :: context' -> aux i tags (((data d) :: childs) :: context')
| [] -> assert false
| `Dtd _ -> assert false
aux i (tag :: []) ([] :: [])
| _ -> invalid_arg err_input_tree
let input_doc_tree ~el ~data i = match input i with
| `Dtd d -> d, input_tree ~el ~data i
| _ -> invalid_arg err_input_doc_tree
let pos i = i.line, i.col
(* Output *)
type 'a frag = [ `El of tag * 'a list | `Data of string ]
type dest = [
| `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
type output =
{ decl : bool; (* True if the XML declaration should be output. *)
nl : bool; (* True if a newline is output at the end. *)
indent : int option; (* Optional indentation. *)
fun_prefix : string -> string option; (* Prefix callback. *)
prefixes : string Ht.t; (* uri -> prefix bindings. *)
outs : std_string -> int -> int -> unit; (* String output. *)
outc : char -> unit; (* character output. *)
mutable last_el_start : bool; (* True if last signal was `El_start *)
mutable scopes : (name * (string list)) list;
(* Qualified el. name and bound uris. *)
mutable depth : int; } (* Scope depth. *)
let err_prefix uri = "unbound namespace (" ^ uri ^ ")"
let err_dtd = "dtd signal not allowed here"
let err_el_start = "start signal not allowed here"
let err_el_end = "end signal without matching start signal"
let err_data = "data signal not allowed here"
let make_output ?(decl = true) ?(nl = false) ?(indent = None)
?(ns_prefix = fun _ ->None) d =
let outs, outc = match d with
| `Channel c -> (output_substring c), (output_char c)
| `Buffer b -> (Std_buffer.add_substring b), (Std_buffer.add_char b)
| `Fun f ->
let os s p l =
for i = p to p + l - 1 do f (Char.code (Std_string.get s i)) done
let oc c = f (Char.code c) in
os, oc
let prefixes =
let h = Ht.create 10 in
Ht.add h String.empty String.empty;
Ht.add h ns_xml n_xml;
Ht.add h ns_xmlns n_xmlns;
{ decl = decl; outs = outs; outc = outc; nl = nl; indent = indent;
last_el_start = false; prefixes = prefixes; scopes = []; depth = -1;
fun_prefix = ns_prefix; }
let output_depth o = o.depth
let outs o s = o.outs s 0 (Std_string.length s)
let str_utf_8 s = String.to_utf_8 (fun _ s -> s) "" s
let out_utf_8 o s = ignore (String.to_utf_8 (fun o s -> outs o s; o) o s)
let prefix_name o (ns, local) =
if str_eq ns ns_xmlns && str_eq local n_xmlns then (String.empty, n_xmlns)
else (Ht.find o.prefixes ns, local)
with Not_found ->
match o.fun_prefix ns with
| None -> invalid_arg (err_prefix (str_utf_8 ns))
| Some prefix -> prefix, local
let bind_prefixes o atts =
let add acc ((ns, local), uri) =
if not (str_eq ns ns_xmlns) then acc else
let prefix = if str_eq local n_xmlns then String.empty else local in
Ht.add o.prefixes uri prefix;
uri :: acc
List.fold_left add [] atts
let out_data o s =
let out () s =
let len = Std_string.length s in
let start = ref 0 in
let last = ref 0 in
let escape e =
o.outs s !start (!last - !start);
outs o e;
incr last;
start := !last
while (!last < len) do match Std_string.get s !last with
| '<' -> escape "<" (* Escape markup delimiters. *)
| '>' -> escape ">"
| '&' -> escape "&"
(* | '\'' -> escape "'" *) (* Not needed we use \x22 for attributes. *)
| '\x22' -> escape """
| '\n' | '\t' | '\r' -> incr last
| c when c < ' ' -> escape "\xEF\xBF\xBD" (* illegal, subst. by U+FFFD *)
| _ -> incr last
o.outs s !start (!last - !start)
String.to_utf_8 out () s
let out_qname o (p, l) =
if not (str_empty p) then (out_utf_8 o p; o.outc ':');
out_utf_8 o l
let out_attribute o (n, v) =
o.outc ' '; out_qname o (prefix_name o n); outs o "=\x22";
out_data o v;
o.outc '\x22'
let output o s =
let indent o = match o.indent with
| None -> ()
| Some c -> for i = 1 to (o.depth * c) do o.outc ' ' done
let unindent o = match o.indent with None -> () | Some _ -> o.outc '\n' in
if o.depth = -1 then
begin match s with
| `Dtd d ->
if o.decl then outs o "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
begin match d with
| Some dtd -> out_utf_8 o dtd; o.outc '\n'
| None -> ()
o.depth <- 0
| `Data _ -> invalid_arg err_data
| `El_start _ -> invalid_arg err_el_start
| `El_end -> invalid_arg err_el_end
begin match s with
| `El_start (n, atts) ->
if o.last_el_start then (outs o ">"; unindent o);
indent o;
let uris = bind_prefixes o atts in
let qn = prefix_name o n in
o.outc '<'; out_qname o qn; List.iter (out_attribute o) atts;
o.scopes <- (qn, uris) :: o.scopes;
o.depth <- o.depth + 1;
o.last_el_start <- true
| `El_end ->
begin match o.scopes with
| (n, uris) :: scopes' ->
o.depth <- o.depth - 1;
if o.last_el_start then outs o "/>" else
indent o;
outs o "</"; out_qname o n; o.outc '>';
o.scopes <- scopes';
List.iter (Ht.remove o.prefixes) uris;
o.last_el_start <- false;
if o.depth = 0 then (if o.nl then o.outc '\n'; o.depth <- -1;)
else unindent o
| [] -> invalid_arg err_el_end
| `Data d ->
if o.last_el_start then (outs o ">"; unindent o);
indent o;
out_data o d;
unindent o;
o.last_el_start <- false
| `Dtd _ -> failwith err_dtd
let output_tree frag o v =
let rec aux o = function
| (v :: rest) :: context ->
begin match frag v with
| `El (tag, childs) ->
output o (`El_start tag);
aux o (childs :: rest :: context)
| (`Data d) as signal ->
output o signal;
aux o (rest :: context)
| [] :: [] -> ()
| [] :: context -> output o `El_end; aux o context
| [] -> assert false
aux o ([v] :: [])
let output_doc_tree frag o (dtd, v) =
output o (`Dtd dtd);
output_tree frag o v
(* Default streaming XML IO *)
module String = struct
type t = string
let empty = ""
let length = String.length
let append = ( ^ )
let lowercase = String.lowercase_ascii
let iter f s =
let len = Std_string.length s in
let pos = ref ~-1 in
let i () =
incr pos;
if !pos = len then raise Exit else
Char.code (Std_string.get s !pos)
try while true do f (uchar_utf8 i) done with Exit -> ()
let of_string s = s
let to_utf_8 f v x = f v x
let compare = String.compare
module Buffer = struct
type string = String.t
type t = Buffer.t
exception Full
let create = Buffer.create
let add_uchar b u =
(* UTF-8 encodes an uchar in the buffer, assumes u is valid code point. *)
let buf c = Buffer.add_char b (Char.chr c) in
if u <= 0x007F then
(buf u)
else if u <= 0x07FF then
(buf (0xC0 lor (u lsr 6));
buf (0x80 lor (u land 0x3F)))
else if u <= 0xFFFF then
(buf (0xE0 lor (u lsr 12));
buf (0x80 lor ((u lsr 6) land 0x3F));
buf (0x80 lor (u land 0x3F)))
(buf (0xF0 lor (u lsr 18));
buf (0x80 lor ((u lsr 12) land 0x3F));
buf (0x80 lor ((u lsr 6) land 0x3F));
buf (0x80 lor (u land 0x3F)))
with Failure _ -> raise Full
let clear b = Buffer.clear b
let contents = Buffer.contents
let length = Buffer.length
include Make(String) (Buffer)
(* Pretty printers *)
let pp = Format.fprintf
let rec pp_list ?(pp_sep = Format.pp_print_cut) pp_v ppf = function
| [] -> ()
| v :: vs ->
pp_v ppf v; if vs <> [] then (pp_sep ppf (); pp_list ~pp_sep pp_v ppf vs)
let pp_name ppf (p, l) = if p <> "" then pp ppf "%s:%s" p l else pp ppf "%s" l
let pp_attribute ppf (n, v) = pp ppf "@[<1>(%a,@,%S)@]" pp_name n v
let pp_tag ppf (name, atts) =
let pp_sep ppf () = pp ppf ";@ " in
pp ppf "@[<1>(%a,@,@[<1>[%a]@])@]"
pp_name name (pp_list ~pp_sep pp_attribute) atts
let pp_dtd ppf = function
| None -> pp ppf "None"
| Some dtd -> pp ppf "@[<1>(Some@ %S)@]" dtd
let pp_signal ppf = function
| `Data s -> pp ppf "@[`Data %S@]" s
| `El_end -> pp ppf "`El_end"
| `El_start tag -> pp ppf "@[`El_start %a@]" pp_tag tag
| `Dtd dtd -> pp ppf "@[`Dtd %a@]" pp_dtd dtd
Copyright (c) 2007 Daniel C. Bünzli
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.