diff --git a/Makefile b/Makefile index 96aec69..44b800e 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Build the cpdf command line tools and top level -MODS = cpdfstrftime cpdf cpdfcommand +MODS = xmlm cpdfstrftime cpdf cpdfcommand SOURCES = $(foreach x,$(MODS),$(x).ml $(x).mli) cpdfcommandrun.ml diff --git a/xmlm.ml b/xmlm.ml new file mode 100755 index 0000000..9f78b4e --- /dev/null +++ b/xmlm.ml @@ -0,0 +1,1232 @@ +(*--------------------------------------------------------------------------- + Copyright 2007 Daniel C. Bünzli. All rights reserved. + Distributed under a BSD3 license, see license at the end of the file. + %%NAME%% release %%VERSION%% + ---------------------------------------------------------------------------*) + +module Std_string = String +module Std_buffer = Buffer + +type std_string = string +type std_buffer = Buffer.t + +module type String = sig + type t + val empty : t + val length : t -> int + val append : t -> t -> t + val lowercase : t -> t + val iter : (int -> unit) -> t -> unit + val of_string : std_string -> t + val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a + val compare : t -> t -> int +end + +module type Buffer = sig + type string + type t + exception Full + val create : int -> t + val add_uchar : t -> int -> unit + val clear : t -> unit + val contents : t -> string + val length : t -> int +end + +module type S = sig + type string + type encoding = [ + | `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1 | `US_ASCII ] + type dtd = string option + type name = string * string + type attribute = name * string + type tag = name * attribute list + type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] + + val ns_xml : string + val ns_xmlns : string + + type pos = int * int + type error = [ + | `Max_buffer_size + | `Unexpected_eoi + | `Malformed_char_stream + | `Unknown_encoding of string + | `Unknown_entity_ref of string + | `Unknown_ns_prefix of string + | `Illegal_char_ref of string + | `Illegal_char_seq of string + | `Expected_char_seqs of string list * string + | `Expected_root_element ] + + exception Error of pos * error + val error_message : error -> string + + type source = [ + | `Channel of in_channel + | `String of int * std_string + | `Fun of (unit -> int) ] + + type input + + val make_input : ?enc:encoding option -> ?strip:bool -> + ?ns:(string -> string option) -> + ?entity: (string -> string option) -> source -> input + + val input : input -> signal + + val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + input -> 'a + + val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + input -> (dtd * 'a) + + val peek : input -> signal + val eoi : input -> bool + val pos : input -> pos + + type 'a frag = [ `El of tag * 'a list | `Data of string ] + type dest = [ + | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] + + type output + val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> + ?ns_prefix:(string -> string option) -> dest -> output + + val output_depth : output -> int + val output : output -> signal -> unit + val output_tree : ('a -> 'a frag) -> output -> 'a -> unit + val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit +end + + +(* Unicode character lexers *) + +exception Malformed (* for character stream, internal only. *) + +let utf8_len = [| (* Char byte length according to first UTF-8 byte. *) + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; + 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; 1; + 1; 1; 1; 1; 1; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; + 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; + 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; + 0; 0; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; 2; + 2; 2; 2; 2; 2; 2; 2; 2; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; 3; + 4; 4; 4; 4; 4; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |] + +let uchar_utf8 i = + let b0 = i () in + begin match utf8_len.(b0) with + | 0 -> raise Malformed + | 1 -> b0 + | 2 -> + let b1 = i () in + if b1 lsr 6 != 0b10 then raise Malformed else + ((b0 land 0x1F) lsl 6) lor (b1 land 0x3F) + | 3 -> + let b1 = i () in + let b2 = i () in + if b2 lsr 6 != 0b10 then raise Malformed else + begin match b0 with + | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then raise Malformed else () + | 0xED -> if b1 < 0x80 || 0x9F < b1 then raise Malformed else () + | _ -> if b1 lsr 6 != 0b10 then raise Malformed else () + end; + ((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) + | 4 -> + let b1 = i () in + let b2 = i () in + let b3 = i () in + if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then raise Malformed else + begin match b0 with + | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then raise Malformed else () + | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then raise Malformed else () + | _ -> if b1 lsr 6 != 0b10 then raise Malformed else () + end; + ((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor + ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) + | _ -> assert false + end + +let int16_be i = + let b0 = i () in + let b1 = i () in + (b0 lsl 8) lor b1 + +let int16_le i = + let b0 = i () in + let b1 = i () in + (b1 lsl 8) lor b0 + +let uchar_utf16 int16 i = + let c0 = int16 i in + if c0 < 0xD800 || c0 > 0xDFFF then c0 else + if c0 > 0xDBFF then raise Malformed else + let c1 = int16 i in + (((c0 land 0x3FF) lsl 10) lor (c1 land 0x3FF)) + 0x10000 + +let uchar_utf16be = uchar_utf16 int16_be +let uchar_utf16le = uchar_utf16 int16_le +let uchar_byte i = i () +let uchar_iso_8859_1 i = i () +let uchar_ascii i = let b = i () in if b > 127 then raise Malformed else b + +(* Functorized streaming XML IO *) + +module Make (String : String) (Buffer : Buffer with type string = String.t) = +struct + type string = String.t + + let str = String.of_string + let str_eq s s' = (compare s s') = 0 + let str_empty s = (compare s String.empty) = 0 + let cat = String.append + let str_of_char u = + let b = Buffer.create 4 in + Buffer.add_uchar b u; + Buffer.contents b + + module Ht = Hashtbl.Make (struct type t = string + let equal = str_eq + let hash = Hashtbl.hash end) + + let u_nl = 0x000A (* newline *) + let u_cr = 0x000D (* carriage return *) + let u_space = 0x0020 (* space *) + let u_quot = 0x0022 (* quote *) + let u_sharp = 0x0023 (* # *) + let u_amp = 0x0026 (* & *) + let u_apos = 0x0027 (* ' *) + let u_minus = 0x002D (* - *) + let u_slash = 0x002F (* / *) + let u_colon = 0x003A (* : *) + let u_scolon = 0x003B (* ; *) + let u_lt = 0x003C (* < *) + let u_eq = 0x003D (* = *) + let u_gt = 0x003E (* > *) + let u_qmark = 0x003F (* ? *) + let u_emark = 0x0021 (* ! *) + let u_lbrack = 0x005B (* [ *) + let u_rbrack = 0x005D (* ] *) + let u_x = 0x0078 (* x *) + let u_bom = 0xFEFF (* BOM *) + let u_9 = 0x0039 (* 9 *) + let u_F = 0x0046 (* F *) + let u_D = 0X0044 (* D *) + + let s_cdata = str "CDATA[" + let ns_xml = str "http://www.w3.org/XML/1998/namespace" + let ns_xmlns = str "http://www.w3.org/2000/xmlns/" + let n_xml = str "xml" + let n_xmlns = str "xmlns" + let n_space = str "space" + let n_version = str "version" + let n_encoding = str "encoding" + let n_standalone = str "standalone" + let v_yes = str "yes" + let v_no = str "no" + let v_preserve = str "preserve" + let v_default = str "default" + let v_version_1_0 = str "1.0" + let v_version_1_1 = str "1.1" + let v_utf_8 = str "utf-8" + let v_utf_16 = str "utf-16" + let v_utf_16be = str "utf-16be" + let v_utf_16le = str "utf-16le" + let v_iso_8859_1 = str "iso-8859-1" + let v_us_ascii = str "us-ascii" + let v_ascii = str "ascii" + + let name_str (p,l) = if str_empty p then l else cat p (cat (str ":") l) + + (* Basic types and values *) + + type encoding = [ + | `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1 | `US_ASCII ] + type dtd = string option + type name = string * string + type attribute = name * string + type tag = name * attribute list + type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] + + (* Input *) + + type pos = int * int + type error = [ + | `Max_buffer_size + | `Unexpected_eoi + | `Malformed_char_stream + | `Unknown_encoding of string + | `Unknown_entity_ref of string + | `Unknown_ns_prefix of string + | `Illegal_char_ref of string + | `Illegal_char_seq of string + | `Expected_char_seqs of string list * string + | `Expected_root_element ] + + exception Error of pos * error + + let error_message e = + let bracket l v r = cat (str l) (cat v (str r)) in + match e with + | `Expected_root_element -> str "expected root element" + | `Max_buffer_size -> str "maximal buffer size exceeded" + | `Unexpected_eoi -> str "unexpected end of input" + | `Malformed_char_stream -> str "malformed character stream" + | `Unknown_encoding e -> bracket "unknown encoding (" e ")" + | `Unknown_entity_ref e -> bracket "unknown entity reference (" e ")" + | `Unknown_ns_prefix e -> bracket "unknown namespace prefix (" e ")" + | `Illegal_char_ref s -> bracket "illegal character reference (#" s ")" + | `Illegal_char_seq s -> + bracket "character sequence illegal here (\"" s "\")" + | `Expected_char_seqs (exps, fnd) -> + let exps = + let exp acc v = cat acc (bracket "\"" v "\", ") in + List.fold_left exp String.empty exps + in + cat (str "expected one of these character sequence: ") + (cat exps (bracket "found \"" fnd "\"")) + + type limit = (* XML is odd to parse. *) + | Stag of name (* '<' qname *) + | Etag of name (* ' int) ] + + type input = + { enc : encoding option; (* Expected encoding. *) + strip : bool; (* Whitespace stripping default behaviour. *) + fun_ns : string -> string option; (* Namespace callback. *) + fun_entity : string -> string option; (* Entity reference callback. *) + i : unit -> int; (* Byte level input. *) + mutable uchar : (unit -> int) -> int; (* Unicode character lexer. *) + mutable c : int; (* Character lookahead. *) + mutable cr : bool; (* True if last u was '\r'. *) + mutable line : int; (* Current line number. *) + mutable col : int; (* Current column number. *) + mutable limit : limit; (* Last parsed limit. *) + mutable peek : signal; (* Signal lookahead. *) + mutable stripping : bool; (* True if stripping whitespace. *) + mutable last_white : bool; (* True if last char was white. *) + mutable scopes : (name * string list * bool) list; + (* Stack of qualified el. name, bound prefixes and strip behaviour. *) + ns : string Ht.t; (* prefix -> uri bindings. *) + ident : Buffer.t; (* Buffer for names and entity refs. *) + data : Buffer.t; } (* Buffer for character and attribute data. *) + + let err_input_tree = "input signal not `El_start or `Data" + let err_input_doc_tree = "input signal not `Dtd" + let err i e = raise (Error ((i.line, i.col), e)) + let err_illegal_char i u = err i (`Illegal_char_seq (str_of_char u)) + let err_expected_seqs i exps s = err i (`Expected_char_seqs (exps, s)) + let err_expected_chars i exps = + err i (`Expected_char_seqs (List.map str_of_char exps, str_of_char i.c)) + + let u_eoi = max_int + let u_start_doc = u_eoi - 1 + let u_end_doc = u_start_doc - 1 + let signal_start_stream = `Data String.empty + + let make_input ?(enc = None) ?(strip = false) ?(ns = fun _ -> None) + ?(entity = fun _ -> None) src = + let i = match src with + | `Fun f -> f + | `Channel ic -> fun () -> input_byte ic + | `String (pos, s) -> + let len = Std_string.length s in + let pos = ref (pos - 1) in + fun () -> + incr pos; + if !pos = len then raise End_of_file else + Char.code (Std_string.get s !pos) + in + let bindings = + let h = Ht.create 15 in + Ht.add h String.empty String.empty; + Ht.add h n_xml ns_xml; + Ht.add h n_xmlns ns_xmlns; + h + in + { enc = enc; strip = strip; fun_ns = ns; fun_entity = entity; + i = i; uchar = uchar_byte; c = u_start_doc; cr = false; + line = 1; col = 0; limit = Text; peek = signal_start_stream; + stripping = strip; last_white = true; scopes = []; ns = bindings; + ident = Buffer.create 64; data = Buffer.create 1024; } + + (* Bracketed non-terminals in comments refer to XML 1.0 non terminals *) + + let r : int -> int -> int -> bool = fun u a b -> a <= u && u <= b + let is_white = function 0x0020 | 0x0009 | 0x000D | 0x000A -> true | _ -> false + + let is_char = function (* {Char} *) + | u when r u 0x0020 0xD7FF -> true + | 0x0009 | 0x000A | 0x000D -> true + | u when r u 0xE000 0xFFFD || r u 0x10000 0x10FFFF -> true + | _ -> false + + let is_digit u = r u 0x0030 0x0039 + let is_hex_digit u = + r u 0x0030 0x0039 || r u 0x0041 0x0046 || r u 0x0061 0x0066 + + let comm_range u = (* common to functions below *) + r u 0x00C0 0x00D6 || r u 0x00D8 0x00F6 || r u 0x00F8 0x02FF || + r u 0x0370 0x037D || r u 0x037F 0x1FFF || r u 0x200C 0x200D || + r u 0x2070 0x218F || r u 0x2C00 0x2FEF || r u 0x3001 0xD7FF || + r u 0xF900 0xFDCF || r u 0xFDF0 0xFFFD || r u 0x10000 0xEFFFF + + let is_name_start_char = function (* {NameStartChar} - ':' (XML 1.1) *) + | u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *) + | u when is_white u -> false + | 0x005F -> true (* '_' *) + | u when comm_range u -> true + | _ -> false + + let is_name_char = function (* {NameChar} - ':' (XML 1.1) *) + | u when r u 0x0061 0x007A || r u 0x0041 0x005A -> true (* [a-z] | [A-Z] *) + | u when is_white u -> false + | u when r u 0x0030 0x0039 -> true (* [0-9] *) + | 0x005F | 0x002D | 0x002E | 0x00B7 -> true (* '_' '-' '.' *) + | u when comm_range u || r u 0x0300 0x036F || r u 0x203F 0x2040 -> true + | _ -> false + + let rec nextc i = + if i.c = u_eoi then err i `Unexpected_eoi; + if i.c = u_nl then (i.line <- i.line + 1; i.col <- 1) + else i.col <- i.col + 1; + i.c <- i.uchar i.i; + if not (is_char i.c) then raise Malformed; + if i.cr && i.c = u_nl then i.c <- i.uchar i.i; (* cr nl business *) + if i.c = u_cr then (i.cr <- true; i.c <- u_nl) else i.cr <- false + + let nextc_eof i = try nextc i with End_of_file -> i.c <- u_eoi + let skip_white i = while (is_white i.c) do nextc i done + let skip_white_eof i = while (is_white i.c) do nextc_eof i done + let accept i c = if i.c = c then nextc i else err_expected_chars i [ c ] + + let clear_ident i = Buffer.clear i.ident + let clear_data i = Buffer.clear i.data + let addc_ident i c = Buffer.add_uchar i.ident c + let addc_data i c = Buffer.add_uchar i.data c + + let addc_data_strip i c = + if is_white c then i.last_white <- true else + begin + if i.last_white && Buffer.length i.data <> 0 then addc_data i u_space; + i.last_white <- false; + addc_data i c + end + + let expand_name i (prefix, local) = + let external_ prefix = match i.fun_ns prefix with + | None -> err i (`Unknown_ns_prefix prefix) + | Some uri -> uri + in + try + let uri = Ht.find i.ns prefix in + if not (str_empty uri) then (uri, local) else + if str_empty prefix then String.empty, local else + (external_ prefix), local (* unbound with xmlns:prefix="" *) + with Not_found -> external_ prefix, local + + let find_encoding i = (* Encoding mess. *) + let reset uchar i = i.uchar <- uchar; i.col <- 0; nextc i in + match i.enc with + | None -> (* User doesn't know encoding. *) + begin match nextc i; i.c with + | 0xFE -> (* UTF-16BE BOM. *) + nextc i; if i.c <> 0xFF then err i `Malformed_char_stream; + reset uchar_utf16be i; + true + | 0xFF -> (* UTF-16LE BOM. *) + nextc i; if i.c <> 0xFE then err i `Malformed_char_stream; + reset uchar_utf16le i; + true + | 0xEF -> (* UTF-8 BOM. *) + nextc i; if i.c <> 0xBB then err i `Malformed_char_stream; + nextc i; if i.c <> 0xBF then err i `Malformed_char_stream; + reset uchar_utf8 i; + true + | 0x3C | _ -> (* UTF-8 or other, try declaration. *) + i.uchar <- uchar_utf8; + false + end + | Some e -> (* User knows encoding. *) + begin match e with + | `US_ASCII -> reset uchar_ascii i + | `ISO_8859_1 -> reset uchar_iso_8859_1 i + | `UTF_8 -> (* Skip BOM if present. *) + reset uchar_utf8 i; if i.c = u_bom then (i.col <- 0; nextc i) + | `UTF_16 -> (* Which UTF-16 ? look BOM. *) + let b0 = nextc i; i.c in + let b1 = nextc i; i.c in + begin match b0, b1 with + | 0xFE, 0xFF -> reset uchar_utf16be i + | 0xFF, 0xFE -> reset uchar_utf16le i + | _ -> err i `Malformed_char_stream; + end + | `UTF_16BE -> (* Skip BOM if present. *) + reset uchar_utf16be i; if i.c = u_bom then (i.col <- 0; nextc i) + | `UTF_16LE -> + reset uchar_utf16le i; if i.c = u_bom then (i.col <- 0; nextc i) + end; + true (* Ignore xml declaration. *) + + + let p_ncname i = (* {NCName} (Namespace 1.1) *) + clear_ident i; + if not (is_name_start_char i.c) then err_illegal_char i i.c else + begin + addc_ident i i.c; nextc i; + while is_name_char i.c do addc_ident i i.c; nextc i done; + Buffer.contents i.ident + end + + let p_qname i = (* {QName} (Namespace 1.1) *) + let n = p_ncname i in + if i.c <> u_colon then (String.empty, n) else (nextc i; (n, p_ncname i)) + + let p_charref i = (* {CharRef}, '&' was eaten. *) + let c = ref 0 in + clear_ident i; + nextc i; + if i.c = u_scolon then err i (`Illegal_char_ref String.empty) else + begin + try + if i.c = u_x then + begin + addc_ident i i.c; + nextc i; + while (i.c <> u_scolon) do + addc_ident i i.c; + if not (is_hex_digit i.c) then raise Exit else + c := !c * 16 + (if i.c <= u_9 then i.c - 48 else + if i.c <= u_F then i.c - 55 else + i.c - 87); + nextc i; + done + end + else + while (i.c <> u_scolon) do + addc_ident i i.c; + if not (is_digit i.c) then raise Exit else + c := !c * 10 + (i.c - 48); + nextc i + done + with Exit -> + c := -1; while i.c <> u_scolon do addc_ident i i.c; nextc i done + end; + nextc i; + if is_char !c then (clear_ident i; addc_ident i !c; Buffer.contents i.ident) + else err i (`Illegal_char_ref (Buffer.contents i.ident)) + + let predefined_entities = + let h = Ht.create 5 in + let e k v = Ht.add h (str k) (str v) in + e "lt" "<"; e "gt" ">"; e "amp" "&"; e "apos" "'"; e "quot" "\""; + h + + let p_entity_ref i = (* {EntityRef}, '&' was eaten. *) + let ent = p_ncname i in + accept i u_scolon; + try Ht.find predefined_entities ent with Not_found -> + match i.fun_entity ent with + | Some s -> s + | None -> err i (`Unknown_entity_ref ent) + + let p_reference i = (* {Reference} *) + nextc i; if i.c = u_sharp then p_charref i else p_entity_ref i + + let p_attr_value i = (* {S}? {AttValue} *) + skip_white i; + let delim = + if i.c = u_quot || i.c = u_apos then i.c else + err_expected_chars i [ u_quot; u_apos] + in + nextc i; + skip_white i; + clear_data i; + i.last_white <- true; + while (i.c <> delim) do + if i.c = u_lt then err_illegal_char i u_lt else + if i.c = u_amp then String.iter (addc_data_strip i) (p_reference i) + else (addc_data_strip i i.c; nextc i) + done; + nextc i; + Buffer.contents i.data + + let p_attributes i = (* ({S} {Attribute})* {S}? *) + let rec aux i pre_acc acc = + if not (is_white i.c) then pre_acc, acc else + begin + skip_white i; + if i.c = u_slash || i.c = u_gt then pre_acc, acc else + begin + let (prefix, local) as n = p_qname i in + let v = skip_white i; accept i u_eq; p_attr_value i in + let att = n, v in + if str_empty prefix && str_eq local n_xmlns then + begin (* xmlns *) + Ht.add i.ns String.empty v; + aux i (String.empty :: pre_acc) (att :: acc) + end + else if str_eq prefix n_xmlns then + begin (* xmlns:local *) + Ht.add i.ns local v; + aux i (local :: pre_acc) (att :: acc) + end + else if str_eq prefix n_xml && str_eq local n_space then + begin (* xml:space *) + if str_eq v v_preserve then i.stripping <- false else + if str_eq v v_default then i.stripping <- i.strip else (); + aux i pre_acc (att :: acc) + end + else + aux i pre_acc (att :: acc) + end + end + in + aux i [] [] (* Returns a list of bound prefixes and attributes *) + + let p_limit i = (* Parses a markup limit *) + i.limit <- + if i.c = u_eoi then Eoi else + if i.c <> u_lt then Text else + begin + nextc i; + if i.c = u_qmark then (nextc i; Pi (p_qname i)) else + if i.c = u_slash then + begin + nextc i; + let n = p_qname i in + skip_white i; + Etag n + end + else if i.c = u_emark then + begin + nextc i; + if i.c = u_minus then (nextc i; accept i u_minus; Comment) else + if i.c = u_D then Dtd else + if i.c = u_lbrack then + begin + nextc i; + clear_ident i; + for k = 1 to 6 do (addc_ident i i.c; nextc i) done; + let cdata = Buffer.contents i.ident in + if str_eq cdata s_cdata then Cdata else + err_expected_seqs i [ s_cdata ] cdata + end + else + err i (`Illegal_char_seq (cat (str " u_minus) do nextc i done; + nextc i; + if i.c <> u_minus then skip_comment i else + begin + nextc i; + if i.c <> u_gt then err_expected_chars i [ u_gt ]; + nextc_eof i + end + + let rec skip_pi i = (* {PI}, ' u_qmark) do nextc i done; + nextc i; + if i.c <> u_gt then skip_pi i else nextc_eof i + + let rec skip_misc i ~allow_xmlpi = match i.limit with (* {Misc}* *) + | Pi (p,l) when (str_empty p && str_eq n_xml (String.lowercase l)) -> + if allow_xmlpi then () else err i (`Illegal_char_seq l) + | Pi _ -> skip_pi i; p_limit i; skip_misc i ~allow_xmlpi + | Comment -> skip_comment i; p_limit i; skip_misc i ~allow_xmlpi + | Text when is_white i.c -> + skip_white_eof i; p_limit i; skip_misc i ~allow_xmlpi + | _ -> () + + let p_chardata addc i = (* {CharData}* ({Reference}{Chardata})* *) + while (i.c <> u_lt) do + if i.c = u_amp then String.iter (addc i) (p_reference i) + else if i.c = u_rbrack then + begin + addc i i.c; + nextc i; + if i.c = u_rbrack then begin + addc i i.c; + nextc i; (* detects ']'*']]>' *) + while (i.c = u_rbrack) do addc i i.c; nextc i done; + if i.c = u_gt then err i (`Illegal_char_seq (str "]]>")); + end + end + else + (addc i i.c; nextc i) + done + + let rec p_cdata addc i = (* {CData} {CDEnd} *) + try while (true) do + if i.c = u_rbrack then begin + nextc i; + while i.c = u_rbrack do + nextc i; + if i.c = u_gt then (nextc i; raise Exit); + addc i u_rbrack + done; + addc i u_rbrack; + end; + addc i i.c; + nextc i; + done with Exit -> () + + let p_xml_decl i ~ignore_enc ~ignore_utf16 = (* {XMLDecl}? *) + let yes_no = [v_yes; v_no] in + let p_val i = skip_white i; accept i u_eq; skip_white i; p_attr_value i in + let p_val_exp i exp = + let v = p_val i in + if not (List.exists (str_eq v) exp) then err_expected_seqs i exp v + in + match i.limit with + | Pi (p, l) when (str_empty p && str_eq l n_xml) -> + let v = skip_white i; p_ncname i in + if not (str_eq v n_version) then err_expected_seqs i [ n_version ] v; + p_val_exp i [v_version_1_0; v_version_1_1]; + skip_white i; + if i.c <> u_qmark then begin + let n = p_ncname i in + if str_eq n n_encoding then begin + let enc = String.lowercase (p_val i) in + if not ignore_enc then begin + if str_eq enc v_utf_8 then i.uchar <- uchar_utf8 else + if str_eq enc v_utf_16be then i.uchar <- uchar_utf16be else + if str_eq enc v_utf_16le then i.uchar <- uchar_utf16le else + if str_eq enc v_iso_8859_1 then i.uchar <- uchar_iso_8859_1 else + if str_eq enc v_us_ascii then i.uchar <- uchar_ascii else + if str_eq enc v_ascii then i.uchar <- uchar_ascii else + if str_eq enc v_utf_16 then + if ignore_utf16 then () else (err i `Malformed_char_stream) + (* A BOM should have been found. *) + else + err i (`Unknown_encoding enc) + end; + skip_white i; + if i.c <> u_qmark then begin + let n = p_ncname i in + if str_eq n n_standalone then p_val_exp i yes_no else + err_expected_seqs i [ n_standalone; str "?>" ] n + end + end + else if str_eq n n_standalone then + p_val_exp i yes_no + else + err_expected_seqs i [ n_encoding; n_standalone; str "?>" ] n + end; + skip_white i; + accept i u_qmark; + accept i u_gt; + p_limit i + | _ -> () + + let p_dtd_signal i =(* {Misc}* {doctypedecl} {Misc}* *) + skip_misc i ~allow_xmlpi:false; + if i.limit <> Dtd then `Dtd None else + begin + let buf = addc_data i in + let nest = ref 1 in + clear_data i; + buf u_lt; buf u_emark; (* add eaten " 0) do + if i.c = u_lt then + begin + nextc i; + if i.c <> u_emark then + (buf u_lt; incr nest) + else + begin + nextc i; + if i.c <> u_minus then (* Carefull with comments ! *) + (buf u_lt; buf u_emark; incr nest) + else + begin + nextc i; + if i.c <> u_minus then + (buf u_lt; buf u_emark; buf u_minus; incr nest) + else + (nextc i; skip_comment i) + end + end + end + else if i.c = u_quot || i.c = u_apos then + begin + let c = i.c in + buf c; nextc i; + while (i.c <> c) do (buf i.c; nextc i) done; + buf c; nextc i + end + else if i.c = u_gt then (buf u_gt; nextc i; decr nest) + else (buf i.c; nextc i) + done; + let dtd = Buffer.contents i.data in + p_limit i; + skip_misc i ~allow_xmlpi:false; + `Dtd (Some dtd); + end + + let p_data i = + let rec bufferize addc i = match i.limit with + | Text -> p_chardata addc i; p_limit i; bufferize addc i + | Cdata -> p_cdata addc i; p_limit i; bufferize addc i + | (Stag _ | Etag _) -> () + | Pi _ -> skip_pi i; p_limit i; bufferize addc i + | Comment -> skip_comment i; p_limit i; bufferize addc i + | Dtd -> err i (`Illegal_char_seq (str " err i `Unexpected_eoi + in + clear_data i; + i.last_white <- true; + bufferize (if i.stripping then addc_data_strip else addc_data) i; + let d = Buffer.contents i.data in + d + + let p_el_start_signal i n = + let expand_att (((prefix, local) as n, v) as att) = + if not (str_eq prefix String.empty) then expand_name i n, v else + if str_eq local n_xmlns then (ns_xmlns, n_xmlns), v else + att (* default namespaces do not influence attributes. *) + in + let strip = i.stripping in (* save it here, p_attributes may change it. *) + let prefixes, atts = p_attributes i in + i.scopes <- (n, prefixes, strip) :: i.scopes; + `El_start ((expand_name i n), List.rev_map expand_att atts) + + let p_el_end_signal i n = match i.scopes with + | (n', prefixes, strip) :: scopes -> + if i.c <> u_gt then err_expected_chars i [ u_gt ]; + if not (str_eq n n') then err_expected_seqs i [name_str n'] (name_str n); + i.scopes <- scopes; + i.stripping <- strip; + List.iter (Ht.remove i.ns) prefixes; + if scopes = [] then i.c <- u_end_doc else (nextc i; p_limit i); + `El_end + | _ -> assert false + + let p_signal i = + if i.scopes = [] then + match i.limit with + | Stag n -> p_el_start_signal i n + | _ -> err i `Expected_root_element + else + let rec find i = match i.limit with + | Stag n -> p_el_start_signal i n + | Etag n -> p_el_end_signal i n + | Text | Cdata -> + let d = p_data i in + if str_empty d then find i else `Data d + | Pi _ -> skip_pi i; p_limit i; find i + | Comment -> skip_comment i; p_limit i; find i + | Dtd -> err i (`Illegal_char_seq (str " err i `Unexpected_eoi + in + begin match i.peek with + | `El_start (n, _) -> (* finish to input start el. *) + skip_white i; + if i.c = u_gt then (accept i u_gt; p_limit i) else + if i.c = u_slash then + begin + let tag = match i.scopes with + | (tag, _, _) :: _ -> tag | _ -> assert false + in + (nextc i; i.limit <- Etag tag) + end + else + err_expected_chars i [ u_slash; u_gt ] + | _ -> () + end; + find i + + let eoi i = + try + if i.c = u_eoi then true else + if i.c <> u_start_doc then false else (* In a document. *) + if i.peek <> `El_end then (* Start of document sequence. *) + begin + let ignore_enc = find_encoding i in + p_limit i; + p_xml_decl i ~ignore_enc ~ignore_utf16:false; + i.peek <- p_dtd_signal i; + false + end + else (* Subsequent documents. *) + begin + nextc_eof i; + p_limit i; + if i.c = u_eoi then true else + begin + skip_misc i ~allow_xmlpi:true; + if i.c = u_eoi then true else + begin + p_xml_decl i ~ignore_enc:false ~ignore_utf16:true; + i.peek <- p_dtd_signal i; + false + end + end + end + with + | Buffer.Full -> err i `Max_buffer_size + | Malformed -> err i `Malformed_char_stream + | End_of_file -> err i `Unexpected_eoi + + let peek i = if eoi i then err i `Unexpected_eoi else i.peek + + let input i = + try + if i.c = u_end_doc then (i.c <- u_start_doc; i.peek) else + let s = peek i in + i.peek <- p_signal i; + s + with + | Buffer.Full -> err i `Max_buffer_size + | Malformed -> err i `Malformed_char_stream + | End_of_file -> err i `Unexpected_eoi + + let input_tree ~el ~data i = match input i with + | `Data d -> data d + | `El_start tag -> + let rec aux i tags context = match input i with + | `El_start tag -> aux i (tag :: tags) ([] :: context) + | `El_end -> + begin match tags, context with + | tag :: tags', childs :: context' -> + let el = el tag (List.rev childs) in + begin match context' with + | parent :: context'' -> aux i tags' ((el :: parent) :: context'') + | [] -> el + end + | _ -> assert false + end + | `Data d -> + begin match context with + | childs :: context' -> aux i tags (((data d) :: childs) :: context') + | [] -> assert false + end + | `Dtd _ -> assert false + in + aux i (tag :: []) ([] :: []) + | _ -> invalid_arg err_input_tree + + + let input_doc_tree ~el ~data i = match input i with + | `Dtd d -> d, input_tree ~el ~data i + | _ -> invalid_arg err_input_doc_tree + + let pos i = i.line, i.col + + (* Output *) + + type 'a frag = [ `El of tag * 'a list | `Data of string ] + type dest = [ + | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] + + type output = + { decl : bool; (* True if the XML declaration should be output. *) + nl : bool; (* True if a newline is output at the end. *) + indent : int option; (* Optional indentation. *) + fun_prefix : string -> string option; (* Prefix callback. *) + prefixes : string Ht.t; (* uri -> prefix bindings. *) + outs : std_string -> int -> int -> unit; (* String output. *) + outc : char -> unit; (* character output. *) + mutable last_el_start : bool; (* True if last signal was `El_start *) + mutable scopes : (name * (string list)) list; + (* Qualified el. name and bound uris. *) + mutable depth : int; } (* Scope depth. *) + + let err_prefix uri = "unbound namespace (" ^ uri ^ ")" + let err_dtd = "dtd signal not allowed here" + let err_el_start = "start signal not allowed here" + let err_el_end = "end signal without matching start signal" + let err_data = "data signal not allowed here" + + let make_output ?(decl = true) ?(nl = false) ?(indent = None) + ?(ns_prefix = fun _ ->None) d = + let outs, outc = match d with + | `Channel c -> (output c), (output_char c) + | `Buffer b -> (Std_buffer.add_substring b), (Std_buffer.add_char b) + | `Fun f -> + let os s p l = + for i = p to p + l - 1 do f (Char.code (Std_string.get s i)) done + in + let oc c = f (Char.code c) in + os, oc + in + let prefixes = + let h = Ht.create 10 in + Ht.add h String.empty String.empty; + Ht.add h ns_xml n_xml; + Ht.add h ns_xmlns n_xmlns; + h + in + { decl = decl; outs = outs; outc = outc; nl = nl; indent = indent; + last_el_start = false; prefixes = prefixes; scopes = []; depth = -1; + fun_prefix = ns_prefix; } + + let output_depth o = o.depth + let outs o s = o.outs s 0 (Std_string.length s) + let str_utf_8 s = String.to_utf_8 (fun _ s -> s) "" s + let out_utf_8 o s = ignore (String.to_utf_8 (fun o s -> outs o s; o) o s) + + let prefix_name o (ns, local) = + try + if str_eq ns ns_xmlns && str_eq local n_xmlns then (String.empty, n_xmlns) + else (Ht.find o.prefixes ns, local) + with Not_found -> + match o.fun_prefix ns with + | None -> invalid_arg (err_prefix (str_utf_8 ns)) + | Some prefix -> prefix, local + + let bind_prefixes o atts = + let add acc ((ns, local), uri) = + if not (str_eq ns ns_xmlns) then acc else + begin + let prefix = if str_eq local n_xmlns then String.empty else local in + Ht.add o.prefixes uri prefix; + uri :: acc + end + in + List.fold_left add [] atts + + let out_data o s = + let out () s = + let len = Std_string.length s in + let start = ref 0 in + let last = ref 0 in + let escape e = + o.outs s !start (!last - !start); + outs o e; + incr last; + start := !last + in + while (!last < len) do match Std_string.get s !last with + | '<' -> escape "<" (* Escape markup delimiters. *) + | '>' -> escape ">" + | '&' -> escape "&" + (* | '\'' -> escape "'" *) (* Not needed we use \x22 for attributes. *) + | '\x22' -> escape """ + | '\n' | '\t' | '\r' -> incr last + | c when c < ' ' -> escape "\xEF\xBF\xBD" (* illegal, subst. by U+FFFD *) + | _ -> incr last + done; + o.outs s !start (!last - !start) + in + String.to_utf_8 out () s + + let out_qname o (p, l) = + if not (str_empty p) then (out_utf_8 o p; o.outc ':'); + out_utf_8 o l + + let out_attribute o (n, v) = + o.outc ' '; out_qname o (prefix_name o n); outs o "=\x22"; + out_data o v; + o.outc '\x22' + + let output o s = + let indent o = match o.indent with + | None -> () + | Some c -> for i = 1 to (o.depth * c) do o.outc ' ' done + in + let unindent o = match o.indent with None -> () | Some _ -> o.outc '\n' in + if o.depth = -1 then + begin match s with + | `Dtd d -> + if o.decl then outs o "\n"; + begin match d with + | Some dtd -> out_utf_8 o dtd; o.outc '\n' + | None -> () + end; + o.depth <- 0 + | `Data _ -> invalid_arg err_data + | `El_start _ -> invalid_arg err_el_start + | `El_end -> invalid_arg err_el_end + end + else + begin match s with + | `El_start (n, atts) -> + if o.last_el_start then (outs o ">"; unindent o); + indent o; + let uris = bind_prefixes o atts in + let qn = prefix_name o n in + o.outc '<'; out_qname o qn; List.iter (out_attribute o) atts; + o.scopes <- (qn, uris) :: o.scopes; + o.depth <- o.depth + 1; + o.last_el_start <- true + | `El_end -> + begin match o.scopes with + | (n, uris) :: scopes' -> + o.depth <- o.depth - 1; + if o.last_el_start then outs o "/>" else + begin + indent o; + outs o "'; + end; + o.scopes <- scopes'; + List.iter (Ht.remove o.prefixes) uris; + o.last_el_start <- false; + if o.depth = 0 then (if o.nl then o.outc '\n'; o.depth <- -1;) + else unindent o + | [] -> invalid_arg err_el_end + end + | `Data d -> + if o.last_el_start then (outs o ">"; unindent o); + indent o; + out_data o d; + unindent o; + o.last_el_start <- false + | `Dtd _ -> failwith err_dtd + end + + let output_tree frag o v = + let rec aux o = function + | (v :: rest) :: context -> + begin match frag v with + | `El (tag, childs) -> + output o (`El_start tag); + aux o (childs :: rest :: context) + | (`Data d) as signal -> + output o signal; + aux o (rest :: context) + end + | [] :: [] -> () + | [] :: context -> output o `El_end; aux o context + | [] -> assert false + in + aux o ([v] :: []) + + let output_doc_tree frag o (dtd, v) = + output o (`Dtd dtd); + output_tree frag o v + +end + +(* Default streaming XML IO *) + +module String = struct + type t = string + let empty = "" + let length = String.length + let append = ( ^ ) + let lowercase = String.lowercase + let iter f s = + let len = Std_string.length s in + let pos = ref ~-1 in + let i () = + incr pos; + if !pos = len then raise Exit else + Char.code (Std_string.get s !pos) + in + try while true do f (uchar_utf8 i) done with Exit -> () + + let of_string s = s + let to_utf_8 f v x = f v x + let compare = String.compare +end + +module Buffer = struct + type string = String.t + type t = Buffer.t + exception Full + let create = Buffer.create + let add_uchar b u = + try + (* UTF-8 encodes an uchar in the buffer, assumes u is valid code point. *) + let buf c = Buffer.add_char b (Char.chr c) in + if u <= 0x007F then + (buf u) + else if u <= 0x07FF then + (buf (0xC0 lor (u lsr 6)); + buf (0x80 lor (u land 0x3F))) + else if u <= 0xFFFF then + (buf (0xE0 lor (u lsr 12)); + buf (0x80 lor ((u lsr 6) land 0x3F)); + buf (0x80 lor (u land 0x3F))) + else + (buf (0xF0 lor (u lsr 18)); + buf (0x80 lor ((u lsr 12) land 0x3F)); + buf (0x80 lor ((u lsr 6) land 0x3F)); + buf (0x80 lor (u land 0x3F))) + with Failure _ -> raise Full + + let clear b = Buffer.clear b + let contents = Buffer.contents + let length = Buffer.length +end + +include Make(String) (Buffer) + +(* Pretty printers *) + +let pp = Format.fprintf +let rec pp_list ?(pp_sep = Format.pp_print_cut) pp_v ppf = function +| [] -> () +| v :: vs -> + pp_v ppf v; if vs <> [] then (pp_sep ppf (); pp_list ~pp_sep pp_v ppf vs) + +let pp_name ppf (p, l) = if p <> "" then pp ppf "%s:%s" p l else pp ppf "%s" l +let pp_attribute ppf (n, v) = pp ppf "@[<1>(%a,@,%S)@]" pp_name n v +let pp_tag ppf (name, atts) = + let pp_sep ppf () = pp ppf ";@ " in + pp ppf "@[<1>(%a,@,@[<1>[%a]@])@]" + pp_name name (pp_list ~pp_sep pp_attribute) atts + +let pp_dtd ppf = function +| None -> pp ppf "None" +| Some dtd -> pp ppf "@[<1>(Some@ %S)@]" dtd + +let pp_signal ppf = function +| `Data s -> pp ppf "@[`Data %S@]" s +| `El_end -> pp ppf "`El_end" +| `El_start tag -> pp ppf "@[`El_start %a@]" pp_tag tag +| `Dtd dtd -> pp ppf "@[`Dtd %a@]" pp_dtd dtd + +(*---------------------------------------------------------------------------- + Copyright 2007 Daniel C. Bünzli + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + + 3. Neither the name of Daniel C. Bünzli nor the names of + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ---------------------------------------------------------------------------*) diff --git a/xmlm.mli b/xmlm.mli new file mode 100755 index 0000000..c38ee8d --- /dev/null +++ b/xmlm.mli @@ -0,0 +1,833 @@ +(*--------------------------------------------------------------------------- + Copyright 2007 Daniel C. Bünzli. All rights reserved. + Distributed under a BSD3 license, see license at the end of the file. + %%NAME%% release %%VERSION%% + ---------------------------------------------------------------------------*) + +(** Streaming XML codec. + + A well-formed sequence of {{!signal}signals} represents an + {{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in + depth first order (this has nothing to do with XML + well-formedness). Input pulls a well-formed sequence of signals + from a data source and output pushes a well-formed sequence of + signals to a data destination. Functions are provided to easily + transform sequences of signals to/from arborescent data structures. + + Consult the {{!io}features and limitations} and {{!ex}examples} + of use. + + {e Release %%VERSION%% — %%MAINTAINER%% } + + {3 References} + {ul + {- Tim Bray. + {e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}}, + 1998.} + {- Tim Bray et al. + {e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}}, + 2006.}} *) + +(** {1 Basic types and values} *) + +(** The type for character encodings. For [`UTF_16], endianness is + determined from the + {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) +type encoding = [ + | `UTF_8 + | `UTF_16 + (** Endianness determined from the + {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) + | `UTF_16BE + | `UTF_16LE + | `ISO_8859_1 + | `US_ASCII ] + +type dtd = string option +(** The type for the optional + {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *) + +type name = string * string +(** The type for attribute and element's + {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names} + [(uri,local)]. An empty [uri] represents a name without a + namespace name, i.e. an unprefixed name + that is not under the scope of a default namespace. *) + +type attribute = name * string +(** The type for attributes. Name and attribute data. *) + +type tag = name * attribute list +(** The type for an element tag. Tag name and attribute list. *) + +type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] +(** The type for signals. A {e well-formed} sequence of signals belongs + to the language of the [doc] grammar : + {[doc ::= `Dtd tree +tree ::= `El_start child `El_end +child ::= `Data | tree | epsilon ]} + Input and output deal only with well-formed sequences or + exceptions are raised. +*) + +val ns_xml : string +(** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the + reserved ["xml"] prefix. *) + +val ns_xmlns : string +(** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the + reserved ["xmlns"] prefix. *) + +val pp_dtd : Format.formatter -> dtd -> unit +(** [pp_dtd ppf dtd] prints an unspecified representation of [dtd] on [ppf]. *) + +val pp_name : Format.formatter -> name -> unit +(** [pp_name ppf name] prints an unspecified representation of [name] on + [ppf]. *) + +val pp_attribute : Format.formatter -> attribute -> unit +(** [pp_attribute ppf att] prints an unspecified representation of [att] on + [ppf]. *) + +val pp_tag : Format.formatter -> tag -> unit +(** [pp_tag ppf tag] prints an unspecified representation of [tag] on + [ppf]. *) + +val pp_signal : Format.formatter -> signal -> unit +(** [pp_signal ppf s] prints an unspecified representation of [s] on + [ppf]. *) + +(** {1 Input} *) + +type pos = int * int +(** The type for input positions. Line and column number, both start + with 1. *) + +(** The type for input errors. *) +type error = [ + | `Max_buffer_size + (** Maximal buffer size exceeded ([Sys.max_string_length]). *) + | `Unexpected_eoi + (** Unexpected end of input. *) + | `Malformed_char_stream + (** Malformed underlying character stream. *) + | `Unknown_encoding of string + (** Unknown encoding. *) + | `Unknown_entity_ref of string + (** Unknown entity reference, {{!inentity} details}. *) + | `Unknown_ns_prefix of string + (** Unknown namespace prefix {{!inns} details} *) + | `Illegal_char_ref of string + (** Illegal character reference. *) + | `Illegal_char_seq of string + (** Illegal character sequence. *) + | `Expected_char_seqs of string list * string + (** Expected one of the character sequences in the list but found another. *) + | `Expected_root_element + (** Expected the document's root element. *) ] + +val error_message : error -> string +(** Converts the error to an english error message. *) + +exception Error of pos * error +(** Raised on input errors. *) + +type source = [ + | `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ] +(** The type for input sources. For [`String] starts reading at the + given integer position. For [`Fun] the function must return the + next {e byte} as an [int] and raise [End_of_file] if there is no + such byte. *) + +type input +(** The type for input abstractions. *) + +val make_input : ?enc:encoding option -> ?strip:bool -> + ?ns:(string -> string option) -> + ?entity: (string -> string option) -> source -> input +(** Returns a new input abstraction reading from the given source. + {ul + {- [enc], character encoding of the document, {{!inenc} details}. + Defaults to [None].} + {- [strip], strips whitespace in character data, {{!inwspace} details}. + Defaults to [false].} + {- [ns] is called to bind undeclared namespace prefixes, + {{!inns} details}. Default returns always [None].} + {- [entity] is called to resolve non predefined entity references, + {{!inentity} details}. Default returns always [None].}} *) + +val input : input -> signal +(** Inputs a signal. Repeated invocation of the function with the same + input abstraction will generate a {{!signal}well-formed} sequence + of signals or an {!Error} is raised. Furthermore there will be no + two consecutive [`Data] signals in the sequence and their string + is always non empty. + + {b Deprecated} After a well-formed sequence was input another may + be input, see {!eoi} and {{!iseq}details}. + + {b Raises} {!Error} on input errors. *) + +val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + input -> 'a +(** If the next signal is a : + {ul + {- [`Data] signal, inputs it and invokes [data] with the character data.} + {- [`El_start] signal, inputs the sequence of signals until its + matching [`El_end] and invokes [el] and [data] as follows + {ul + {- [el], is called on each [`El_end] signals with the corresponding + [`El_start] tag and the result of the callback invocation for the + element's children.} + {- [data], is called on each [`Data] signals with the character data. + This function won't be called twice consecutively or with the empty + string.}}} + {- Other signals, raises [Invalid_argument].}} + + {b Raises} {!Error} on input errors and [Invalid_argument] + if the next signal is not [`El_start] or [`Data]. *) + +val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + input -> (dtd * 'a) +(** Same as {!input_tree} but reads a complete {{!signal}well-formed} + sequence of signals. + + {b Raises} {!Error} on input errors and [Invalid_argument] + if the next signal is not [`Dtd]. *) + +val peek : input -> signal +(** Same as {!input} but doesn't remove the signal from the sequence. + + {b Raises} {!Error} on input errors. *) + +val eoi : input -> bool +(** Returns [true] if the end of input is reached. See {{!iseq}details}. + + {b Raises} {!Error} on input errors. *) + +val pos : input -> pos +(** Current position in the input abstraction. *) + +(** {1 Output} *) + +type 'a frag = [ `El of tag * 'a list | `Data of string ] +(** The type for deconstructing data structures of type ['a]. *) + +type dest = [ `Channel of out_channel | `Buffer of Buffer.t + | `Fun of (int -> unit) ] +(** The type for output destinations. For [`Buffer], the buffer won't + be cleared. For [`Fun] the function is called with the output {e + bytes} as [int]s. *) + +type output +(** The type for output abstractions. *) + +val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> + ?ns_prefix:(string -> string option) -> dest -> output +(** Returns a new output abstraction writing to the given destination. + {ul + {- [decl], if [true] the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML + declaration} is output (defaults to [true]).} + {- [nl], if [true] a newline is output when the root's element [`El_end] + signal is output. + Defaults to [false].} + {- [indent], identation behaviour, see {{!outindent} details}. Defaults to + [None].} + {- [ns_prefix], undeclared namespace prefix bindings, + see {{!outns}details}. Default returns always [None].}} *) + +val output : output -> signal -> unit +(** Outputs a signal. + + {b Deprecated.} After a well-formed sequence of signals was output + a new well-formed sequence can be output. + + {b Raises} [Invalid_argument] if the resulting signal sequence on + the output abstraction is not {{!signal}well-formed} or if a + namespace name could not be bound to a prefix. *) + +val output_depth : output -> int +(** [output_depth o] is [o]'s current element nesting level (undefined + before the first [`El_start] and after the last [`El_end]). *) + +val output_tree : ('a -> 'a frag) -> output -> 'a -> unit +(** Outputs signals corresponding to a value by recursively + applying the given value deconstructor. + + {b Raises} see {!output}. *) + +val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit +(** Same as {!output_tree} but outputs a complete {{!signal}well-formed} + sequence of signals. + + {b Raises} see {!output}. *) + +(** {1:sto Functorial interface (deprecated)} + + {b WARNING.} The functioral interface is deprecated and will be + removed. + + {!Make} allows client to specify types for strings and internal + buffers. Among other things this can be used to perform + hash-consing or to process the character stream, e.g. to normalize + unicode characters or to convert to a custom encoding. *) + +type std_string = string +type std_buffer = Buffer.t + +(** Input signature for strings. *) +module type String = sig + + type t + (** The type for strings. *) + + val empty : t + (** The empty string. *) + + val length : t -> int + (** Returns the length of the string. *) + + val append : t -> t -> t + (** Concatenates two strings. *) + + val lowercase : t -> t + (** New string with uppercase letter translated + to lowercase (correctness is only needed for ASCII + {{:http://www.unicode.org/glossary/#code_point}code point}). *) + + val iter : (int -> unit) -> t -> unit + (** Iterates over the unicode + {{:http://www.unicode.org/glossary/#code_point}code point} + of the given string. *) + + val of_string : std_string -> t + (** String from an OCaml string. *) + + val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a + (** [to_utf_8 f v s], is [f (... (f (f v s1) s2) ...) sn]. Where the + concatenation of [s1], [s2], ... [sn] is [s] as an UTF-8 stream. *) + + val compare : t -> t -> int + (** String comparison. Binary comparison is sufficent. *) +end + +(** Input signature for internal buffers. *) +module type Buffer = sig + + type string + (** The type for strings. *) + + type t + (** The type for buffers. *) + + exception Full + (** Raised if the buffer cannot be grown. *) + + val create : int -> t + (** Creates a buffer of the given size. *) + + val add_uchar : t -> int -> unit + (** Adds the given (guaranteed valid) unicode + {{:http://www.unicode.org/glossary/#code_point}code point} to a + buffer. + + {b Raises} {!Full} if the buffer cannot be grown. *) + + val clear : t -> unit + (** Clears the buffer. *) + + val contents : t -> string + (** Returns the buffer contents. *) + + val length : t -> int + (** Returns the number of characters contained in the buffer. *) +end + +(** Output signature of {!Make}. *) +module type S = sig + + (** {1 Basic types and values} *) + + type string + + type encoding = [ + | `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1| `US_ASCII ] + type dtd = string option + type name = string * string + type attribute = name * string + type tag = name * attribute list + type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] + + val ns_xml : string + val ns_xmlns : string + + (** {1 Input} *) + + type pos = int * int + type error = [ + | `Max_buffer_size + | `Unexpected_eoi + | `Malformed_char_stream + | `Unknown_encoding of string + | `Unknown_entity_ref of string + | `Unknown_ns_prefix of string + | `Illegal_char_ref of string + | `Illegal_char_seq of string + | `Expected_char_seqs of string list * string + | `Expected_root_element ] + + exception Error of pos * error + val error_message : error -> string + + type source = [ + | `Channel of in_channel + | `String of int * std_string + | `Fun of (unit -> int) ] + + type input + + val make_input : ?enc:encoding option -> ?strip:bool -> + ?ns:(string -> string option) -> + ?entity: (string -> string option) -> source -> input + + val input : input -> signal + + val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + input -> 'a + + val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + input -> (dtd * 'a) + + val peek : input -> signal + val eoi : input -> bool + val pos : input -> pos + + (** {1 Output} *) + + type 'a frag = [ `El of tag * 'a list | `Data of string ] + type dest = [ + | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] + + type output + val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> + ?ns_prefix:(string -> string option) -> dest -> output + + val output_depth : output -> int + val output : output -> signal -> unit + val output_tree : ('a -> 'a frag) -> output -> 'a -> unit + val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit +end + +(** Functor building streaming XML IO with the given strings and buffers. *) +module Make (String : String) (Buffer : Buffer with type string = String.t) : S + with type string = String.t + +(** {1:io Features and limitations} + + The module assumes strings are immutable, thus strings + the client gives or receives {e during} the input and output process + must not be modified. + {2:input Input} + {3:inenc Encoding} + + The parser supports ASCII, US-ASCII, + {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}, + {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16}, + {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE}, + {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and + {{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1} + (Latin-1) encoded documents. But strings returned by + the library are {b always} UTF-8 encoded. + + The encoding can be specified explicitly using the optional + argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a + {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the + beginning of the document. If there is no BOM it uses the encoding + specified in the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML + declaration}. Finally, if there is no XML declaration UTF-8 is assumed. + {3:inwspace White space handling} + + The parser performs + {{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data + normalization} on {e every} attribute data. This means that + attribute data does not have leading and trailling white space and that + any white space is collapsed and transformed to a single space + character ([U+0020]). + + White space handling of character data depends on the [strip] + argument. If [strip] is [true], character data is treated like + attribute data, white space before and after elements is removed + and any white space is collapsed and transformed to a single + space character ([U+0020]), except if the data is under the scope of a {e + xml:space} attribute whose value is {e preserve}. If [strip] is + [false] all white space data is preserved as present in the + document (however all kinds of + {{:http://www.w3.org/TR/REC-xml/#sec-line-ends}line ends} are + translated to the newline character ([U+000A]). {3:inns Namespaces} + + Xmlm's {{!name}names} are + {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}. + The parser automatically handles the document's namespace + declarations. Undeclared namespace prefixes can be bound via the + callback [ns], which must return a namespace name. If [ns] returns + [None] an [`Unknown_ns_prefix] error is raised. + + Attributes used for namespace declarations are preserved by the + parser. They are in the {!ns_xmlns} namespace. Default namespace + declarations made with {i xmlns} have the attribute name + [(Xmlm.ns_xmlns, "xmlns")]. Prefix declarations have the prefix as + the local name, for example {i xmlns:ex} results in the attribute name + [(Xmlm.ns_xmlns, "ex")]. + + Regarding constraints on the usage of the {i xml} and {i xmlns} + prefixes by documents, the parser does not report errors on violations + of the {i must} constraints listed in + {{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}. + + {3:inentity Character and entity references} + + {{:http://www.w3.org/TR/REC-xml/#dt-charref}Character references} + and {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined + entities} are automatically resolved. Other entity references can + be resolved by the callback [entity], which must return an UTF-8 + string corresponding to the + replacement character data. The replacement data is {e not} + analysed for further references, it is added to the data as such + modulo white space stripping. If [entity] returns [None] the error + [`Unknown_entity_ref] is returned. + + {3:iseq Sequences of documents (deprecated)} + + {b WARNING.} This feature is deprecated and will be removed. + + When a well-formed sequence of signals is input, no data is consumed beyond + the closing ['>'] of the document's root element. + + If you want to parse a document as + {{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML + specification, call {!eoi} after a well-formed sequence of + signals, it must return [true]. If you expect another document on + the same input abstraction a new well-formed sequence of signals + can be {!input}. Use {!eoi} to check if a document follows (this + may consume data). + + Invoking {!eoi} after a well-formed sequence of signals skips + whitespaces, comments and processing instructions until it gets to + either an {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML + declaration} or a {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} + or the start of a new element or the end of input (in which case + {!eoi} returns [true]). If there is a new document but there is no + XML declaration or the declaration specifies UTF-16, the same + encoding as for the previous document is used. + + {3:inmisc Miscellaneous} + {ul + {- Parses the more liberal and simpler XML 1.1 + {{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because + of namespaces).} + {- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed + roughly (no guarantee it is well formed) and its information is ignored.} + {- The parser drops + {{:http://www.w3.org/TR/REC-xml/#dt-comment}comments}, + {{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and + {{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.} + {- Element attributes are not checked for uniqueness.} + {- Attribute and character data chunks are limited by + [Sys.max_string_length]. + The error [`Max_buffer_size] is raised if the limit is hit.} + {- Tail recursive.} + {- Non validating.} + } + + {2:output Output} + {3:outenc Encoding} + + Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8} + encoded documents. Strings given to + output functions {b must be} UTF-8 encoded, no checks are + performed. Unicode characters that are not legal XML + {{:http://www.w3.org/TR/REC-xml/#NT-Char}characters} are replaced + by the {{:http://unicode.org/glossary/#replacement_character}Unicode + replacement character}. + + {3:outns Namespaces} + + Xmlm's {{:#TYPEname}names} are + {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}. + Expanded names are automatically converted to + {{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified + names} by the output abstraction. There is no particular api to specify + prefixes and default namespaces, + the actual result depends solely on the output + of attributes belonging to the {!ns_xmlns} namespace. For example to set + the default namespace of an element to {i http://example.org/myns}, + use the following attribute : + {[(* xmlns='http://example.org/myns' *) +let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]} + To bind the prefix ["ex"] to {i http://example.org/ex}, use the + following attribute : + {[(* xmlns:ex='http://example.org/ex' *) +let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} + Note that outputing input signals without + touching namespace declaration attributes will preserve existing + prefixes and bindings provided the same namespace name is not + bound to different prefixes in a given context. + + The callback [ns_prefix] of an output abstraction can be used to + give a prefix to a namespace name lacking a prefix binding in the + current output scope. Given a namespace name the function must return + the prefix to use. Note that this + will {b not} add any namespace declaration attribute to the + output. If the function returns [None], {!output} will raise + [Invalid_argument]. The default function returns always [None]. + {3:outindent Indentation} + + Output can be indented by specifying the [indent] argument when an + output abstraction is created. If [indent] is [None] (default) + signal output does not introduce any extra white space. If + [ident] is [Some c], each {!signal} is output on its own line + (for empty elements [`El_start] and [`El_end] are collapsed on a single + line) and nested elements are indented with [c] space + characters. + + {3:oseq Sequences of documents (deprecated)} + + {b WARNING.} This feature is deprecated and will be removed. + + After a well-formed sequence of signals was output, the output + abstraction can be reused to output a new well-formed sequence of + signals. + + {3:outmisc Miscellaneous} + {ul + {- Output on a channel does not flush it.} + {- In attribute and character data you provide, markup + delimiters ['<'],['>'],['&'], and ['\"'] are + automatically escaped to + {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined + entities}.} + {- No checks are peformed on the prefix and local part of output + names to verify they are + {{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s. + For example using the tag name [("","dip d")] will produce + a non well-formed document because of the space character.} + {- Tail recursive.}} + + {2 Tips} + {ul + {- The best options to do an input/output round trip + and preserve as much information as possible is to + input with [strip = false] and output with [indent = None].} + {- Complete whitespace control on output is achieved + with [indent = None] and suitable [`Data] signals}} + + {1:ex Examples} + + {2:exseq Sequential processing} + + Sequential processing has the advantage that you don't need to get + the whole document tree in memory to process it. + + The following function reads a {e single} document on an + input channel and outputs it. +{[let id ic oc = + let i = Xmlm.make_input (`Channel ic) in + let o = Xmlm.make_output (`Channel oc) in + let rec pull i o depth = + Xmlm.output o (Xmlm.peek i); + match Xmlm.input i with + | `El_start _ -> pull i o (depth + 1) + | `El_end -> if depth = 1 then () else pull i o (depth - 1) + | `Data _ -> pull i o depth + | `Dtd _ -> assert false + in + Xmlm.output o (Xmlm.input i); (* `Dtd *) + pull i o 0; + if not (Xmlm.eoi i) then invalid_arg "document not well-formed"]} + + The following function reads a {e sequence} of documents on an + input channel and outputs it. +{[let id_seq ic oc = + let i = Xmlm.make_input (`Channel ic) in + let o = Xmlm.make_output ~nl:true (`Channel oc) in + while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]} + The following function reads a {e sequence} of documents on the + input channel. In each document's tree it prunes non root elements + whose name belongs to [prune_list]. +{[let prune_docs prune_list ic oc = + let i = Xmlm.make_input (`Channel ic) in + let o = Xmlm.make_output ~nl:true (`Channel oc) in + let copy i o = Xmlm.output o (Xmlm.input i) in + let prune (name, _) = List.mem name prune_list in + let rec process i o d = + let rec skip i d = match Xmlm.input i with + | `El_start _ -> skip i (d + 1) + | `El_end -> if d = 1 then () else skip i (d - 1) + | s -> skip i d + in + match Xmlm.peek i with + | `El_start tag when prune tag -> skip i 0; process i o d + | `El_start _ -> copy i o; process i o (d + 1) + | `El_end -> copy i o; if d = 0 then () else process i o (d - 1) + | `Data _ -> copy i o; process i o d + | `Dtd _ -> assert false + in + let rec docs i o = + copy i o; (* `Dtd *) + copy i o; (* root start *) + process i o 0; + if Xmlm.eoi i then () else docs i o + in + docs i o]} + + {2:extree Tree processing} + + A document's sequence of signals can be easily converted + to an arborescent data structure. Assume your trees are defined by : + {[type tree = E of Xmlm.tag * tree list | D of string]} + The following functions input/output xml documents from/to abstractions + as value of type [tree]. +{[let in_tree i = + let el tag childs = E (tag, childs) in + let data d = D d in + Xmlm.input_doc_tree ~el ~data i + +let out_tree o t = + let frag = function + | E (tag, childs) -> `El (tag, childs) + | D d -> `Data d + in + Xmlm.output_doc_tree frag o t]} + + {2:exrow Tabular data processing} + + We show how to process XML data that represents tabular data (some + people like do that). + + The file we need to deal with represents nominal data about + {{:http://www.w3.org/}W3C bureaucrats}. There are no namespaces + and attributes are ignored. The element structure of the document + is : + {ul {- + {ul {- represents a W3C bureaucrat + (zero or more). + + A bureaucrat contains the following elements, in order. + {ul {- its name (mandatory, string).} + {- its surname (mandatory, string).} + {- present iff he implemented one of its spec + (optional, empty).} + {- its grade on the + open scale of obfuscation (mandatory, float).} + {- (zero or more, string), technical reports he + worked on.}}}}}} + + In OCaml we represent a W3C bureaucrat by this type : +{[type w3c_bureaucrat = { + name : string; + surname : string; + honest : bool; + obfuscation_level : float; + trs : string list; }]} + The following functions input and output W3C bureaucrats as lists + of values of type [w3c_bureaucrat]. +{[let in_w3c_bureaucrats src = + let i = Xmlm.make_input ~strip:true src in + let tag n = ("", n), [] in + let error () = invalid_arg "parse error" in + let accept s i = if Xmlm.input i = s then () else error () in + let rec i_seq el acc i = match Xmlm.peek i with + | `El_start _ -> i_seq el ((el i) :: acc) i + | `El_end -> List.rev acc + | _ -> error () + in + let i_el n i = + accept (`El_start (tag n)) i; + let d = match Xmlm.peek i with + | `Data d -> ignore (Xmlm.input i); d + | `El_end -> "" + | _ -> error () + in + accept (`El_end) i; + d + in + let i_bureaucrat i = + try + accept (`El_start (tag "bureaucrat")) i; + let name = i_el "name" i in + let surname = i_el "surname" i in + let honest = match Xmlm.peek i with + | `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true + | _ -> false + in + let obf = float_of_string (i_el "obfuscation_level" i) in + let trs = i_seq (i_el "tr") [] i in + accept (`El_end) i; + { name = name; surname = surname; honest = honest; + obfuscation_level = obf; trs = trs } + with + | Failure _ -> error () (* float_of_string *) + in + accept (`Dtd None) i; + accept (`El_start (tag "list")) i; + let bl = i_seq i_bureaucrat [] i in + accept (`El_end) i; + if not (Xmlm.eoi i) then invalid_arg "more than one document"; + bl + +let out_w3c_bureaucrats dst bl = + let tag n = ("", n), [] in + let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in + let out = Xmlm.output o in + let o_el n d = + out (`El_start (tag n)); + if d <> "" then out (`Data d); + out `El_end + in + let o_bureaucrat b = + out (`El_start (tag "bureaucrat")); + o_el "name" b.name; + o_el "surname" b.surname; + if b.honest then o_el "honest" ""; + o_el "obfuscation_level" (string_of_float b.obfuscation_level); + List.iter (o_el "tr") b.trs; + out `El_end + in + out (`Dtd None); + out (`El_start (tag "list")); + List.iter o_bureaucrat bl; + out (`El_end)]} +*) + +(*--------------------------------------------------------------------------- + Copyright 2007 Daniel C. Bünzli + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + + 3. Neither the name of Daniel C. Bünzli nor the names of + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ---------------------------------------------------------------------------*)