This commit is contained in:
John Whitington 2016-11-13 18:04:13 +00:00
parent 2caa9de061
commit 2ec3f672c0
2 changed files with 552 additions and 596 deletions

723
xmlm.ml Normal file → Executable file

File diff suppressed because it is too large Load Diff

425
xmlm.mli Normal file → Executable file
View File

@ -4,54 +4,54 @@
%%NAME%% release %%VERSION%% %%NAME%% release %%VERSION%%
---------------------------------------------------------------------------*) ---------------------------------------------------------------------------*)
(** Streaming XML codec. (** Streaming XML codec.
A well-formed sequence of {{!signal}signals} represents an A well-formed sequence of {{!signal}signals} represents an
{{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in {{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in
depth first order (this has nothing to do with XML depth first order (this has nothing to do with XML
well-formedness). Input pulls a well-formed sequence of signals well-formedness). Input pulls a well-formed sequence of signals
from a data source and output pushes a well-formed sequence of from a data source and output pushes a well-formed sequence of
signals to a data destination. Functions are provided to easily signals to a data destination. Functions are provided to easily
transform sequences of signals to/from arborescent data structures. transform sequences of signals to/from arborescent data structures.
Consult the {{!io}features and limitations} and {{!ex}examples} Consult the {{!io}features and limitations} and {{!ex}examples}
of use. of use.
{e Release %%VERSION%% %%MAINTAINER%% } {e Release %%VERSION%% %%MAINTAINER%% }
{3 References} {3 References}
{ul {ul
{- Tim Bray. {- Tim Bray.
{e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}}, {e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}},
1998.} 1998.}
{- Tim Bray et al. {- Tim Bray et al.
{e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}}, {e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}},
2006.}} *) 2006.}} *)
(** {1 Basic types and values} *) (** {1 Basic types and values} *)
(** The type for character encodings. For [`UTF_16], endianness is (** The type for character encodings. For [`UTF_16], endianness is
determined from the determined from the
{{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
type encoding = [ type encoding = [
| `UTF_8 | `UTF_8
| `UTF_16 | `UTF_16
(** Endianness determined from the (** Endianness determined from the
{{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
| `UTF_16BE | `UTF_16BE
| `UTF_16LE | `UTF_16LE
| `ISO_8859_1 | `ISO_8859_1
| `US_ASCII ] | `US_ASCII ]
type dtd = string option type dtd = string option
(** The type for the optional (** The type for the optional
{{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *) {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *)
type name = string * string type name = string * string
(** The type for attribute and element's (** The type for attribute and element's
{{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names} {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}
[(uri,local)]. An empty [uri] represents a name without a [(uri,local)]. An empty [uri] represents a name without a
namespace name, i.e. an unprefixed name namespace name, i.e. an unprefixed name
that is not under the scope of a default namespace. *) that is not under the scope of a default namespace. *)
type attribute = name * string type attribute = name * string
@ -69,61 +69,42 @@ child ::= `Data | tree | epsilon ]}
Input and output deal only with well-formed sequences or Input and output deal only with well-formed sequences or
exceptions are raised. exceptions are raised.
*) *)
val ns_xml : string val ns_xml : string
(** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the (** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the
reserved ["xml"] prefix. *) reserved ["xml"] prefix. *)
val ns_xmlns : string val ns_xmlns : string
(** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the (** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the
reserved ["xmlns"] prefix. *) reserved ["xmlns"] prefix. *)
val pp_dtd : Format.formatter -> dtd -> unit
(** [pp_dtd ppf dtd] prints an unspecified representation of [dtd] on [ppf]. *)
val pp_name : Format.formatter -> name -> unit
(** [pp_name ppf name] prints an unspecified representation of [name] on
[ppf]. *)
val pp_attribute : Format.formatter -> attribute -> unit
(** [pp_attribute ppf att] prints an unspecified representation of [att] on
[ppf]. *)
val pp_tag : Format.formatter -> tag -> unit
(** [pp_tag ppf tag] prints an unspecified representation of [tag] on
[ppf]. *)
val pp_signal : Format.formatter -> signal -> unit
(** [pp_signal ppf s] prints an unspecified representation of [s] on
[ppf]. *)
(** {1 Input} *) (** {1 Input} *)
type pos = int * int type pos = int * int
(** The type for input positions. Line and column number, both start (** The type for input positions. Line and column number, both start
with 1. *) with 1. *)
(** The type for input errors. *) (** The type for input errors. *)
type error = [ type error = [
| `Max_buffer_size | `Max_buffer_size
(** Maximal buffer size exceeded ([Sys.max_string_length]). *) (** Maximal buffer size exceeded ([Sys.max_string_length]). *)
| `Unexpected_eoi | `Unexpected_eoi
(** Unexpected end of input. *) (** Unexpected end of input. *)
| `Malformed_char_stream | `Malformed_char_stream
(** Malformed underlying character stream. *) (** Malformed underlying character stream. *)
| `Unknown_encoding of string | `Unknown_encoding of string
(** Unknown encoding. *) (** Unknown encoding. *)
| `Unknown_entity_ref of string | `Unknown_entity_ref of string
(** Unknown entity reference, {{!inentity} details}. *) (** Unknown entity reference, {{!inentity} details}. *)
| `Unknown_ns_prefix of string | `Unknown_ns_prefix of string
(** Unknown namespace prefix {{!inns} details} *) (** Unknown namespace prefix {{!inns} details} *)
| `Illegal_char_ref of string | `Illegal_char_ref of string
(** Illegal character reference. *) (** Illegal character reference. *)
| `Illegal_char_seq of string | `Illegal_char_seq of string
(** Illegal character sequence. *) (** Illegal character sequence. *)
| `Expected_char_seqs of string list * string | `Expected_char_seqs of string list * string
(** Expected one of the character sequences in the list but found another. *) (** Expected one of the character sequences in the list but found another. *)
| `Expected_root_element | `Expected_root_element
(** Expected the document's root element. *) ] (** Expected the document's root element. *) ]
val error_message : error -> string val error_message : error -> string
@ -132,7 +113,7 @@ val error_message : error -> string
exception Error of pos * error exception Error of pos * error
(** Raised on input errors. *) (** Raised on input errors. *)
type source = [ type source = [
| `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ] | `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ]
(** The type for input sources. For [`String] starts reading at the (** The type for input sources. For [`String] starts reading at the
given integer position. For [`Fun] the function must return the given integer position. For [`Fun] the function must return the
@ -142,70 +123,70 @@ type source = [
type input type input
(** The type for input abstractions. *) (** The type for input abstractions. *)
val make_input : ?enc:encoding option -> ?strip:bool -> val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) -> ?ns:(string -> string option) ->
?entity: (string -> string option) -> source -> input ?entity: (string -> string option) -> source -> input
(** Returns a new input abstraction reading from the given source. (** Returns a new input abstraction reading from the given source.
{ul {ul
{- [enc], character encoding of the document, {{!inenc} details}. {- [enc], character encoding of the document, {{!inenc} details}.
Defaults to [None].} Defaults to [None].}
{- [strip], strips whitespace in character data, {{!inwspace} details}. {- [strip], strips whitespace in character data, {{!inwspace} details}.
Defaults to [false].} Defaults to [false].}
{- [ns] is called to bind undeclared namespace prefixes, {- [ns] is called to bind undeclared namespace prefixes,
{{!inns} details}. Default returns always [None].} {{!inns} details}. Default returns always [None].}
{- [entity] is called to resolve non predefined entity references, {- [entity] is called to resolve non predefined entity references,
{{!inentity} details}. Default returns always [None].}} *) {{!inentity} details}. Default returns always [None].}} *)
val input : input -> signal val input : input -> signal
(** Inputs a signal. Repeated invocation of the function with the same (** Inputs a signal. Repeated invocation of the function with the same
input abstraction will generate a {{!signal}well-formed} sequence input abstraction will generate a {{!signal}well-formed} sequence
of signals or an {!Error} is raised. Furthermore there will be no of signals or an {!Error} is raised. Furthermore there will be no
two consecutive [`Data] signals in the sequence and their string two consecutive [`Data] signals in the sequence and their string
is always non empty. is always non empty.
{b Deprecated} After a well-formed sequence was input another may {b Deprecated} After a well-formed sequence was input another may
be input, see {!eoi} and {{!iseq}details}. be input, see {!eoi} and {{!iseq}details}.
{b Raises} {!Error} on input errors. *) {b Raises} {!Error} on input errors. *)
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> 'a input -> 'a
(** If the next signal is a : (** If the next signal is a :
{ul {ul
{- [`Data] signal, inputs it and invokes [data] with the character data.} {- [`Data] signal, inputs it and invokes [data] with the character data.}
{- [`El_start] signal, inputs the sequence of signals until its {- [`El_start] signal, inputs the sequence of signals until its
matching [`El_end] and invokes [el] and [data] as follows matching [`El_end] and invokes [el] and [data] as follows
{ul {ul
{- [el], is called on each [`El_end] signals with the corresponding {- [el], is called on each [`El_end] signals with the corresponding
[`El_start] tag and the result of the callback invocation for the [`El_start] tag and the result of the callback invocation for the
element's children.} element's children.}
{- [data], is called on each [`Data] signals with the character data. {- [data], is called on each [`Data] signals with the character data.
This function won't be called twice consecutively or with the empty This function won't be called twice consecutively or with the empty
string.}}} string.}}}
{- Other signals, raises [Invalid_argument].}} {- Other signals, raises [Invalid_argument].}}
{b Raises} {!Error} on input errors and [Invalid_argument] {b Raises} {!Error} on input errors and [Invalid_argument]
if the next signal is not [`El_start] or [`Data]. *) if the next signal is not [`El_start] or [`Data]. *)
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> (dtd * 'a) input -> (dtd * 'a)
(** Same as {!input_tree} but reads a complete {{!signal}well-formed} (** Same as {!input_tree} but reads a complete {{!signal}well-formed}
sequence of signals. sequence of signals.
{b Raises} {!Error} on input errors and [Invalid_argument] {b Raises} {!Error} on input errors and [Invalid_argument]
if the next signal is not [`Dtd]. *) if the next signal is not [`Dtd]. *)
val peek : input -> signal val peek : input -> signal
(** Same as {!input} but doesn't remove the signal from the sequence. (** Same as {!input} but doesn't remove the signal from the sequence.
{b Raises} {!Error} on input errors. *) {b Raises} {!Error} on input errors. *)
val eoi : input -> bool val eoi : input -> bool
(** Returns [true] if the end of input is reached. See {{!iseq}details}. (** Returns [true] if the end of input is reached. See {{!iseq}details}.
{b Raises} {!Error} on input errors. *) {b Raises} {!Error} on input errors. *)
val pos : input -> pos val pos : input -> pos
(** Current position in the input abstraction. *) (** Current position in the input abstraction. *)
(** {1 Output} *) (** {1 Output} *)
@ -213,31 +194,31 @@ val pos : input -> pos
type 'a frag = [ `El of tag * 'a list | `Data of string ] type 'a frag = [ `El of tag * 'a list | `Data of string ]
(** The type for deconstructing data structures of type ['a]. *) (** The type for deconstructing data structures of type ['a]. *)
type dest = [ `Channel of out_channel | `Buffer of Buffer.t type dest = [ `Channel of out_channel | `Buffer of Buffer.t
| `Fun of (int -> unit) ] | `Fun of (int -> unit) ]
(** The type for output destinations. For [`Buffer], the buffer won't (** The type for output destinations. For [`Buffer], the buffer won't
be cleared. For [`Fun] the function is called with the output {e be cleared. For [`Fun] the function is called with the output {e
bytes} as [int]s. *) bytes} as [int]s. *)
type output type output
(** The type for output abstractions. *) (** The type for output abstractions. *)
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
?ns_prefix:(string -> string option) -> dest -> output ?ns_prefix:(string -> string option) -> dest -> output
(** Returns a new output abstraction writing to the given destination. (** Returns a new output abstraction writing to the given destination.
{ul {ul
{- [decl], if [true] the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML {- [decl], if [true] the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML
declaration} is output (defaults to [true]).} declaration} is output (defaults to [true]).}
{- [nl], if [true] a newline is output when the root's element [`El_end] {- [nl], if [true] a newline is output when the root's element [`El_end]
signal is output. signal is output.
Defaults to [false].} Defaults to [false].}
{- [indent], identation behaviour, see {{!outindent} details}. Defaults to {- [indent], identation behaviour, see {{!outindent} details}. Defaults to
[None].} [None].}
{- [ns_prefix], undeclared namespace prefix bindings, {- [ns_prefix], undeclared namespace prefix bindings,
see {{!outns}details}. Default returns always [None].}} *) see {{!outns}details}. Default returns always [None].}} *)
val output : output -> signal -> unit val output : output -> signal -> unit
(** Outputs a signal. (** Outputs a signal.
{b Deprecated.} After a well-formed sequence of signals was output {b Deprecated.} After a well-formed sequence of signals was output
a new well-formed sequence can be output. a new well-formed sequence can be output.
@ -249,22 +230,22 @@ val output : output -> signal -> unit
val output_depth : output -> int val output_depth : output -> int
(** [output_depth o] is [o]'s current element nesting level (undefined (** [output_depth o] is [o]'s current element nesting level (undefined
before the first [`El_start] and after the last [`El_end]). *) before the first [`El_start] and after the last [`El_end]). *)
val output_tree : ('a -> 'a frag) -> output -> 'a -> unit val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
(** Outputs signals corresponding to a value by recursively (** Outputs signals corresponding to a value by recursively
applying the given value deconstructor. applying the given value deconstructor.
{b Raises} see {!output}. *) {b Raises} see {!output}. *)
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
(** Same as {!output_tree} but outputs a complete {{!signal}well-formed} (** Same as {!output_tree} but outputs a complete {{!signal}well-formed}
sequence of signals. sequence of signals.
{b Raises} see {!output}. *) {b Raises} see {!output}. *)
(** {1:sto Functorial interface (deprecated)} (** {1:sto Functorial interface (deprecated)}
{b WARNING.} The functioral interface is deprecated and will be {b WARNING.} The functioral interface is deprecated and will be
removed. removed.
{!Make} allows client to specify types for strings and internal {!Make} allows client to specify types for strings and internal
@ -277,16 +258,16 @@ type std_buffer = Buffer.t
(** Input signature for strings. *) (** Input signature for strings. *)
module type String = sig module type String = sig
type t type t
(** The type for strings. *) (** The type for strings. *)
val empty : t val empty : t
(** The empty string. *) (** The empty string. *)
val length : t -> int val length : t -> int
(** Returns the length of the string. *) (** Returns the length of the string. *)
val append : t -> t -> t val append : t -> t -> t
(** Concatenates two strings. *) (** Concatenates two strings. *)
@ -296,7 +277,7 @@ module type String = sig
{{:http://www.unicode.org/glossary/#code_point}code point}). *) {{:http://www.unicode.org/glossary/#code_point}code point}). *)
val iter : (int -> unit) -> t -> unit val iter : (int -> unit) -> t -> unit
(** Iterates over the unicode (** Iterates over the unicode
{{:http://www.unicode.org/glossary/#code_point}code point} {{:http://www.unicode.org/glossary/#code_point}code point}
of the given string. *) of the given string. *)
@ -316,129 +297,129 @@ module type Buffer = sig
type string type string
(** The type for strings. *) (** The type for strings. *)
type t type t
(** The type for buffers. *) (** The type for buffers. *)
exception Full exception Full
(** Raised if the buffer cannot be grown. *) (** Raised if the buffer cannot be grown. *)
val create : int -> t val create : int -> t
(** Creates a buffer of the given size. *) (** Creates a buffer of the given size. *)
val add_uchar : t -> int -> unit val add_uchar : t -> int -> unit
(** Adds the given (guaranteed valid) unicode (** Adds the given (guaranteed valid) unicode
{{:http://www.unicode.org/glossary/#code_point}code point} to a {{:http://www.unicode.org/glossary/#code_point}code point} to a
buffer. buffer.
{b Raises} {!Full} if the buffer cannot be grown. *) {b Raises} {!Full} if the buffer cannot be grown. *)
val clear : t -> unit val clear : t -> unit
(** Clears the buffer. *) (** Clears the buffer. *)
val contents : t -> string val contents : t -> string
(** Returns the buffer contents. *) (** Returns the buffer contents. *)
val length : t -> int val length : t -> int
(** Returns the number of characters contained in the buffer. *) (** Returns the number of characters contained in the buffer. *)
end end
(** Output signature of {!Make}. *) (** Output signature of {!Make}. *)
module type S = sig module type S = sig
(** {1 Basic types and values} *)
(** {1 Basic types and values} *) type string
type string type encoding = [
type encoding = [
| `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1| `US_ASCII ] | `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1| `US_ASCII ]
type dtd = string option type dtd = string option
type name = string * string type name = string * string
type attribute = name * string type attribute = name * string
type tag = name * attribute list type tag = name * attribute list
type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
val ns_xml : string val ns_xml : string
val ns_xmlns : string val ns_xmlns : string
(** {1 Input} *)
(** {1 Input} *) type pos = int * int
type pos = int * int
type error = [ type error = [
| `Max_buffer_size | `Max_buffer_size
| `Unexpected_eoi | `Unexpected_eoi
| `Malformed_char_stream | `Malformed_char_stream
| `Unknown_encoding of string | `Unknown_encoding of string
| `Unknown_entity_ref of string | `Unknown_entity_ref of string
| `Unknown_ns_prefix of string | `Unknown_ns_prefix of string
| `Illegal_char_ref of string | `Illegal_char_ref of string
| `Illegal_char_seq of string | `Illegal_char_seq of string
| `Expected_char_seqs of string list * string | `Expected_char_seqs of string list * string
| `Expected_root_element ] | `Expected_root_element ]
exception Error of pos * error exception Error of pos * error
val error_message : error -> string val error_message : error -> string
type source = [ type source = [
| `Channel of in_channel | `Channel of in_channel
| `String of int * std_string | `String of int * std_string
| `Fun of (unit -> int) ] | `Fun of (unit -> int) ]
type input type input
val make_input : ?enc:encoding option -> ?strip:bool -> val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) -> ?ns:(string -> string option) ->
?entity: (string -> string option) -> source -> input ?entity: (string -> string option) -> source -> input
val input : input -> signal val input : input -> signal
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> 'a input -> 'a
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> (dtd * 'a) input -> (dtd * 'a)
val peek : input -> signal val peek : input -> signal
val eoi : input -> bool val eoi : input -> bool
val pos : input -> pos val pos : input -> pos
(** {1 Output} *) (** {1 Output} *)
type 'a frag = [ `El of tag * 'a list | `Data of string ] type 'a frag = [ `El of tag * 'a list | `Data of string ]
type dest = [ type dest = [
| `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
type output type output
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
?ns_prefix:(string -> string option) -> dest -> output ?ns_prefix:(string -> string option) -> dest -> output
val output_depth : output -> int val output_depth : output -> int
val output : output -> signal -> unit val output : output -> signal -> unit
val output_tree : ('a -> 'a frag) -> output -> 'a -> unit val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
end end
(** Functor building streaming XML IO with the given strings and buffers. *) (** Functor building streaming XML IO with the given strings and buffers. *)
module Make (String : String) (Buffer : Buffer with type string = String.t) : S module Make (String : String) (Buffer : Buffer with type string = String.t) : S
with type string = String.t with type string = String.t
(** {1:io Features and limitations} (** {1:io Features and limitations}
The module assumes strings are immutable, thus strings The module assumes strings are immutable, thus strings
the client gives or receives {e during} the input and output process the client gives or receives {e during} the input and output process
must not be modified. must not be modified.
{2:input Input} {2:input Input}
{3:inenc Encoding} {3:inenc Encoding}
The parser supports ASCII, US-ASCII, The parser supports ASCII, US-ASCII,
{{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}, {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8},
{{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16}, {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16},
{{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE}, {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE},
{{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and
{{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1} {{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1}
(Latin-1) encoded documents. But strings returned by (Latin-1) encoded documents. But strings returned by
the library are {b always} UTF-8 encoded. the library are {b always} UTF-8 encoded.
The encoding can be specified explicitly using the optional The encoding can be specified explicitly using the optional
argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a
{{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the
@ -450,8 +431,8 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
The parser performs The parser performs
{{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data {{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data
normalization} on {e every} attribute data. This means that normalization} on {e every} attribute data. This means that
attribute data does not have leading and trailling white space and that attribute data does not have leading and trailling white space and that
any white space is collapsed and transformed to a single space any white space is collapsed and transformed to a single space
character ([U+0020]). character ([U+0020]).
White space handling of character data depends on the [strip] White space handling of character data depends on the [strip]
@ -480,9 +461,9 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
[(Xmlm.ns_xmlns, "ex")]. [(Xmlm.ns_xmlns, "ex")].
Regarding constraints on the usage of the {i xml} and {i xmlns} Regarding constraints on the usage of the {i xml} and {i xmlns}
prefixes by documents, the parser does not report errors on violations prefixes by documents, the parser does not report errors on violations
of the {i must} constraints listed in of the {i must} constraints listed in
{{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}. {{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}.
{3:inentity Character and entity references} {3:inentity Character and entity references}
@ -494,14 +475,14 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
replacement character data. The replacement data is {e not} replacement character data. The replacement data is {e not}
analysed for further references, it is added to the data as such analysed for further references, it is added to the data as such
modulo white space stripping. If [entity] returns [None] the error modulo white space stripping. If [entity] returns [None] the error
[`Unknown_entity_ref] is returned. [`Unknown_entity_ref] is returned.
{3:iseq Sequences of documents (deprecated)} {3:iseq Sequences of documents (deprecated)}
{b WARNING.} This feature is deprecated and will be removed. {b WARNING.} This feature is deprecated and will be removed.
When a well-formed sequence of signals is input, no data is consumed beyond When a well-formed sequence of signals is input, no data is consumed beyond
the closing ['>'] of the document's root element. the closing ['>'] of the document's root element.
If you want to parse a document as If you want to parse a document as
{{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML {{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML
@ -522,32 +503,32 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
{3:inmisc Miscellaneous} {3:inmisc Miscellaneous}
{ul {ul
{- Parses the more liberal and simpler XML 1.1 {- Parses the more liberal and simpler XML 1.1
{{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because {{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because
of namespaces).} of namespaces).}
{- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed {- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed
roughly (no guarantee it is well formed) and its information is ignored.} roughly (no guarantee it is well formed) and its information is ignored.}
{- The parser drops {- The parser drops
{{:http://www.w3.org/TR/REC-xml/#dt-comment}comments}, {{:http://www.w3.org/TR/REC-xml/#dt-comment}comments},
{{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and {{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and
{{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.} {{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.}
{- Element attributes are not checked for uniqueness.} {- Element attributes are not checked for uniqueness.}
{- Attribute and character data chunks are limited by {- Attribute and character data chunks are limited by
[Sys.max_string_length]. [Sys.max_string_length].
The error [`Max_buffer_size] is raised if the limit is hit.} The error [`Max_buffer_size] is raised if the limit is hit.}
{- Tail recursive.} {- Tail recursive.}
{- Non validating.} {- Non validating.}
} }
{2:output Output} {2:output Output}
{3:outenc Encoding} {3:outenc Encoding}
Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8} Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}
encoded documents. Strings given to encoded documents. Strings given to
output functions {b must be} UTF-8 encoded, no checks are output functions {b must be} UTF-8 encoded, no checks are
performed. Unicode characters that are not legal XML performed. Unicode characters that are not legal XML
{{:http://www.w3.org/TR/REC-xml/#NT-Char}characters} are replaced {{:http://www.w3.org/TR/REC-xml/#NT-Char}characters} are replaced
by the {{:http://unicode.org/glossary/#replacement_character}Unicode by the {{:http://unicode.org/glossary/#replacement_character}Unicode
replacement character}. replacement character}.
{3:outns Namespaces} {3:outns Namespaces}
@ -556,15 +537,15 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
{{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}. {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}.
Expanded names are automatically converted to Expanded names are automatically converted to
{{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified {{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified
names} by the output abstraction. There is no particular api to specify names} by the output abstraction. There is no particular api to specify
prefixes and default namespaces, prefixes and default namespaces,
the actual result depends solely on the output the actual result depends solely on the output
of attributes belonging to the {!ns_xmlns} namespace. For example to set of attributes belonging to the {!ns_xmlns} namespace. For example to set
the default namespace of an element to {i http://example.org/myns}, the default namespace of an element to {i http://example.org/myns},
use the following attribute : use the following attribute :
{[(* xmlns='http://example.org/myns' *) {[(* xmlns='http://example.org/myns' *)
let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]} let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]}
To bind the prefix ["ex"] to {i http://example.org/ex}, use the To bind the prefix ["ex"] to {i http://example.org/ex}, use the
following attribute : following attribute :
{[(* xmlns:ex='http://example.org/ex' *) {[(* xmlns:ex='http://example.org/ex' *)
let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
@ -575,7 +556,7 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
The callback [ns_prefix] of an output abstraction can be used to The callback [ns_prefix] of an output abstraction can be used to
give a prefix to a namespace name lacking a prefix binding in the give a prefix to a namespace name lacking a prefix binding in the
current output scope. Given a namespace name the function must return current output scope. Given a namespace name the function must return
the prefix to use. Note that this the prefix to use. Note that this
will {b not} add any namespace declaration attribute to the will {b not} add any namespace declaration attribute to the
output. If the function returns [None], {!output} will raise output. If the function returns [None], {!output} will raise
@ -590,7 +571,7 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
line) and nested elements are indented with [c] space line) and nested elements are indented with [c] space
characters. characters.
{3:oseq Sequences of documents (deprecated)} {3:oseq Sequences of documents (deprecated)}
{b WARNING.} This feature is deprecated and will be removed. {b WARNING.} This feature is deprecated and will be removed.
@ -601,44 +582,44 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
{3:outmisc Miscellaneous} {3:outmisc Miscellaneous}
{ul {ul
{- Output on a channel does not flush it.} {- Output on a channel does not flush it.}
{- In attribute and character data you provide, markup {- In attribute and character data you provide, markup
delimiters ['<'],['>'],['&'], and ['\"'] are delimiters ['<'],['>'],['&'], and ['\"'] are
automatically escaped to automatically escaped to
{{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined
entities}.} entities}.}
{- No checks are peformed on the prefix and local part of output {- No checks are peformed on the prefix and local part of output
names to verify they are names to verify they are
{{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s. {{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s.
For example using the tag name [("","dip d")] will produce For example using the tag name [("","dip d")] will produce
a non well-formed document because of the space character.} a non well-formed document because of the space character.}
{- Tail recursive.}} {- Tail recursive.}}
{2 Tips} {2 Tips}
{ul {ul
{- The best options to do an input/output round trip {- The best options to do an input/output round trip
and preserve as much information as possible is to and preserve as much information as possible is to
input with [strip = false] and output with [indent = None].} input with [strip = false] and output with [indent = None].}
{- Complete whitespace control on output is achieved {- Complete whitespace control on output is achieved
with [indent = None] and suitable [`Data] signals}} with [indent = None] and suitable [`Data] signals}}
{1:ex Examples} {1:ex Examples}
{2:exseq Sequential processing} {2:exseq Sequential processing}
Sequential processing has the advantage that you don't need to get Sequential processing has the advantage that you don't need to get
the whole document tree in memory to process it. the whole document tree in memory to process it.
The following function reads a {e single} document on an The following function reads a {e single} document on an
input channel and outputs it. input channel and outputs it.
{[let id ic oc = {[let id ic oc =
let i = Xmlm.make_input (`Channel ic) in let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output (`Channel oc) in let o = Xmlm.make_output (`Channel oc) in
let rec pull i o depth = let rec pull i o depth =
Xmlm.output o (Xmlm.peek i); Xmlm.output o (Xmlm.peek i);
match Xmlm.input i with match Xmlm.input i with
| `El_start _ -> pull i o (depth + 1) | `El_start _ -> pull i o (depth + 1)
| `El_end -> if depth = 1 then () else pull i o (depth - 1) | `El_end -> if depth = 1 then () else pull i o (depth - 1)
| `Data _ -> pull i o depth | `Data _ -> pull i o depth
| `Dtd _ -> assert false | `Dtd _ -> assert false
in in
Xmlm.output o (Xmlm.input i); (* `Dtd *) Xmlm.output o (Xmlm.input i); (* `Dtd *)
@ -647,32 +628,32 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
The following function reads a {e sequence} of documents on an The following function reads a {e sequence} of documents on an
input channel and outputs it. input channel and outputs it.
{[let id_seq ic oc = {[let id_seq ic oc =
let i = Xmlm.make_input (`Channel ic) in let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output ~nl:true (`Channel oc) in let o = Xmlm.make_output ~nl:true (`Channel oc) in
while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]} while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]}
The following function reads a {e sequence} of documents on the The following function reads a {e sequence} of documents on the
input channel. In each document's tree it prunes non root elements input channel. In each document's tree it prunes non root elements
whose name belongs to [prune_list]. whose name belongs to [prune_list].
{[let prune_docs prune_list ic oc = {[let prune_docs prune_list ic oc =
let i = Xmlm.make_input (`Channel ic) in let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output ~nl:true (`Channel oc) in let o = Xmlm.make_output ~nl:true (`Channel oc) in
let copy i o = Xmlm.output o (Xmlm.input i) in let copy i o = Xmlm.output o (Xmlm.input i) in
let prune (name, _) = List.mem name prune_list in let prune (name, _) = List.mem name prune_list in
let rec process i o d = let rec process i o d =
let rec skip i d = match Xmlm.input i with let rec skip i d = match Xmlm.input i with
| `El_start _ -> skip i (d + 1) | `El_start _ -> skip i (d + 1)
| `El_end -> if d = 1 then () else skip i (d - 1) | `El_end -> if d = 1 then () else skip i (d - 1)
| s -> skip i d | s -> skip i d
in in
match Xmlm.peek i with match Xmlm.peek i with
| `El_start tag when prune tag -> skip i 0; process i o d | `El_start tag when prune tag -> skip i 0; process i o d
| `El_start _ -> copy i o; process i o (d + 1) | `El_start _ -> copy i o; process i o (d + 1)
| `El_end -> copy i o; if d = 0 then () else process i o (d - 1) | `El_end -> copy i o; if d = 0 then () else process i o (d - 1)
| `Data _ -> copy i o; process i o d | `Data _ -> copy i o; process i o d
| `Dtd _ -> assert false | `Dtd _ -> assert false
in in
let rec docs i o = let rec docs i o =
copy i o; (* `Dtd *) copy i o; (* `Dtd *)
copy i o; (* root start *) copy i o; (* root start *)
process i o 0; process i o 0;
@ -680,24 +661,24 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
in in
docs i o]} docs i o]}
{2:extree Tree processing} {2:extree Tree processing}
A document's sequence of signals can be easily converted A document's sequence of signals can be easily converted
to an arborescent data structure. Assume your trees are defined by : to an arborescent data structure. Assume your trees are defined by :
{[type tree = E of Xmlm.tag * tree list | D of string]} {[type tree = E of Xmlm.tag * tree list | D of string]}
The following functions input/output xml documents from/to abstractions The following functions input/output xml documents from/to abstractions
as value of type [tree]. as value of type [tree].
{[let in_tree i = {[let in_tree i =
let el tag childs = E (tag, childs) in let el tag childs = E (tag, childs) in
let data d = D d in let data d = D d in
Xmlm.input_doc_tree ~el ~data i Xmlm.input_doc_tree ~el ~data i
let out_tree o t = let out_tree o t =
let frag = function let frag = function
| E (tag, childs) -> `El (tag, childs) | E (tag, childs) -> `El (tag, childs)
| D d -> `Data d | D d -> `Data d
in in
Xmlm.output_doc_tree frag o t]} Xmlm.output_doc_tree frag o t]}
{2:exrow Tabular data processing} {2:exrow Tabular data processing}
@ -715,7 +696,7 @@ let out_tree o t =
A bureaucrat contains the following elements, in order. A bureaucrat contains the following elements, in order.
{ul {- <name> its name (mandatory, string).} {ul {- <name> its name (mandatory, string).}
{- <surname> its surname (mandatory, string).} {- <surname> its surname (mandatory, string).}
{- <honest> present iff he implemented one of its spec {- <honest> present iff he implemented one of its spec
(optional, empty).} (optional, empty).}
{- <obfuscation_level> its grade on the {- <obfuscation_level> its grade on the
open scale of obfuscation (mandatory, float).} open scale of obfuscation (mandatory, float).}
@ -723,25 +704,25 @@ let out_tree o t =
worked on.}}}}}} worked on.}}}}}}
In OCaml we represent a W3C bureaucrat by this type : In OCaml we represent a W3C bureaucrat by this type :
{[type w3c_bureaucrat = { {[type w3c_bureaucrat = {
name : string; name : string;
surname : string; surname : string;
honest : bool; honest : bool;
obfuscation_level : float; obfuscation_level : float;
trs : string list; }]} trs : string list; }]}
The following functions input and output W3C bureaucrats as lists The following functions input and output W3C bureaucrats as lists
of values of type [w3c_bureaucrat]. of values of type [w3c_bureaucrat].
{[let in_w3c_bureaucrats src = {[let in_w3c_bureaucrats src =
let i = Xmlm.make_input ~strip:true src in let i = Xmlm.make_input ~strip:true src in
let tag n = ("", n), [] in let tag n = ("", n), [] in
let error () = invalid_arg "parse error" in let error () = invalid_arg "parse error" in
let accept s i = if Xmlm.input i = s then () else error () in let accept s i = if Xmlm.input i = s then () else error () in
let rec i_seq el acc i = match Xmlm.peek i with let rec i_seq el acc i = match Xmlm.peek i with
| `El_start _ -> i_seq el ((el i) :: acc) i | `El_start _ -> i_seq el ((el i) :: acc) i
| `El_end -> List.rev acc | `El_end -> List.rev acc
| _ -> error () | _ -> error ()
in in
let i_el n i = let i_el n i =
accept (`El_start (tag n)) i; accept (`El_start (tag n)) i;
let d = match Xmlm.peek i with let d = match Xmlm.peek i with
| `Data d -> ignore (Xmlm.input i); d | `Data d -> ignore (Xmlm.input i); d
@ -751,7 +732,7 @@ let out_tree o t =
accept (`El_end) i; accept (`El_end) i;
d d
in in
let i_bureaucrat i = let i_bureaucrat i =
try try
accept (`El_start (tag "bureaucrat")) i; accept (`El_start (tag "bureaucrat")) i;
let name = i_el "name" i in let name = i_el "name" i in
@ -763,7 +744,7 @@ let out_tree o t =
let obf = float_of_string (i_el "obfuscation_level" i) in let obf = float_of_string (i_el "obfuscation_level" i) in
let trs = i_seq (i_el "tr") [] i in let trs = i_seq (i_el "tr") [] i in
accept (`El_end) i; accept (`El_end) i;
{ name = name; surname = surname; honest = honest; { name = name; surname = surname; honest = honest;
obfuscation_level = obf; trs = trs } obfuscation_level = obf; trs = trs }
with with
| Failure _ -> error () (* float_of_string *) | Failure _ -> error () (* float_of_string *)
@ -775,16 +756,16 @@ let out_tree o t =
if not (Xmlm.eoi i) then invalid_arg "more than one document"; if not (Xmlm.eoi i) then invalid_arg "more than one document";
bl bl
let out_w3c_bureaucrats dst bl = let out_w3c_bureaucrats dst bl =
let tag n = ("", n), [] in let tag n = ("", n), [] in
let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in
let out = Xmlm.output o in let out = Xmlm.output o in
let o_el n d = let o_el n d =
out (`El_start (tag n)); out (`El_start (tag n));
if d <> "" then out (`Data d); if d <> "" then out (`Data d);
out `El_end out `El_end
in in
let o_bureaucrat b = let o_bureaucrat b =
out (`El_start (tag "bureaucrat")); out (`El_start (tag "bureaucrat"));
o_el "name" b.name; o_el "name" b.name;
o_el "surname" b.surname; o_el "surname" b.surname;
@ -806,7 +787,7 @@ let out_w3c_bureaucrats dst bl =
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
met: met:
1. Redistributions of source code must retain the above copyright 1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer.