This commit is contained in:
John Whitington 2016-11-13 18:04:13 +00:00
parent 2caa9de061
commit 2ec3f672c0
2 changed files with 552 additions and 596 deletions

723
xmlm.ml Normal file → Executable file

File diff suppressed because it is too large Load Diff

425
xmlm.mli Normal file → Executable file
View File

@ -4,54 +4,54 @@
%%NAME%% release %%VERSION%%
---------------------------------------------------------------------------*)
(** Streaming XML codec.
(** Streaming XML codec.
A well-formed sequence of {{!signal}signals} represents an
{{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in
depth first order (this has nothing to do with XML
well-formedness). Input pulls a well-formed sequence of signals
from a data source and output pushes a well-formed sequence of
signals to a data destination. Functions are provided to easily
signals to a data destination. Functions are provided to easily
transform sequences of signals to/from arborescent data structures.
Consult the {{!io}features and limitations} and {{!ex}examples}
Consult the {{!io}features and limitations} and {{!ex}examples}
of use.
{e Release %%VERSION%% %%MAINTAINER%% }
{3 References}
{ul
{- Tim Bray.
{e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}},
1998.}
{- Tim Bray et al.
{- Tim Bray.
{e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}},
1998.}
{- Tim Bray et al.
{e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}},
2006.}} *)
(** {1 Basic types and values} *)
(** The type for character encodings. For [`UTF_16], endianness is
determined from the
determined from the
{{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
type encoding = [
| `UTF_8
| `UTF_16
(** Endianness determined from the
type encoding = [
| `UTF_8
| `UTF_16
(** Endianness determined from the
{{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
| `UTF_16BE
| `UTF_16LE
| `UTF_16BE
| `UTF_16LE
| `ISO_8859_1
| `US_ASCII ]
type dtd = string option
(** The type for the optional
{{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *)
type name = string * string
type name = string * string
(** The type for attribute and element's
{{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}
{{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}
[(uri,local)]. An empty [uri] represents a name without a
namespace name, i.e. an unprefixed name
namespace name, i.e. an unprefixed name
that is not under the scope of a default namespace. *)
type attribute = name * string
@ -69,61 +69,42 @@ child ::= `Data | tree | epsilon ]}
Input and output deal only with well-formed sequences or
exceptions are raised.
*)
val ns_xml : string
(** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the
val ns_xml : string
(** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the
reserved ["xml"] prefix. *)
val ns_xmlns : string
(** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the
(** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the
reserved ["xmlns"] prefix. *)
val pp_dtd : Format.formatter -> dtd -> unit
(** [pp_dtd ppf dtd] prints an unspecified representation of [dtd] on [ppf]. *)
val pp_name : Format.formatter -> name -> unit
(** [pp_name ppf name] prints an unspecified representation of [name] on
[ppf]. *)
val pp_attribute : Format.formatter -> attribute -> unit
(** [pp_attribute ppf att] prints an unspecified representation of [att] on
[ppf]. *)
val pp_tag : Format.formatter -> tag -> unit
(** [pp_tag ppf tag] prints an unspecified representation of [tag] on
[ppf]. *)
val pp_signal : Format.formatter -> signal -> unit
(** [pp_signal ppf s] prints an unspecified representation of [s] on
[ppf]. *)
(** {1 Input} *)
type pos = int * int
type pos = int * int
(** The type for input positions. Line and column number, both start
with 1. *)
(** The type for input errors. *)
type error = [
| `Max_buffer_size
| `Max_buffer_size
(** Maximal buffer size exceeded ([Sys.max_string_length]). *)
| `Unexpected_eoi
| `Unexpected_eoi
(** Unexpected end of input. *)
| `Malformed_char_stream
| `Malformed_char_stream
(** Malformed underlying character stream. *)
| `Unknown_encoding of string
| `Unknown_encoding of string
(** Unknown encoding. *)
| `Unknown_entity_ref of string
| `Unknown_entity_ref of string
(** Unknown entity reference, {{!inentity} details}. *)
| `Unknown_ns_prefix of string
| `Unknown_ns_prefix of string
(** Unknown namespace prefix {{!inns} details} *)
| `Illegal_char_ref of string
| `Illegal_char_ref of string
(** Illegal character reference. *)
| `Illegal_char_seq of string
| `Illegal_char_seq of string
(** Illegal character sequence. *)
| `Expected_char_seqs of string list * string
(** Expected one of the character sequences in the list but found another. *)
| `Expected_root_element
| `Expected_root_element
(** Expected the document's root element. *) ]
val error_message : error -> string
@ -132,7 +113,7 @@ val error_message : error -> string
exception Error of pos * error
(** Raised on input errors. *)
type source = [
type source = [
| `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ]
(** The type for input sources. For [`String] starts reading at the
given integer position. For [`Fun] the function must return the
@ -142,70 +123,70 @@ type source = [
type input
(** The type for input abstractions. *)
val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) ->
val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) ->
?entity: (string -> string option) -> source -> input
(** Returns a new input abstraction reading from the given source.
{ul
{- [enc], character encoding of the document, {{!inenc} details}.
{ul
{- [enc], character encoding of the document, {{!inenc} details}.
Defaults to [None].}
{- [strip], strips whitespace in character data, {{!inwspace} details}.
Defaults to [false].}
Defaults to [false].}
{- [ns] is called to bind undeclared namespace prefixes,
{{!inns} details}. Default returns always [None].}
{- [entity] is called to resolve non predefined entity references,
{{!inentity} details}. Default returns always [None].}} *)
val input : input -> signal
(** Inputs a signal. Repeated invocation of the function with the same
input abstraction will generate a {{!signal}well-formed} sequence
of signals or an {!Error} is raised. Furthermore there will be no
two consecutive [`Data] signals in the sequence and their string
is always non empty.
{b Deprecated} After a well-formed sequence was input another may
is always non empty.
{b Deprecated} After a well-formed sequence was input another may
be input, see {!eoi} and {{!iseq}details}.
{b Raises} {!Error} on input errors. *)
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> 'a
(** If the next signal is a :
{ul
{- [`Data] signal, inputs it and invokes [data] with the character data.}
{- [`El_start] signal, inputs the sequence of signals until its
{- [`El_start] signal, inputs the sequence of signals until its
matching [`El_end] and invokes [el] and [data] as follows
{ul
{- [el], is called on each [`El_end] signals with the corresponding
[`El_start] tag and the result of the callback invocation for the
{- [el], is called on each [`El_end] signals with the corresponding
[`El_start] tag and the result of the callback invocation for the
element's children.}
{- [data], is called on each [`Data] signals with the character data.
This function won't be called twice consecutively or with the empty
{- [data], is called on each [`Data] signals with the character data.
This function won't be called twice consecutively or with the empty
string.}}}
{- Other signals, raises [Invalid_argument].}}
{b Raises} {!Error} on input errors and [Invalid_argument]
if the next signal is not [`El_start] or [`Data]. *)
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> (dtd * 'a)
(** Same as {!input_tree} but reads a complete {{!signal}well-formed}
sequence of signals.
(** Same as {!input_tree} but reads a complete {{!signal}well-formed}
sequence of signals.
{b Raises} {!Error} on input errors and [Invalid_argument]
if the next signal is not [`Dtd]. *)
val peek : input -> signal
(** Same as {!input} but doesn't remove the signal from the sequence.
(** Same as {!input} but doesn't remove the signal from the sequence.
{b Raises} {!Error} on input errors. *)
val eoi : input -> bool
(** Returns [true] if the end of input is reached. See {{!iseq}details}.
{b Raises} {!Error} on input errors. *)
val pos : input -> pos
val pos : input -> pos
(** Current position in the input abstraction. *)
(** {1 Output} *)
@ -213,31 +194,31 @@ val pos : input -> pos
type 'a frag = [ `El of tag * 'a list | `Data of string ]
(** The type for deconstructing data structures of type ['a]. *)
type dest = [ `Channel of out_channel | `Buffer of Buffer.t
type dest = [ `Channel of out_channel | `Buffer of Buffer.t
| `Fun of (int -> unit) ]
(** The type for output destinations. For [`Buffer], the buffer won't
be cleared. For [`Fun] the function is called with the output {e
bytes} as [int]s. *)
type output
(** The type for output abstractions. *)
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
?ns_prefix:(string -> string option) -> dest -> output
(** Returns a new output abstraction writing to the given destination.
{ul
{ul
{- [decl], if [true] the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML
declaration} is output (defaults to [true]).}
{- [nl], if [true] a newline is output when the root's element [`El_end]
{- [nl], if [true] a newline is output when the root's element [`El_end]
signal is output.
Defaults to [false].}
{- [indent], identation behaviour, see {{!outindent} details}. Defaults to
[None].}
{- [ns_prefix], undeclared namespace prefix bindings,
{- [ns_prefix], undeclared namespace prefix bindings,
see {{!outns}details}. Default returns always [None].}} *)
val output : output -> signal -> unit
(** Outputs a signal.
(** Outputs a signal.
{b Deprecated.} After a well-formed sequence of signals was output
a new well-formed sequence can be output.
@ -249,22 +230,22 @@ val output : output -> signal -> unit
val output_depth : output -> int
(** [output_depth o] is [o]'s current element nesting level (undefined
before the first [`El_start] and after the last [`El_end]). *)
val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
(** Outputs signals corresponding to a value by recursively
applying the given value deconstructor.
{b Raises} see {!output}. *)
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
(** Same as {!output_tree} but outputs a complete {{!signal}well-formed}
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
(** Same as {!output_tree} but outputs a complete {{!signal}well-formed}
sequence of signals.
{b Raises} see {!output}. *)
(** {1:sto Functorial interface (deprecated)}
(** {1:sto Functorial interface (deprecated)}
{b WARNING.} The functioral interface is deprecated and will be
{b WARNING.} The functioral interface is deprecated and will be
removed.
{!Make} allows client to specify types for strings and internal
@ -277,16 +258,16 @@ type std_buffer = Buffer.t
(** Input signature for strings. *)
module type String = sig
type t
(** The type for strings. *)
val empty : t
(** The empty string. *)
val length : t -> int
(** Returns the length of the string. *)
val append : t -> t -> t
(** Concatenates two strings. *)
@ -296,7 +277,7 @@ module type String = sig
{{:http://www.unicode.org/glossary/#code_point}code point}). *)
val iter : (int -> unit) -> t -> unit
(** Iterates over the unicode
(** Iterates over the unicode
{{:http://www.unicode.org/glossary/#code_point}code point}
of the given string. *)
@ -316,129 +297,129 @@ module type Buffer = sig
type string
(** The type for strings. *)
type t
type t
(** The type for buffers. *)
exception Full
(** Raised if the buffer cannot be grown. *)
val create : int -> t
(** Creates a buffer of the given size. *)
val add_uchar : t -> int -> unit
(** Adds the given (guaranteed valid) unicode
{{:http://www.unicode.org/glossary/#code_point}code point} to a
buffer.
buffer.
{b Raises} {!Full} if the buffer cannot be grown. *)
val clear : t -> unit
(** Clears the buffer. *)
val contents : t -> string
(** Returns the buffer contents. *)
val length : t -> int
(** Returns the number of characters contained in the buffer. *)
end
(** Output signature of {!Make}. *)
module type S = sig
module type S = sig
(** {1 Basic types and values} *)
(** {1 Basic types and values} *)
type string
type encoding = [
type string
type encoding = [
| `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1| `US_ASCII ]
type dtd = string option
type name = string * string
type name = string * string
type attribute = name * string
type tag = name * attribute list
type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
val ns_xml : string
val ns_xml : string
val ns_xmlns : string
(** {1 Input} *)
(** {1 Input} *)
type pos = int * int
type pos = int * int
type error = [
| `Max_buffer_size
| `Max_buffer_size
| `Unexpected_eoi
| `Malformed_char_stream
| `Unknown_encoding of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Expected_char_seqs of string list * string
| `Expected_root_element ]
exception Error of pos * error
val error_message : error -> string
type source = [
| `Channel of in_channel
type source = [
| `Channel of in_channel
| `String of int * std_string
| `Fun of (unit -> int) ]
type input
val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) ->
val make_input : ?enc:encoding option -> ?strip:bool ->
?ns:(string -> string option) ->
?entity: (string -> string option) -> source -> input
val input : input -> signal
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> 'a
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
input -> (dtd * 'a)
val peek : input -> signal
val eoi : input -> bool
val pos : input -> pos
val pos : input -> pos
(** {1 Output} *)
type 'a frag = [ `El of tag * 'a list | `Data of string ]
type dest = [
type dest = [
| `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
type output
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option ->
?ns_prefix:(string -> string option) -> dest -> output
val output_depth : output -> int
val output : output -> signal -> unit
val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
end
(** Functor building streaming XML IO with the given strings and buffers. *)
module Make (String : String) (Buffer : Buffer with type string = String.t) : S
with type string = String.t
(** {1:io Features and limitations}
The module assumes strings are immutable, thus strings
the client gives or receives {e during} the input and output process
the client gives or receives {e during} the input and output process
must not be modified.
{2:input Input}
{3:inenc Encoding}
{3:inenc Encoding}
The parser supports ASCII, US-ASCII,
The parser supports ASCII, US-ASCII,
{{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8},
{{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16},
{{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE},
{{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and
{{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1}
{{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1}
(Latin-1) encoded documents. But strings returned by
the library are {b always} UTF-8 encoded.
the library are {b always} UTF-8 encoded.
The encoding can be specified explicitly using the optional
argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a
{{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the
@ -450,8 +431,8 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
The parser performs
{{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data
normalization} on {e every} attribute data. This means that
attribute data does not have leading and trailling white space and that
any white space is collapsed and transformed to a single space
attribute data does not have leading and trailling white space and that
any white space is collapsed and transformed to a single space
character ([U+0020]).
White space handling of character data depends on the [strip]
@ -480,9 +461,9 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
[(Xmlm.ns_xmlns, "ex")].
Regarding constraints on the usage of the {i xml} and {i xmlns}
prefixes by documents, the parser does not report errors on violations
prefixes by documents, the parser does not report errors on violations
of the {i must} constraints listed in
{{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}.
{{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}.
{3:inentity Character and entity references}
@ -494,14 +475,14 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
replacement character data. The replacement data is {e not}
analysed for further references, it is added to the data as such
modulo white space stripping. If [entity] returns [None] the error
[`Unknown_entity_ref] is returned.
[`Unknown_entity_ref] is returned.
{3:iseq Sequences of documents (deprecated)}
{b WARNING.} This feature is deprecated and will be removed.
When a well-formed sequence of signals is input, no data is consumed beyond
the closing ['>'] of the document's root element.
the closing ['>'] of the document's root element.
If you want to parse a document as
{{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML
@ -522,32 +503,32 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
{3:inmisc Miscellaneous}
{ul
{- Parses the more liberal and simpler XML 1.1
{- Parses the more liberal and simpler XML 1.1
{{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because
of namespaces).}
{- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed
roughly (no guarantee it is well formed) and its information is ignored.}
{- The parser drops
{{:http://www.w3.org/TR/REC-xml/#dt-comment}comments},
{{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and
{- The parser drops
{{:http://www.w3.org/TR/REC-xml/#dt-comment}comments},
{{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and
{{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.}
{- Element attributes are not checked for uniqueness.}
{- Attribute and character data chunks are limited by
[Sys.max_string_length].
{- Attribute and character data chunks are limited by
[Sys.max_string_length].
The error [`Max_buffer_size] is raised if the limit is hit.}
{- Tail recursive.}
{- Non validating.}
}
{2:output Output}
{3:outenc Encoding}
{2:output Output}
{3:outenc Encoding}
Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}
encoded documents. Strings given to
output functions {b must be} UTF-8 encoded, no checks are
performed. Unicode characters that are not legal XML
{{:http://www.w3.org/TR/REC-xml/#NT-Char}characters} are replaced
by the {{:http://unicode.org/glossary/#replacement_character}Unicode
by the {{:http://unicode.org/glossary/#replacement_character}Unicode
replacement character}.
{3:outns Namespaces}
@ -556,15 +537,15 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S
{{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}.
Expanded names are automatically converted to
{{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified
names} by the output abstraction. There is no particular api to specify
prefixes and default namespaces,
names} by the output abstraction. There is no particular api to specify
prefixes and default namespaces,
the actual result depends solely on the output
of attributes belonging to the {!ns_xmlns} namespace. For example to set
the default namespace of an element to {i http://example.org/myns},
of attributes belonging to the {!ns_xmlns} namespace. For example to set
the default namespace of an element to {i http://example.org/myns},
use the following attribute :
{[(* xmlns='http://example.org/myns' *)
let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]}
To bind the prefix ["ex"] to {i http://example.org/ex}, use the
To bind the prefix ["ex"] to {i http://example.org/ex}, use the
following attribute :
{[(* xmlns:ex='http://example.org/ex' *)
let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
@ -575,7 +556,7 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
The callback [ns_prefix] of an output abstraction can be used to
give a prefix to a namespace name lacking a prefix binding in the
current output scope. Given a namespace name the function must return
current output scope. Given a namespace name the function must return
the prefix to use. Note that this
will {b not} add any namespace declaration attribute to the
output. If the function returns [None], {!output} will raise
@ -590,7 +571,7 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
line) and nested elements are indented with [c] space
characters.
{3:oseq Sequences of documents (deprecated)}
{3:oseq Sequences of documents (deprecated)}
{b WARNING.} This feature is deprecated and will be removed.
@ -601,44 +582,44 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
{3:outmisc Miscellaneous}
{ul
{- Output on a channel does not flush it.}
{- In attribute and character data you provide, markup
delimiters ['<'],['>'],['&'], and ['\"'] are
automatically escaped to
{- In attribute and character data you provide, markup
delimiters ['<'],['>'],['&'], and ['\"'] are
automatically escaped to
{{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined
entities}.}
{- No checks are peformed on the prefix and local part of output
names to verify they are
{{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s.
For example using the tag name [("","dip d")] will produce
For example using the tag name [("","dip d")] will produce
a non well-formed document because of the space character.}
{- Tail recursive.}}
{2 Tips}
{ul
{2 Tips}
{ul
{- The best options to do an input/output round trip
and preserve as much information as possible is to
and preserve as much information as possible is to
input with [strip = false] and output with [indent = None].}
{- Complete whitespace control on output is achieved
{- Complete whitespace control on output is achieved
with [indent = None] and suitable [`Data] signals}}
{1:ex Examples}
{1:ex Examples}
{2:exseq Sequential processing}
{2:exseq Sequential processing}
Sequential processing has the advantage that you don't need to get
the whole document tree in memory to process it.
The following function reads a {e single} document on an
input channel and outputs it.
{[let id ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output (`Channel oc) in
let rec pull i o depth =
{[let id ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output (`Channel oc) in
let rec pull i o depth =
Xmlm.output o (Xmlm.peek i);
match Xmlm.input i with
match Xmlm.input i with
| `El_start _ -> pull i o (depth + 1)
| `El_end -> if depth = 1 then () else pull i o (depth - 1)
| `Data _ -> pull i o depth
| `Data _ -> pull i o depth
| `Dtd _ -> assert false
in
Xmlm.output o (Xmlm.input i); (* `Dtd *)
@ -647,32 +628,32 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
The following function reads a {e sequence} of documents on an
input channel and outputs it.
{[let id_seq ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output ~nl:true (`Channel oc) in
{[let id_seq ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output ~nl:true (`Channel oc) in
while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]}
The following function reads a {e sequence} of documents on the
The following function reads a {e sequence} of documents on the
input channel. In each document's tree it prunes non root elements
whose name belongs to [prune_list].
{[let prune_docs prune_list ic oc =
{[let prune_docs prune_list ic oc =
let i = Xmlm.make_input (`Channel ic) in
let o = Xmlm.make_output ~nl:true (`Channel oc) in
let copy i o = Xmlm.output o (Xmlm.input i) in
let prune (name, _) = List.mem name prune_list in
let rec process i o d =
let rec process i o d =
let rec skip i d = match Xmlm.input i with
| `El_start _ -> skip i (d + 1)
| `El_end -> if d = 1 then () else skip i (d - 1)
| s -> skip i d
in
match Xmlm.peek i with
match Xmlm.peek i with
| `El_start tag when prune tag -> skip i 0; process i o d
| `El_start _ -> copy i o; process i o (d + 1)
| `El_end -> copy i o; if d = 0 then () else process i o (d - 1)
| `Data _ -> copy i o; process i o d
| `Dtd _ -> assert false
in
let rec docs i o =
let rec docs i o =
copy i o; (* `Dtd *)
copy i o; (* root start *)
process i o 0;
@ -680,24 +661,24 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
in
docs i o]}
{2:extree Tree processing}
{2:extree Tree processing}
A document's sequence of signals can be easily converted
to an arborescent data structure. Assume your trees are defined by :
{[type tree = E of Xmlm.tag * tree list | D of string]}
The following functions input/output xml documents from/to abstractions
The following functions input/output xml documents from/to abstractions
as value of type [tree].
{[let in_tree i =
{[let in_tree i =
let el tag childs = E (tag, childs) in
let data d = D d in
Xmlm.input_doc_tree ~el ~data i
let out_tree o t =
let out_tree o t =
let frag = function
| E (tag, childs) -> `El (tag, childs)
| D d -> `Data d
| E (tag, childs) -> `El (tag, childs)
| D d -> `Data d
in
Xmlm.output_doc_tree frag o t]}
Xmlm.output_doc_tree frag o t]}
{2:exrow Tabular data processing}
@ -715,7 +696,7 @@ let out_tree o t =
A bureaucrat contains the following elements, in order.
{ul {- <name> its name (mandatory, string).}
{- <surname> its surname (mandatory, string).}
{- <honest> present iff he implemented one of its spec
{- <honest> present iff he implemented one of its spec
(optional, empty).}
{- <obfuscation_level> its grade on the
open scale of obfuscation (mandatory, float).}
@ -723,25 +704,25 @@ let out_tree o t =
worked on.}}}}}}
In OCaml we represent a W3C bureaucrat by this type :
{[type w3c_bureaucrat = {
name : string;
surname : string;
honest : bool;
{[type w3c_bureaucrat = {
name : string;
surname : string;
honest : bool;
obfuscation_level : float;
trs : string list; }]}
The following functions input and output W3C bureaucrats as lists
of values of type [w3c_bureaucrat].
{[let in_w3c_bureaucrats src =
{[let in_w3c_bureaucrats src =
let i = Xmlm.make_input ~strip:true src in
let tag n = ("", n), [] in
let error () = invalid_arg "parse error" in
let accept s i = if Xmlm.input i = s then () else error () in
let rec i_seq el acc i = match Xmlm.peek i with
let rec i_seq el acc i = match Xmlm.peek i with
| `El_start _ -> i_seq el ((el i) :: acc) i
| `El_end -> List.rev acc
| _ -> error ()
in
let i_el n i =
let i_el n i =
accept (`El_start (tag n)) i;
let d = match Xmlm.peek i with
| `Data d -> ignore (Xmlm.input i); d
@ -751,7 +732,7 @@ let out_tree o t =
accept (`El_end) i;
d
in
let i_bureaucrat i =
let i_bureaucrat i =
try
accept (`El_start (tag "bureaucrat")) i;
let name = i_el "name" i in
@ -763,7 +744,7 @@ let out_tree o t =
let obf = float_of_string (i_el "obfuscation_level" i) in
let trs = i_seq (i_el "tr") [] i in
accept (`El_end) i;
{ name = name; surname = surname; honest = honest;
{ name = name; surname = surname; honest = honest;
obfuscation_level = obf; trs = trs }
with
| Failure _ -> error () (* float_of_string *)
@ -775,16 +756,16 @@ let out_tree o t =
if not (Xmlm.eoi i) then invalid_arg "more than one document";
bl
let out_w3c_bureaucrats dst bl =
let out_w3c_bureaucrats dst bl =
let tag n = ("", n), [] in
let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in
let out = Xmlm.output o in
let o_el n d =
out (`El_start (tag n));
if d <> "" then out (`Data d);
out `El_end
let o_el n d =
out (`El_start (tag n));
if d <> "" then out (`Data d);
out `El_end
in
let o_bureaucrat b =
let o_bureaucrat b =
out (`El_start (tag "bureaucrat"));
o_el "name" b.name;
o_el "surname" b.surname;
@ -806,7 +787,7 @@ let out_w3c_bureaucrats dst bl =
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.