From 18075b84ce76853ad182e400f3974bfdbb1f95ef Mon Sep 17 00:00:00 2001 From: John Whitington Date: Mon, 30 Jul 2018 13:58:25 +0100 Subject: [PATCH] -safe-string --- Makefile | 4 +- cpdfmanual.tex | 1 + xmlm.ml | 2 +- xmlm.mli | 488 +++++++++++++++++++++++++------------------------ 4 files changed, 251 insertions(+), 244 deletions(-) diff --git a/Makefile b/Makefile index 52b9eeb..0f90244 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,8 @@ RESULT = cpdf ANNOTATE = true PACKS = camlpdf -OCAMLNCFLAGS = -g -unsafe-string -w -3 -annot -OCAMLBCFLAGS = -g -unsafe-string -w -3 -annot +OCAMLNCFLAGS = -g -safe-string -w -3 -annot +OCAMLBCFLAGS = -g -safe-string -w -3 -annot OCAMLLDFLAGS = -g all : native-code native-code-library byte-code-library top htdoc diff --git a/cpdfmanual.tex b/cpdfmanual.tex index 58f2c2c..113df47 100644 --- a/cpdfmanual.tex +++ b/cpdfmanual.tex @@ -1,3 +1,4 @@ +%FIXME: Finish and document -dump-attachments. %FIXME: Mention that OpenAction supersedes PageLayout so use -remove-dict-option to get rid of it %FIXME: Document new -hard-box option %FIXME: Document that -upright also shifts the page to 0,0 diff --git a/xmlm.ml b/xmlm.ml index 4cf4887..3430e91 100644 --- a/xmlm.ml +++ b/xmlm.ml @@ -961,7 +961,7 @@ struct let make_output ?(decl = true) ?(nl = false) ?(indent = None) ?(ns_prefix = fun _ ->None) d = let outs, outc = match d with - | `Channel c -> (output c), (output_char c) + | `Channel c -> (output_substring c), (output_char c) | `Buffer b -> (Std_buffer.add_substring b), (Std_buffer.add_char b) | `Fun f -> let os s p l = diff --git a/xmlm.mli b/xmlm.mli index 9bd7f09..b8339ab 100755 --- a/xmlm.mli +++ b/xmlm.mli @@ -1,57 +1,57 @@ (*--------------------------------------------------------------------------- - Copyright 2007 Daniel C. Bünzli. All rights reserved. - Distributed under a BSD3 license, see license at the end of the file. - %%NAME%% release %%VERSION%% + Copyright (c) 2007 Daniel C. Bünzli. All rights reserved. + Distributed under the ISC license, see terms at the end of the file. + %%NAME%% %%VERSION%% ---------------------------------------------------------------------------*) -(** Streaming XML codec. +(** Streaming XML codec. A well-formed sequence of {{!signal}signals} represents an {{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in depth first order (this has nothing to do with XML well-formedness). Input pulls a well-formed sequence of signals from a data source and output pushes a well-formed sequence of - signals to a data destination. Functions are provided to easily + signals to a data destination. Functions are provided to easily transform sequences of signals to/from arborescent data structures. - Consult the {{!io}features and limitations} and {{!ex}examples} + Consult the {{!io}features and limitations} and {{!ex}examples} of use. - {e Release %%VERSION%% — %%MAINTAINER%% } + {e %%VERSION%% — {{:%%PKG_HOMEPAGE%% }homepage}} {3 References} {ul - {- Tim Bray. - {e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}}, - 1998.} - {- Tim Bray et al. + {- Tim Bray. + {e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}}, + 1998.} + {- Tim Bray et al. {e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}}, 2006.}} *) (** {1 Basic types and values} *) (** The type for character encodings. For [`UTF_16], endianness is - determined from the + determined from the {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) -type encoding = [ - | `UTF_8 - | `UTF_16 - (** Endianness determined from the +type encoding = [ + | `UTF_8 + | `UTF_16 + (** Endianness determined from the {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *) - | `UTF_16BE - | `UTF_16LE + | `UTF_16BE + | `UTF_16LE | `ISO_8859_1 | `US_ASCII ] - + type dtd = string option (** The type for the optional {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *) -type name = string * string +type name = string * string (** The type for attribute and element's - {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names} + {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names} [(uri,local)]. An empty [uri] represents a name without a - namespace name, i.e. an unprefixed name + namespace name, i.e. an unprefixed name that is not under the scope of a default namespace. *) type attribute = name * string @@ -65,46 +65,69 @@ type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] to the language of the [doc] grammar : {[doc ::= `Dtd tree tree ::= `El_start child `El_end -child ::= `Data | tree | epsilon ]} +child ::= `Data trees | trees +trees ::= tree child | epsilon]} + The [trees] production is used to expresses the fact that there will + never be two consecutive `Data signals in the children of an element. + Input and output deal only with well-formed sequences or - exceptions are raised. -*) - -val ns_xml : string -(** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the + exceptions are raised. However on output consecutive [`Data] + signals are allowed. *) + +val ns_xml : string +(** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the reserved ["xml"] prefix. *) - + val ns_xmlns : string -(** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the +(** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the reserved ["xmlns"] prefix. *) +val pp_dtd : Format.formatter -> dtd -> unit +(** [pp_dtd ppf dtd] prints an unspecified representation of [dtd] on [ppf]. *) + +val pp_name : Format.formatter -> name -> unit +(** [pp_name ppf name] prints an unspecified representation of [name] on + [ppf]. *) + +val pp_attribute : Format.formatter -> attribute -> unit +(** [pp_attribute ppf att] prints an unspecified representation of [att] on + [ppf]. *) + +val pp_tag : Format.formatter -> tag -> unit +(** [pp_tag ppf tag] prints an unspecified representation of [tag] on + [ppf]. *) + +val pp_signal : Format.formatter -> signal -> unit +(** [pp_signal ppf s] prints an unspecified representation of [s] on + [ppf]. *) + (** {1 Input} *) -type pos = int * int +type pos = int * int (** The type for input positions. Line and column number, both start with 1. *) (** The type for input errors. *) type error = [ - | `Max_buffer_size + | `Max_buffer_size (** Maximal buffer size exceeded ([Sys.max_string_length]). *) - | `Unexpected_eoi + | `Unexpected_eoi (** Unexpected end of input. *) - | `Malformed_char_stream + | `Malformed_char_stream (** Malformed underlying character stream. *) - | `Unknown_encoding of string + | `Unknown_encoding of string (** Unknown encoding. *) - | `Unknown_entity_ref of string + | `Unknown_entity_ref of string (** Unknown entity reference, {{!inentity} details}. *) - | `Unknown_ns_prefix of string + | `Unknown_ns_prefix of string (** Unknown namespace prefix {{!inns} details} *) - | `Illegal_char_ref of string + | `Illegal_char_ref of string (** Illegal character reference. *) - | `Illegal_char_seq of string + | `Illegal_char_seq of string (** Illegal character sequence. *) | `Expected_char_seqs of string list * string (** Expected one of the character sequences in the list but found another. *) - | `Expected_root_element + | `Expected_root_element (** Expected the document's root element. *) ] val error_message : error -> string @@ -113,7 +136,7 @@ val error_message : error -> string exception Error of pos * error (** Raised on input errors. *) -type source = [ +type source = [ | `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ] (** The type for input sources. For [`String] starts reading at the given integer position. For [`Fun] the function must return the @@ -123,70 +146,70 @@ type source = [ type input (** The type for input abstractions. *) -val make_input : ?enc:encoding option -> ?strip:bool -> - ?ns:(string -> string option) -> +val make_input : ?enc:encoding option -> ?strip:bool -> + ?ns:(string -> string option) -> ?entity: (string -> string option) -> source -> input (** Returns a new input abstraction reading from the given source. - {ul - {- [enc], character encoding of the document, {{!inenc} details}. + {ul + {- [enc], character encoding of the document, {{!inenc} details}. Defaults to [None].} {- [strip], strips whitespace in character data, {{!inwspace} details}. - Defaults to [false].} + Defaults to [false].} {- [ns] is called to bind undeclared namespace prefixes, {{!inns} details}. Default returns always [None].} {- [entity] is called to resolve non predefined entity references, {{!inentity} details}. Default returns always [None].}} *) - + val input : input -> signal (** Inputs a signal. Repeated invocation of the function with the same input abstraction will generate a {{!signal}well-formed} sequence of signals or an {!Error} is raised. Furthermore there will be no two consecutive [`Data] signals in the sequence and their string - is always non empty. - - {b Deprecated} After a well-formed sequence was input another may + is always non empty. + + {b Deprecated} After a well-formed sequence was input another may be input, see {!eoi} and {{!iseq}details}. - + {b Raises} {!Error} on input errors. *) - -val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + +val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> input -> 'a (** If the next signal is a : {ul {- [`Data] signal, inputs it and invokes [data] with the character data.} - {- [`El_start] signal, inputs the sequence of signals until its + {- [`El_start] signal, inputs the sequence of signals until its matching [`El_end] and invokes [el] and [data] as follows {ul - {- [el], is called on each [`El_end] signals with the corresponding - [`El_start] tag and the result of the callback invocation for the + {- [el], is called on each [`El_end] signals with the corresponding + [`El_start] tag and the result of the callback invocation for the element's children.} - {- [data], is called on each [`Data] signals with the character data. - This function won't be called twice consecutively or with the empty + {- [data], is called on each [`Data] signals with the character data. + This function won't be called twice consecutively or with the empty string.}}} {- Other signals, raises [Invalid_argument].}} {b Raises} {!Error} on input errors and [Invalid_argument] if the next signal is not [`El_start] or [`Data]. *) -val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> +val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> input -> (dtd * 'a) -(** Same as {!input_tree} but reads a complete {{!signal}well-formed} - sequence of signals. - +(** Same as {!input_tree} but reads a complete {{!signal}well-formed} + sequence of signals. + {b Raises} {!Error} on input errors and [Invalid_argument] if the next signal is not [`Dtd]. *) - + val peek : input -> signal -(** Same as {!input} but doesn't remove the signal from the sequence. - - {b Raises} {!Error} on input errors. *) - -val eoi : input -> bool -(** Returns [true] if the end of input is reached. See {{!iseq}details}. - +(** Same as {!input} but doesn't remove the signal from the sequence. + {b Raises} {!Error} on input errors. *) -val pos : input -> pos +val eoi : input -> bool +(** Returns [true] if the end of input is reached. See {{!iseq}details}. + + {b Raises} {!Error} on input errors. *) + +val pos : input -> pos (** Current position in the input abstraction. *) (** {1 Output} *) @@ -194,31 +217,31 @@ val pos : input -> pos type 'a frag = [ `El of tag * 'a list | `Data of string ] (** The type for deconstructing data structures of type ['a]. *) -type dest = [ `Channel of out_channel | `Buffer of Buffer.t +type dest = [ `Channel of out_channel | `Buffer of Buffer.t | `Fun of (int -> unit) ] (** The type for output destinations. For [`Buffer], the buffer won't be cleared. For [`Fun] the function is called with the output {e bytes} as [int]s. *) - + type output (** The type for output abstractions. *) - -val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> + +val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> ?ns_prefix:(string -> string option) -> dest -> output (** Returns a new output abstraction writing to the given destination. - {ul + {ul {- [decl], if [true] the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML declaration} is output (defaults to [true]).} - {- [nl], if [true] a newline is output when the root's element [`El_end] + {- [nl], if [true] a newline is output when the root's element [`El_end] signal is output. Defaults to [false].} {- [indent], identation behaviour, see {{!outindent} details}. Defaults to [None].} - {- [ns_prefix], undeclared namespace prefix bindings, + {- [ns_prefix], undeclared namespace prefix bindings, see {{!outns}details}. Default returns always [None].}} *) val output : output -> signal -> unit -(** Outputs a signal. +(** Outputs a signal. {b Deprecated.} After a well-formed sequence of signals was output a new well-formed sequence can be output. @@ -230,22 +253,22 @@ val output : output -> signal -> unit val output_depth : output -> int (** [output_depth o] is [o]'s current element nesting level (undefined before the first [`El_start] and after the last [`El_end]). *) - + val output_tree : ('a -> 'a frag) -> output -> 'a -> unit (** Outputs signals corresponding to a value by recursively applying the given value deconstructor. - + {b Raises} see {!output}. *) - -val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit -(** Same as {!output_tree} but outputs a complete {{!signal}well-formed} + +val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit +(** Same as {!output_tree} but outputs a complete {{!signal}well-formed} sequence of signals. {b Raises} see {!output}. *) -(** {1:sto Functorial interface (deprecated)} +(** {1:sto Functorial interface (deprecated)} - {b WARNING.} The functioral interface is deprecated and will be + {b WARNING.} The functioral interface is deprecated and will be removed. {!Make} allows client to specify types for strings and internal @@ -258,16 +281,16 @@ type std_buffer = Buffer.t (** Input signature for strings. *) module type String = sig - + type t (** The type for strings. *) - + val empty : t (** The empty string. *) - + val length : t -> int (** Returns the length of the string. *) - + val append : t -> t -> t (** Concatenates two strings. *) @@ -277,7 +300,7 @@ module type String = sig {{:http://www.unicode.org/glossary/#code_point}code point}). *) val iter : (int -> unit) -> t -> unit - (** Iterates over the unicode + (** Iterates over the unicode {{:http://www.unicode.org/glossary/#code_point}code point} of the given string. *) @@ -297,129 +320,129 @@ module type Buffer = sig type string (** The type for strings. *) - - type t + + type t (** The type for buffers. *) exception Full (** Raised if the buffer cannot be grown. *) - + val create : int -> t (** Creates a buffer of the given size. *) val add_uchar : t -> int -> unit (** Adds the given (guaranteed valid) unicode {{:http://www.unicode.org/glossary/#code_point}code point} to a - buffer. + buffer. {b Raises} {!Full} if the buffer cannot be grown. *) val clear : t -> unit (** Clears the buffer. *) - + val contents : t -> string (** Returns the buffer contents. *) - + val length : t -> int (** Returns the number of characters contained in the buffer. *) end (** Output signature of {!Make}. *) -module type S = sig - - (** {1 Basic types and values} *) +module type S = sig - type string - - type encoding = [ + (** {1 Basic types and values} *) + + type string + + type encoding = [ | `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1| `US_ASCII ] type dtd = string option - type name = string * string + type name = string * string type attribute = name * string type tag = name * attribute list type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ] - - val ns_xml : string - val ns_xmlns : string - - (** {1 Input} *) - type pos = int * int + val ns_xml : string + val ns_xmlns : string + + (** {1 Input} *) + + type pos = int * int type error = [ - | `Max_buffer_size + | `Max_buffer_size | `Unexpected_eoi | `Malformed_char_stream | `Unknown_encoding of string - | `Unknown_entity_ref of string - | `Unknown_ns_prefix of string - | `Illegal_char_ref of string - | `Illegal_char_seq of string + | `Unknown_entity_ref of string + | `Unknown_ns_prefix of string + | `Illegal_char_ref of string + | `Illegal_char_seq of string | `Expected_char_seqs of string list * string | `Expected_root_element ] - + exception Error of pos * error val error_message : error -> string - type source = [ - | `Channel of in_channel + type source = [ + | `Channel of in_channel | `String of int * std_string | `Fun of (unit -> int) ] type input - - val make_input : ?enc:encoding option -> ?strip:bool -> - ?ns:(string -> string option) -> + + val make_input : ?enc:encoding option -> ?strip:bool -> + ?ns:(string -> string option) -> ?entity: (string -> string option) -> source -> input - + val input : input -> signal - - val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + + val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> input -> 'a - - val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> + + val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> input -> (dtd * 'a) - + val peek : input -> signal val eoi : input -> bool - val pos : input -> pos - + val pos : input -> pos + (** {1 Output} *) - + type 'a frag = [ `El of tag * 'a list | `Data of string ] - type dest = [ + type dest = [ | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ] - + type output - val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> + val make_output : ?decl:bool -> ?nl:bool -> ?indent:int option -> ?ns_prefix:(string -> string option) -> dest -> output - + val output_depth : output -> int val output : output -> signal -> unit val output_tree : ('a -> 'a frag) -> output -> 'a -> unit - val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit + val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit end (** Functor building streaming XML IO with the given strings and buffers. *) module Make (String : String) (Buffer : Buffer with type string = String.t) : S with type string = String.t - + (** {1:io Features and limitations} - + The module assumes strings are immutable, thus strings - the client gives or receives {e during} the input and output process + the client gives or receives {e during} the input and output process must not be modified. {2:input Input} - {3:inenc Encoding} + {3:inenc Encoding} - The parser supports ASCII, US-ASCII, + The parser supports ASCII, US-ASCII, {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}, {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16}, {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE}, {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and - {{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1} + {{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1} (Latin-1) encoded documents. But strings returned by - the library are {b always} UTF-8 encoded. - + the library are {b always} UTF-8 encoded. + The encoding can be specified explicitly using the optional argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the @@ -431,8 +454,8 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S The parser performs {{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data normalization} on {e every} attribute data. This means that - attribute data does not have leading and trailling white space and that - any white space is collapsed and transformed to a single space + attribute data does not have leading and trailling white space and that + any white space is collapsed and transformed to a single space character ([U+0020]). White space handling of character data depends on the [strip] @@ -461,9 +484,9 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S [(Xmlm.ns_xmlns, "ex")]. Regarding constraints on the usage of the {i xml} and {i xmlns} - prefixes by documents, the parser does not report errors on violations + prefixes by documents, the parser does not report errors on violations of the {i must} constraints listed in - {{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}. + {{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}. {3:inentity Character and entity references} @@ -475,14 +498,14 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S replacement character data. The replacement data is {e not} analysed for further references, it is added to the data as such modulo white space stripping. If [entity] returns [None] the error - [`Unknown_entity_ref] is returned. + [`Unknown_entity_ref] is returned. {3:iseq Sequences of documents (deprecated)} {b WARNING.} This feature is deprecated and will be removed. When a well-formed sequence of signals is input, no data is consumed beyond - the closing ['>'] of the document's root element. + the closing ['>'] of the document's root element. If you want to parse a document as {{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML @@ -503,32 +526,32 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S {3:inmisc Miscellaneous} {ul - {- Parses the more liberal and simpler XML 1.1 + {- Parses the more liberal and simpler XML 1.1 {{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because of namespaces).} {- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed roughly (no guarantee it is well formed) and its information is ignored.} - {- The parser drops - {{:http://www.w3.org/TR/REC-xml/#dt-comment}comments}, - {{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and + {- The parser drops + {{:http://www.w3.org/TR/REC-xml/#dt-comment}comments}, + {{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and {{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.} {- Element attributes are not checked for uniqueness.} - {- Attribute and character data chunks are limited by - [Sys.max_string_length]. + {- Attribute and character data chunks are limited by + [Sys.max_string_length]. The error [`Max_buffer_size] is raised if the limit is hit.} {- Tail recursive.} {- Non validating.} } - - {2:output Output} - {3:outenc Encoding} + + {2:output Output} + {3:outenc Encoding} Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8} encoded documents. Strings given to output functions {b must be} UTF-8 encoded, no checks are performed. Unicode characters that are not legal XML {{:http://www.w3.org/TR/REC-xml/#NT-Char}characters} are replaced - by the {{:http://unicode.org/glossary/#replacement_character}Unicode + by the {{:http://unicode.org/glossary/#replacement_character}Unicode replacement character}. {3:outns Namespaces} @@ -537,15 +560,15 @@ module Make (String : String) (Buffer : Buffer with type string = String.t) : S {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}. Expanded names are automatically converted to {{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified - names} by the output abstraction. There is no particular api to specify - prefixes and default namespaces, + names} by the output abstraction. There is no particular api to specify + prefixes and default namespaces, the actual result depends solely on the output - of attributes belonging to the {!ns_xmlns} namespace. For example to set - the default namespace of an element to {i http://example.org/myns}, + of attributes belonging to the {!ns_xmlns} namespace. For example to set + the default namespace of an element to {i http://example.org/myns}, use the following attribute : {[(* xmlns='http://example.org/myns' *) let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]} - To bind the prefix ["ex"] to {i http://example.org/ex}, use the + To bind the prefix ["ex"] to {i http://example.org/ex}, use the following attribute : {[(* xmlns:ex='http://example.org/ex' *) let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} @@ -556,7 +579,7 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} The callback [ns_prefix] of an output abstraction can be used to give a prefix to a namespace name lacking a prefix binding in the - current output scope. Given a namespace name the function must return + current output scope. Given a namespace name the function must return the prefix to use. Note that this will {b not} add any namespace declaration attribute to the output. If the function returns [None], {!output} will raise @@ -571,7 +594,7 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} line) and nested elements are indented with [c] space characters. - {3:oseq Sequences of documents (deprecated)} + {3:oseq Sequences of documents (deprecated)} {b WARNING.} This feature is deprecated and will be removed. @@ -582,44 +605,44 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} {3:outmisc Miscellaneous} {ul {- Output on a channel does not flush it.} - {- In attribute and character data you provide, markup - delimiters ['<'],['>'],['&'], and ['\"'] are - automatically escaped to + {- In attribute and character data you provide, markup + delimiters ['<'],['>'],['&'], and ['\"'] are + automatically escaped to {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined entities}.} {- No checks are peformed on the prefix and local part of output names to verify they are {{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s. - For example using the tag name [("","dip d")] will produce + For example using the tag name [("","dip d")] will produce a non well-formed document because of the space character.} {- Tail recursive.}} - {2 Tips} - {ul + {2 Tips} + {ul {- The best options to do an input/output round trip - and preserve as much information as possible is to + and preserve as much information as possible is to input with [strip = false] and output with [indent = None].} - {- Complete whitespace control on output is achieved + {- Complete whitespace control on output is achieved with [indent = None] and suitable [`Data] signals}} - {1:ex Examples} + {1:ex Examples} - {2:exseq Sequential processing} + {2:exseq Sequential processing} Sequential processing has the advantage that you don't need to get the whole document tree in memory to process it. The following function reads a {e single} document on an input channel and outputs it. -{[let id ic oc = - let i = Xmlm.make_input (`Channel ic) in - let o = Xmlm.make_output (`Channel oc) in - let rec pull i o depth = +{[let id ic oc = + let i = Xmlm.make_input (`Channel ic) in + let o = Xmlm.make_output (`Channel oc) in + let rec pull i o depth = Xmlm.output o (Xmlm.peek i); - match Xmlm.input i with + match Xmlm.input i with | `El_start _ -> pull i o (depth + 1) | `El_end -> if depth = 1 then () else pull i o (depth - 1) - | `Data _ -> pull i o depth + | `Data _ -> pull i o depth | `Dtd _ -> assert false in Xmlm.output o (Xmlm.input i); (* `Dtd *) @@ -628,32 +651,32 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} The following function reads a {e sequence} of documents on an input channel and outputs it. -{[let id_seq ic oc = - let i = Xmlm.make_input (`Channel ic) in - let o = Xmlm.make_output ~nl:true (`Channel oc) in +{[let id_seq ic oc = + let i = Xmlm.make_input (`Channel ic) in + let o = Xmlm.make_output ~nl:true (`Channel oc) in while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]} - The following function reads a {e sequence} of documents on the + The following function reads a {e sequence} of documents on the input channel. In each document's tree it prunes non root elements whose name belongs to [prune_list]. -{[let prune_docs prune_list ic oc = +{[let prune_docs prune_list ic oc = let i = Xmlm.make_input (`Channel ic) in let o = Xmlm.make_output ~nl:true (`Channel oc) in let copy i o = Xmlm.output o (Xmlm.input i) in let prune (name, _) = List.mem name prune_list in - let rec process i o d = + let rec process i o d = let rec skip i d = match Xmlm.input i with | `El_start _ -> skip i (d + 1) | `El_end -> if d = 1 then () else skip i (d - 1) | s -> skip i d in - match Xmlm.peek i with + match Xmlm.peek i with | `El_start tag when prune tag -> skip i 0; process i o d | `El_start _ -> copy i o; process i o (d + 1) | `El_end -> copy i o; if d = 0 then () else process i o (d - 1) | `Data _ -> copy i o; process i o d | `Dtd _ -> assert false in - let rec docs i o = + let rec docs i o = copy i o; (* `Dtd *) copy i o; (* root start *) process i o 0; @@ -661,24 +684,24 @@ let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]} in docs i o]} - {2:extree Tree processing} + {2:extree Tree processing} A document's sequence of signals can be easily converted to an arborescent data structure. Assume your trees are defined by : {[type tree = E of Xmlm.tag * tree list | D of string]} - The following functions input/output xml documents from/to abstractions + The following functions input/output xml documents from/to abstractions as value of type [tree]. -{[let in_tree i = +{[let in_tree i = let el tag childs = E (tag, childs) in let data d = D d in Xmlm.input_doc_tree ~el ~data i -let out_tree o t = +let out_tree o t = let frag = function - | E (tag, childs) -> `El (tag, childs) - | D d -> `Data d + | E (tag, childs) -> `El (tag, childs) + | D d -> `Data d in - Xmlm.output_doc_tree frag o t]} + Xmlm.output_doc_tree frag o t]} {2:exrow Tabular data processing} @@ -696,7 +719,7 @@ let out_tree o t = A bureaucrat contains the following elements, in order. {ul {- its name (mandatory, string).} {- its surname (mandatory, string).} - {- present iff he implemented one of its spec + {- present iff he implemented one of its spec (optional, empty).} {- its grade on the open scale of obfuscation (mandatory, float).} @@ -704,25 +727,25 @@ let out_tree o t = worked on.}}}}}} In OCaml we represent a W3C bureaucrat by this type : -{[type w3c_bureaucrat = { - name : string; - surname : string; - honest : bool; +{[type w3c_bureaucrat = { + name : string; + surname : string; + honest : bool; obfuscation_level : float; trs : string list; }]} The following functions input and output W3C bureaucrats as lists of values of type [w3c_bureaucrat]. -{[let in_w3c_bureaucrats src = +{[let in_w3c_bureaucrats src = let i = Xmlm.make_input ~strip:true src in let tag n = ("", n), [] in let error () = invalid_arg "parse error" in let accept s i = if Xmlm.input i = s then () else error () in - let rec i_seq el acc i = match Xmlm.peek i with + let rec i_seq el acc i = match Xmlm.peek i with | `El_start _ -> i_seq el ((el i) :: acc) i | `El_end -> List.rev acc | _ -> error () in - let i_el n i = + let i_el n i = accept (`El_start (tag n)) i; let d = match Xmlm.peek i with | `Data d -> ignore (Xmlm.input i); d @@ -732,7 +755,7 @@ let out_tree o t = accept (`El_end) i; d in - let i_bureaucrat i = + let i_bureaucrat i = try accept (`El_start (tag "bureaucrat")) i; let name = i_el "name" i in @@ -744,7 +767,7 @@ let out_tree o t = let obf = float_of_string (i_el "obfuscation_level" i) in let trs = i_seq (i_el "tr") [] i in accept (`El_end) i; - { name = name; surname = surname; honest = honest; + { name = name; surname = surname; honest = honest; obfuscation_level = obf; trs = trs } with | Failure _ -> error () (* float_of_string *) @@ -756,16 +779,16 @@ let out_tree o t = if not (Xmlm.eoi i) then invalid_arg "more than one document"; bl -let out_w3c_bureaucrats dst bl = +let out_w3c_bureaucrats dst bl = let tag n = ("", n), [] in let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in let out = Xmlm.output o in - let o_el n d = - out (`El_start (tag n)); - if d <> "" then out (`Data d); - out `El_end + let o_el n d = + out (`El_start (tag n)); + if d <> "" then out (`Data d); + out `El_end in - let o_bureaucrat b = + let o_bureaucrat b = out (`El_start (tag "bureaucrat")); o_el "name" b.name; o_el "surname" b.surname; @@ -781,34 +804,17 @@ let out_w3c_bureaucrats dst bl = *) (*--------------------------------------------------------------------------- - Copyright 2007 Daniel C. Bünzli - All rights reserved. + Copyright (c) 2007 Daniel C. Bünzli - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the - distribution. - - 3. Neither the name of Daniel C. Bünzli nor the names of - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ---------------------------------------------------------------------------*)