1 /*
2  * Copyright (c) 2004 World Wide Web Consortium,
3  *
4  * (Massachusetts Institute of Technology, European Research Consortium for
5  * Informatics and Mathematics, Keio University). All Rights Reserved. This
6  * work is distributed under the W3C(r) Software License [1] in the hope that
7  * it will be useful, but WITHOUT ANY WARRANTY; without even the implied
8  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
9  *
10  * [1] http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
11  */
12 
13 package org.w3c.dom.ls;
14 
15 import org.w3c.dom.DOMConfiguration;
16 import org.w3c.dom.DOMException;
17 import org.w3c.dom.Node;
18 
19 /**
20  *  A <code>LSSerializer</code> provides an API for serializing (writing) a
21  * DOM document out into XML. The XML data is written to a string or an
22  * output stream. Any changes or fixups made during the serialization affect
23  * only the serialized data. The <code>Document</code> object and its
24  * children are never altered by the serialization operation.
25  * <p> During serialization of XML data, namespace fixup is done as defined in [<a href='http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407'>DOM Level 3 Core</a>]
26  * , Appendix B. [<a href='http://www.w3.org/TR/2000/REC-DOM-Level-2-Core-20001113'>DOM Level 2 Core</a>]
27  *  allows empty strings as a real namespace URI. If the
28  * <code>namespaceURI</code> of a <code>Node</code> is empty string, the
29  * serialization will treat them as <code>null</code>, ignoring the prefix
30  * if any.
31  * <p> <code>LSSerializer</code> accepts any node type for serialization. For
32  * nodes of type <code>Document</code> or <code>Entity</code>, well-formed
33  * XML will be created when possible (well-formedness is guaranteed if the
34  * document or entity comes from a parse operation and is unchanged since it
35  * was created). The serialized output for these node types is either as a
36  * XML document or an External XML Entity, respectively, and is acceptable
37  * input for an XML parser. For all other types of nodes the serialized form
38  * is implementation dependent.
39  * <p>Within a <code>Document</code>, <code>DocumentFragment</code>, or
40  * <code>Entity</code> being serialized, <code>Nodes</code> are processed as
41  * follows
42  * <ul>
43  * <li> <code>Document</code> nodes are written, including the XML
44  * declaration (unless the parameter "xml-declaration" is set to
45  * <code>false</code>) and a DTD subset, if one exists in the DOM. Writing a
46  * <code>Document</code> node serializes the entire document.
47  * </li>
48  * <li>
49  * <code>Entity</code> nodes, when written directly by
50  * <code>LSSerializer.write</code>, outputs the entity expansion but no
51  * namespace fixup is done. The resulting output will be valid as an
52  * external entity.
53  * </li>
54  * <li> If the parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-entities'>
55  * entities</a>" is set to <code>true</code>, <code>EntityReference</code> nodes are
56  * serialized as an entity reference of the form "
57  * <code>&amp;entityName;</code>" in the output. Child nodes (the expansion)
58  * of the entity reference are ignored. If the parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-entities'>
59  * entities</a>" is set to <code>false</code>, only the children of the entity reference
60  * are serialized. <code>EntityReference</code> nodes with no children (no
61  * corresponding <code>Entity</code> node or the corresponding
62  * <code>Entity</code> nodes have no children) are always serialized.
63  * </li>
64  * <li>
65  * <code>CDATAsections</code> containing content characters that cannot be
66  * represented in the specified output encoding are handled according to the
67  * "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-split-cdata-sections'>
68  * split-cdata-sections</a>" parameter.  If the parameter is set to <code>true</code>,
69  * <code>CDATAsections</code> are split, and the unrepresentable characters
70  * are serialized as numeric character references in ordinary content. The
71  * exact position and number of splits is not specified.  If the parameter
72  * is set to <code>false</code>, unrepresentable characters in a
73  * <code>CDATAsection</code> are reported as
74  * <code>"wf-invalid-character"</code> errors if the parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-well-formed'>
75  * well-formed</a>" is set to <code>true</code>. The error is not recoverable - there is no
76  * mechanism for supplying alternative characters and continuing with the
77  * serialization.
78  * </li>
79  * <li> <code>DocumentFragment</code> nodes are serialized by
80  * serializing the children of the document fragment in the order they
81  * appear in the document fragment.
82  * </li>
83  * <li> All other node types (Element, Text,
84  * etc.) are serialized to their corresponding XML source form.
85  * </li>
86  * </ul>
87  * <p ><b>Note:</b>  The serialization of a <code>Node</code> does not always
88  * generate a well-formed XML document, i.e. a <code>LSParser</code> might
89  * throw fatal errors when parsing the resulting serialization.
90  * <p> Within the character data of a document (outside of markup), any
91  * characters that cannot be represented directly are replaced with
92  * character references. Occurrences of '&lt;' and '&amp;' are replaced by
93  * the predefined entities &amp;lt; and &amp;amp;. The other predefined
94  * entities (&amp;gt;, &amp;apos;, and &amp;quot;) might not be used, except
95  * where needed (e.g. using &amp;gt; in cases such as ']]&gt;'). Any
96  * characters that cannot be represented directly in the output character
97  * encoding are serialized as numeric character references (and since
98  * character encoding standards commonly use hexadecimal representations of
99  * characters, using the hexadecimal representation when serializing
100  * character references is encouraged).
101  * <p> To allow attribute values to contain both single and double quotes, the
102  * apostrophe or single-quote character (') may be represented as
103  * "&amp;apos;", and the double-quote character (")  as "&amp;quot;". New
104  * line characters and other characters that cannot be represented directly
105  * in attribute values in the output character encoding are serialized as a
106  * numeric character reference.
107  * <p> Within markup, but outside of attributes, any occurrence of a character
108  * that cannot be represented in the output character encoding is reported
109  * as a <code>DOMError</code> fatal error. An example would be serializing
110  * the element &lt;LaCa\u00f1ada/&gt; with <code>encoding="us-ascii"</code>.
111  * This will result with a generation of a <code>DOMError</code>
112  * "wf-invalid-character-in-node-name" (as proposed in "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-well-formed'>
113  * well-formed</a>").
114  * <p> When requested by setting the parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-normalize-characters'>
115  * normalize-characters</a>" on <code>LSSerializer</code> to true, character normalization is
116  * performed according to the definition of <a href='http://www.w3.org/TR/2004/REC-xml11-20040204/#dt-fullnorm'>fully
117  * normalized</a> characters included in appendix E of [<a href='http://www.w3.org/TR/2004/REC-xml11-20040204/'>XML 1.1</a>] on all
118  * data to be serialized, both markup and character data. The character
119  * normalization process affects only the data as it is being written; it
120  * does not alter the DOM's view of the document after serialization has
121  * completed.
122  * <p> Implementations are required to support the encodings "UTF-8",
123  * "UTF-16", "UTF-16BE", and "UTF-16LE" to guarantee that data is
124  * serializable in all encodings that are required to be supported by all
125  * XML parsers. When the encoding is UTF-8, whether or not a byte order mark
126  * is serialized, or if the output is big-endian or little-endian, is
127  * implementation dependent. When the encoding is UTF-16, whether or not the
128  * output is big-endian or little-endian is implementation dependent, but a
129  * Byte Order Mark must be generated for non-character outputs, such as
130  * <code>LSOutput.byteStream</code> or <code>LSOutput.systemId</code>. If
131  * the Byte Order Mark is not generated, a "byte-order-mark-needed" warning
132  * is reported. When the encoding is UTF-16LE or UTF-16BE, the output is
133  * big-endian (UTF-16BE) or little-endian (UTF-16LE) and the Byte Order Mark
134  * is not be generated. In all cases, the encoding declaration, if
135  * generated, will correspond to the encoding used during the serialization
136  * (e.g. <code>encoding="UTF-16"</code> will appear if UTF-16 was
137  * requested).
138  * <p> Namespaces are fixed up during serialization, the serialization process
139  * will verify that namespace declarations, namespace prefixes and the
140  * namespace URI associated with elements and attributes are consistent. If
141  * inconsistencies are found, the serialized form of the document will be
142  * altered to remove them. The method used for doing the namespace fixup
143  * while serializing a document is the algorithm defined in Appendix B.1,
144  * "Namespace normalization", of [<a href='http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407'>DOM Level 3 Core</a>]
145  * .
146  * <p> While serializing a document, the parameter "discard-default-content"
147  * controls whether or not non-specified data is serialized.
148  * <p> While serializing, errors and warnings are reported to the application
149  * through the error handler (<code>LSSerializer.domConfig</code>'s "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-error-handler'>
150  * error-handler</a>" parameter). This specification does in no way try to define all possible
151  * errors and warnings that can occur while serializing a DOM node, but some
152  * common error and warning cases are defined. The types (
153  * <code>DOMError.type</code>) of errors and warnings defined by this
154  * specification are:
155  * <dl>
156  * <dt><code>"no-output-specified" [fatal]</code></dt>
157  * <dd> Raised when
158  * writing to a <code>LSOutput</code> if no output is specified in the
159  * <code>LSOutput</code>. </dd>
160  * <dt>
161  * <code>"unbound-prefix-in-entity-reference" [fatal]</code> </dt>
162  * <dd> Raised if the
163  * configuration parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-namespaces'>
164  * namespaces</a>" is set to <code>true</code> and an entity whose replacement text
165  * contains unbound namespace prefixes is referenced in a location where
166  * there are no bindings for the namespace prefixes. </dd>
167  * <dt>
168  * <code>"unsupported-encoding" [fatal]</code></dt>
169  * <dd> Raised if an unsupported
170  * encoding is encountered. </dd>
171  * </dl>
172  * <p> In addition to raising the defined errors and warnings, implementations
173  * are expected to raise implementation specific errors and warnings for any
174  * other error and warning cases such as IO errors (file not found,
175  * permission denied,...) and so on.
176  * <p>See also the <a href='http://www.w3.org/TR/2004/REC-DOM-Level-3-LS-20040407'>Document Object Model (DOM) Level 3 Load
177 and Save Specification</a>.
178  */
179 public interface LSSerializer {
180     /**
181      *  The <code>DOMConfiguration</code> object used by the
182      * <code>LSSerializer</code> when serializing a DOM node.
183      * <br> In addition to the parameters recognized by the <a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#DOMConfiguration'>
184      * DOMConfiguration</a> interface defined in [<a href='http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407'>DOM Level 3 Core</a>]
185      * , the <code>DOMConfiguration</code> objects for
186      * <code>LSSerializer</code> adds, or modifies, the following
187      * parameters:
188      * <dl>
189      * <dt><code>"canonical-form"</code></dt>
190      * <dd>
191      * <dl>
192      * <dt><code>true</code></dt>
193      * <dd>[<em>optional</em>] Writes the document according to the rules specified in [<a href='http://www.w3.org/TR/2001/REC-xml-c14n-20010315'>Canonical XML</a>].
194      * In addition to the behavior described in "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-canonical-form'>
195      * canonical-form</a>" [<a href='http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407'>DOM Level 3 Core</a>]
196      * , setting this parameter to <code>true</code> will set the parameters
197      * "format-pretty-print", "discard-default-content", and "xml-declaration
198      * ", to <code>false</code>. Setting one of those parameters to
199      * <code>true</code> will set this parameter to <code>false</code>.
200      * Serializing an XML 1.1 document when "canonical-form" is
201      * <code>true</code> will generate a fatal error. </dd>
202      * <dt><code>false</code></dt>
203      * <dd>[<em>required</em>] (<em>default</em>) Do not canonicalize the output. </dd>
204      * </dl></dd>
205      * <dt><code>"discard-default-content"</code></dt>
206      * <dd>
207      * <dl>
208      * <dt>
209      * <code>true</code></dt>
210      * <dd>[<em>required</em>] (<em>default</em>) Use the <code>Attr.specified</code> attribute to decide what attributes
211      * should be discarded. Note that some implementations might use
212      * whatever information available to the implementation (i.e. XML
213      * schema, DTD, the <code>Attr.specified</code> attribute, and so on) to
214      * determine what attributes and content to discard if this parameter is
215      * set to <code>true</code>. </dd>
216      * <dt><code>false</code></dt>
217      * <dd>[<em>required</em>]Keep all attributes and all content.</dd>
218      * </dl></dd>
219      * <dt><code>"format-pretty-print"</code></dt>
220      * <dd>
221      * <dl>
222      * <dt>
223      * <code>true</code></dt>
224      * <dd>[<em>optional</em>] Formatting the output by adding whitespace to produce a pretty-printed,
225      * indented, human-readable form. The exact form of the transformations
226      * is not specified by this specification. Pretty-printing changes the
227      * content of the document and may affect the validity of the document,
228      * validating implementations should preserve validity. </dd>
229      * <dt>
230      * <code>false</code></dt>
231      * <dd>[<em>required</em>] (<em>default</em>) Don't pretty-print the result. </dd>
232      * </dl></dd>
233      * <dt>
234      * <code>"ignore-unknown-character-denormalizations"</code> </dt>
235      * <dd>
236      * <dl>
237      * <dt>
238      * <code>true</code></dt>
239      * <dd>[<em>required</em>] (<em>default</em>) If, while verifying full normalization when [<a href='http://www.w3.org/TR/2004/REC-xml11-20040204/'>XML 1.1</a>] is
240      * supported, a character is encountered for which the normalization
241      * properties cannot be determined, then raise a
242      * <code>"unknown-character-denormalization"</code> warning (instead of
243      * raising an error, if this parameter is not set) and ignore any
244      * possible denormalizations caused by these characters. </dd>
245      * <dt>
246      * <code>false</code></dt>
247      * <dd>[<em>optional</em>] Report a fatal error if a character is encountered for which the
248      * processor cannot determine the normalization properties. </dd>
249      * </dl></dd>
250      * <dt>
251      * <code>"normalize-characters"</code></dt>
252      * <dd> This parameter is equivalent to
253      * the one defined by <code>DOMConfiguration</code> in [<a href='http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407'>DOM Level 3 Core</a>]
254      * . Unlike in the Core, the default value for this parameter is
255      * <code>true</code>. While DOM implementations are not required to
256      * support <a href='http://www.w3.org/TR/2004/REC-xml11-20040204/#dt-fullnorm'>fully
257      * normalizing</a> the characters in the document according to appendix E of [<a href='http://www.w3.org/TR/2004/REC-xml11-20040204/'>XML 1.1</a>], this
258      * parameter must be activated by default if supported. </dd>
259      * <dt>
260      * <code>"xml-declaration"</code></dt>
261      * <dd>
262      * <dl>
263      * <dt><code>true</code></dt>
264      * <dd>[<em>required</em>] (<em>default</em>) If a <code>Document</code>, <code>Element</code>, or <code>Entity</code>
265      *  node is serialized, the XML declaration, or text declaration, should
266      * be included. The version (<code>Document.xmlVersion</code> if the
267      * document is a Level 3 document and the version is non-null, otherwise
268      * use the value "1.0"), and the output encoding (see
269      * <code>LSSerializer.write</code> for details on how to find the output
270      * encoding) are specified in the serialized XML declaration. </dd>
271      * <dt>
272      * <code>false</code></dt>
273      * <dd>[<em>required</em>] Do not serialize the XML and text declarations. Report a
274      * <code>"xml-declaration-needed"</code> warning if this will cause
275      * problems (i.e. the serialized data is of an XML version other than [<a href='http://www.w3.org/TR/2004/REC-xml-20040204'>XML 1.0</a>], or an
276      * encoding would be needed to be able to re-parse the serialized data). </dd>
277      * </dl></dd>
278      * </dl>
279      */
getDomConfig()280     public DOMConfiguration getDomConfig();
281 
282     /**
283      *  The end-of-line sequence of characters to be used in the XML being
284      * written out. Any string is supported, but XML treats only a certain
285      * set of characters sequence as end-of-line (See section 2.11,
286      * "End-of-Line Handling" in [<a href='http://www.w3.org/TR/2004/REC-xml-20040204'>XML 1.0</a>], if the
287      * serialized content is XML 1.0 or section 2.11, "End-of-Line Handling"
288      * in [<a href='http://www.w3.org/TR/2004/REC-xml11-20040204/'>XML 1.1</a>], if the
289      * serialized content is XML 1.1). Using other character sequences than
290      * the recommended ones can result in a document that is either not
291      * serializable or not well-formed).
292      * <br> On retrieval, the default value of this attribute is the
293      * implementation specific default end-of-line sequence. DOM
294      * implementations should choose the default to match the usual
295      * convention for text files in the environment being used.
296      * Implementations must choose a default sequence that matches one of
297      * those allowed by XML 1.0 or XML 1.1, depending on the serialized
298      * content. Setting this attribute to <code>null</code> will reset its
299      * value to the default value.
300      * <br>
301      */
getNewLine()302     public String getNewLine();
303     /**
304      *  The end-of-line sequence of characters to be used in the XML being
305      * written out. Any string is supported, but XML treats only a certain
306      * set of characters sequence as end-of-line (See section 2.11,
307      * "End-of-Line Handling" in [<a href='http://www.w3.org/TR/2004/REC-xml-20040204'>XML 1.0</a>], if the
308      * serialized content is XML 1.0 or section 2.11, "End-of-Line Handling"
309      * in [<a href='http://www.w3.org/TR/2004/REC-xml11-20040204/'>XML 1.1</a>], if the
310      * serialized content is XML 1.1). Using other character sequences than
311      * the recommended ones can result in a document that is either not
312      * serializable or not well-formed).
313      * <br> On retrieval, the default value of this attribute is the
314      * implementation specific default end-of-line sequence. DOM
315      * implementations should choose the default to match the usual
316      * convention for text files in the environment being used.
317      * Implementations must choose a default sequence that matches one of
318      * those allowed by XML 1.0 or XML 1.1, depending on the serialized
319      * content. Setting this attribute to <code>null</code> will reset its
320      * value to the default value.
321      * <br>
322      */
setNewLine(String newLine)323     public void setNewLine(String newLine);
324 
325     /**
326      *  When the application provides a filter, the serializer will call out
327      * to the filter before serializing each Node. The filter implementation
328      * can choose to remove the node from the stream or to terminate the
329      * serialization early.
330      * <br> The filter is invoked after the operations requested by the
331      * <code>DOMConfiguration</code> parameters have been applied. For
332      * example, CDATA sections won't be passed to the filter if "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-cdata-sections'>
333      * cdata-sections</a>" is set to <code>false</code>.
334      *
335      * @hide
336      */
getFilter()337     public LSSerializerFilter getFilter();
338     /**
339      *  When the application provides a filter, the serializer will call out
340      * to the filter before serializing each Node. The filter implementation
341      * can choose to remove the node from the stream or to terminate the
342      * serialization early.
343      * <br> The filter is invoked after the operations requested by the
344      * <code>DOMConfiguration</code> parameters have been applied. For
345      * example, CDATA sections won't be passed to the filter if "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-cdata-sections'>
346      * cdata-sections</a>" is set to <code>false</code>.
347      *
348      * @hide
349      */
setFilter(LSSerializerFilter filter)350     public void setFilter(LSSerializerFilter filter);
351 
352     /**
353      *  Serialize the specified node as described above in the general
354      * description of the <code>LSSerializer</code> interface. The output is
355      * written to the supplied <code>LSOutput</code>.
356      * <br> When writing to a <code>LSOutput</code>, the encoding is found by
357      * looking at the encoding information that is reachable through the
358      * <code>LSOutput</code> and the item to be written (or its owner
359      * document) in this order:
360      * <ol>
361      * <li> <code>LSOutput.encoding</code>,
362      * </li>
363      * <li>
364      * <code>Document.inputEncoding</code>,
365      * </li>
366      * <li>
367      * <code>Document.xmlEncoding</code>.
368      * </li>
369      * </ol>
370      * <br> If no encoding is reachable through the above properties, a
371      * default encoding of "UTF-8" will be used. If the specified encoding
372      * is not supported an "unsupported-encoding" fatal error is raised.
373      * <br> If no output is specified in the <code>LSOutput</code>, a
374      * "no-output-specified" fatal error is raised.
375      * <br> The implementation is responsible of associating the appropriate
376      * media type with the serialized data.
377      * <br> When writing to a HTTP URI, a HTTP PUT is performed. When writing
378      * to other types of URIs, the mechanism for writing the data to the URI
379      * is implementation dependent.
380      * @param nodeArg  The node to serialize.
381      * @param destination The destination for the serialized DOM.
382      * @return  Returns <code>true</code> if <code>node</code> was
383      *   successfully serialized. Return <code>false</code> in case the
384      *   normal processing stopped but the implementation kept serializing
385      *   the document; the result of the serialization being implementation
386      *   dependent then.
387      * @exception LSException
388      *    SERIALIZE_ERR: Raised if the <code>LSSerializer</code> was unable to
389      *   serialize the node. DOM applications should attach a
390      *   <code>DOMErrorHandler</code> using the parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-error-handler'>
391      *   error-handler</a>" if they wish to get details on the error.
392      */
write(Node nodeArg, LSOutput destination)393     public boolean write(Node nodeArg,
394                          LSOutput destination)
395                          throws LSException;
396 
397     /**
398      *  A convenience method that acts as if <code>LSSerializer.write</code>
399      * was called with a <code>LSOutput</code> with no encoding specified
400      * and <code>LSOutput.systemId</code> set to the <code>uri</code>
401      * argument.
402      * @param nodeArg  The node to serialize.
403      * @param uri The URI to write to.
404      * @return  Returns <code>true</code> if <code>node</code> was
405      *   successfully serialized. Return <code>false</code> in case the
406      *   normal processing stopped but the implementation kept serializing
407      *   the document; the result of the serialization being implementation
408      *   dependent then.
409      * @exception LSException
410      *    SERIALIZE_ERR: Raised if the <code>LSSerializer</code> was unable to
411      *   serialize the node. DOM applications should attach a
412      *   <code>DOMErrorHandler</code> using the parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-error-handler'>
413      *   error-handler</a>" if they wish to get details on the error.
414      */
writeToURI(Node nodeArg, String uri)415     public boolean writeToURI(Node nodeArg,
416                               String uri)
417                               throws LSException;
418 
419     /**
420      *  Serialize the specified node as described above in the general
421      * description of the <code>LSSerializer</code> interface. The output is
422      * written to a <code>DOMString</code> that is returned to the caller.
423      * The encoding used is the encoding of the <code>DOMString</code> type,
424      * i.e. UTF-16. Note that no Byte Order Mark is generated in a
425      * <code>DOMString</code> object.
426      * @param nodeArg  The node to serialize.
427      * @return  Returns the serialized data.
428      * @exception DOMException
429      *    DOMSTRING_SIZE_ERR: Raised if the resulting string is too long to
430      *   fit in a <code>DOMString</code>.
431      * @exception LSException
432      *    SERIALIZE_ERR: Raised if the <code>LSSerializer</code> was unable to
433      *   serialize the node. DOM applications should attach a
434      *   <code>DOMErrorHandler</code> using the parameter "<a href='http://www.w3.org/TR/DOM-Level-3-Core/core.html#parameter-error-handler'>
435      *   error-handler</a>" if they wish to get details on the error.
436      */
writeToString(Node nodeArg)437     public String writeToString(Node nodeArg)
438                                 throws DOMException, LSException;
439 
440 }
441