1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.harmony.xml.parsers; 18 19 import com.android.org.kxml2.io.KXmlParser; 20 import java.io.IOException; 21 import java.net.URL; 22 import java.net.URLConnection; 23 import javax.xml.parsers.DocumentBuilder; 24 import libcore.io.IoUtils; 25 import org.apache.harmony.xml.dom.CDATASectionImpl; 26 import org.apache.harmony.xml.dom.DOMImplementationImpl; 27 import org.apache.harmony.xml.dom.DocumentImpl; 28 import org.apache.harmony.xml.dom.DocumentTypeImpl; 29 import org.apache.harmony.xml.dom.TextImpl; 30 import org.w3c.dom.Attr; 31 import org.w3c.dom.DOMImplementation; 32 import org.w3c.dom.Document; 33 import org.w3c.dom.DocumentType; 34 import org.w3c.dom.Element; 35 import org.w3c.dom.Node; 36 import org.w3c.dom.Text; 37 import org.xml.sax.EntityResolver; 38 import org.xml.sax.ErrorHandler; 39 import org.xml.sax.InputSource; 40 import org.xml.sax.SAXException; 41 import org.xml.sax.SAXParseException; 42 import org.xml.sax.helpers.LocatorImpl; 43 import org.xmlpull.v1.XmlPullParser; 44 import org.xmlpull.v1.XmlPullParserException; 45 46 /** 47 * Builds a DOM using KXmlParser. 48 */ 49 class DocumentBuilderImpl extends DocumentBuilder { 50 51 private static DOMImplementationImpl dom = DOMImplementationImpl.getInstance(); 52 53 private boolean coalescing; 54 private EntityResolver entityResolver; 55 private ErrorHandler errorHandler; 56 private boolean ignoreComments; 57 private boolean ignoreElementContentWhitespace; 58 private boolean namespaceAware; 59 // adding a new field? don't forget to update reset(). 60 reset()61 @Override public void reset() { 62 coalescing = false; 63 entityResolver = null; 64 errorHandler = null; 65 ignoreComments = false; 66 ignoreElementContentWhitespace = false; 67 namespaceAware = false; 68 } 69 70 @Override getDOMImplementation()71 public DOMImplementation getDOMImplementation() { 72 return dom; 73 } 74 75 @Override isNamespaceAware()76 public boolean isNamespaceAware() { 77 return namespaceAware; 78 } 79 80 @Override isValidating()81 public boolean isValidating() { 82 return false; 83 } 84 85 @Override newDocument()86 public Document newDocument() { 87 return dom.createDocument(null, null, null); 88 } 89 90 @Override parse(InputSource source)91 public Document parse(InputSource source) throws SAXException, IOException { 92 if (source == null) { 93 throw new IllegalArgumentException("source == null"); 94 } 95 96 String namespaceURI = null; 97 String qualifiedName = null; 98 DocumentType doctype = null; 99 String inputEncoding = source.getEncoding(); 100 String systemId = source.getSystemId(); 101 DocumentImpl document = new DocumentImpl( 102 dom, namespaceURI, qualifiedName, doctype, inputEncoding); 103 document.setDocumentURI(systemId); 104 105 KXmlParser parser = new KXmlParser(); 106 try { 107 parser.keepNamespaceAttributes(); 108 parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, namespaceAware); 109 110 if (source.getByteStream() != null) { 111 parser.setInput(source.getByteStream(), inputEncoding); 112 } else if (source.getCharacterStream() != null) { 113 parser.setInput(source.getCharacterStream()); 114 } else if (systemId != null) { 115 URL url = new URL(systemId); 116 URLConnection urlConnection = url.openConnection(); 117 urlConnection.connect(); 118 // TODO: if null, extract the inputEncoding from the Content-Type header? 119 parser.setInput(urlConnection.getInputStream(), inputEncoding); 120 } else { 121 throw new SAXParseException("InputSource needs a stream, reader or URI", null); 122 } 123 124 if (parser.nextToken() == XmlPullParser.END_DOCUMENT) { 125 throw new SAXParseException("Unexpected end of document", null); 126 } 127 128 parse(parser, document, document, XmlPullParser.END_DOCUMENT); 129 130 parser.require(XmlPullParser.END_DOCUMENT, null, null); 131 } catch (XmlPullParserException ex) { 132 Throwable detail = ex.getDetail(); 133 if (detail instanceof IOException) { 134 throw (IOException) detail; 135 } 136 if (detail instanceof RuntimeException) { 137 throw (RuntimeException) detail; 138 } 139 140 LocatorImpl locator = new LocatorImpl(); 141 142 locator.setPublicId(source.getPublicId()); 143 locator.setSystemId(systemId); 144 locator.setLineNumber(ex.getLineNumber()); 145 locator.setColumnNumber(ex.getColumnNumber()); 146 147 SAXParseException newEx = new SAXParseException(ex.getMessage(), locator); 148 149 if (errorHandler != null) { 150 errorHandler.error(newEx); 151 } 152 153 throw newEx; 154 } finally { 155 IoUtils.closeQuietly(parser); 156 } 157 158 return document; 159 } 160 161 /** 162 * Implements the whole parsing of the XML document. The XML pull parser is 163 * actually more of a tokenizer, and we are doing a classical recursive 164 * descent parsing (the method invokes itself for XML elements). Our 165 * approach to parsing does accept some illegal documents (more than one 166 * root element, for example). The assumption is that the DOM implementation 167 * throws the proper exceptions in these cases. 168 * 169 * @param parser The XML pull parser we're reading from. 170 * @param document The document we're building. 171 * @param node The node we're currently on (initially the document itself). 172 * @param endToken The token that will end this recursive call. Either 173 * XmlPullParser.END_DOCUMENT or XmlPullParser.END_TAG. 174 * 175 * @throws XmlPullParserException If a parsing error occurs. 176 * @throws IOException If a general IO error occurs. 177 */ parse(KXmlParser parser, DocumentImpl document, Node node, int endToken)178 private void parse(KXmlParser parser, DocumentImpl document, Node node, 179 int endToken) throws XmlPullParserException, IOException { 180 181 int token = parser.getEventType(); 182 183 /* 184 * The main parsing loop. The precondition is that we are already on the 185 * token to be processed. This holds for each iteration of the loop, so 186 * the inner statements have to ensure that (in particular the recursive 187 * call). 188 */ 189 while (token != endToken && token != XmlPullParser.END_DOCUMENT) { 190 if (token == XmlPullParser.PROCESSING_INSTRUCTION) { 191 /* 192 * Found a processing instructions. We need to split the token 193 * text at the first whitespace character. 194 */ 195 String text = parser.getText(); 196 197 int dot = text.indexOf(' '); 198 199 String target = (dot != -1 ? text.substring(0, dot) : text); 200 String data = (dot != -1 ? text.substring(dot + 1) : ""); 201 202 node.appendChild(document.createProcessingInstruction(target, 203 data)); 204 } else if (token == XmlPullParser.DOCDECL) { 205 String name = parser.getRootElementName(); 206 String publicId = parser.getPublicId(); 207 String systemId = parser.getSystemId(); 208 document.appendChild(new DocumentTypeImpl(document, name, publicId, systemId)); 209 210 } else if (token == XmlPullParser.COMMENT) { 211 /* 212 * Found a comment. We simply take the token text, but we only 213 * create a node if the client wants to see comments at all. 214 */ 215 if (!ignoreComments) { 216 node.appendChild(document.createComment(parser.getText())); 217 } 218 } else if (token == XmlPullParser.IGNORABLE_WHITESPACE) { 219 /* 220 * Found some ignorable whitespace. We only add it if the client 221 * wants to see whitespace. Whitespace before and after the 222 * document element is always ignored. 223 */ 224 if (!ignoreElementContentWhitespace && document != node) { 225 appendText(document, node, token, parser.getText()); 226 } 227 } else if (token == XmlPullParser.TEXT || token == XmlPullParser.CDSECT) { 228 /* 229 * Found a piece of text (possibly encoded as a CDATA section). 230 * That's the easiest case. We simply take it and create a new text node, 231 * or merge with an adjacent text node. 232 */ 233 appendText(document, node, token, parser.getText()); 234 } else if (token == XmlPullParser.ENTITY_REF) { 235 /* 236 * Found an entity reference. If an entity resolver is 237 * installed, we replace it by text (if possible). Otherwise we 238 * add an entity reference node. 239 */ 240 String entity = parser.getName(); 241 242 if (entityResolver != null) { 243 // TODO Implement this... 244 } 245 246 String resolved = resolvePredefinedOrCharacterEntity(entity); 247 if (resolved != null) { 248 appendText(document, node, token, resolved); 249 } else { 250 node.appendChild(document.createEntityReference(entity)); 251 } 252 } else if (token == XmlPullParser.START_TAG) { 253 /* 254 * Found an element start tag. We create an element node with 255 * the proper info and attributes. We then invoke parse() 256 * recursively to handle the next level of nesting. When we 257 * return from this call, we check that we are on the proper 258 * element end tag. The whole handling differs somewhat 259 * depending on whether the parser is namespace-aware or not. 260 */ 261 if (namespaceAware) { 262 // Collect info for element node 263 String namespace = parser.getNamespace(); 264 String name = parser.getName(); 265 String prefix = parser.getPrefix(); 266 267 if ("".equals(namespace)) { 268 namespace = null; 269 } 270 271 // Create element node and wire it correctly 272 Element element = document.createElementNS(namespace, name); 273 element.setPrefix(prefix); 274 node.appendChild(element); 275 276 for (int i = 0; i < parser.getAttributeCount(); i++) { 277 // Collect info for a single attribute node 278 String attrNamespace = parser.getAttributeNamespace(i); 279 String attrPrefix = parser.getAttributePrefix(i); 280 String attrName = parser.getAttributeName(i); 281 String attrValue = parser.getAttributeValue(i); 282 283 if ("".equals(attrNamespace)) { 284 attrNamespace = null; 285 } 286 287 // Create attribute node and wire it correctly 288 Attr attr = document.createAttributeNS(attrNamespace, attrName); 289 attr.setPrefix(attrPrefix); 290 attr.setValue(attrValue); 291 element.setAttributeNodeNS(attr); 292 } 293 294 // Recursive descent 295 token = parser.nextToken(); 296 parse(parser, document, element, XmlPullParser.END_TAG); 297 298 // Expect the element's end tag here 299 parser.require(XmlPullParser.END_TAG, namespace, name); 300 301 } else { 302 // Collect info for element node 303 String name = parser.getName(); 304 305 // Create element node and wire it correctly 306 Element element = document.createElement(name); 307 node.appendChild(element); 308 309 for (int i = 0; i < parser.getAttributeCount(); i++) { 310 // Collect info for a single attribute node 311 String attrName = parser.getAttributeName(i); 312 String attrValue = parser.getAttributeValue(i); 313 314 // Create attribute node and wire it correctly 315 Attr attr = document.createAttribute(attrName); 316 attr.setValue(attrValue); 317 element.setAttributeNode(attr); 318 } 319 320 // Recursive descent 321 token = parser.nextToken(); 322 parse(parser, document, element, XmlPullParser.END_TAG); 323 324 // Expect the element's end tag here 325 parser.require(XmlPullParser.END_TAG, "", name); 326 } 327 } 328 329 token = parser.nextToken(); 330 } 331 } 332 333 /** 334 * @param token the XML pull parser token type, such as XmlPullParser.CDSECT 335 * or XmlPullParser.ENTITY_REF. 336 */ appendText(DocumentImpl document, Node parent, int token, String text)337 private void appendText(DocumentImpl document, Node parent, int token, String text) { 338 // Ignore empty runs. 339 if (text.isEmpty()) { 340 return; 341 } 342 // Merge with any previous text node if possible. 343 if (coalescing || token != XmlPullParser.CDSECT) { 344 Node lastChild = parent.getLastChild(); 345 if (lastChild != null && lastChild.getNodeType() == Node.TEXT_NODE) { 346 Text textNode = (Text) lastChild; 347 textNode.appendData(text); 348 return; 349 } 350 } 351 // Okay, we really do need a new text node 352 parent.appendChild(token == XmlPullParser.CDSECT 353 ? new CDATASectionImpl(document, text) 354 : new TextImpl(document, text)); 355 } 356 357 @Override setEntityResolver(EntityResolver resolver)358 public void setEntityResolver(EntityResolver resolver) { 359 entityResolver = resolver; 360 } 361 362 @Override setErrorHandler(ErrorHandler handler)363 public void setErrorHandler(ErrorHandler handler) { 364 errorHandler = handler; 365 } 366 367 /** 368 * Controls whether this DocumentBuilder ignores comments. 369 */ setIgnoreComments(boolean value)370 public void setIgnoreComments(boolean value) { 371 ignoreComments = value; 372 } 373 setCoalescing(boolean value)374 public void setCoalescing(boolean value) { 375 coalescing = value; 376 } 377 378 /** 379 * Controls whether this DocumentBuilder ignores element content whitespace. 380 */ setIgnoreElementContentWhitespace(boolean value)381 public void setIgnoreElementContentWhitespace(boolean value) { 382 ignoreElementContentWhitespace = value; 383 } 384 385 /** 386 * Controls whether this DocumentBuilder is namespace-aware. 387 */ setNamespaceAware(boolean value)388 public void setNamespaceAware(boolean value) { 389 namespaceAware = value; 390 } 391 392 /** 393 * Returns the replacement text or null if {@code entity} isn't predefined. 394 */ resolvePredefinedOrCharacterEntity(String entityName)395 private String resolvePredefinedOrCharacterEntity(String entityName) { 396 // Character references, section 4.1 of the XML specification. 397 if (entityName.startsWith("#x")) { 398 return resolveCharacterReference(entityName.substring(2), 16); 399 } else if (entityName.startsWith("#")) { 400 return resolveCharacterReference(entityName.substring(1), 10); 401 } 402 // Predefined entities, section 4.6 of the XML specification. 403 if ("lt".equals(entityName)) { 404 return "<"; 405 } else if ("gt".equals(entityName)) { 406 return ">"; 407 } else if ("amp".equals(entityName)) { 408 return "&"; 409 } else if ("apos".equals(entityName)) { 410 return "'"; 411 } else if ("quot".equals(entityName)) { 412 return "\""; 413 } else { 414 return null; 415 } 416 } 417 resolveCharacterReference(String value, int base)418 private String resolveCharacterReference(String value, int base) { 419 try { 420 int codePoint = Integer.parseInt(value, base); 421 if (Character.isBmpCodePoint(codePoint)) { 422 return String.valueOf((char) codePoint); 423 } else { 424 char[] surrogatePair = Character.toChars(codePoint); 425 return new String(surrogatePair); 426 } 427 } catch (NumberFormatException ex) { 428 return null; 429 } 430 } 431 } 432