1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package java.lang; 19 20 import dalvik.annotation.optimization.FastNative; 21 import java.io.Serializable; 22 import java.io.UnsupportedEncodingException; 23 import java.nio.ByteBuffer; 24 import java.nio.CharBuffer; 25 import java.nio.charset.Charset; 26 import java.util.Arrays; 27 import java.util.Comparator; 28 import libcore.util.CharsetUtils; 29 import libcore.util.EmptyArray; 30 31 /** 32 * Class used to generate strings instead of calling String.<init>. 33 * 34 * @hide 35 */ 36 public final class StringFactory { 37 38 // TODO: Remove once native methods are in place. 39 private static final char REPLACEMENT_CHAR = (char) 0xfffd; 40 newEmptyString()41 public static String newEmptyString() { 42 return newStringFromChars(EmptyArray.CHAR, 0, 0); 43 } 44 newStringFromBytes(byte[] data)45 public static String newStringFromBytes(byte[] data) { 46 return newStringFromBytes(data, 0, data.length); 47 } 48 newStringFromBytes(byte[] data, int high)49 public static String newStringFromBytes(byte[] data, int high) { 50 return newStringFromBytes(data, high, 0, data.length); 51 } 52 newStringFromBytes(byte[] data, int offset, int byteCount)53 public static String newStringFromBytes(byte[] data, int offset, int byteCount) { 54 return newStringFromBytes(data, offset, byteCount, Charset.defaultCharset()); 55 } 56 57 @FastNative newStringFromBytes(byte[] data, int high, int offset, int byteCount)58 public static native String newStringFromBytes(byte[] data, int high, int offset, int byteCount); 59 newStringFromBytes(byte[] data, int offset, int byteCount, String charsetName)60 public static String newStringFromBytes(byte[] data, int offset, int byteCount, String charsetName) throws UnsupportedEncodingException { 61 return newStringFromBytes(data, offset, byteCount, Charset.forNameUEE(charsetName)); 62 } 63 newStringFromBytes(byte[] data, String charsetName)64 public static String newStringFromBytes(byte[] data, String charsetName) throws UnsupportedEncodingException { 65 return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName)); 66 } 67 68 private static final int[] TABLE_UTF8_NEEDED = new int[] { 69 // 0 1 2 3 4 5 6 7 8 9 a b c d e f 70 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf 71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf 72 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef 73 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff 74 }; 75 76 // TODO: Implement this method natively. newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset)77 public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) { 78 if ((offset | byteCount) < 0 || byteCount > data.length - offset) { 79 throw new StringIndexOutOfBoundsException(data.length, offset, byteCount); 80 } 81 82 char[] value; 83 int length; 84 85 // We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed. 86 String canonicalCharsetName = charset.name(); 87 if (canonicalCharsetName.equals("UTF-8")) { 88 /* 89 This code converts a UTF-8 byte sequence to a Java String (UTF-16). 90 It implements the W3C recommended UTF-8 decoder. 91 https://www.w3.org/TR/encoding/#utf-8-decoder 92 93 Unicode 3.2 Well-Formed UTF-8 Byte Sequences 94 Code Points First Second Third Fourth 95 U+0000..U+007F 00..7F 96 U+0080..U+07FF C2..DF 80..BF 97 U+0800..U+0FFF E0 A0..BF 80..BF 98 U+1000..U+CFFF E1..EC 80..BF 80..BF 99 U+D000..U+D7FF ED 80..9F 80..BF 100 U+E000..U+FFFF EE..EF 80..BF 80..BF 101 U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 102 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 103 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 104 105 Please refer to Unicode as the authority. 106 p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf 107 108 Handling Malformed Input 109 The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is 110 the longest code unit subsequence starting at an unconvertible offset that is either 111 1) the initial subsequence of a well-formed code unit sequence, or 112 2) a subsequence of length one: 113 One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix 114 of a valid sequence, and with the conversion to restart after the incomplete sequence. 115 116 For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are 117 "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80", 118 but "C0" can't be the initial subsequence of any well-formed code unit sequence. 119 Thus, the output should be "A\ufffd\ufffdA\ufffdA". 120 121 Please refer to section "Best Practices for Using U+FFFD." in 122 http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf 123 */ 124 byte[] d = data; 125 char[] v = new char[byteCount]; 126 127 int idx = offset; 128 int last = offset + byteCount; 129 int s = 0; 130 131 int codePoint = 0; 132 int utf8BytesSeen = 0; 133 int utf8BytesNeeded = 0; 134 int lowerBound = 0x80; 135 int upperBound = 0xbf; 136 137 while (idx < last) { 138 int b = d[idx++] & 0xff; 139 if (utf8BytesNeeded == 0) { 140 if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx 141 v[s++] = (char) b; 142 continue; 143 } 144 145 if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte 146 v[s++] = REPLACEMENT_CHAR; 147 continue; 148 } 149 150 // 11xxxxxx 151 int tableLookupIndex = b & 0x3f; 152 utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex]; 153 if (utf8BytesNeeded == 0) { 154 v[s++] = REPLACEMENT_CHAR; 155 continue; 156 } 157 158 // utf8BytesNeeded 159 // 1: b & 0x1f 160 // 2: b & 0x0f 161 // 3: b & 0x07 162 codePoint = b & (0x3f >> utf8BytesNeeded); 163 if (b == 0xe0) { 164 lowerBound = 0xa0; 165 } else if (b == 0xed) { 166 upperBound = 0x9f; 167 } else if (b == 0xf0) { 168 lowerBound = 0x90; 169 } else if (b == 0xf4) { 170 upperBound = 0x8f; 171 } 172 } else { 173 if (b < lowerBound || b > upperBound) { 174 // The bytes seen are ill-formed. Substitute them with U+FFFD 175 v[s++] = REPLACEMENT_CHAR; 176 codePoint = 0; 177 utf8BytesNeeded = 0; 178 utf8BytesSeen = 0; 179 lowerBound = 0x80; 180 upperBound = 0xbf; 181 /* 182 * According to the Unicode Standard, 183 * "a UTF-8 conversion process is required to never consume well-formed 184 * subsequences as part of its error handling for ill-formed subsequences" 185 * The current byte could be part of well-formed subsequences. Reduce the 186 * index by 1 to parse it in next loop. 187 */ 188 idx--; 189 continue; 190 } 191 192 lowerBound = 0x80; 193 upperBound = 0xbf; 194 codePoint = (codePoint << 6) | (b & 0x3f); 195 utf8BytesSeen++; 196 if (utf8BytesNeeded != utf8BytesSeen) { 197 continue; 198 } 199 200 // Encode chars from U+10000 up as surrogate pairs 201 if (codePoint < 0x10000) { 202 v[s++] = (char) codePoint; 203 } else { 204 v[s++] = (char) ((codePoint >> 10) + 0xd7c0); 205 v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00); 206 } 207 208 utf8BytesSeen = 0; 209 utf8BytesNeeded = 0; 210 codePoint = 0; 211 } 212 } 213 214 // The bytes seen are ill-formed. Substitute them by U+FFFD 215 if (utf8BytesNeeded != 0) { 216 v[s++] = REPLACEMENT_CHAR; 217 } 218 219 if (s == byteCount) { 220 // We guessed right, so we can use our temporary array as-is. 221 value = v; 222 length = s; 223 } else { 224 // Our temporary array was too big, so reallocate and copy. 225 value = new char[s]; 226 length = s; 227 System.arraycopy(v, 0, value, 0, s); 228 } 229 } else if (canonicalCharsetName.equals("ISO-8859-1")) { 230 value = new char[byteCount]; 231 length = byteCount; 232 CharsetUtils.isoLatin1BytesToChars(data, offset, byteCount, value); 233 } else if (canonicalCharsetName.equals("US-ASCII")) { 234 value = new char[byteCount]; 235 length = byteCount; 236 CharsetUtils.asciiBytesToChars(data, offset, byteCount, value); 237 } else { 238 CharBuffer cb = charset.decode(ByteBuffer.wrap(data, offset, byteCount)); 239 length = cb.length(); 240 // The call to newStringFromChars below will copy length bytes out of value, so it does 241 // not matter that cb.array().length may be > cb.length() or that a Charset could keep a 242 // reference to the CharBuffer it returns and later mutate it. 243 value = cb.array(); 244 } 245 return newStringFromChars(value, 0, length); 246 } 247 newStringFromBytes(byte[] data, Charset charset)248 public static String newStringFromBytes(byte[] data, Charset charset) { 249 return newStringFromBytes(data, 0, data.length, charset); 250 } 251 newStringFromChars(char[] data)252 public static String newStringFromChars(char[] data) { 253 return newStringFromChars(data, 0, data.length); 254 } 255 newStringFromChars(char[] data, int offset, int charCount)256 public static String newStringFromChars(char[] data, int offset, int charCount) { 257 if ((offset | charCount) < 0 || charCount > data.length - offset) { 258 throw new StringIndexOutOfBoundsException(data.length, offset, charCount); 259 } 260 return newStringFromChars(offset, charCount, data); 261 } 262 263 // The char array passed as {@code java_data} must not be a null reference. 264 @FastNative newStringFromChars(int offset, int charCount, char[] data)265 static native String newStringFromChars(int offset, int charCount, char[] data); 266 267 @FastNative newStringFromString(String toCopy)268 public static native String newStringFromString(String toCopy); 269 newStringFromStringBuffer(StringBuffer stringBuffer)270 public static String newStringFromStringBuffer(StringBuffer stringBuffer) { 271 synchronized (stringBuffer) { 272 return newStringFromChars(stringBuffer.getValue(), 0, stringBuffer.length()); 273 } 274 } 275 276 // TODO: Implement this method natively. newStringFromCodePoints(int[] codePoints, int offset, int count)277 public static String newStringFromCodePoints(int[] codePoints, int offset, int count) { 278 if (codePoints == null) { 279 throw new NullPointerException("codePoints == null"); 280 } 281 if ((offset | count) < 0 || count > codePoints.length - offset) { 282 throw new StringIndexOutOfBoundsException(codePoints.length, offset, count); 283 } 284 char[] value = new char[count * 2]; 285 int end = offset + count; 286 int length = 0; 287 for (int i = offset; i < end; i++) { 288 length += Character.toChars(codePoints[i], value, length); 289 } 290 return newStringFromChars(value, 0, length); 291 } 292 newStringFromStringBuilder(StringBuilder stringBuilder)293 public static String newStringFromStringBuilder(StringBuilder stringBuilder) { 294 return newStringFromChars(stringBuilder.getValue(), 0, stringBuilder.length()); 295 } 296 } 297