1 /*
2  *  Licensed to the Apache Software Foundation (ASF) under one or more
3  *  contributor license agreements.  See the NOTICE file distributed with
4  *  this work for additional information regarding copyright ownership.
5  *  The ASF licenses this file to You under the Apache License, Version 2.0
6  *  (the "License"); you may not use this file except in compliance with
7  *  the License.  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  */
17 
18 package java.lang;
19 
20 import dalvik.annotation.optimization.FastNative;
21 import java.io.Serializable;
22 import java.io.UnsupportedEncodingException;
23 import java.nio.ByteBuffer;
24 import java.nio.CharBuffer;
25 import java.nio.charset.Charset;
26 import java.util.Arrays;
27 import java.util.Comparator;
28 import libcore.util.CharsetUtils;
29 import libcore.util.EmptyArray;
30 
31 /**
32  * Class used to generate strings instead of calling String.<init>.
33  *
34  * @hide
35  */
36 public final class StringFactory {
37 
38     // TODO: Remove once native methods are in place.
39     private static final char REPLACEMENT_CHAR = (char) 0xfffd;
40 
newEmptyString()41     public static String newEmptyString() {
42         return newStringFromChars(EmptyArray.CHAR, 0, 0);
43     }
44 
newStringFromBytes(byte[] data)45     public static String newStringFromBytes(byte[] data) {
46         return newStringFromBytes(data, 0, data.length);
47     }
48 
newStringFromBytes(byte[] data, int high)49     public static String newStringFromBytes(byte[] data, int high) {
50         return newStringFromBytes(data, high, 0, data.length);
51     }
52 
newStringFromBytes(byte[] data, int offset, int byteCount)53     public static String newStringFromBytes(byte[] data, int offset, int byteCount) {
54         return newStringFromBytes(data, offset, byteCount, Charset.defaultCharset());
55     }
56 
57     @FastNative
newStringFromBytes(byte[] data, int high, int offset, int byteCount)58     public static native String newStringFromBytes(byte[] data, int high, int offset, int byteCount);
59 
newStringFromBytes(byte[] data, int offset, int byteCount, String charsetName)60     public static String newStringFromBytes(byte[] data, int offset, int byteCount, String charsetName) throws UnsupportedEncodingException {
61         return newStringFromBytes(data, offset, byteCount, Charset.forNameUEE(charsetName));
62     }
63 
newStringFromBytes(byte[] data, String charsetName)64     public static String newStringFromBytes(byte[] data, String charsetName) throws UnsupportedEncodingException {
65         return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName));
66     }
67 
68     private static final int[] TABLE_UTF8_NEEDED = new int[] {
69     //      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
70             0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf
71             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf
72             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef
73             3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
74     };
75 
76     // TODO: Implement this method natively.
newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset)77     public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) {
78         if ((offset | byteCount) < 0 || byteCount > data.length - offset) {
79             throw new StringIndexOutOfBoundsException(data.length, offset, byteCount);
80         }
81 
82         char[] value;
83         int length;
84 
85         // We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed.
86         String canonicalCharsetName = charset.name();
87         if (canonicalCharsetName.equals("UTF-8")) {
88             /*
89             This code converts a UTF-8 byte sequence to a Java String (UTF-16).
90             It implements the W3C recommended UTF-8 decoder.
91             https://www.w3.org/TR/encoding/#utf-8-decoder
92 
93             Unicode 3.2 Well-Formed UTF-8 Byte Sequences
94             Code Points        First  Second Third Fourth
95             U+0000..U+007F     00..7F
96             U+0080..U+07FF     C2..DF 80..BF
97             U+0800..U+0FFF     E0     A0..BF 80..BF
98             U+1000..U+CFFF     E1..EC 80..BF 80..BF
99             U+D000..U+D7FF     ED     80..9F 80..BF
100             U+E000..U+FFFF     EE..EF 80..BF 80..BF
101             U+10000..U+3FFFF   F0     90..BF 80..BF 80..BF
102             U+40000..U+FFFFF   F1..F3 80..BF 80..BF 80..BF
103             U+100000..U+10FFFF F4     80..8F 80..BF 80..BF
104 
105             Please refer to Unicode as the authority.
106             p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
107 
108             Handling Malformed Input
109             The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is
110             the longest code unit subsequence starting at an unconvertible offset that is either
111             1) the initial subsequence of a well-formed code unit sequence, or
112             2) a subsequence of length one:
113             One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix
114             of a valid sequence, and with the conversion to restart after the incomplete sequence.
115 
116             For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are
117             "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80",
118             but "C0" can't be the initial subsequence of any well-formed code unit sequence.
119             Thus, the output should be "A\ufffd\ufffdA\ufffdA".
120 
121             Please refer to section "Best Practices for Using U+FFFD." in
122             http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
123             */
124             byte[] d = data;
125             char[] v = new char[byteCount];
126 
127             int idx = offset;
128             int last = offset + byteCount;
129             int s = 0;
130 
131             int codePoint = 0;
132             int utf8BytesSeen = 0;
133             int utf8BytesNeeded = 0;
134             int lowerBound = 0x80;
135             int upperBound = 0xbf;
136 
137             while (idx < last) {
138                 int b = d[idx++] & 0xff;
139                 if (utf8BytesNeeded == 0) {
140                     if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx
141                         v[s++] = (char) b;
142                         continue;
143                     }
144 
145                     if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte
146                         v[s++] = REPLACEMENT_CHAR;
147                         continue;
148                     }
149 
150                     // 11xxxxxx
151                     int tableLookupIndex = b & 0x3f;
152                     utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex];
153                     if (utf8BytesNeeded == 0) {
154                         v[s++] = REPLACEMENT_CHAR;
155                         continue;
156                     }
157 
158                     // utf8BytesNeeded
159                     // 1: b & 0x1f
160                     // 2: b & 0x0f
161                     // 3: b & 0x07
162                     codePoint = b & (0x3f >> utf8BytesNeeded);
163                     if (b == 0xe0) {
164                         lowerBound = 0xa0;
165                     } else if (b == 0xed) {
166                         upperBound = 0x9f;
167                     } else if (b == 0xf0) {
168                         lowerBound = 0x90;
169                     } else if (b == 0xf4) {
170                         upperBound = 0x8f;
171                     }
172                 } else {
173                     if (b < lowerBound || b > upperBound) {
174                         // The bytes seen are ill-formed. Substitute them with U+FFFD
175                         v[s++] = REPLACEMENT_CHAR;
176                         codePoint = 0;
177                         utf8BytesNeeded = 0;
178                         utf8BytesSeen = 0;
179                         lowerBound = 0x80;
180                         upperBound = 0xbf;
181                         /*
182                          * According to the Unicode Standard,
183                          * "a UTF-8 conversion process is required to never consume well-formed
184                          * subsequences as part of its error handling for ill-formed subsequences"
185                          * The current byte could be part of well-formed subsequences. Reduce the
186                          * index by 1 to parse it in next loop.
187                          */
188                         idx--;
189                         continue;
190                     }
191 
192                     lowerBound = 0x80;
193                     upperBound = 0xbf;
194                     codePoint = (codePoint << 6) | (b & 0x3f);
195                     utf8BytesSeen++;
196                     if (utf8BytesNeeded != utf8BytesSeen) {
197                         continue;
198                     }
199 
200                     // Encode chars from U+10000 up as surrogate pairs
201                     if (codePoint < 0x10000) {
202                         v[s++] = (char) codePoint;
203                     } else {
204                         v[s++] = (char) ((codePoint >> 10) + 0xd7c0);
205                         v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00);
206                     }
207 
208                     utf8BytesSeen = 0;
209                     utf8BytesNeeded = 0;
210                     codePoint = 0;
211                 }
212             }
213 
214             // The bytes seen are ill-formed. Substitute them by U+FFFD
215             if (utf8BytesNeeded != 0) {
216                 v[s++] = REPLACEMENT_CHAR;
217             }
218 
219             if (s == byteCount) {
220                 // We guessed right, so we can use our temporary array as-is.
221                 value = v;
222                 length = s;
223             } else {
224                 // Our temporary array was too big, so reallocate and copy.
225                 value = new char[s];
226                 length = s;
227                 System.arraycopy(v, 0, value, 0, s);
228             }
229         } else if (canonicalCharsetName.equals("ISO-8859-1")) {
230             value = new char[byteCount];
231             length = byteCount;
232             CharsetUtils.isoLatin1BytesToChars(data, offset, byteCount, value);
233         } else if (canonicalCharsetName.equals("US-ASCII")) {
234             value = new char[byteCount];
235             length = byteCount;
236             CharsetUtils.asciiBytesToChars(data, offset, byteCount, value);
237         } else {
238             CharBuffer cb = charset.decode(ByteBuffer.wrap(data, offset, byteCount));
239             length = cb.length();
240             // The call to newStringFromChars below will copy length bytes out of value, so it does
241             // not matter that cb.array().length may be > cb.length() or that a Charset could keep a
242             // reference to the CharBuffer it returns and later mutate it.
243             value = cb.array();
244         }
245         return newStringFromChars(value, 0, length);
246     }
247 
newStringFromBytes(byte[] data, Charset charset)248     public static String newStringFromBytes(byte[] data, Charset charset) {
249         return newStringFromBytes(data, 0, data.length, charset);
250     }
251 
newStringFromChars(char[] data)252     public static String newStringFromChars(char[] data) {
253         return newStringFromChars(data, 0, data.length);
254     }
255 
newStringFromChars(char[] data, int offset, int charCount)256     public static String newStringFromChars(char[] data, int offset, int charCount) {
257         if ((offset | charCount) < 0 || charCount > data.length - offset) {
258             throw new StringIndexOutOfBoundsException(data.length, offset, charCount);
259         }
260         return newStringFromChars(offset, charCount, data);
261     }
262 
263     // The char array passed as {@code java_data} must not be a null reference.
264     @FastNative
newStringFromChars(int offset, int charCount, char[] data)265     static native String newStringFromChars(int offset, int charCount, char[] data);
266 
267     @FastNative
newStringFromString(String toCopy)268     public static native String newStringFromString(String toCopy);
269 
newStringFromStringBuffer(StringBuffer stringBuffer)270     public static String newStringFromStringBuffer(StringBuffer stringBuffer) {
271         synchronized (stringBuffer) {
272             return newStringFromChars(stringBuffer.getValue(), 0, stringBuffer.length());
273         }
274     }
275 
276     // TODO: Implement this method natively.
newStringFromCodePoints(int[] codePoints, int offset, int count)277     public static String newStringFromCodePoints(int[] codePoints, int offset, int count) {
278         if (codePoints == null) {
279             throw new NullPointerException("codePoints == null");
280         }
281         if ((offset | count) < 0 || count > codePoints.length - offset) {
282             throw new StringIndexOutOfBoundsException(codePoints.length, offset, count);
283         }
284         char[] value = new char[count * 2];
285         int end = offset + count;
286         int length = 0;
287         for (int i = offset; i < end; i++) {
288             length += Character.toChars(codePoints[i], value, length);
289         }
290         return newStringFromChars(value, 0, length);
291     }
292 
newStringFromStringBuilder(StringBuilder stringBuilder)293     public static String newStringFromStringBuilder(StringBuilder stringBuilder) {
294         return newStringFromChars(stringBuilder.getValue(), 0, stringBuilder.length());
295     }
296 }
297