1 // Tencent is pleased to support the open source community by making RapidJSON available.
2 //
3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4 //
5 // Licensed under the MIT License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // http://opensource.org/licenses/MIT
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifndef RAPIDJSON_ENCODEDSTREAM_H_
16 #define RAPIDJSON_ENCODEDSTREAM_H_
17 
18 #include "rapidjson.h"
19 
20 #ifdef __GNUC__
21 RAPIDJSON_DIAG_PUSH
22 RAPIDJSON_DIAG_OFF(effc++)
23 #endif
24 
25 RAPIDJSON_NAMESPACE_BEGIN
26 
27 //! Input byte stream wrapper with a statically bound encoding.
28 /*!
29     \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
30     \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
31 */
32 template <typename Encoding, typename InputByteStream>
33 class EncodedInputStream {
34     RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
35 public:
36     typedef typename Encoding::Ch Ch;
37 
EncodedInputStream(InputByteStream & is)38     EncodedInputStream(InputByteStream& is) : is_(is) {
39         current_ = Encoding::TakeBOM(is_);
40     }
41 
Peek()42     Ch Peek() const { return current_; }
Take()43     Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
Tell()44     size_t Tell() const { return is_.Tell(); }
45 
46     // Not implemented
Put(Ch)47     void Put(Ch) { RAPIDJSON_ASSERT(false); }
Flush()48     void Flush() { RAPIDJSON_ASSERT(false); }
PutBegin()49     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
PutEnd(Ch *)50     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
51 
52 private:
53     EncodedInputStream(const EncodedInputStream&);
54     EncodedInputStream& operator=(const EncodedInputStream&);
55 
56     InputByteStream& is_;
57     Ch current_;
58 };
59 
60 //! Output byte stream wrapper with statically bound encoding.
61 /*!
62     \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
63     \tparam InputByteStream Type of input byte stream. For example, FileWriteStream.
64 */
65 template <typename Encoding, typename OutputByteStream>
66 class EncodedOutputStream {
67     RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
68 public:
69     typedef typename Encoding::Ch Ch;
70 
os_(os)71     EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
72         if (putBOM)
73             Encoding::PutBOM(os_);
74     }
75 
Put(Ch c)76     void Put(Ch c) { Encoding::Put(os_, c);  }
Flush()77     void Flush() { os_.Flush(); }
78 
79     // Not implemented
Peek()80     Ch Peek() const { RAPIDJSON_ASSERT(false); }
Take()81     Ch Take() { RAPIDJSON_ASSERT(false);  }
Tell()82     size_t Tell() const { RAPIDJSON_ASSERT(false);  return 0; }
PutBegin()83     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
PutEnd(Ch *)84     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
85 
86 private:
87     EncodedOutputStream(const EncodedOutputStream&);
88     EncodedOutputStream& operator=(const EncodedOutputStream&);
89 
90     OutputByteStream& os_;
91 };
92 
93 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
94 
95 //! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
96 /*!
97     \tparam CharType Type of character for reading.
98     \tparam InputByteStream type of input byte stream to be wrapped.
99 */
100 template <typename CharType, typename InputByteStream>
101 class AutoUTFInputStream {
102     RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
103 public:
104     typedef CharType Ch;
105 
106     //! Constructor.
107     /*!
108         \param is input stream to be wrapped.
109         \param type UTF encoding type if it is not detected from the stream.
110     */
111     AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
112         RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
113         DetectType();
114         static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
115         takeFunc_ = f[type_];
116         current_ = takeFunc_(*is_);
117     }
118 
GetType()119     UTFType GetType() const { return type_; }
HasBOM()120     bool HasBOM() const { return hasBOM_; }
121 
Peek()122     Ch Peek() const { return current_; }
Take()123     Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
Tell()124     size_t Tell() const { return is_->Tell(); }
125 
126     // Not implemented
Put(Ch)127     void Put(Ch) { RAPIDJSON_ASSERT(false); }
Flush()128     void Flush() { RAPIDJSON_ASSERT(false); }
PutBegin()129     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
PutEnd(Ch *)130     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
131 
132 private:
133     AutoUTFInputStream(const AutoUTFInputStream&);
134     AutoUTFInputStream& operator=(const AutoUTFInputStream&);
135 
136     // Detect encoding type with BOM or RFC 4627
DetectType()137     void DetectType() {
138         // BOM (Byte Order Mark):
139         // 00 00 FE FF  UTF-32BE
140         // FF FE 00 00  UTF-32LE
141         // FE FF        UTF-16BE
142         // FF FE        UTF-16LE
143         // EF BB BF     UTF-8
144 
145         const unsigned char* c = (const unsigned char *)is_->Peek4();
146         if (!c)
147             return;
148 
149         unsigned bom = static_cast<unsigned>(c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24));
150         hasBOM_ = false;
151         if (bom == 0xFFFE0000)                  { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
152         else if (bom == 0x0000FEFF)             { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
153         else if ((bom & 0xFFFF) == 0xFFFE)      { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take();                           }
154         else if ((bom & 0xFFFF) == 0xFEFF)      { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take();                           }
155         else if ((bom & 0xFFFFFF) == 0xBFBBEF)  { type_ = kUTF8;    hasBOM_ = true; is_->Take(); is_->Take(); is_->Take();              }
156 
157         // RFC 4627: Section 3
158         // "Since the first two characters of a JSON text will always be ASCII
159         // characters [RFC0020], it is possible to determine whether an octet
160         // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
161         // at the pattern of nulls in the first four octets."
162         // 00 00 00 xx  UTF-32BE
163         // 00 xx 00 xx  UTF-16BE
164         // xx 00 00 00  UTF-32LE
165         // xx 00 xx 00  UTF-16LE
166         // xx xx xx xx  UTF-8
167 
168         if (!hasBOM_) {
169             unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
170             switch (pattern) {
171             case 0x08: type_ = kUTF32BE; break;
172             case 0x0A: type_ = kUTF16BE; break;
173             case 0x01: type_ = kUTF32LE; break;
174             case 0x05: type_ = kUTF16LE; break;
175             case 0x0F: type_ = kUTF8;    break;
176             default: break; // Use type defined by user.
177             }
178         }
179 
180         // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
181         if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
182         if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
183     }
184 
185     typedef Ch (*TakeFunc)(InputByteStream& is);
186     InputByteStream* is_;
187     UTFType type_;
188     Ch current_;
189     TakeFunc takeFunc_;
190     bool hasBOM_;
191 };
192 
193 //! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
194 /*!
195     \tparam CharType Type of character for writing.
196     \tparam InputByteStream type of output byte stream to be wrapped.
197 */
198 template <typename CharType, typename OutputByteStream>
199 class AutoUTFOutputStream {
200     RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
201 public:
202     typedef CharType Ch;
203 
204     //! Constructor.
205     /*!
206         \param os output stream to be wrapped.
207         \param type UTF encoding type.
208         \param putBOM Whether to write BOM at the beginning of the stream.
209     */
AutoUTFOutputStream(OutputByteStream & os,UTFType type,bool putBOM)210     AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
211         RAPIDJSON_ASSERT(type >= kUTF8 && type <= kUTF32BE);
212 
213         // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
214         if (type_ == kUTF16LE || type_ == kUTF16BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
215         if (type_ == kUTF32LE || type_ == kUTF32BE) RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
216 
217         static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
218         putFunc_ = f[type_];
219 
220         if (putBOM)
221             PutBOM();
222     }
223 
GetType()224     UTFType GetType() const { return type_; }
225 
Put(Ch c)226     void Put(Ch c) { putFunc_(*os_, c); }
Flush()227     void Flush() { os_->Flush(); }
228 
229     // Not implemented
Peek()230     Ch Peek() const { RAPIDJSON_ASSERT(false); }
Take()231     Ch Take() { RAPIDJSON_ASSERT(false); }
Tell()232     size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
PutBegin()233     Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
PutEnd(Ch *)234     size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
235 
236 private:
237     AutoUTFOutputStream(const AutoUTFOutputStream&);
238     AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
239 
PutBOM()240     void PutBOM() {
241         typedef void (*PutBOMFunc)(OutputByteStream&);
242         static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
243         f[type_](*os_);
244     }
245 
246     typedef void (*PutFunc)(OutputByteStream&, Ch);
247 
248     OutputByteStream* os_;
249     UTFType type_;
250     PutFunc putFunc_;
251 };
252 
253 #undef RAPIDJSON_ENCODINGS_FUNC
254 
255 RAPIDJSON_NAMESPACE_END
256 
257 #ifdef __GNUC__
258 RAPIDJSON_DIAG_POP
259 #endif
260 
261 #endif // RAPIDJSON_FILESTREAM_H_
262