1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/utf8_utils.h"
18 
19 #include <gtest/gtest.h>
20 
21 #include <vector>
22 
23 #include "utils/int_array_view.h"
24 
25 namespace latinime {
26 namespace dicttoolkit {
27 namespace {
28 
TEST(Utf8UtilsTests,TestGetCodePoints)29 TEST(Utf8UtilsTests, TestGetCodePoints) {
30     {
31         const std::vector<int> codePoints = Utf8Utils::getCodePoints("");
32         EXPECT_EQ(0u, codePoints.size());
33     }
34     {
35         const std::vector<int> codePoints = Utf8Utils::getCodePoints("test");
36         EXPECT_EQ(4u, codePoints.size());
37         EXPECT_EQ('t', codePoints[0]);
38         EXPECT_EQ('e', codePoints[1]);
39         EXPECT_EQ('s', codePoints[2]);
40         EXPECT_EQ('t', codePoints[3]);
41     }
42     {
43         const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410");
44         EXPECT_EQ(4u, codePoints.size());
45         EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A
46         EXPECT_EQ('a', codePoints[1]);
47         EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A
48         EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA
49     }
50     {
51         const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752");
52         EXPECT_EQ(3u, codePoints.size());
53         EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE
54         EXPECT_EQ('?', codePoints[1]);
55         EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT
56     }
57 
58     // Redundant UTF-8 sequences must be rejected.
59     EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty());
60     EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty());
61     EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty());
62 }
63 
TEST(Utf8UtilsTests,TestGetUtf8String)64 TEST(Utf8UtilsTests, TestGetUtf8String) {
65     {
66         const std::vector<int> codePoints = {'t', 'e', 's', 't'};
67         EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
68     }
69     {
70         const std::vector<int> codePoints = {
71                 0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */,
72                 0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
73                 0x0430 /* CYRILLIC SMALL LETTER A */,
74                 0x3042 /* HIRAGANA LETTER A */,
75                 0x1F36A /* COOKIE */,
76                 0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */
77         };
78         EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752",
79                 Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
80     }
81 }
82 
83 } // namespace
84 } // namespace dicttoolkit
85 } // namespace latinime
86