1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <cstdio>
20 
21 #include <gtest/gtest.h>
22 #include <unicode/uclean.h>
23 #include <unicode/udata.h>
24 
25 #include "UnicodeUtils.h"
26 
27 #ifndef NELEM
28 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
29 #endif
30 
31 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
32 
33 namespace minikin {
34 
TEST(WordBreakerTest,basic)35 TEST(WordBreakerTest, basic) {
36     uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
37     WordBreaker breaker;
38     breaker.setText(buf, NELEM(buf));
39     EXPECT_EQ(0, breaker.current());
40     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0));  // after "hello "
41     EXPECT_EQ(0, breaker.wordStart());                              // "hello"
42     EXPECT_EQ(5, breaker.wordEnd());
43     EXPECT_EQ(0, breaker.breakBadness());
44     EXPECT_EQ(6, breaker.current());
45     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
46     EXPECT_EQ(6, breaker.wordStart());               // "world"
47     EXPECT_EQ(11, breaker.wordEnd());
48     EXPECT_EQ(0, breaker.breakBadness());
49     EXPECT_EQ(11, breaker.current());
50 }
51 
TEST(WordBreakerTest,softHyphen)52 TEST(WordBreakerTest, softHyphen) {
53     uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
54     WordBreaker breaker;
55     breaker.setText(buf, NELEM(buf));
56     EXPECT_EQ(0, breaker.current());
57     // after "hel{SOFT HYPHEN}lo "
58     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
59     EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
60     EXPECT_EQ(6, breaker.wordEnd());
61     EXPECT_EQ(0, breaker.breakBadness());
62     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
63     EXPECT_EQ(7, breaker.wordStart());               // "world"
64     EXPECT_EQ(12, breaker.wordEnd());
65     EXPECT_EQ(0, breaker.breakBadness());
66 }
67 
TEST(WordBreakerTest,hardHyphen)68 TEST(WordBreakerTest, hardHyphen) {
69     // Hyphens should not allow breaks anymore.
70     uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
71     WordBreaker breaker;
72     breaker.setText(buf, NELEM(buf));
73     EXPECT_EQ(0, breaker.current());
74     EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
75     EXPECT_EQ(0, breaker.wordStart());
76     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
77     EXPECT_EQ(0, breaker.breakBadness());
78 }
79 
TEST(WordBreakerTest,postfixAndPrefix)80 TEST(WordBreakerTest, postfixAndPrefix) {
81     uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5};  // US¢ JP¥
82     WordBreaker breaker;
83     breaker.setText(buf, NELEM(buf));
84     EXPECT_EQ(0, breaker.current());
85 
86     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0));  // after CENT SIGN
87     EXPECT_EQ(0, breaker.wordStart());                              // "US¢"
88     EXPECT_EQ(3, breaker.wordEnd());
89 
90     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
91     EXPECT_EQ(4, breaker.wordStart());               // "JP¥"
92     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
93 }
94 
TEST(WordBreakerTest,myanmarKinzi)95 TEST(WordBreakerTest, myanmarKinzi) {
96     uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
97     WordBreaker breaker;
98     breaker.setText(buf, NELEM(buf));
99     EXPECT_EQ(0, breaker.current());
100 
101     // end of string
102     EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
103     EXPECT_EQ(0, breaker.wordStart());
104     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
105 }
106 
TEST(WordBreakerTest,zwjEmojiSequences)107 TEST(WordBreakerTest, zwjEmojiSequences) {
108     uint16_t buf[] = {
109             // man + zwj + heart + zwj + man
110             UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
111             // woman + zwj + heart + zwj + kiss mark + zwj + woman
112             UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
113             // eye + zwj + left speech bubble
114             UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
115             // CAT FACE + zwj + BUST IN SILHOUETTE
116             UTF16(0x1F431), 0x200D, UTF16(0x1F464),
117     };
118     WordBreaker breaker;
119     breaker.setText(buf, NELEM(buf));
120     EXPECT_EQ(0, breaker.current());
121     // after man + zwj + heart + zwj + man
122     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
123     EXPECT_EQ(0, breaker.wordStart());
124     EXPECT_EQ(7, breaker.wordEnd());
125     EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
126     EXPECT_EQ(7, breaker.wordStart());
127     EXPECT_EQ(17, breaker.wordEnd());
128     EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
129     EXPECT_EQ(17, breaker.wordStart());
130     EXPECT_EQ(22, breaker.wordEnd());
131     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
132     EXPECT_EQ(22, breaker.wordStart());
133     EXPECT_EQ(27, breaker.wordEnd());
134 }
135 
TEST(WordBreakerTest,emojiWithModifier)136 TEST(WordBreakerTest, emojiWithModifier) {
137     uint16_t buf[] = {
138             UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
139             0x270C, 0xFE0F,
140             UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
141     };
142     WordBreaker breaker;
143     breaker.setText(buf, NELEM(buf));
144     EXPECT_EQ(0, breaker.current());
145     // after boy + type 1-2 fitzpatrick modifier
146     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0));
147     EXPECT_EQ(0, breaker.wordStart());
148     EXPECT_EQ(4, breaker.wordEnd());
149     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
150     EXPECT_EQ(4, breaker.wordStart());
151     EXPECT_EQ(8, breaker.wordEnd());
152 }
153 
TEST(WordBreakerTest,unicode10Emoji)154 TEST(WordBreakerTest, unicode10Emoji) {
155     // Should break between emojis.
156     uint16_t buf[] = {
157             // SLED + SLED
158             UTF16(0x1F6F7), UTF16(0x1F6F7),
159             // SLED + VS15 + SLED
160             UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
161             // WHITE SMILING FACE + SLED
162             0x263A, UTF16(0x1F6F7),
163             // WHITE SMILING FACE + VS16 + SLED
164             0x263A, 0xFE0F, UTF16(0x1F6F7),
165     };
166     WordBreaker breaker;
167     breaker.setText(buf, NELEM(buf));
168     EXPECT_EQ(0, breaker.current());
169     EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), 0));
170     EXPECT_EQ(0, breaker.wordStart());
171     EXPECT_EQ(2, breaker.wordEnd());
172 
173     EXPECT_EQ(4, breaker.next());
174     EXPECT_EQ(2, breaker.wordStart());
175     EXPECT_EQ(4, breaker.wordEnd());
176 
177     EXPECT_EQ(7, breaker.next());
178     EXPECT_EQ(4, breaker.wordStart());
179     EXPECT_EQ(7, breaker.wordEnd());
180 
181     EXPECT_EQ(9, breaker.next());
182     EXPECT_EQ(7, breaker.wordStart());
183     EXPECT_EQ(9, breaker.wordEnd());
184 
185     EXPECT_EQ(10, breaker.next());
186     EXPECT_EQ(9, breaker.wordStart());
187     EXPECT_EQ(10, breaker.wordEnd());
188 
189     EXPECT_EQ(12, breaker.next());
190     EXPECT_EQ(10, breaker.wordStart());
191     EXPECT_EQ(12, breaker.wordEnd());
192 
193     EXPECT_EQ(14, breaker.next());
194     EXPECT_EQ(12, breaker.wordStart());
195     EXPECT_EQ(14, breaker.wordEnd());
196 
197     EXPECT_EQ(16, breaker.next());
198     EXPECT_EQ(14, breaker.wordStart());
199     EXPECT_EQ(16, breaker.wordEnd());
200 }
201 
TEST(WordBreakerTest,flagsSequenceSingleFlag)202 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
203     const std::string kFlag = "U+1F3F4";
204     const std::string flags = kFlag + " " + kFlag;
205 
206     const int kFlagLength = 2;
207     const size_t BUF_SIZE = kFlagLength * 2;
208 
209     uint16_t buf[BUF_SIZE];
210     size_t size;
211     ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
212 
213     WordBreaker breaker;
214     breaker.setText(buf, size);
215     EXPECT_EQ(0, breaker.current());
216     // end of the first flag
217     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
218     EXPECT_EQ(0, breaker.wordStart());
219     EXPECT_EQ(kFlagLength, breaker.wordEnd());
220     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
221     EXPECT_EQ(kFlagLength, breaker.wordStart());
222     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
223 }
224 
TEST(WordBreakerTest,flagsSequence)225 TEST(WordBreakerTest, flagsSequence) {
226     // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
227     // of Scotland.
228     const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
229     const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
230 
231     const int kFlagLength = 14;
232     const size_t BUF_SIZE = kFlagLength * 2;
233 
234     uint16_t buf[BUF_SIZE];
235     size_t size;
236     ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
237 
238     WordBreaker breaker;
239     breaker.setText(buf, size);
240     EXPECT_EQ(0, breaker.current());
241     // end of the first flag sequence
242     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
243     EXPECT_EQ(0, breaker.wordStart());
244     EXPECT_EQ(kFlagLength, breaker.wordEnd());
245     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
246     EXPECT_EQ(kFlagLength, breaker.wordStart());
247     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
248 }
249 
TEST(WordBreakerTest,punct)250 TEST(WordBreakerTest, punct) {
251     uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
252                       ' ',    'w',    'o', 'r', 'l', 'd', '!', '!'};
253     WordBreaker breaker;
254     breaker.setText(buf, NELEM(buf));
255     EXPECT_EQ(0, breaker.current());
256     EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), 0));  // after "¡¡hello, "
257     EXPECT_EQ(2, breaker.wordStart());                              // "hello"
258     EXPECT_EQ(7, breaker.wordEnd());
259     EXPECT_EQ(0, breaker.breakBadness());
260     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
261     EXPECT_EQ(9, breaker.wordStart());               // "world"
262     EXPECT_EQ(14, breaker.wordEnd());
263     EXPECT_EQ(0, breaker.breakBadness());
264 }
265 
TEST(WordBreakerTest,email)266 TEST(WordBreakerTest, email) {
267     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
268                       'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
269     WordBreaker breaker;
270     breaker.setText(buf, NELEM(buf));
271     EXPECT_EQ(0, breaker.current());
272     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0));  // after "foo@example"
273     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
274     EXPECT_EQ(1, breaker.breakBadness());
275     EXPECT_EQ(16, breaker.next());  // after ".com "
276     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
277     EXPECT_EQ(0, breaker.breakBadness());
278     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
279     EXPECT_EQ(16, breaker.wordStart());              // "x"
280     EXPECT_EQ(17, breaker.wordEnd());
281     EXPECT_EQ(0, breaker.breakBadness());
282 }
283 
TEST(WordBreakerTest,mailto)284 TEST(WordBreakerTest, mailto) {
285     uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
286                       'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
287     WordBreaker breaker;
288     breaker.setText(buf, NELEM(buf));
289     EXPECT_EQ(0, breaker.current());
290     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));  // after "mailto:"
291     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
292     EXPECT_EQ(1, breaker.breakBadness());
293     EXPECT_EQ(18, breaker.next());  // after "foo@example"
294     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
295     EXPECT_EQ(1, breaker.breakBadness());
296     EXPECT_EQ(23, breaker.next());  // after ".com "
297     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
298     EXPECT_EQ(0, breaker.breakBadness());
299     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
300     EXPECT_EQ(23, breaker.wordStart());              // "x"
301     EXPECT_EQ(24, breaker.wordEnd());
302     EXPECT_EQ(0, breaker.breakBadness());
303 }
304 
305 // The current logic always places a line break after a detected email address or URL
306 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)307 TEST(WordBreakerTest, emailNonAscii) {
308     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
309                       'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
310     WordBreaker breaker;
311     breaker.setText(buf, NELEM(buf));
312     EXPECT_EQ(0, breaker.current());
313     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0));  // after "foo@example"
314     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
315     EXPECT_EQ(1, breaker.breakBadness());
316     EXPECT_EQ(15, breaker.next());  // after ".com"
317     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
318     EXPECT_EQ(0, breaker.breakBadness());
319     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
320     EXPECT_EQ(15, breaker.wordStart());              // "一"
321     EXPECT_EQ(16, breaker.wordEnd());
322     EXPECT_EQ(0, breaker.breakBadness());
323 }
324 
TEST(WordBreakerTest,emailCombining)325 TEST(WordBreakerTest, emailCombining) {
326     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a',    'm', 'p',
327                       'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
328     WordBreaker breaker;
329     breaker.setText(buf, NELEM(buf));
330     EXPECT_EQ(0, breaker.current());
331     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0));  // after "foo@example"
332     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
333     EXPECT_EQ(1, breaker.breakBadness());
334     EXPECT_EQ(17, breaker.next());  // after ".com̃ "
335     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
336     EXPECT_EQ(0, breaker.breakBadness());
337     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
338     EXPECT_EQ(17, breaker.wordStart());              // "x"
339     EXPECT_EQ(18, breaker.wordEnd());
340     EXPECT_EQ(0, breaker.breakBadness());
341 }
342 
TEST(WordBreakerTest,lonelyAt)343 TEST(WordBreakerTest, lonelyAt) {
344     uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
345     WordBreaker breaker;
346     breaker.setText(buf, NELEM(buf));
347     EXPECT_EQ(0, breaker.current());
348     EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0));  // after "a "
349     EXPECT_EQ(0, breaker.wordStart());                              // "a"
350     EXPECT_EQ(1, breaker.wordEnd());
351     EXPECT_EQ(0, breaker.breakBadness());
352     EXPECT_EQ(4, breaker.next());  // after "@ "
353     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
354     EXPECT_EQ(0, breaker.breakBadness());
355     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
356     EXPECT_EQ(4, breaker.wordStart());               // "b"
357     EXPECT_EQ(5, breaker.wordEnd());
358     EXPECT_EQ(0, breaker.breakBadness());
359 }
360 
TEST(WordBreakerTest,url)361 TEST(WordBreakerTest, url) {
362     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
363                       'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
364     WordBreaker breaker;
365     breaker.setText(buf, NELEM(buf));
366     EXPECT_EQ(0, breaker.current());
367     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
368     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
369     EXPECT_EQ(1, breaker.breakBadness());
370     EXPECT_EQ(7, breaker.next());  // after "//"
371     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
372     EXPECT_EQ(1, breaker.breakBadness());
373     EXPECT_EQ(14, breaker.next());  // after "example"
374     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
375     EXPECT_EQ(1, breaker.breakBadness());
376     EXPECT_EQ(19, breaker.next());  // after ".com "
377     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
378     EXPECT_EQ(0, breaker.breakBadness());
379     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
380     EXPECT_EQ(19, breaker.wordStart());              // "x"
381     EXPECT_EQ(20, breaker.wordEnd());
382     EXPECT_EQ(0, breaker.breakBadness());
383 }
384 
385 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)386 TEST(WordBreakerTest, urlBreakChars) {
387     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
388                       '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
389                       'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
390     WordBreaker breaker;
391     breaker.setText(buf, NELEM(buf));
392     EXPECT_EQ(0, breaker.current());
393     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
394     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
395     EXPECT_EQ(1, breaker.breakBadness());
396     EXPECT_EQ(7, breaker.next());  // after "//"
397     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
398     EXPECT_EQ(1, breaker.breakBadness());
399     EXPECT_EQ(8, breaker.next());  // after "a"
400     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
401     EXPECT_EQ(1, breaker.breakBadness());
402     EXPECT_EQ(10, breaker.next());  // after ".b"
403     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
404     EXPECT_EQ(1, breaker.breakBadness());
405     EXPECT_EQ(11, breaker.next());  // after "/"
406     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
407     EXPECT_EQ(1, breaker.breakBadness());
408     EXPECT_EQ(13, breaker.next());  // after "~c"
409     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
410     EXPECT_EQ(1, breaker.breakBadness());
411     EXPECT_EQ(15, breaker.next());  // after ",d"
412     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
413     EXPECT_EQ(1, breaker.breakBadness());
414     EXPECT_EQ(17, breaker.next());  // after "-e"
415     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
416     EXPECT_EQ(1, breaker.breakBadness());
417     EXPECT_EQ(19, breaker.next());  // after "?f"
418     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
419     EXPECT_EQ(1, breaker.breakBadness());
420     EXPECT_EQ(20, breaker.next());  // after "="
421     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
422     EXPECT_EQ(1, breaker.breakBadness());
423     EXPECT_EQ(21, breaker.next());  // after "g"
424     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
425     EXPECT_EQ(1, breaker.breakBadness());
426     EXPECT_EQ(22, breaker.next());  // after "&"
427     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
428     EXPECT_EQ(1, breaker.breakBadness());
429     EXPECT_EQ(23, breaker.next());  // after "h"
430     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
431     EXPECT_EQ(1, breaker.breakBadness());
432     EXPECT_EQ(25, breaker.next());  // after "#i"
433     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
434     EXPECT_EQ(1, breaker.breakBadness());
435     EXPECT_EQ(27, breaker.next());  // after "%j"
436     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
437     EXPECT_EQ(1, breaker.breakBadness());
438     EXPECT_EQ(29, breaker.next());  // after "_k"
439     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
440     EXPECT_EQ(1, breaker.breakBadness());
441     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
442     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
443     EXPECT_EQ(0, breaker.breakBadness());
444 }
445 
TEST(WordBreakerTest,urlNoHyphenBreak)446 TEST(WordBreakerTest, urlNoHyphenBreak) {
447     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
448     WordBreaker breaker;
449     breaker.setText(buf, NELEM(buf));
450     EXPECT_EQ(0, breaker.current());
451     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
452     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
453     EXPECT_EQ(7, breaker.next());  // after "//"
454     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455     EXPECT_EQ(8, breaker.next());  // after "a"
456     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
457     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
458     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
459 }
460 
TEST(WordBreakerTest,urlEndsWithSlash)461 TEST(WordBreakerTest, urlEndsWithSlash) {
462     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
463     WordBreaker breaker;
464     breaker.setText(buf, NELEM(buf));
465     EXPECT_EQ(0, breaker.current());
466     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
467     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
468     EXPECT_EQ(7, breaker.next());  // after "//"
469     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
470     EXPECT_EQ(8, breaker.next());  // after "a"
471     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
472     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
473     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
474 }
475 
TEST(WordBreakerTest,emailStartsWithSlash)476 TEST(WordBreakerTest, emailStartsWithSlash) {
477     uint16_t buf[] = {'/', 'a', '@', 'b'};
478     WordBreaker breaker;
479     breaker.setText(buf, NELEM(buf));
480     EXPECT_EQ(0, breaker.current());
481     EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));  // end
482     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
483 }
484 
TEST(WordBreakerTest,setLocaleInsideUrl)485 TEST(WordBreakerTest, setLocaleInsideUrl) {
486     std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
487     WordBreaker breaker;
488     breaker.setText(buf.data(), buf.size());
489     EXPECT_EQ(0, breaker.current());
490     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0));  // after "Hello "
491     EXPECT_EQ(0, breaker.wordStart());
492     EXPECT_EQ(5, breaker.wordEnd());
493 
494     EXPECT_EQ(6, breaker.current());
495     EXPECT_EQ(11, breaker.next());  // after "http:"
496 
497     // Restart from middle point of the URL. It should return the same previous break point.
498     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 6));  // after "http:"
499     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
500 
501     EXPECT_EQ(13, breaker.next());  // after "//"
502     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
503 
504     // Restart from middle point of the URL. It should return the same previous break point.
505     EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), 12));  // after "//"
506     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
507     EXPECT_EQ(16, breaker.next());  // after "abc"
508     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
509     EXPECT_EQ(18, breaker.next());  // after "/d"
510     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
511     EXPECT_EQ(24, breaker.next());  // after ".html"
512     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
513 
514     EXPECT_EQ(29, breaker.next());  // after "World"
515     EXPECT_EQ(24, breaker.wordStart());
516     EXPECT_EQ(29, breaker.wordEnd());
517 }
518 
519 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)520 TEST(WordBreakerTest, spaceAfterSpace) {
521     const std::vector<uint16_t> SPACES = {
522             '\t',    // TAB
523             0x1680,  // OGHAM SPACE MARK
524             0x3000,  // IDEOGRAPHIC SPACE
525     };
526 
527     constexpr uint16_t CHAR_SPACE = 0x0020;
528 
529     for (uint16_t sp : SPACES) {
530         char msg[64] = {};
531         snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
532         SCOPED_TRACE(msg);
533 
534         std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
535         WordBreaker breaker;
536         breaker.setText(buf.data(), buf.size());
537 
538         EXPECT_EQ(0, breaker.current());
539         EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0));  // after "a "
540         EXPECT_EQ(0, breaker.wordStart());
541         EXPECT_EQ(1, breaker.wordEnd());
542 
543         EXPECT_EQ(2, breaker.current());
544         EXPECT_EQ(3, breaker.next());  // after CHAR_SPACE character.
545         EXPECT_EQ(2, breaker.wordStart());
546         EXPECT_EQ(2, breaker.wordEnd());
547 
548         EXPECT_EQ(3, breaker.current());
549         EXPECT_EQ(4, breaker.next());  // after sp character.
550         EXPECT_EQ(3, breaker.wordStart());
551         EXPECT_EQ(4, breaker.wordEnd());
552     }
553 }
554 
555 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
556 public:
TestableICULineBreakerPoolImpl()557     TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
558 
559     using ICULineBreakerPoolImpl::getPoolSize;
560     using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
561 };
562 
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)563 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
564     TestableICULineBreakerPoolImpl pool;
565 
566     const Locale enUS("en-Latn-US");
567     const Locale frFR("fr-Latn-FR");
568 
569     // All following three breakers must be the different instances.
570     ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
571     ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
572     ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
573 
574     EXPECT_NE(nullptr, enUSBreaker.breaker.get());
575     EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
576     EXPECT_NE(nullptr, frFRBreaker.breaker.get());
577 
578     EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
579     EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
580     EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
581 
582     EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
583     EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
584     EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
585 }
586 
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)587 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
588     TestableICULineBreakerPoolImpl pool;
589 
590     const Locale enUS("en-Latn-US");
591     const Locale frFR("fr-Latn-FR");
592 
593     // All following three breakers must be the different instances.
594     ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
595 
596     uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
597     UBreakIterator* enUSBreakerPtr = enUSBreaker.breaker.get();
598 
599     pool.release(std::move(enUSBreaker));
600     EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
601 
602     // acquire must return a different instance if the locale is different.
603     ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
604     EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
605     EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
606 
607     // acquire must return the same instance as released before if the locale is the same.
608     ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
609     EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
610     EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
611 }
612 
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)613 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
614     const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
615     TestableICULineBreakerPoolImpl pool;
616 
617     const Locale enUS("en-Latn-US");
618 
619     ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
620 
621     // Make pool full.
622     for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
623         slots[i] = pool.acquire(enUS);
624         EXPECT_EQ(0U, pool.getPoolSize());
625     }
626 
627     for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
628         pool.release(std::move(slots[i]));
629         EXPECT_EQ(i + 1, pool.getPoolSize());
630     }
631 
632     for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
633         pool.release(std::move(slots[i]));
634         EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
635     }
636 }
637 
638 }  // namespace minikin
639