1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "WordBreaker.h"
18
19 #include <cstdio>
20
21 #include <gtest/gtest.h>
22 #include <unicode/uclean.h>
23 #include <unicode/udata.h>
24
25 #include "UnicodeUtils.h"
26
27 #ifndef NELEM
28 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
29 #endif
30
31 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
32
33 namespace minikin {
34
TEST(WordBreakerTest,basic)35 TEST(WordBreakerTest, basic) {
36 uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
37 WordBreaker breaker;
38 breaker.setText(buf, NELEM(buf));
39 EXPECT_EQ(0, breaker.current());
40 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0)); // after "hello "
41 EXPECT_EQ(0, breaker.wordStart()); // "hello"
42 EXPECT_EQ(5, breaker.wordEnd());
43 EXPECT_EQ(0, breaker.breakBadness());
44 EXPECT_EQ(6, breaker.current());
45 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
46 EXPECT_EQ(6, breaker.wordStart()); // "world"
47 EXPECT_EQ(11, breaker.wordEnd());
48 EXPECT_EQ(0, breaker.breakBadness());
49 EXPECT_EQ(11, breaker.current());
50 }
51
TEST(WordBreakerTest,softHyphen)52 TEST(WordBreakerTest, softHyphen) {
53 uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
54 WordBreaker breaker;
55 breaker.setText(buf, NELEM(buf));
56 EXPECT_EQ(0, breaker.current());
57 // after "hel{SOFT HYPHEN}lo "
58 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
59 EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
60 EXPECT_EQ(6, breaker.wordEnd());
61 EXPECT_EQ(0, breaker.breakBadness());
62 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
63 EXPECT_EQ(7, breaker.wordStart()); // "world"
64 EXPECT_EQ(12, breaker.wordEnd());
65 EXPECT_EQ(0, breaker.breakBadness());
66 }
67
TEST(WordBreakerTest,hardHyphen)68 TEST(WordBreakerTest, hardHyphen) {
69 // Hyphens should not allow breaks anymore.
70 uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
71 WordBreaker breaker;
72 breaker.setText(buf, NELEM(buf));
73 EXPECT_EQ(0, breaker.current());
74 EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
75 EXPECT_EQ(0, breaker.wordStart());
76 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
77 EXPECT_EQ(0, breaker.breakBadness());
78 }
79
TEST(WordBreakerTest,postfixAndPrefix)80 TEST(WordBreakerTest, postfixAndPrefix) {
81 uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
82 WordBreaker breaker;
83 breaker.setText(buf, NELEM(buf));
84 EXPECT_EQ(0, breaker.current());
85
86 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0)); // after CENT SIGN
87 EXPECT_EQ(0, breaker.wordStart()); // "US¢"
88 EXPECT_EQ(3, breaker.wordEnd());
89
90 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
91 EXPECT_EQ(4, breaker.wordStart()); // "JP¥"
92 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
93 }
94
TEST(WordBreakerTest,myanmarKinzi)95 TEST(WordBreakerTest, myanmarKinzi) {
96 uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU
97 WordBreaker breaker;
98 breaker.setText(buf, NELEM(buf));
99 EXPECT_EQ(0, breaker.current());
100
101 // end of string
102 EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
103 EXPECT_EQ(0, breaker.wordStart());
104 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
105 }
106
TEST(WordBreakerTest,zwjEmojiSequences)107 TEST(WordBreakerTest, zwjEmojiSequences) {
108 uint16_t buf[] = {
109 // man + zwj + heart + zwj + man
110 UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
111 // woman + zwj + heart + zwj + kiss mark + zwj + woman
112 UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
113 // eye + zwj + left speech bubble
114 UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
115 // CAT FACE + zwj + BUST IN SILHOUETTE
116 UTF16(0x1F431), 0x200D, UTF16(0x1F464),
117 };
118 WordBreaker breaker;
119 breaker.setText(buf, NELEM(buf));
120 EXPECT_EQ(0, breaker.current());
121 // after man + zwj + heart + zwj + man
122 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
123 EXPECT_EQ(0, breaker.wordStart());
124 EXPECT_EQ(7, breaker.wordEnd());
125 EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
126 EXPECT_EQ(7, breaker.wordStart());
127 EXPECT_EQ(17, breaker.wordEnd());
128 EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble
129 EXPECT_EQ(17, breaker.wordStart());
130 EXPECT_EQ(22, breaker.wordEnd());
131 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
132 EXPECT_EQ(22, breaker.wordStart());
133 EXPECT_EQ(27, breaker.wordEnd());
134 }
135
TEST(WordBreakerTest,emojiWithModifier)136 TEST(WordBreakerTest, emojiWithModifier) {
137 uint16_t buf[] = {
138 UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier
139 0x270C, 0xFE0F,
140 UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier
141 };
142 WordBreaker breaker;
143 breaker.setText(buf, NELEM(buf));
144 EXPECT_EQ(0, breaker.current());
145 // after boy + type 1-2 fitzpatrick modifier
146 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0));
147 EXPECT_EQ(0, breaker.wordStart());
148 EXPECT_EQ(4, breaker.wordEnd());
149 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
150 EXPECT_EQ(4, breaker.wordStart());
151 EXPECT_EQ(8, breaker.wordEnd());
152 }
153
TEST(WordBreakerTest,unicode10Emoji)154 TEST(WordBreakerTest, unicode10Emoji) {
155 // Should break between emojis.
156 uint16_t buf[] = {
157 // SLED + SLED
158 UTF16(0x1F6F7), UTF16(0x1F6F7),
159 // SLED + VS15 + SLED
160 UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
161 // WHITE SMILING FACE + SLED
162 0x263A, UTF16(0x1F6F7),
163 // WHITE SMILING FACE + VS16 + SLED
164 0x263A, 0xFE0F, UTF16(0x1F6F7),
165 };
166 WordBreaker breaker;
167 breaker.setText(buf, NELEM(buf));
168 EXPECT_EQ(0, breaker.current());
169 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), 0));
170 EXPECT_EQ(0, breaker.wordStart());
171 EXPECT_EQ(2, breaker.wordEnd());
172
173 EXPECT_EQ(4, breaker.next());
174 EXPECT_EQ(2, breaker.wordStart());
175 EXPECT_EQ(4, breaker.wordEnd());
176
177 EXPECT_EQ(7, breaker.next());
178 EXPECT_EQ(4, breaker.wordStart());
179 EXPECT_EQ(7, breaker.wordEnd());
180
181 EXPECT_EQ(9, breaker.next());
182 EXPECT_EQ(7, breaker.wordStart());
183 EXPECT_EQ(9, breaker.wordEnd());
184
185 EXPECT_EQ(10, breaker.next());
186 EXPECT_EQ(9, breaker.wordStart());
187 EXPECT_EQ(10, breaker.wordEnd());
188
189 EXPECT_EQ(12, breaker.next());
190 EXPECT_EQ(10, breaker.wordStart());
191 EXPECT_EQ(12, breaker.wordEnd());
192
193 EXPECT_EQ(14, breaker.next());
194 EXPECT_EQ(12, breaker.wordStart());
195 EXPECT_EQ(14, breaker.wordEnd());
196
197 EXPECT_EQ(16, breaker.next());
198 EXPECT_EQ(14, breaker.wordStart());
199 EXPECT_EQ(16, breaker.wordEnd());
200 }
201
TEST(WordBreakerTest,flagsSequenceSingleFlag)202 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
203 const std::string kFlag = "U+1F3F4";
204 const std::string flags = kFlag + " " + kFlag;
205
206 const int kFlagLength = 2;
207 const size_t BUF_SIZE = kFlagLength * 2;
208
209 uint16_t buf[BUF_SIZE];
210 size_t size;
211 ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
212
213 WordBreaker breaker;
214 breaker.setText(buf, size);
215 EXPECT_EQ(0, breaker.current());
216 // end of the first flag
217 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
218 EXPECT_EQ(0, breaker.wordStart());
219 EXPECT_EQ(kFlagLength, breaker.wordEnd());
220 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
221 EXPECT_EQ(kFlagLength, breaker.wordStart());
222 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
223 }
224
TEST(WordBreakerTest,flagsSequence)225 TEST(WordBreakerTest, flagsSequence) {
226 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
227 // of Scotland.
228 const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
229 const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
230
231 const int kFlagLength = 14;
232 const size_t BUF_SIZE = kFlagLength * 2;
233
234 uint16_t buf[BUF_SIZE];
235 size_t size;
236 ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
237
238 WordBreaker breaker;
239 breaker.setText(buf, size);
240 EXPECT_EQ(0, breaker.current());
241 // end of the first flag sequence
242 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
243 EXPECT_EQ(0, breaker.wordStart());
244 EXPECT_EQ(kFlagLength, breaker.wordEnd());
245 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
246 EXPECT_EQ(kFlagLength, breaker.wordStart());
247 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
248 }
249
TEST(WordBreakerTest,punct)250 TEST(WordBreakerTest, punct) {
251 uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
252 ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'};
253 WordBreaker breaker;
254 breaker.setText(buf, NELEM(buf));
255 EXPECT_EQ(0, breaker.current());
256 EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), 0)); // after "¡¡hello, "
257 EXPECT_EQ(2, breaker.wordStart()); // "hello"
258 EXPECT_EQ(7, breaker.wordEnd());
259 EXPECT_EQ(0, breaker.breakBadness());
260 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
261 EXPECT_EQ(9, breaker.wordStart()); // "world"
262 EXPECT_EQ(14, breaker.wordEnd());
263 EXPECT_EQ(0, breaker.breakBadness());
264 }
265
TEST(WordBreakerTest,email)266 TEST(WordBreakerTest, email) {
267 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
268 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
269 WordBreaker breaker;
270 breaker.setText(buf, NELEM(buf));
271 EXPECT_EQ(0, breaker.current());
272 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0)); // after "foo@example"
273 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
274 EXPECT_EQ(1, breaker.breakBadness());
275 EXPECT_EQ(16, breaker.next()); // after ".com "
276 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
277 EXPECT_EQ(0, breaker.breakBadness());
278 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
279 EXPECT_EQ(16, breaker.wordStart()); // "x"
280 EXPECT_EQ(17, breaker.wordEnd());
281 EXPECT_EQ(0, breaker.breakBadness());
282 }
283
TEST(WordBreakerTest,mailto)284 TEST(WordBreakerTest, mailto) {
285 uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
286 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
287 WordBreaker breaker;
288 breaker.setText(buf, NELEM(buf));
289 EXPECT_EQ(0, breaker.current());
290 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0)); // after "mailto:"
291 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
292 EXPECT_EQ(1, breaker.breakBadness());
293 EXPECT_EQ(18, breaker.next()); // after "foo@example"
294 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
295 EXPECT_EQ(1, breaker.breakBadness());
296 EXPECT_EQ(23, breaker.next()); // after ".com "
297 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
298 EXPECT_EQ(0, breaker.breakBadness());
299 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
300 EXPECT_EQ(23, breaker.wordStart()); // "x"
301 EXPECT_EQ(24, breaker.wordEnd());
302 EXPECT_EQ(0, breaker.breakBadness());
303 }
304
305 // The current logic always places a line break after a detected email address or URL
306 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)307 TEST(WordBreakerTest, emailNonAscii) {
308 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
309 'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
310 WordBreaker breaker;
311 breaker.setText(buf, NELEM(buf));
312 EXPECT_EQ(0, breaker.current());
313 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0)); // after "foo@example"
314 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
315 EXPECT_EQ(1, breaker.breakBadness());
316 EXPECT_EQ(15, breaker.next()); // after ".com"
317 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
318 EXPECT_EQ(0, breaker.breakBadness());
319 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
320 EXPECT_EQ(15, breaker.wordStart()); // "一"
321 EXPECT_EQ(16, breaker.wordEnd());
322 EXPECT_EQ(0, breaker.breakBadness());
323 }
324
TEST(WordBreakerTest,emailCombining)325 TEST(WordBreakerTest, emailCombining) {
326 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
327 'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
328 WordBreaker breaker;
329 breaker.setText(buf, NELEM(buf));
330 EXPECT_EQ(0, breaker.current());
331 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0)); // after "foo@example"
332 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
333 EXPECT_EQ(1, breaker.breakBadness());
334 EXPECT_EQ(17, breaker.next()); // after ".com̃ "
335 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
336 EXPECT_EQ(0, breaker.breakBadness());
337 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
338 EXPECT_EQ(17, breaker.wordStart()); // "x"
339 EXPECT_EQ(18, breaker.wordEnd());
340 EXPECT_EQ(0, breaker.breakBadness());
341 }
342
TEST(WordBreakerTest,lonelyAt)343 TEST(WordBreakerTest, lonelyAt) {
344 uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
345 WordBreaker breaker;
346 breaker.setText(buf, NELEM(buf));
347 EXPECT_EQ(0, breaker.current());
348 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0)); // after "a "
349 EXPECT_EQ(0, breaker.wordStart()); // "a"
350 EXPECT_EQ(1, breaker.wordEnd());
351 EXPECT_EQ(0, breaker.breakBadness());
352 EXPECT_EQ(4, breaker.next()); // after "@ "
353 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
354 EXPECT_EQ(0, breaker.breakBadness());
355 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
356 EXPECT_EQ(4, breaker.wordStart()); // "b"
357 EXPECT_EQ(5, breaker.wordEnd());
358 EXPECT_EQ(0, breaker.breakBadness());
359 }
360
TEST(WordBreakerTest,url)361 TEST(WordBreakerTest, url) {
362 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
363 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
364 WordBreaker breaker;
365 breaker.setText(buf, NELEM(buf));
366 EXPECT_EQ(0, breaker.current());
367 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
368 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
369 EXPECT_EQ(1, breaker.breakBadness());
370 EXPECT_EQ(7, breaker.next()); // after "//"
371 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
372 EXPECT_EQ(1, breaker.breakBadness());
373 EXPECT_EQ(14, breaker.next()); // after "example"
374 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
375 EXPECT_EQ(1, breaker.breakBadness());
376 EXPECT_EQ(19, breaker.next()); // after ".com "
377 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
378 EXPECT_EQ(0, breaker.breakBadness());
379 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
380 EXPECT_EQ(19, breaker.wordStart()); // "x"
381 EXPECT_EQ(20, breaker.wordEnd());
382 EXPECT_EQ(0, breaker.breakBadness());
383 }
384
385 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)386 TEST(WordBreakerTest, urlBreakChars) {
387 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
388 '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
389 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
390 WordBreaker breaker;
391 breaker.setText(buf, NELEM(buf));
392 EXPECT_EQ(0, breaker.current());
393 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
394 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
395 EXPECT_EQ(1, breaker.breakBadness());
396 EXPECT_EQ(7, breaker.next()); // after "//"
397 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
398 EXPECT_EQ(1, breaker.breakBadness());
399 EXPECT_EQ(8, breaker.next()); // after "a"
400 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
401 EXPECT_EQ(1, breaker.breakBadness());
402 EXPECT_EQ(10, breaker.next()); // after ".b"
403 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
404 EXPECT_EQ(1, breaker.breakBadness());
405 EXPECT_EQ(11, breaker.next()); // after "/"
406 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
407 EXPECT_EQ(1, breaker.breakBadness());
408 EXPECT_EQ(13, breaker.next()); // after "~c"
409 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
410 EXPECT_EQ(1, breaker.breakBadness());
411 EXPECT_EQ(15, breaker.next()); // after ",d"
412 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
413 EXPECT_EQ(1, breaker.breakBadness());
414 EXPECT_EQ(17, breaker.next()); // after "-e"
415 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
416 EXPECT_EQ(1, breaker.breakBadness());
417 EXPECT_EQ(19, breaker.next()); // after "?f"
418 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
419 EXPECT_EQ(1, breaker.breakBadness());
420 EXPECT_EQ(20, breaker.next()); // after "="
421 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
422 EXPECT_EQ(1, breaker.breakBadness());
423 EXPECT_EQ(21, breaker.next()); // after "g"
424 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
425 EXPECT_EQ(1, breaker.breakBadness());
426 EXPECT_EQ(22, breaker.next()); // after "&"
427 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
428 EXPECT_EQ(1, breaker.breakBadness());
429 EXPECT_EQ(23, breaker.next()); // after "h"
430 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
431 EXPECT_EQ(1, breaker.breakBadness());
432 EXPECT_EQ(25, breaker.next()); // after "#i"
433 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
434 EXPECT_EQ(1, breaker.breakBadness());
435 EXPECT_EQ(27, breaker.next()); // after "%j"
436 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
437 EXPECT_EQ(1, breaker.breakBadness());
438 EXPECT_EQ(29, breaker.next()); // after "_k"
439 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
440 EXPECT_EQ(1, breaker.breakBadness());
441 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
442 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
443 EXPECT_EQ(0, breaker.breakBadness());
444 }
445
TEST(WordBreakerTest,urlNoHyphenBreak)446 TEST(WordBreakerTest, urlNoHyphenBreak) {
447 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
448 WordBreaker breaker;
449 breaker.setText(buf, NELEM(buf));
450 EXPECT_EQ(0, breaker.current());
451 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
452 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
453 EXPECT_EQ(7, breaker.next()); // after "//"
454 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455 EXPECT_EQ(8, breaker.next()); // after "a"
456 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
457 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
458 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
459 }
460
TEST(WordBreakerTest,urlEndsWithSlash)461 TEST(WordBreakerTest, urlEndsWithSlash) {
462 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
463 WordBreaker breaker;
464 breaker.setText(buf, NELEM(buf));
465 EXPECT_EQ(0, breaker.current());
466 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
467 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
468 EXPECT_EQ(7, breaker.next()); // after "//"
469 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
470 EXPECT_EQ(8, breaker.next()); // after "a"
471 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
472 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
473 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
474 }
475
TEST(WordBreakerTest,emailStartsWithSlash)476 TEST(WordBreakerTest, emailStartsWithSlash) {
477 uint16_t buf[] = {'/', 'a', '@', 'b'};
478 WordBreaker breaker;
479 breaker.setText(buf, NELEM(buf));
480 EXPECT_EQ(0, breaker.current());
481 EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0)); // end
482 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
483 }
484
TEST(WordBreakerTest,setLocaleInsideUrl)485 TEST(WordBreakerTest, setLocaleInsideUrl) {
486 std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
487 WordBreaker breaker;
488 breaker.setText(buf.data(), buf.size());
489 EXPECT_EQ(0, breaker.current());
490 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0)); // after "Hello "
491 EXPECT_EQ(0, breaker.wordStart());
492 EXPECT_EQ(5, breaker.wordEnd());
493
494 EXPECT_EQ(6, breaker.current());
495 EXPECT_EQ(11, breaker.next()); // after "http:"
496
497 // Restart from middle point of the URL. It should return the same previous break point.
498 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 6)); // after "http:"
499 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
500
501 EXPECT_EQ(13, breaker.next()); // after "//"
502 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
503
504 // Restart from middle point of the URL. It should return the same previous break point.
505 EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), 12)); // after "//"
506 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
507 EXPECT_EQ(16, breaker.next()); // after "abc"
508 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
509 EXPECT_EQ(18, breaker.next()); // after "/d"
510 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
511 EXPECT_EQ(24, breaker.next()); // after ".html"
512 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
513
514 EXPECT_EQ(29, breaker.next()); // after "World"
515 EXPECT_EQ(24, breaker.wordStart());
516 EXPECT_EQ(29, breaker.wordEnd());
517 }
518
519 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)520 TEST(WordBreakerTest, spaceAfterSpace) {
521 const std::vector<uint16_t> SPACES = {
522 '\t', // TAB
523 0x1680, // OGHAM SPACE MARK
524 0x3000, // IDEOGRAPHIC SPACE
525 };
526
527 constexpr uint16_t CHAR_SPACE = 0x0020;
528
529 for (uint16_t sp : SPACES) {
530 char msg[64] = {};
531 snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
532 SCOPED_TRACE(msg);
533
534 std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
535 WordBreaker breaker;
536 breaker.setText(buf.data(), buf.size());
537
538 EXPECT_EQ(0, breaker.current());
539 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0)); // after "a "
540 EXPECT_EQ(0, breaker.wordStart());
541 EXPECT_EQ(1, breaker.wordEnd());
542
543 EXPECT_EQ(2, breaker.current());
544 EXPECT_EQ(3, breaker.next()); // after CHAR_SPACE character.
545 EXPECT_EQ(2, breaker.wordStart());
546 EXPECT_EQ(2, breaker.wordEnd());
547
548 EXPECT_EQ(3, breaker.current());
549 EXPECT_EQ(4, breaker.next()); // after sp character.
550 EXPECT_EQ(3, breaker.wordStart());
551 EXPECT_EQ(4, breaker.wordEnd());
552 }
553 }
554
555 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
556 public:
TestableICULineBreakerPoolImpl()557 TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
558
559 using ICULineBreakerPoolImpl::getPoolSize;
560 using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
561 };
562
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)563 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
564 TestableICULineBreakerPoolImpl pool;
565
566 const Locale enUS("en-Latn-US");
567 const Locale frFR("fr-Latn-FR");
568
569 // All following three breakers must be the different instances.
570 ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
571 ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
572 ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
573
574 EXPECT_NE(nullptr, enUSBreaker.breaker.get());
575 EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
576 EXPECT_NE(nullptr, frFRBreaker.breaker.get());
577
578 EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
579 EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
580 EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
581
582 EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
583 EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
584 EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
585 }
586
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)587 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
588 TestableICULineBreakerPoolImpl pool;
589
590 const Locale enUS("en-Latn-US");
591 const Locale frFR("fr-Latn-FR");
592
593 // All following three breakers must be the different instances.
594 ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
595
596 uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
597 UBreakIterator* enUSBreakerPtr = enUSBreaker.breaker.get();
598
599 pool.release(std::move(enUSBreaker));
600 EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
601
602 // acquire must return a different instance if the locale is different.
603 ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
604 EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
605 EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
606
607 // acquire must return the same instance as released before if the locale is the same.
608 ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
609 EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
610 EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
611 }
612
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)613 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
614 const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
615 TestableICULineBreakerPoolImpl pool;
616
617 const Locale enUS("en-Latn-US");
618
619 ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
620
621 // Make pool full.
622 for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
623 slots[i] = pool.acquire(enUS);
624 EXPECT_EQ(0U, pool.getPoolSize());
625 }
626
627 for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
628 pool.release(std::move(slots[i]));
629 EXPECT_EQ(i + 1, pool.getPoolSize());
630 }
631
632 for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
633 pool.release(std::move(slots[i]));
634 EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
635 }
636 }
637
638 } // namespace minikin
639