1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <list>
20 #include <map>
21 
22 #include <unicode/ubrk.h>
23 #include <unicode/uchar.h>
24 #include <unicode/utf16.h>
25 
26 #include "minikin/Emoji.h"
27 #include "minikin/Hyphenator.h"
28 
29 #include "Locale.h"
30 #include "MinikinInternal.h"
31 
32 namespace minikin {
33 
34 namespace {
createNewIterator(const Locale & locale)35 static UBreakIterator* createNewIterator(const Locale& locale) {
36     // TODO: handle failure status
37     UErrorCode status = U_ZERO_ERROR;
38     char localeID[ULOC_FULLNAME_CAPACITY] = {};
39     uloc_forLanguageTag(locale.getString().c_str(), localeID, ULOC_FULLNAME_CAPACITY, nullptr,
40                         &status);
41     return ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status);
42 }
43 }  // namespace
44 
acquire(const Locale & locale)45 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale) {
46     const uint64_t id = locale.getIdentifier();
47     std::lock_guard<std::mutex> lock(mMutex);
48     for (auto i = mPool.begin(); i != mPool.end(); i++) {
49         if (i->localeId == id) {
50             Slot slot = std::move(*i);
51             mPool.erase(i);
52             return slot;
53         }
54     }
55 
56     // Not found in pool. Create new one.
57     return {id, IcuUbrkUniquePtr(createNewIterator(locale))};
58 }
59 
release(ICULineBreakerPool::Slot && slot)60 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
61     if (slot.breaker.get() == nullptr) {
62         return;  // Already released slot. Do nothing.
63     }
64     std::lock_guard<std::mutex> lock(mMutex);
65     if (mPool.size() >= MAX_POOL_SIZE) {
66         // Pool is full. Move to local variable, so that the given slot will be released when the
67         // variable leaves the scope.
68         Slot localSlot = std::move(slot);
69         return;
70     }
71     mPool.push_front(std::move(slot));
72 }
73 
WordBreaker()74 WordBreaker::WordBreaker() : mPool(&ICULineBreakerPoolImpl::getInstance()) {}
75 
WordBreaker(ICULineBreakerPool * pool)76 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool) {}
77 
followingWithLocale(const Locale & locale,size_t from)78 ssize_t WordBreaker::followingWithLocale(const Locale& locale, size_t from) {
79     mIcuBreaker = mPool->acquire(locale);
80     UErrorCode status = U_ZERO_ERROR;
81     MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
82     // TODO: handle failure status
83     ubrk_setUText(mIcuBreaker.breaker.get(), &mUText, &status);
84     if (mInEmailOrUrl) {
85         // Note:
86         // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
87         // The email/URL detection doesn't support following() functionality, so that we can't
88         // restart from the specific position. This means following() can not be supported in
89         // general, but keeping old email/URL context works for LineBreaker since it just wants to
90         // re-calculate the next break point with the new locale.
91     } else {
92         mCurrent = mLast = mScanOffset = from;
93         next();
94     }
95     return mCurrent;
96 }
97 
setText(const uint16_t * data,size_t size)98 void WordBreaker::setText(const uint16_t* data, size_t size) {
99     mText = data;
100     mTextSize = size;
101     mLast = 0;
102     mCurrent = 0;
103     mScanOffset = 0;
104     mInEmailOrUrl = false;
105     UErrorCode status = U_ZERO_ERROR;
106     utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size, &status);
107 }
108 
current() const109 ssize_t WordBreaker::current() const {
110     return mCurrent;
111 }
112 
113 /**
114  * Determine whether a line break at position i within the buffer buf is valid. This
115  * represents customization beyond the ICU behavior, because plain ICU provides some
116  * line break opportunities that we don't want.
117  **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)118 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
119     const size_t position = static_cast<size_t>(i);
120     if (i == UBRK_DONE || position == bufEnd) {
121         // If the iterator reaches the end, treat as break.
122         return true;
123     }
124     uint32_t codePoint;
125     size_t prev_offset = position;
126     U16_PREV(buf, 0, prev_offset, codePoint);
127     // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
128     if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
129         return false;
130     }
131     // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
132     // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
133     // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
134     // where no line break could be imagined, since the Myanmar virama is a pure stacker.
135     if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
136         return false;
137     }
138 
139     uint32_t next_codepoint;
140     size_t next_offset = position;
141     U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
142 
143     // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
144     // emoji data than ICU does.
145     if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
146         return false;
147     }
148 
149     // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
150     if (isEmojiModifier(next_codepoint)) {
151         if (codePoint == 0xFE0F && prev_offset > 0) {
152             // skip over emoji variation selector
153             U16_PREV(buf, 0, prev_offset, codePoint);
154         }
155         if (isEmojiBase(codePoint)) {
156             return false;
157         }
158     }
159     return true;
160 }
161 
162 // Customized iteratorNext that takes care of both resets and our modifications
163 // to ICU's behavior.
iteratorNext()164 int32_t WordBreaker::iteratorNext() {
165     int32_t result = ubrk_following(mIcuBreaker.breaker.get(), mCurrent);
166     while (!isValidBreak(mText, mTextSize, result)) {
167         result = ubrk_next(mIcuBreaker.breaker.get());
168     }
169     return result;
170 }
171 
172 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)173 static bool breakAfter(uint16_t c) {
174     return c == ':' || c == '=' || c == '&';
175 }
176 
177 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)178 static bool breakBefore(uint16_t c) {
179     return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
180            c == '%' || c == '=' || c == '&';
181 }
182 
183 enum ScanState {
184     START,
185     SAW_AT,
186     SAW_COLON,
187     SAW_COLON_SLASH,
188     SAW_COLON_SLASH_SLASH,
189 };
190 
detectEmailOrUrl()191 void WordBreaker::detectEmailOrUrl() {
192     // scan forward from current ICU position for email address or URL
193     if (mLast >= mScanOffset) {
194         ScanState state = START;
195         size_t i;
196         for (i = mLast; i < mTextSize; i++) {
197             uint16_t c = mText[i];
198             // scan only ASCII characters, stop at space
199             if (!(' ' < c && c <= 0x007E)) {
200                 break;
201             }
202             if (state == START && c == '@') {
203                 state = SAW_AT;
204             } else if (state == START && c == ':') {
205                 state = SAW_COLON;
206             } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
207                 if (c == '/') {
208                     state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
209                 } else {
210                     state = START;
211                 }
212             }
213         }
214         if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
215             if (!ubrk_isBoundary(mIcuBreaker.breaker.get(), i)) {
216                 // If there are combining marks or such at the end of the URL or the email address,
217                 // consider them a part of the URL or the email, and skip to the next actual
218                 // boundary.
219                 i = ubrk_following(mIcuBreaker.breaker.get(), i);
220             }
221             mInEmailOrUrl = true;
222         } else {
223             mInEmailOrUrl = false;
224         }
225         mScanOffset = i;
226     }
227 }
228 
findNextBreakInEmailOrUrl()229 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
230     // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
231     uint16_t lastChar = mText[mLast];
232     ssize_t i;
233     for (i = mLast + 1; i < mScanOffset; i++) {
234         if (breakAfter(lastChar)) {
235             break;
236         }
237         // break after double slash
238         if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
239             break;
240         }
241         const uint16_t thisChar = mText[i];
242         // never break after hyphen
243         if (lastChar != '-') {
244             if (breakBefore(thisChar)) {
245                 break;
246             }
247             // break before single slash
248             if (thisChar == '/' && lastChar != '/' &&
249                 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
250                 break;
251             }
252         }
253         lastChar = thisChar;
254     }
255     return i;
256 }
257 
next()258 ssize_t WordBreaker::next() {
259     mLast = mCurrent;
260 
261     detectEmailOrUrl();
262     if (mInEmailOrUrl) {
263         mCurrent = findNextBreakInEmailOrUrl();
264     } else {  // Business as usual
265         mCurrent = (ssize_t)iteratorNext();
266     }
267     return mCurrent;
268 }
269 
wordStart() const270 ssize_t WordBreaker::wordStart() const {
271     if (mInEmailOrUrl) {
272         return mLast;
273     }
274     ssize_t result = mLast;
275     while (result < mCurrent) {
276         UChar32 c;
277         ssize_t ix = result;
278         U16_NEXT(mText, ix, mCurrent, c);
279         const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
280         // strip leading punctuation, defined as OP and QU line breaking classes,
281         // see UAX #14
282         if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
283             break;
284         }
285         result = ix;
286     }
287     return result;
288 }
289 
wordEnd() const290 ssize_t WordBreaker::wordEnd() const {
291     if (mInEmailOrUrl) {
292         return mLast;
293     }
294     ssize_t result = mCurrent;
295     while (result > mLast) {
296         UChar32 c;
297         ssize_t ix = result;
298         U16_PREV(mText, mLast, ix, c);
299         const int32_t gc_mask = U_GET_GC_MASK(c);
300         // strip trailing spaces, punctuation and control characters
301         if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
302             break;
303         }
304         result = ix;
305     }
306     return result;
307 }
308 
breakBadness() const309 int WordBreaker::breakBadness() const {
310     return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
311 }
312 
finish()313 void WordBreaker::finish() {
314     mText = nullptr;
315     // Note: calling utext_close multiply is safe
316     utext_close(&mUText);
317     mPool->release(std::move(mIcuBreaker));
318 }
319 
320 }  // namespace minikin
321