1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minikin/GraphemeBreak.h"
18 
19 #include <algorithm>
20 #include <cstdint>
21 
22 #include <android-base/macros.h>
23 #include <unicode/uchar.h>
24 #include <unicode/utf16.h>
25 
26 #include "minikin/Emoji.h"
27 
28 namespace minikin {
29 
tailoredGraphemeClusterBreak(uint32_t c)30 int32_t tailoredGraphemeClusterBreak(uint32_t c) {
31     // Characters defined as Control that we want to treat them as Extend.
32     // These are curated manually.
33     if (c == 0x00AD                      // SHY
34         || c == 0x061C                   // ALM
35         || c == 0x180E                   // MONGOLIAN VOWEL SEPARATOR
36         || c == 0x200B                   // ZWSP
37         || c == 0x200E                   // LRM
38         || c == 0x200F                   // RLM
39         || (0x202A <= c && c <= 0x202E)  // LRE, RLE, PDF, LRO, RLO
40         || ((c | 0xF) == 0x206F)         // WJ, invisible math operators, LRI, RLI, FSI, PDI,
41                                          // and the deprecated invisible format controls
42         || c == 0xFEFF                   // BOM
43         || ((c | 0x7F) == 0xE007F))      // recently undeprecated tag characters in Plane 14
44         return U_GCB_EXTEND;
45     // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they
46     // allow a grapheme break before it.
47     else if (c == 0x0E33)
48         return U_GCB_OTHER;
49     else
50         return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
51 }
52 
53 // Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
54 // From http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory.txt
isPureKiller(uint32_t c)55 bool isPureKiller(uint32_t c) {
56     return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 ||
57             c == 0x1734 || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 ||
58             c == 0xA806 || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA ||
59             c == 0x1172B);
60 }
61 
isGraphemeBreak(const float * advances,const uint16_t * buf,size_t start,size_t count,const size_t offset)62 bool GraphemeBreak::isGraphemeBreak(const float* advances, const uint16_t* buf, size_t start,
63                                     size_t count, const size_t offset) {
64     // This implementation closely follows Unicode Standard Annex #29 on
65     // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
66     // implementing a tailored version of extended grapheme clusters.
67     // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
68 
69     // Rule GB1, sot ÷; Rule GB2, ÷ eot
70     if (offset <= start || offset >= start + count) {
71         return true;
72     }
73     if (U16_IS_TRAIL(buf[offset])) {
74         // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break
75         return !U16_IS_LEAD(buf[offset - 1]);
76     }
77     uint32_t c1 = 0;
78     uint32_t c2 = 0;
79     size_t offset_back = offset;
80     size_t offset_forward = offset;
81     U16_PREV(buf, start, offset_back, c1);
82     U16_NEXT(buf, offset_forward, start + count, c2);
83     int32_t p1 = tailoredGraphemeClusterBreak(c1);
84     int32_t p2 = tailoredGraphemeClusterBreak(c2);
85     // Rule GB3, CR x LF
86     if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
87         return false;
88     }
89     // Rule GB4, (Control | CR | LF) ÷
90     if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
91         return true;
92     }
93     // Rule GB5, ÷ (Control | CR | LF)
94     if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
95         return true;
96     }
97     // Rule GB6, L x ( L | V | LV | LVT )
98     if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
99         return false;
100     }
101     // Rule GB7, ( LV | V ) x ( V | T )
102     if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
103         return false;
104     }
105     // Rule GB8, ( LVT | T ) x T
106     if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
107         return false;
108     }
109 
110     // This is used to decide font-dependent grapheme clusters. If we don't have the advance
111     // information, we become conservative in grapheme breaking and assume that it has no advance.
112     const bool c2_has_advance = (advances != nullptr && advances[offset - start] != 0.0);
113 
114     // All the following rules are font-dependent, in the way that if we know c2 has an advance,
115     // we definitely know that it cannot form a grapheme with the character(s) before it. So we
116     // make the decision in favor a grapheme break early.
117     if (c2_has_advance) {
118         return true;
119     }
120 
121     // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
122     if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
123         return false;
124     }
125 
126     // Tailored version of Rule GB11
127     // \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
128     if (offset_back > start && p1 == U_GCB_ZWJ &&
129         u_hasBinaryProperty(c2, UCHAR_EXTENDED_PICTOGRAPHIC)) {
130         uint32_t c0 = 0;
131         size_t offset_backback = offset_back;
132         int32_t p0 = 0;
133 
134         U16_PREV(buf, start, offset_backback, c0);
135         p0 = tailoredGraphemeClusterBreak(c0);
136 
137         while (p0 == U_GCB_EXTEND && offset_backback > start) {
138             U16_PREV(buf, start, offset_backback, c0);
139             p0 = tailoredGraphemeClusterBreak(c0);
140         }
141 
142         if (u_hasBinaryProperty(c0, UCHAR_EXTENDED_PICTOGRAPHIC)) {
143             return false;
144         }
145     }
146 
147     // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
148     // sot   (RI RI)*  RI x RI
149     // [^RI] (RI RI)*  RI x RI
150     //
151     // If we have font information, we have already broken the cluster if and only if the second
152     // character had no advance, which means a ligature was formed. If we don't, we look back like
153     // UAX #29 recommends, but only up to 1000 code units.
154     if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
155         if (advances != nullptr) {
156             // We have advances information. But if we are here, we already know c2 has no advance.
157             // So we should definitely disallow a break.
158             return false;
159         } else {
160             // Look at up to 1000 code units.
161             const size_t lookback_barrier = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
162             size_t offset_backback = offset_back;
163             while (offset_backback > lookback_barrier) {
164                 uint32_t c0 = 0;
165                 U16_PREV(buf, lookback_barrier, offset_backback, c0);
166                 if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
167                     offset_backback += U16_LENGTH(c0);
168                     break;
169                 }
170             }
171             // The number 4 comes from the number of code units in a whole flag.
172             return (offset - offset_backback) % 4 == 0;
173         }
174     }
175     // Cluster Indic syllables together (tailoring of UAX #29).
176     // Immediately after each virama (that is not just a pure killer) followed by a letter, we
177     // disallow grapheme breaks (if we are here, we don't know about advances, or we already know
178     // that c2 has no advance).
179     if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
180         && !isPureKiller(c1) &&
181         u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
182         return false;
183     }
184     // Rule GB999, Any ÷ Any
185     return true;
186 }
187 
getTextRunCursor(const float * advances,const uint16_t * buf,size_t start,size_t count,size_t offset,MoveOpt opt)188 size_t GraphemeBreak::getTextRunCursor(const float* advances, const uint16_t* buf, size_t start,
189                                        size_t count, size_t offset, MoveOpt opt) {
190     switch (opt) {
191         case AFTER:
192             if (offset < start + count) {
193                 offset++;
194             }
195             FALLTHROUGH_INTENDED;
196         case AT_OR_AFTER:
197             while (!isGraphemeBreak(advances, buf, start, count, offset)) {
198                 offset++;
199             }
200             break;
201         case BEFORE:
202             if (offset > start) {
203                 offset--;
204             }
205             FALLTHROUGH_INTENDED;
206         case AT_OR_BEFORE:
207             while (!isGraphemeBreak(advances, buf, start, count, offset)) {
208                 offset--;
209             }
210             break;
211         case AT:
212             if (!isGraphemeBreak(advances, buf, start, count, offset)) {
213                 offset = (size_t)-1;
214             }
215             break;
216     }
217     return offset;
218 }
219 
220 }  // namespace minikin
221