1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "Locale.h"
18 
19 #include <algorithm>
20 
21 #include <hb.h>
22 
23 #include "minikin/LocaleList.h"
24 
25 #include "LocaleListCache.h"
26 #include "MinikinInternal.h"
27 #include "StringPiece.h"
28 
29 namespace minikin {
30 
31 constexpr uint32_t FIVE_BITS = 0x1f;
32 
registerLocaleList(const std::string & locales)33 uint32_t registerLocaleList(const std::string& locales) {
34     return LocaleListCache::getId(locales);
35 }
36 
37 // Check if a language code supports extension such as emoji and line break etc. according to its
38 // subtag
isSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)39 static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
40     if (bufLen < subtagLen) {
41         return false;
42     }
43     if (strncmp(buf, subtag, subtagLen) != 0) {
44         return false;  // no match between two strings
45     }
46     return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
47             buf[subtagLen] == '_');
48 }
49 
50 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
51 // For the region code, the letters must be all digits in three letter case, so the number of
52 // possible values are 10. For the language code, the letters must be all small alphabets, so the
53 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
54 // three letter language code or region code to 15 bits.
55 //
56 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const StringPiece & in,uint8_t twoLetterBase,uint8_t threeLetterBase)57 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
58                                      uint8_t threeLetterBase) {
59     if (in.length() == 2) {
60         return 0x7c00u |  // 0x1fu << 10
61                (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
62     } else {
63         return ((uint16_t)(in[0] - threeLetterBase) << 10) |
64                (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
65     }
66 }
67 
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)68 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
69                                      uint8_t threeLetterBase) {
70     uint8_t first = (in >> 10) & FIVE_BITS;
71     uint8_t second = (in >> 5) & FIVE_BITS;
72     uint8_t third = in & FIVE_BITS;
73 
74     if (first == 0x1f) {
75         out[0] = second + twoLetterBase;
76         out[1] = third + twoLetterBase;
77         return 2;
78     } else {
79         out[0] = first + threeLetterBase;
80         out[1] = second + threeLetterBase;
81         out[2] = third + threeLetterBase;
82         return 3;
83     }
84 }
85 
packLanguage(const StringPiece & in)86 static uint16_t packLanguage(const StringPiece& in) {
87     return packLanguageOrRegion(in, 'a', 'a');
88 }
89 
unpackLanguage(uint16_t in,char * out)90 static size_t unpackLanguage(uint16_t in, char* out) {
91     return unpackLanguageOrRegion(in, out, 'a', 'a');
92 }
93 
packScript(char c1,char c2,char c3,char c4)94 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
95     constexpr char FIRST_LETTER_BASE = 'A';
96     constexpr char REST_LETTER_BASE = 'a';
97     return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
98            ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
99 }
100 
packScript(uint32_t script)101 constexpr uint32_t packScript(uint32_t script) {
102     return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
103 }
104 
unpackScript(uint32_t packedScript)105 constexpr uint32_t unpackScript(uint32_t packedScript) {
106     constexpr char FIRST_LETTER_BASE = 'A';
107     constexpr char REST_LETTER_BASE = 'a';
108     const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
109     const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
110     const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
111     const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;
112 
113     return first << 24 | second << 16 | third << 8 | fourth;
114 }
115 
packRegion(const StringPiece & in)116 static uint16_t packRegion(const StringPiece& in) {
117     return packLanguageOrRegion(in, 'A', '0');
118 }
119 
unpackRegion(uint16_t in,char * out)120 static size_t unpackRegion(uint16_t in, char* out) {
121     return unpackLanguageOrRegion(in, out, 'A', '0');
122 }
123 
isLowercase(char c)124 static inline bool isLowercase(char c) {
125     return 'a' <= c && c <= 'z';
126 }
127 
isUppercase(char c)128 static inline bool isUppercase(char c) {
129     return 'A' <= c && c <= 'Z';
130 }
131 
isDigit(char c)132 static inline bool isDigit(char c) {
133     return '0' <= c && c <= '9';
134 }
135 
136 // Returns true if the buffer is valid for language code.
isValidLanguageCode(const StringPiece & buffer)137 static inline bool isValidLanguageCode(const StringPiece& buffer) {
138     if (buffer.length() != 2 && buffer.length() != 3) return false;
139     if (!isLowercase(buffer[0])) return false;
140     if (!isLowercase(buffer[1])) return false;
141     if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
142     return true;
143 }
144 
145 // Returns true if buffer is valid for script code. The length of buffer must be 4.
isValidScriptCode(const StringPiece & buffer)146 static inline bool isValidScriptCode(const StringPiece& buffer) {
147     return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
148            isLowercase(buffer[2]) && isLowercase(buffer[3]);
149 }
150 
151 // Returns true if the buffer is valid for region code.
isValidRegionCode(const StringPiece & buffer)152 static inline bool isValidRegionCode(const StringPiece& buffer) {
153     return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
154            (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
155 }
156 
157 // Parse BCP 47 language identifier into internal structure
Locale(const StringPiece & input)158 Locale::Locale(const StringPiece& input) : Locale() {
159     SplitIterator it(input, '-');
160 
161     StringPiece language = it.next();
162     if (isValidLanguageCode(language)) {
163         mLanguage = packLanguage(language);
164     } else {
165         // We don't understand anything other than two-letter or three-letter
166         // language codes, so we skip parsing the rest of the string.
167         return;
168     }
169 
170     if (!it.hasNext()) {
171         return;  // Language code only.
172     }
173     StringPiece token = it.next();
174 
175     if (isValidScriptCode(token)) {
176         mScript = packScript(token[0], token[1], token[2], token[3]);
177         mSubScriptBits = scriptToSubScriptBits(mScript);
178 
179         if (!it.hasNext()) {
180             goto finalize;  // No variant, emoji subtag and region code.
181         }
182         token = it.next();
183     }
184 
185     if (isValidRegionCode(token)) {
186         mRegion = packRegion(token);
187 
188         if (!it.hasNext()) {
189             goto finalize;  // No variant or emoji subtag.
190         }
191         token = it.next();
192     }
193 
194     if (language == "de") {  // We are only interested in German variants.
195         if (token == "1901") {
196             mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
197         } else if (token == "1996") {
198             mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
199         }
200 
201         if (mVariant != Variant::NO_VARIANT) {
202             if (!it.hasNext()) {
203                 goto finalize;  // No emoji subtag.
204             }
205 
206             token = it.next();
207         }
208     }
209 
210     resolveUnicodeExtension(input.data(), input.length());
211 
212 finalize:
213     if (mEmojiStyle == EmojiStyle::EMPTY) {
214         mEmojiStyle = scriptToEmojiStyle(mScript);
215     }
216 }
217 
resolveUnicodeExtension(const char * buf,size_t length)218 void Locale::resolveUnicodeExtension(const char* buf, size_t length) {
219     static const char kPrefix[] = "-u-";
220     const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
221     if (pos != buf + length) {
222         pos += strlen(kPrefix);
223         const size_t remainingLength = length - (pos - buf);
224         mLBStyle = resolveLineBreakStyle(pos, remainingLength);
225         mEmojiStyle = resolveEmojiStyle(pos, remainingLength);
226     }
227 }
228 
229 // static
230 // Lookup line break subtag and determine the line break style.
resolveLineBreakStyle(const char * buf,size_t length)231 LineBreakStyle Locale::resolveLineBreakStyle(const char* buf, size_t length) {
232     // 8 is the length of "-u-lb-loose", which is the shortest line break subtag,
233     // unnecessary comparison can be avoided if total length is smaller than 11.
234     const size_t kMinSubtagLength = 8;
235     if (length >= kMinSubtagLength) {
236         static const char kPrefix[] = "lb-";
237         const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
238         if (pos != buf + length) {  // found
239             pos += strlen(kPrefix);
240             const size_t remainingLength = length - (pos - buf);
241             if (isSubtag(pos, remainingLength, "loose", 5)) {
242                 return LineBreakStyle::LOOSE;
243             } else if (isSubtag(pos, remainingLength, "normal", 6)) {
244                 return LineBreakStyle::NORMAL;
245             } else if (isSubtag(pos, remainingLength, "strict", 6)) {
246                 return LineBreakStyle::STRICT;
247             }
248         }
249     }
250     return LineBreakStyle::EMPTY;
251 }
252 
253 // static
254 // Lookup emoji subtag and determine the emoji style.
resolveEmojiStyle(const char * buf,size_t length)255 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
256     // 7 is the length of "-u-em-text", which is the shortest emoji subtag,
257     // unnecessary comparison can be avoided if total length is smaller than 10.
258     const size_t kMinSubtagLength = 7;
259     if (length >= kMinSubtagLength) {
260         static const char kPrefix[] = "em-";
261         const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
262         if (pos != buf + length) {  // found
263             pos += strlen(kPrefix);
264             const size_t remainingLength = length - (pos - buf);
265             if (isSubtag(pos, remainingLength, "emoji", 5)) {
266                 return EmojiStyle::EMOJI;
267             } else if (isSubtag(pos, remainingLength, "text", 4)) {
268                 return EmojiStyle::TEXT;
269             } else if (isSubtag(pos, remainingLength, "default", 7)) {
270                 return EmojiStyle::DEFAULT;
271             }
272         }
273     }
274     return EmojiStyle::EMPTY;
275 }
276 
scriptToEmojiStyle(uint32_t script)277 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
278     // If no emoji subtag was provided, resolve the emoji style from script code.
279     if (script == packScript('Z', 's', 'y', 'e')) {
280         return EmojiStyle::EMOJI;
281     } else if (script == packScript('Z', 's', 'y', 'm')) {
282         return EmojiStyle::TEXT;
283     }
284     return EmojiStyle::EMPTY;
285 }
286 
287 // static
scriptToSubScriptBits(uint32_t script)288 uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
289     uint8_t subScriptBits = 0u;
290     switch (script) {
291         case packScript('B', 'o', 'p', 'o'):
292             subScriptBits = kBopomofoFlag;
293             break;
294         case packScript('H', 'a', 'n', 'g'):
295             subScriptBits = kHangulFlag;
296             break;
297         case packScript('H', 'a', 'n', 'b'):
298             // Bopomofo is almost exclusively used in Taiwan.
299             subScriptBits = kHanFlag | kBopomofoFlag;
300             break;
301         case packScript('H', 'a', 'n', 'i'):
302             subScriptBits = kHanFlag;
303             break;
304         case packScript('H', 'a', 'n', 's'):
305             subScriptBits = kHanFlag | kSimplifiedChineseFlag;
306             break;
307         case packScript('H', 'a', 'n', 't'):
308             subScriptBits = kHanFlag | kTraditionalChineseFlag;
309             break;
310         case packScript('H', 'i', 'r', 'a'):
311             subScriptBits = kHiraganaFlag;
312             break;
313         case packScript('H', 'r', 'k', 't'):
314             subScriptBits = kKatakanaFlag | kHiraganaFlag;
315             break;
316         case packScript('J', 'p', 'a', 'n'):
317             subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
318             break;
319         case packScript('K', 'a', 'n', 'a'):
320             subScriptBits = kKatakanaFlag;
321             break;
322         case packScript('K', 'o', 'r', 'e'):
323             subScriptBits = kHanFlag | kHangulFlag;
324             break;
325     }
326     return subScriptBits;
327 }
328 
getString() const329 std::string Locale::getString() const {
330     char buf[32] = {};
331     size_t i;
332     if (mLanguage == NO_LANGUAGE) {
333         buf[0] = 'u';
334         buf[1] = 'n';
335         buf[2] = 'd';
336         i = 3;
337     } else {
338         i = unpackLanguage(mLanguage, buf);
339     }
340     if (mScript != NO_SCRIPT) {
341         uint32_t rawScript = unpackScript(mScript);
342         buf[i++] = '-';
343         buf[i++] = (rawScript >> 24) & 0xFFu;
344         buf[i++] = (rawScript >> 16) & 0xFFu;
345         buf[i++] = (rawScript >> 8) & 0xFFu;
346         buf[i++] = rawScript & 0xFFu;
347     }
348     if (mRegion != NO_REGION) {
349         buf[i++] = '-';
350         i += unpackRegion(mRegion, buf + i);
351     }
352     if (mVariant != Variant::NO_VARIANT) {
353         buf[i++] = '-';
354         buf[i++] = '1';
355         buf[i++] = '9';
356         switch (mVariant) {
357             case Variant::GERMAN_1901_ORTHOGRAPHY:
358                 buf[i++] = '0';
359                 buf[i++] = '1';
360                 break;
361             case Variant::GERMAN_1996_ORTHOGRAPHY:
362                 buf[i++] = '9';
363                 buf[i++] = '6';
364                 break;
365             default:
366                 MINIKIN_ASSERT(false, "Must not reached.");
367         }
368     }
369     // Add line break unicode extension.
370     if (mLBStyle != LineBreakStyle::EMPTY) {
371         buf[i++] = '-';
372         buf[i++] = 'u';
373         buf[i++] = '-';
374         buf[i++] = 'l';
375         buf[i++] = 'b';
376         buf[i++] = '-';
377         switch (mLBStyle) {
378             case LineBreakStyle::LOOSE:
379                 buf[i++] = 'l';
380                 buf[i++] = 'o';
381                 buf[i++] = 'o';
382                 buf[i++] = 's';
383                 buf[i++] = 'e';
384                 break;
385             case LineBreakStyle::NORMAL:
386                 buf[i++] = 'n';
387                 buf[i++] = 'o';
388                 buf[i++] = 'r';
389                 buf[i++] = 'm';
390                 buf[i++] = 'a';
391                 buf[i++] = 'l';
392                 break;
393             case LineBreakStyle::STRICT:
394                 buf[i++] = 's';
395                 buf[i++] = 't';
396                 buf[i++] = 'r';
397                 buf[i++] = 'i';
398                 buf[i++] = 'c';
399                 buf[i++] = 't';
400                 break;
401             default:
402                 MINIKIN_ASSERT(false, "Must not reached.");
403         }
404     }
405     return std::string(buf, i);
406 }
407 
getPartialLocale(SubtagBits bits) const408 Locale Locale::getPartialLocale(SubtagBits bits) const {
409     Locale subLocale;
410     if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
411         subLocale.mLanguage = mLanguage;
412     } else {
413         subLocale.mLanguage = packLanguage("und");
414     }
415     if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
416         subLocale.mScript = mScript;
417         subLocale.mSubScriptBits = mSubScriptBits;
418     }
419     if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
420         subLocale.mRegion = mRegion;
421     }
422     if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
423         subLocale.mVariant = mVariant;
424     }
425     if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
426         subLocale.mEmojiStyle = mEmojiStyle;
427     }
428     return subLocale;
429 }
430 
isEqualScript(const Locale & other) const431 bool Locale::isEqualScript(const Locale& other) const {
432     return other.mScript == mScript;
433 }
434 
435 // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)436 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
437     return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
438 }
439 
supportsHbScript(hb_script_t script) const440 bool Locale::supportsHbScript(hb_script_t script) const {
441     static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
442                   "The Minikin script and HarfBuzz hb_script_t have different encodings.");
443     uint32_t packedScript = packScript(script);
444     if (packedScript == mScript) return true;
445     return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
446 }
447 
calcScoreFor(const LocaleList & supported) const448 int Locale::calcScoreFor(const LocaleList& supported) const {
449     bool languageScriptMatch = false;
450     bool subtagMatch = false;
451     bool scriptMatch = false;
452 
453     for (size_t i = 0; i < supported.size(); ++i) {
454         if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
455             subtagMatch = true;
456             if (mLanguage == supported[i].mLanguage) {
457                 return 4;
458             }
459         }
460         if (isEqualScript(supported[i]) ||
461             supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
462             scriptMatch = true;
463             if (mLanguage == supported[i].mLanguage) {
464                 languageScriptMatch = true;
465             }
466         }
467     }
468 
469     if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
470         scriptMatch = true;
471         if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
472             return 3;
473         }
474     }
475 
476     if (languageScriptMatch) {
477         return 3;
478     } else if (subtagMatch) {
479         return 2;
480     } else if (scriptMatch) {
481         return 1;
482     }
483     return 0;
484 }
485 
buildHbLanguage(const Locale & locale)486 static hb_language_t buildHbLanguage(const Locale& locale) {
487     return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
488                                 : HB_LANGUAGE_INVALID;
489 }
490 
LocaleList(std::vector<Locale> && locales)491 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
492     mIsAllTheSameLocale = true;
493     mUnionOfSubScriptBits = 0u;
494     mHbLangs.reserve(mLocales.size());
495     mEmojiStyle = EmojiStyle::EMPTY;
496     const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
497     for (const Locale& locale : mLocales) {
498         mUnionOfSubScriptBits |= locale.mSubScriptBits;
499         if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
500             mIsAllTheSameLocale = false;
501         }
502         mHbLangs.push_back(buildHbLanguage(locale));
503         if (mEmojiStyle == EmojiStyle::EMPTY) {
504             mEmojiStyle = locale.getEmojiStyle();
505         }
506     }
507 }
508 
509 }  // namespace minikin
510