1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINIKIN_LOCALE_LIST_H
18 #define MINIKIN_LOCALE_LIST_H
19 
20 #include <string>
21 #include <vector>
22 
23 #include <hb.h>
24 
25 #include "StringPiece.h"
26 
27 namespace minikin {
28 
29 // Due to the limits in font fallback score calculation, we can't use anything more than 12 locales.
30 const size_t FONT_LOCALE_LIMIT = 12;
31 
32 // The language or region code is encoded to 15 bits.
33 constexpr uint16_t NO_LANGUAGE = 0x7fff;
34 constexpr uint16_t NO_REGION = 0x7fff;
35 // The script code is encoded to 20 bits.
36 constexpr uint32_t NO_SCRIPT = 0xfffff;
37 
38 class LocaleList;
39 
40 // Enum for making sub-locale from FontLangauge.
41 enum class SubtagBits : uint8_t {
42     EMPTY = 0b00000000,
43     LANGUAGE = 0b00000001,
44     SCRIPT = 0b00000010,
45     REGION = 0b00000100,
46     VARIANT = 0b00001000,
47     EMOJI = 0b00010000,
48     ALL = 0b00011111,
49 };
50 
51 inline constexpr SubtagBits operator&(SubtagBits l, SubtagBits r) {
52     return static_cast<SubtagBits>(static_cast<uint8_t>(l) & static_cast<uint8_t>(r));
53 }
54 inline constexpr SubtagBits operator|(SubtagBits l, SubtagBits r) {
55     return static_cast<SubtagBits>(static_cast<uint8_t>(l) | static_cast<uint8_t>(r));
56 }
57 
58 // Enum for emoji style.
59 enum class EmojiStyle : uint8_t {
60     EMPTY = 0,    // No emoji style is specified.
61     DEFAULT = 1,  // Default emoji style is specified.
62     EMOJI = 2,    // Emoji (color) emoji style is specified.
63     TEXT = 3,     // Text (black/white) emoji style is specified.
64 };
65 
66 // Enum for line break style.
67 enum class LineBreakStyle : uint8_t {
68     EMPTY = 0,   // No line break style is specified.
69     LOOSE = 1,   // line break style is loose.
70     NORMAL = 2,  // line break style is normal.
71     STRICT = 3,  // line break style is strict.
72 };
73 
74 // Locale is a compact representation of a BCP 47 language tag.
75 // It does not capture all possible information, only what directly affects text layout:
76 // font rendering, hyphenation, word breaking, etc.
77 struct Locale {
78 public:
79     enum class Variant : uint16_t {
80         NO_VARIANT = 0x0000,
81         GERMAN_1901_ORTHOGRAPHY = 0x0001,
82         GERMAN_1996_ORTHOGRAPHY = 0x0002,
83     };
84 
85     // Default constructor creates the unsupported locale.
LocaleLocale86     Locale()
87             : mScript(NO_SCRIPT),
88               mLanguage(NO_LANGUAGE),
89               mRegion(NO_REGION),
90               mSubScriptBits(0ul),
91               mVariant(Variant::NO_VARIANT),
92               mEmojiStyle(EmojiStyle::EMPTY),
93               mLBStyle(LineBreakStyle::EMPTY) {}
94 
95     // Parse from string
96     Locale(const StringPiece& buf);
97 
98     bool operator==(const Locale other) const {
99         return !isUnsupported() && isEqualScript(other) && mLanguage == other.mLanguage &&
100                mRegion == other.mRegion && mVariant == other.mVariant &&
101                mLBStyle == other.mLBStyle && mEmojiStyle == other.mEmojiStyle;
102     }
103 
104     bool operator!=(const Locale other) const { return !(*this == other); }
105 
hasLanguageLocale106     inline bool hasLanguage() const { return mLanguage != NO_LANGUAGE; }
hasScriptLocale107     inline bool hasScript() const { return mScript != NO_SCRIPT; }
hasRegionLocale108     inline bool hasRegion() const { return mRegion != NO_REGION; }
hasVariantLocale109     inline bool hasVariant() const { return mVariant != Variant::NO_VARIANT; }
hasLBStyleLocale110     inline bool hasLBStyle() const { return mLBStyle != LineBreakStyle::EMPTY; }
hasEmojiStyleLocale111     inline bool hasEmojiStyle() const { return mEmojiStyle != EmojiStyle::EMPTY; }
112 
isSupportedLocale113     inline bool isSupported() const {
114         return hasLanguage() || hasScript() || hasRegion() || hasVariant() || hasLBStyle() ||
115                hasEmojiStyle();
116     }
117 
isUnsupportedLocale118     inline bool isUnsupported() const { return !isSupported(); }
119 
getEmojiStyleLocale120     EmojiStyle getEmojiStyle() const { return mEmojiStyle; }
121 
122     bool isEqualScript(const Locale& other) const;
123 
124     // Returns true if this script supports the given script. For example, ja-Jpan supports Hira,
125     // ja-Hira doesn't support Jpan.
126     bool supportsHbScript(hb_script_t script) const;
127 
128     std::string getString() const;
129 
130     // Calculates a matching score. This score represents how well the input locales cover this
131     // locale. The maximum score in the locale list is returned.
132     // 0 = no match, 1 = script match, 2 = script and primary language match.
133     int calcScoreFor(const LocaleList& supported) const;
134 
135     // Identifier pattern:
136     // |-------|-------|-------|-------|-------|-------|-------|-------|
137     // lllllllllllllll                                                   Language Code
138     //                ssssssssssssssssssss                               Script Code
139     //                                    rrrrrrrrrrrrrrr                Region Code
140     //                                                   ee              Emoji Style
141     //                                                     bb            Line Break Style
142     //                                                       XXXXXXXX    Free
143     //                                                               vv  German Variant
getIdentifierLocale144     uint64_t getIdentifier() const {
145         return ((uint64_t)mLanguage << 49) | ((uint64_t)mScript << 29) | ((uint64_t)mRegion << 14) |
146                ((uint64_t)mEmojiStyle << 12) | ((uint64_t)mLBStyle << 10) | (uint64_t)mVariant;
147     }
148 
149     Locale getPartialLocale(SubtagBits bits) const;
150 
151 private:
152     friend class LocaleList;  // for LocaleList constructor
153 
154     // ISO 15924 compliant script code. The 4 chars script code are packed into a 20 bit integer.
155     // If not specified, this is kInvalidScript.
156     uint32_t mScript;
157 
158     // ISO 639-1 or ISO 639-2 compliant language code.
159     // The two- or three-letter language code is packed into a 15 bit integer.
160     // mLanguage = 0 means the Locale is unsupported.
161     uint16_t mLanguage;
162 
163     // ISO 3166-1 or UN M.49 compliant region code. The two-letter or three-digit region code is
164     // packed into a 15 bit integer.
165     uint16_t mRegion;
166 
167     // For faster comparing, use 7 bits for specific scripts.
168     static const uint8_t kBopomofoFlag = 1u;
169     static const uint8_t kHanFlag = 1u << 1;
170     static const uint8_t kHangulFlag = 1u << 2;
171     static const uint8_t kHiraganaFlag = 1u << 3;
172     static const uint8_t kKatakanaFlag = 1u << 4;
173     static const uint8_t kSimplifiedChineseFlag = 1u << 5;
174     static const uint8_t kTraditionalChineseFlag = 1u << 6;
175     uint8_t mSubScriptBits;
176 
177     Variant mVariant;
178 
179     EmojiStyle mEmojiStyle;
180     LineBreakStyle mLBStyle;
181 
182     void resolveUnicodeExtension(const char* buf, size_t length);
183 
184     static uint8_t scriptToSubScriptBits(uint32_t rawScript);
185 
186     static LineBreakStyle resolveLineBreakStyle(const char* buf, size_t length);
187     static EmojiStyle resolveEmojiStyle(const char* buf, size_t length);
188     static EmojiStyle scriptToEmojiStyle(uint32_t script);
189 
190     // Returns true if the provide subscript bits has the requested subscript bits.
191     // Note that this function returns false if the requested subscript bits are empty.
192     static bool supportsScript(uint8_t providedBits, uint8_t requestedBits);
193 };
194 
195 // An immutable list of locale.
196 class LocaleList {
197 public:
198     explicit LocaleList(std::vector<Locale>&& locales);
LocaleList()199     LocaleList()
200             : mUnionOfSubScriptBits(0),
201               mIsAllTheSameLocale(false),
202               mEmojiStyle(EmojiStyle::EMPTY) {}
203     LocaleList(LocaleList&&) = default;
204 
size()205     size_t size() const { return mLocales.size(); }
empty()206     bool empty() const { return mLocales.empty(); }
207     const Locale& operator[](size_t n) const { return mLocales[n]; }
208 
getHbLanguage(size_t n)209     hb_language_t getHbLanguage(size_t n) const { return mHbLangs[n]; }
210 
211     // Returns an effective emoji style of this locale list.
212     // The effective means the first non empty emoji style in the list.
getEmojiStyle()213     EmojiStyle getEmojiStyle() const { return mEmojiStyle; }
214 
215 private:
216     friend struct Locale;  // for calcScoreFor
217 
218     std::vector<Locale> mLocales;
219 
220     // The languages to be passed to HarfBuzz shaper.
221     std::vector<hb_language_t> mHbLangs;
222     uint8_t mUnionOfSubScriptBits;
223     bool mIsAllTheSameLocale;
224     EmojiStyle mEmojiStyle;
225 
getUnionOfSubScriptBits()226     uint8_t getUnionOfSubScriptBits() const { return mUnionOfSubScriptBits; }
isAllTheSameLocale()227     bool isAllTheSameLocale() const { return mIsAllTheSameLocale; }
228 
229     // Do not copy and assign.
230     LocaleList(const LocaleList&) = delete;
231     void operator=(const LocaleList&) = delete;
232 };
233 
234 }  // namespace minikin
235 
236 #endif  // MINIKIN_LOCALE_LIST_H
237