1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.utils;
18 
19 import android.text.InputType;
20 import android.text.TextUtils;
21 
22 import com.android.inputmethod.latin.WordComposer;
23 import com.android.inputmethod.latin.common.Constants;
24 import com.android.inputmethod.latin.common.StringUtils;
25 import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
26 
27 import java.util.ArrayList;
28 import java.util.Locale;
29 
30 public final class CapsModeUtils {
CapsModeUtils()31     private CapsModeUtils() {
32         // This utility class is not publicly instantiable.
33     }
34 
35     /**
36      * Apply an auto-caps mode to a string.
37      *
38      * This intentionally does NOT apply manual caps mode. It only changes the capitalization if
39      * the mode is one of the auto-caps modes.
40      * @param s The string to capitalize.
41      * @param capitalizeMode The mode in which to capitalize.
42      * @param locale The locale for capitalizing.
43      * @return The capitalized string.
44      */
applyAutoCapsMode(final String s, final int capitalizeMode, final Locale locale)45     public static String applyAutoCapsMode(final String s, final int capitalizeMode,
46             final Locale locale) {
47         if (WordComposer.CAPS_MODE_AUTO_SHIFT_LOCKED == capitalizeMode) {
48             return s.toUpperCase(locale);
49         } else if (WordComposer.CAPS_MODE_AUTO_SHIFTED == capitalizeMode) {
50             return StringUtils.capitalizeFirstCodePoint(s, locale);
51         } else {
52             return s;
53         }
54     }
55 
56     /**
57      * Return whether a constant represents an auto-caps mode (either auto-shift or auto-shift-lock)
58      * @param mode The mode to test for
59      * @return true if this represents an auto-caps mode, false otherwise
60      */
isAutoCapsMode(final int mode)61     public static boolean isAutoCapsMode(final int mode) {
62         return WordComposer.CAPS_MODE_AUTO_SHIFTED == mode
63                 || WordComposer.CAPS_MODE_AUTO_SHIFT_LOCKED == mode;
64     }
65 
66     /**
67      * Helper method to find out if a code point is starting punctuation.
68      *
69      * This include the Unicode START_PUNCTUATION category, but also some other symbols that are
70      * starting, like the inverted question mark or the double quote.
71      *
72      * @param codePoint the code point
73      * @return true if it's starting punctuation, false otherwise.
74      */
isStartPunctuation(final int codePoint)75     private static boolean isStartPunctuation(final int codePoint) {
76         return (codePoint == Constants.CODE_DOUBLE_QUOTE || codePoint == Constants.CODE_SINGLE_QUOTE
77                 || codePoint == Constants.CODE_INVERTED_QUESTION_MARK
78                 || codePoint == Constants.CODE_INVERTED_EXCLAMATION_MARK
79                 || Character.getType(codePoint) == Character.START_PUNCTUATION);
80     }
81 
82     /**
83      * Determine what caps mode should be in effect at the current offset in
84      * the text. Only the mode bits set in <var>reqModes</var> will be
85      * checked. Note that the caps mode flags here are explicitly defined
86      * to match those in {@link InputType}.
87      *
88      * This code is a straight copy of TextUtils.getCapsMode (modulo namespace and formatting
89      * issues). This will change in the future as we simplify the code for our use and fix bugs.
90      *
91      * @param cs The text that should be checked for caps modes.
92      * @param reqModes The modes to be checked: may be any combination of
93      * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
94      * {@link TextUtils#CAP_MODE_SENTENCES}.
95      * @param spacingAndPunctuations The current spacing and punctuations settings.
96      * @param hasSpaceBefore Whether we should consider there is a space inserted at the end of cs
97      *
98      * @return Returns the actual capitalization modes that can be in effect
99      * at the current position, which is any combination of
100      * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
101      * {@link TextUtils#CAP_MODE_SENTENCES}.
102      */
getCapsMode(final CharSequence cs, final int reqModes, final SpacingAndPunctuations spacingAndPunctuations, final boolean hasSpaceBefore)103     public static int getCapsMode(final CharSequence cs, final int reqModes,
104             final SpacingAndPunctuations spacingAndPunctuations, final boolean hasSpaceBefore) {
105         // Quick description of what we want to do:
106         // CAP_MODE_CHARACTERS is always on.
107         // CAP_MODE_WORDS is on if there is some whitespace before the cursor.
108         // CAP_MODE_SENTENCES is on if there is some whitespace before the cursor, and the end
109         //   of a sentence just before that.
110         // We ignore opening parentheses and the like just before the cursor for purposes of
111         // finding whitespace for WORDS and SENTENCES modes.
112         // The end of a sentence ends with a period, question mark or exclamation mark. If it's
113         // a period, it also needs not to be an abbreviation, which means it also needs to either
114         // be immediately preceded by punctuation, or by a string of only letters with single
115         // periods interleaved.
116 
117         // Step 1 : check for cap MODE_CHARACTERS. If it's looked for, it's always on.
118         if ((reqModes & (TextUtils.CAP_MODE_WORDS | TextUtils.CAP_MODE_SENTENCES)) == 0) {
119             // Here we are not looking for MODE_WORDS or MODE_SENTENCES, so since we already
120             // evaluated MODE_CHARACTERS, we can return.
121             return TextUtils.CAP_MODE_CHARACTERS & reqModes;
122         }
123 
124         // Step 2 : Skip (ignore at the end of input) any opening punctuation. This includes
125         // opening parentheses, brackets, opening quotes, everything that *opens* a span of
126         // text in the linguistic sense. In RTL languages, this is still an opening sign, although
127         // it may look like a right parenthesis for example. We also include double quote and
128         // single quote since they aren't start punctuation in the unicode sense, but should still
129         // be skipped for English. TODO: does this depend on the language?
130         int i;
131         if (hasSpaceBefore) {
132             i = cs.length() + 1;
133         } else {
134             for (i = cs.length(); i > 0; i--) {
135                 final char c = cs.charAt(i - 1);
136                 if (!isStartPunctuation(c)) {
137                     break;
138                 }
139             }
140         }
141 
142         // We are now on the character that precedes any starting punctuation, so in the most
143         // frequent case this will be whitespace or a letter, although it may occasionally be a
144         // start of line, or some symbol.
145 
146         // Step 3 : Search for the start of a paragraph. From the starting point computed in step 2,
147         // we go back over any space or tab char sitting there. We find the start of a paragraph
148         // if the first char that's not a space or tab is a start of line (as in \n, start of text,
149         // or some other similar characters).
150         int j = i;
151         char prevChar = Constants.CODE_SPACE;
152         if (hasSpaceBefore) --j;
153         while (j > 0) {
154             prevChar = cs.charAt(j - 1);
155             if (!Character.isSpaceChar(prevChar) && prevChar != Constants.CODE_TAB) break;
156             j--;
157         }
158         if (j <= 0 || Character.isWhitespace(prevChar)) {
159             if (spacingAndPunctuations.mUsesGermanRules) {
160                 // In German typography rules, there is a specific case that the first character
161                 // of a new line should not be capitalized if the previous line ends in a comma.
162                 boolean hasNewLine = false;
163                 while (--j >= 0 && Character.isWhitespace(prevChar)) {
164                     if (Constants.CODE_ENTER == prevChar) {
165                         hasNewLine = true;
166                     }
167                     prevChar = cs.charAt(j);
168                 }
169                 if (Constants.CODE_COMMA == prevChar && hasNewLine) {
170                     return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
171                 }
172             }
173             // There are only spacing chars between the start of the paragraph and the cursor,
174             // defined as a isWhitespace() char that is neither a isSpaceChar() nor a tab. Both
175             // MODE_WORDS and MODE_SENTENCES should be active.
176             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
177                     | TextUtils.CAP_MODE_SENTENCES) & reqModes;
178         }
179         if (i == j) {
180             // If we don't have whitespace before index i, it means neither MODE_WORDS
181             // nor mode sentences should be on so we can return right away.
182             return TextUtils.CAP_MODE_CHARACTERS & reqModes;
183         }
184         if ((reqModes & TextUtils.CAP_MODE_SENTENCES) == 0) {
185             // Here we know we have whitespace before the cursor (if not, we returned in the above
186             // if i == j clause), so we need MODE_WORDS to be on. And we don't need to evaluate
187             // MODE_SENTENCES so we can return right away.
188             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
189         }
190         // Please note that because of the reqModes & CAP_MODE_SENTENCES test a few lines above,
191         // we know that MODE_SENTENCES is being requested.
192 
193         // Step 4 : Search for MODE_SENTENCES.
194         // English is a special case in that "American typography" rules, which are the most common
195         // in English, state that a sentence terminator immediately following a quotation mark
196         // should be swapped with it and de-duplicated (included in the quotation mark),
197         // e.g. <<Did they say, "let's go home?">>
198         // No other language has such a rule as far as I know, instead putting inside the quotation
199         // mark as the exact thing quoted and handling the surrounding punctuation independently,
200         // e.g. <<Did they say, "let's go home"?>>
201         if (spacingAndPunctuations.mUsesAmericanTypography) {
202             for (; j > 0; j--) {
203                 // Here we look to go over any closing punctuation. This is because in dominant
204                 // variants of English, the final period is placed within double quotes and maybe
205                 // other closing punctuation signs. This is generally not true in other languages.
206                 final char c = cs.charAt(j - 1);
207                 if (c != Constants.CODE_DOUBLE_QUOTE && c != Constants.CODE_SINGLE_QUOTE
208                         && Character.getType(c) != Character.END_PUNCTUATION) {
209                     break;
210                 }
211             }
212         }
213 
214         if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes;
215         char c = cs.charAt(--j);
216 
217         // We found the next interesting chunk of text ; next we need to determine if it's the
218         // end of a sentence. If we have a sentence terminator (typically a question mark or an
219         // exclamation mark), then it's the end of a sentence; however, we treat the abbreviation
220         // marker specially because usually is the same char as the sentence separator (the
221         // period in most languages) and in this case we need to apply a heuristic to determine
222         // in which of these senses it's used.
223         if (spacingAndPunctuations.isSentenceTerminator(c)
224                 && !spacingAndPunctuations.isAbbreviationMarker(c)) {
225             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
226                     | TextUtils.CAP_MODE_SENTENCES) & reqModes;
227         }
228         // If we reach here, we know we have whitespace before the cursor and before that there
229         // is something that either does not terminate the sentence, or a symbol preceded by the
230         // start of the text, or it's the sentence separator AND it happens to be the same code
231         // point as the abbreviation marker.
232         // If it's a symbol or something that does not terminate the sentence, then we need to
233         // return caps for MODE_CHARACTERS and MODE_WORDS, but not for MODE_SENTENCES.
234         if (!spacingAndPunctuations.isSentenceSeparator(c) || j <= 0) {
235             return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
236         }
237 
238         // We found out that we have a period. We need to determine if this is a full stop or
239         // otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation
240         // looks like (\w\.){2,}. Moreover, in German, you put periods after digits for dates
241         // and some other things, and in German specifically we need to not go into autocaps after
242         // a whitespace-digits-period sequence.
243         // To find out, we will have a simple state machine with the following states :
244         // START, WORD, PERIOD, ABBREVIATION, NUMBER
245         // On START : (just before the first period)
246         //           letter => WORD
247         //           digit => NUMBER if German; end with caps otherwise
248         //           whitespace => end with no caps (it was a stand-alone period)
249         //           otherwise => end with caps (several periods/symbols in a row)
250         // On WORD : (within the word just before the first period)
251         //           letter => WORD
252         //           period => PERIOD
253         //           otherwise => end with caps (it was a word with a full stop at the end)
254         // On PERIOD : (period within a potential abbreviation)
255         //           letter => LETTER
256         //           otherwise => end with caps (it was not an abbreviation)
257         // On LETTER : (letter within a potential abbreviation)
258         //           letter => LETTER
259         //           period => PERIOD
260         //           otherwise => end with no caps (it was an abbreviation)
261         // On NUMBER : (period immediately preceded by one or more digits)
262         //           digit => NUMBER
263         //           letter => LETTER (promote to word)
264         //           otherwise => end with no caps (it was a whitespace-digits-period sequence,
265         //            or a punctuation-digits-period sequence like "11.11.")
266         // "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This
267         // should capitalize.
268 
269         final int START = 0;
270         final int WORD = 1;
271         final int PERIOD = 2;
272         final int LETTER = 3;
273         final int NUMBER = 4;
274         final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
275                 | TextUtils.CAP_MODE_SENTENCES) & reqModes;
276         final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
277         int state = START;
278         while (j > 0) {
279             c = cs.charAt(--j);
280             switch (state) {
281             case START:
282                 if (Character.isLetter(c)) {
283                     state = WORD;
284                 } else if (Character.isWhitespace(c)) {
285                     return noCaps;
286                 } else if (Character.isDigit(c) && spacingAndPunctuations.mUsesGermanRules) {
287                     state = NUMBER;
288                 } else {
289                     return caps;
290                 }
291                 break;
292             case WORD:
293                 if (Character.isLetter(c)) {
294                     state = WORD;
295                 } else if (spacingAndPunctuations.isSentenceSeparator(c)) {
296                     state = PERIOD;
297                 } else {
298                     return caps;
299                 }
300                 break;
301             case PERIOD:
302                 if (Character.isLetter(c)) {
303                     state = LETTER;
304                 } else {
305                     return caps;
306                 }
307                 break;
308             case LETTER:
309                 if (Character.isLetter(c)) {
310                     state = LETTER;
311                 } else if (spacingAndPunctuations.isSentenceSeparator(c)) {
312                     state = PERIOD;
313                 } else {
314                     return noCaps;
315                 }
316                 break;
317             case NUMBER:
318                 if (Character.isLetter(c)) {
319                     state = WORD;
320                 } else if (Character.isDigit(c)) {
321                     state = NUMBER;
322                 } else {
323                     return noCaps;
324                 }
325             }
326         }
327         // Here we arrived at the start of the line. This should behave exactly like whitespace.
328         return (START == state || LETTER == state) ? noCaps : caps;
329     }
330 
331     /**
332      * Convert capitalize mode flags into human readable text.
333      *
334      * @param capsFlags The modes flags to be converted. It may be any combination of
335      * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
336      * {@link TextUtils#CAP_MODE_SENTENCES}.
337      * @return the text that describe the <code>capsMode</code>.
338      */
flagsToString(final int capsFlags)339     public static String flagsToString(final int capsFlags) {
340         final int capsFlagsMask = TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
341                 | TextUtils.CAP_MODE_SENTENCES;
342         if ((capsFlags & ~capsFlagsMask) != 0) {
343             return "unknown<0x" + Integer.toHexString(capsFlags) + ">";
344         }
345         final ArrayList<String> builder = new ArrayList<>();
346         if ((capsFlags & android.text.TextUtils.CAP_MODE_CHARACTERS) != 0) {
347             builder.add("characters");
348         }
349         if ((capsFlags & android.text.TextUtils.CAP_MODE_WORDS) != 0) {
350             builder.add("words");
351         }
352         if ((capsFlags & android.text.TextUtils.CAP_MODE_SENTENCES) != 0) {
353             builder.add("sentences");
354         }
355         return builder.isEmpty() ? "none" : TextUtils.join("|", builder);
356     }
357 }
358