1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package android.text.method; 18 19 import android.annotation.NonNull; 20 import android.compat.annotation.UnsupportedAppUsage; 21 import android.icu.lang.UCharacter; 22 import android.icu.lang.UProperty; 23 import android.icu.text.BreakIterator; 24 import android.text.CharSequenceCharacterIterator; 25 import android.text.Selection; 26 27 import java.util.Locale; 28 29 /** 30 * Walks through cursor positions at word boundaries. Internally uses 31 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} 32 * for performance reasons. 33 * 34 * Also provides methods to determine word boundaries. 35 * {@hide} 36 */ 37 public class WordIterator implements Selection.PositionIterator { 38 // Size of the window for the word iterator, should be greater than the longest word's length 39 private static final int WINDOW_WIDTH = 50; 40 41 private int mStart, mEnd; 42 private CharSequence mCharSeq; 43 private final BreakIterator mIterator; 44 45 /** 46 * Constructs a WordIterator using the default locale. 47 */ WordIterator()48 public WordIterator() { 49 this(Locale.getDefault()); 50 } 51 52 /** 53 * Constructs a new WordIterator for the specified locale. 54 * @param locale The locale to be used for analyzing the text. 55 */ 56 @UnsupportedAppUsage WordIterator(Locale locale)57 public WordIterator(Locale locale) { 58 mIterator = BreakIterator.getWordInstance(locale); 59 } 60 61 @UnsupportedAppUsage setCharSequence(@onNull CharSequence charSequence, int start, int end)62 public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) { 63 if (0 <= start && end <= charSequence.length()) { 64 mCharSeq = charSequence; 65 mStart = Math.max(0, start - WINDOW_WIDTH); 66 mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); 67 mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd)); 68 } else { 69 throw new IndexOutOfBoundsException("input indexes are outside the CharSequence"); 70 } 71 } 72 73 /** {@inheritDoc} */ 74 @UnsupportedAppUsage preceding(int offset)75 public int preceding(int offset) { 76 checkOffsetIsValid(offset); 77 while (true) { 78 offset = mIterator.preceding(offset); 79 if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) { 80 return offset; 81 } 82 } 83 } 84 85 /** {@inheritDoc} */ 86 @UnsupportedAppUsage following(int offset)87 public int following(int offset) { 88 checkOffsetIsValid(offset); 89 while (true) { 90 offset = mIterator.following(offset); 91 if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) { 92 return offset; 93 } 94 } 95 } 96 97 /** {@inheritDoc} */ 98 @UnsupportedAppUsage isBoundary(int offset)99 public boolean isBoundary(int offset) { 100 checkOffsetIsValid(offset); 101 return mIterator.isBoundary(offset); 102 } 103 104 /** 105 * Returns the position of next boundary after the given offset. Returns 106 * {@code DONE} if there is no boundary after the given offset. 107 * 108 * @param offset the given start position to search from. 109 * @return the position of the last boundary preceding the given offset. 110 */ 111 @UnsupportedAppUsage nextBoundary(int offset)112 public int nextBoundary(int offset) { 113 checkOffsetIsValid(offset); 114 return mIterator.following(offset); 115 } 116 117 /** 118 * Returns the position of boundary preceding the given offset or 119 * {@code DONE} if the given offset specifies the starting position. 120 * 121 * @param offset the given start position to search from. 122 * @return the position of the last boundary preceding the given offset. 123 */ 124 @UnsupportedAppUsage prevBoundary(int offset)125 public int prevBoundary(int offset) { 126 checkOffsetIsValid(offset); 127 return mIterator.preceding(offset); 128 } 129 130 /** If <code>offset</code> is within a word, returns the index of the first character of that 131 * word, otherwise returns BreakIterator.DONE. 132 * 133 * The offsets that are considered to be part of a word are the indexes of its characters, 134 * <i>as well as</i> the index of its last character plus one. 135 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 136 * 137 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 138 * The returned value is within [0..offset] or BreakIterator.DONE. 139 * 140 * @throws IllegalArgumentException is offset is not valid. 141 */ 142 @UnsupportedAppUsage getBeginning(int offset)143 public int getBeginning(int offset) { 144 // TODO: Check if usage of this can be updated to getBeginning(offset, true) if 145 // so this method can be removed. 146 return getBeginning(offset, false); 147 } 148 149 /** 150 * If <code>offset</code> is within a word, returns the index of the last character of that 151 * word plus one, otherwise returns BreakIterator.DONE. 152 * 153 * The offsets that are considered to be part of a word are the indexes of its characters, 154 * <i>as well as</i> the index of its last character plus one. 155 * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 156 * 157 * Valid range for offset is [0..textLength] (note the inclusive upper bound). 158 * The returned value is within [offset..textLength] or BreakIterator.DONE. 159 * 160 * @throws IllegalArgumentException is offset is not valid. 161 */ 162 @UnsupportedAppUsage getEnd(int offset)163 public int getEnd(int offset) { 164 // TODO: Check if usage of this can be updated to getEnd(offset, true), if 165 // so this method can be removed. 166 return getEnd(offset, false); 167 } 168 169 /** 170 * If the <code>offset</code> is within a word or on a word boundary that can only be 171 * considered the start of a word (e.g. _word where "_" is any character that would not 172 * be considered part of the word) then this returns the index of the first character of 173 * that word. 174 * 175 * If the offset is on a word boundary that can be considered the start and end of a 176 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 177 * between AA and BB, this would return the start of the previous word, AA. 178 * 179 * Returns BreakIterator.DONE if there is no previous boundary. 180 * 181 * @throws IllegalArgumentException is offset is not valid. 182 */ 183 @UnsupportedAppUsage getPrevWordBeginningOnTwoWordsBoundary(int offset)184 public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { 185 return getBeginning(offset, true); 186 } 187 188 /** 189 * If the <code>offset</code> is within a word or on a word boundary that can only be 190 * considered the end of a word (e.g. word_ where "_" is any character that would not 191 * be considered part of the word) then this returns the index of the last character 192 * plus one of that word. 193 * 194 * If the offset is on a word boundary that can be considered the start and end of a 195 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 196 * between AA and BB, this would return the end of the next word, BB. 197 * 198 * Returns BreakIterator.DONE if there is no next boundary. 199 * 200 * @throws IllegalArgumentException is offset is not valid. 201 */ 202 @UnsupportedAppUsage getNextWordEndOnTwoWordBoundary(int offset)203 public int getNextWordEndOnTwoWordBoundary(int offset) { 204 return getEnd(offset, true); 205 } 206 207 /** 208 * If the <code>offset</code> is within a word or on a word boundary that can only be 209 * considered the start of a word (e.g. _word where "_" is any character that would not 210 * be considered part of the word) then this returns the index of the first character of 211 * that word. 212 * 213 * If the offset is on a word boundary that can be considered the start and end of a 214 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 215 * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would 216 * return the start of the previous word, AA. Otherwise it would return the current offset, 217 * the start of BB. 218 * 219 * Returns BreakIterator.DONE if there is no previous boundary. 220 * 221 * @throws IllegalArgumentException is offset is not valid. 222 */ getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary)223 private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { 224 checkOffsetIsValid(offset); 225 226 if (isOnLetterOrDigit(offset)) { 227 if (mIterator.isBoundary(offset) 228 && (!isAfterLetterOrDigit(offset) 229 || !getPrevWordBeginningOnTwoWordsBoundary)) { 230 return offset; 231 } else { 232 return mIterator.preceding(offset); 233 } 234 } else { 235 if (isAfterLetterOrDigit(offset)) { 236 return mIterator.preceding(offset); 237 } 238 } 239 return BreakIterator.DONE; 240 } 241 242 /** 243 * If the <code>offset</code> is within a word or on a word boundary that can only be 244 * considered the end of a word (e.g. word_ where "_" is any character that would not be 245 * considered part of the word) then this returns the index of the last character plus one 246 * of that word. 247 * 248 * If the offset is on a word boundary that can be considered the start and end of a 249 * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 250 * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return 251 * the end of the next word, BB. Otherwise it would return the current offset, the end 252 * of AA. 253 * 254 * Returns BreakIterator.DONE if there is no next boundary. 255 * 256 * @throws IllegalArgumentException is offset is not valid. 257 */ getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary)258 private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { 259 checkOffsetIsValid(offset); 260 261 if (isAfterLetterOrDigit(offset)) { 262 if (mIterator.isBoundary(offset) 263 && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) { 264 return offset; 265 } else { 266 return mIterator.following(offset); 267 } 268 } else { 269 if (isOnLetterOrDigit(offset)) { 270 return mIterator.following(offset); 271 } 272 } 273 return BreakIterator.DONE; 274 } 275 276 /** 277 * If <code>offset</code> is within a group of punctuation as defined 278 * by {@link #isPunctuation(int)}, returns the index of the first character 279 * of that group, otherwise returns BreakIterator.DONE. 280 * 281 * @param offset the offset to search from. 282 */ 283 @UnsupportedAppUsage getPunctuationBeginning(int offset)284 public int getPunctuationBeginning(int offset) { 285 checkOffsetIsValid(offset); 286 while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { 287 offset = prevBoundary(offset); 288 } 289 // No need to shift offset, prevBoundary handles that. 290 return offset; 291 } 292 293 /** 294 * If <code>offset</code> is within a group of punctuation as defined 295 * by {@link #isPunctuation(int)}, returns the index of the last character 296 * of that group plus one, otherwise returns BreakIterator.DONE. 297 * 298 * @param offset the offset to search from. 299 */ 300 @UnsupportedAppUsage getPunctuationEnd(int offset)301 public int getPunctuationEnd(int offset) { 302 checkOffsetIsValid(offset); 303 while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { 304 offset = nextBoundary(offset); 305 } 306 // No need to shift offset, nextBoundary handles that. 307 return offset; 308 } 309 310 /** 311 * Indicates if the provided offset is after a punctuation character 312 * as defined by {@link #isPunctuation(int)}. 313 * 314 * @param offset the offset to check from. 315 * @return Whether the offset is after a punctuation character. 316 */ 317 @UnsupportedAppUsage isAfterPunctuation(int offset)318 public boolean isAfterPunctuation(int offset) { 319 if (mStart < offset && offset <= mEnd) { 320 final int codePoint = Character.codePointBefore(mCharSeq, offset); 321 return isPunctuation(codePoint); 322 } 323 return false; 324 } 325 326 /** 327 * Indicates if the provided offset is at a punctuation character 328 * as defined by {@link #isPunctuation(int)}. 329 * 330 * @param offset the offset to check from. 331 * @return Whether the offset is at a punctuation character. 332 */ 333 @UnsupportedAppUsage isOnPunctuation(int offset)334 public boolean isOnPunctuation(int offset) { 335 if (mStart <= offset && offset < mEnd) { 336 final int codePoint = Character.codePointAt(mCharSeq, offset); 337 return isPunctuation(codePoint); 338 } 339 return false; 340 } 341 342 /** 343 * Indicates if the codepoint is a mid-word-only punctuation. 344 * 345 * At the moment, this is locale-independent, and includes all the characters in 346 * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see 347 * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the 348 * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are 349 * in the middle of a word, but they become word breaks if they happen at the end of a word 350 * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise). 351 * 352 * @param locale the locale to consider the codepoint in. Presently ignored. 353 * @param codePoint the codepoint to check. 354 * @return True if the codepoint is a mid-word punctuation. 355 */ isMidWordPunctuation(Locale locale, int codePoint)356 public static boolean isMidWordPunctuation(Locale locale, int codePoint) { 357 final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK); 358 return (wb == UCharacter.WordBreak.MIDLETTER 359 || wb == UCharacter.WordBreak.MIDNUMLET 360 || wb == UCharacter.WordBreak.SINGLE_QUOTE); 361 } 362 isPunctuationStartBoundary(int offset)363 private boolean isPunctuationStartBoundary(int offset) { 364 return isOnPunctuation(offset) && !isAfterPunctuation(offset); 365 } 366 isPunctuationEndBoundary(int offset)367 private boolean isPunctuationEndBoundary(int offset) { 368 return !isOnPunctuation(offset) && isAfterPunctuation(offset); 369 } 370 isPunctuation(int cp)371 private static boolean isPunctuation(int cp) { 372 final int type = Character.getType(cp); 373 return (type == Character.CONNECTOR_PUNCTUATION 374 || type == Character.DASH_PUNCTUATION 375 || type == Character.END_PUNCTUATION 376 || type == Character.FINAL_QUOTE_PUNCTUATION 377 || type == Character.INITIAL_QUOTE_PUNCTUATION 378 || type == Character.OTHER_PUNCTUATION 379 || type == Character.START_PUNCTUATION); 380 } 381 isAfterLetterOrDigit(int offset)382 private boolean isAfterLetterOrDigit(int offset) { 383 if (mStart < offset && offset <= mEnd) { 384 final int codePoint = Character.codePointBefore(mCharSeq, offset); 385 if (Character.isLetterOrDigit(codePoint)) return true; 386 } 387 return false; 388 } 389 isOnLetterOrDigit(int offset)390 private boolean isOnLetterOrDigit(int offset) { 391 if (mStart <= offset && offset < mEnd) { 392 final int codePoint = Character.codePointAt(mCharSeq, offset); 393 if (Character.isLetterOrDigit(codePoint)) return true; 394 } 395 return false; 396 } 397 checkOffsetIsValid(int offset)398 private void checkOffsetIsValid(int offset) { 399 if (!(mStart <= offset && offset <= mEnd)) { 400 throw new IllegalArgumentException("Invalid offset: " + (offset) + 401 ". Valid range is [" + mStart + ", " + mEnd + "]"); 402 } 403 } 404 } 405