1// Copyright 2013 The Flutter Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5part of engine; 6 7enum _FindBreakDirection { 8 /// Indicates to find the word break by looking forward. 9 forward, 10 11 /// Indicates to find the word break by looking backward. 12 backward, 13} 14 15/// [WordBreaker] exposes static methods to identify word boundaries. 16abstract class WordBreaker { 17 /// It starts from [index] and tries to find the next word boundary in [text]. 18 static int nextBreakIndex(String text, int index) => 19 _findBreakIndex(_FindBreakDirection.forward, text, index); 20 21 /// It starts from [index] and tries to find the previous word boundary in 22 /// [text]. 23 static int prevBreakIndex(String text, int index) => 24 _findBreakIndex(_FindBreakDirection.backward, text, index); 25 26 static int _findBreakIndex( 27 _FindBreakDirection direction, 28 String text, 29 int index, 30 ) { 31 int step, min, max; 32 if (direction == _FindBreakDirection.forward) { 33 step = 1; 34 min = 0; 35 max = text.length - 1; 36 } else { 37 step = -1; 38 min = 1; 39 max = text.length; 40 } 41 42 int i = index; 43 while (i >= min && i <= max) { 44 i += step; 45 if (_isBreak(text, i)) { 46 break; 47 } 48 } 49 return i; 50 } 51 52 /// Find out if there's a word break between [index - 1] and [index]. 53 /// http://unicode.org/reports/tr29/#Word_Boundary_Rules 54 static bool _isBreak(String text, int index) { 55 // Break at the start and end of text. 56 // WB1: sot ÷ Any 57 // WB2: Any ÷ eot 58 if (index <= 0 || index >= text.length) { 59 return true; 60 } 61 62 // Do not break inside surrogate pair 63 if (_isUtf16Surrogate(text.codeUnitAt(index - 1))) { 64 return false; 65 } 66 67 final CharProperty immediateRight = getCharProperty(text, index); 68 CharProperty immediateLeft = getCharProperty(text, index - 1); 69 70 // Do not break within CRLF. 71 // WB3: CR × LF 72 if (immediateLeft == CharProperty.CR && immediateRight == CharProperty.LF) 73 return false; 74 75 // Otherwise break before and after Newlines (including CR and LF) 76 // WB3a: (Newline | CR | LF) ÷ 77 if (_oneOf( 78 immediateLeft, 79 CharProperty.Newline, 80 CharProperty.CR, 81 CharProperty.LF, 82 )) { 83 return true; 84 } 85 86 // WB3b: ÷ (Newline | CR | LF) 87 if (_oneOf( 88 immediateRight, 89 CharProperty.Newline, 90 CharProperty.CR, 91 CharProperty.LF, 92 )) { 93 return true; 94 } 95 96 // WB3c: ZWJ × \p{Extended_Pictographic} 97 // TODO(flutter_web): What's the right way to implement this? 98 99 // Keep horizontal whitespace together. 100 // WB3d: WSegSpace × WSegSpace 101 if (immediateLeft == CharProperty.WSegSpace && 102 immediateRight == CharProperty.WSegSpace) { 103 return false; 104 } 105 106 // Ignore Format and Extend characters, except after sot, CR, LF, and 107 // Newline. 108 // WB4: X (Extend | Format | ZWJ)* → X 109 if (_oneOf( 110 immediateRight, 111 CharProperty.Extend, 112 CharProperty.Format, 113 CharProperty.ZWJ, 114 )) { 115 // The Extend|Format|ZWJ character is to the right, so it is attached 116 // to a character to the left, don't split here 117 return false; 118 } 119 120 // We've reached the end of an Extend|Format|ZWJ sequence, collapse it. 121 int l = 0; 122 while (_oneOf( 123 immediateLeft, 124 CharProperty.Extend, 125 CharProperty.Format, 126 CharProperty.ZWJ, 127 )) { 128 l++; 129 if (index - l - 1 < 0) { 130 // Reached the beginning of text. 131 return true; 132 } 133 immediateLeft = getCharProperty(text, index - l - 1); 134 } 135 136 // Do not break between most letters. 137 // WB5: (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) 138 if (_isAHLetter(immediateLeft) && _isAHLetter(immediateRight)) { 139 return false; 140 } 141 142 // Some tests beyond this point require more context. We need to get that 143 // context while also respecting rule WB4. So ignore Format, Extend and ZWJ. 144 145 // Skip all Format, Extend and ZWJ to the right. 146 int r = 0; 147 CharProperty nextRight; 148 do { 149 r++; 150 nextRight = getCharProperty(text, index + r); 151 } while (_oneOf( 152 nextRight, 153 CharProperty.Extend, 154 CharProperty.Format, 155 CharProperty.ZWJ, 156 )); 157 158 // Skip all Format, Extend and ZWJ to the left. 159 CharProperty nextLeft; 160 do { 161 l++; 162 nextLeft = getCharProperty(text, index - l - 1); 163 } while (_oneOf( 164 nextLeft, 165 CharProperty.Extend, 166 CharProperty.Format, 167 CharProperty.ZWJ, 168 )); 169 170 // Do not break letters across certain punctuation. 171 // WB6: (AHLetter) × (MidLetter | MidNumLet | Single_Quote) (AHLetter) 172 if (_isAHLetter(immediateLeft) && 173 _oneOf( 174 immediateRight, 175 CharProperty.MidLetter, 176 CharProperty.MidNumLet, 177 CharProperty.SingleQuote, 178 ) && 179 _isAHLetter(nextRight)) { 180 return false; 181 } 182 183 // WB7: (AHLetter) (MidLetter | MidNumLet | Single_Quote) × (AHLetter) 184 if (_isAHLetter(nextLeft) && 185 _oneOf( 186 immediateLeft, 187 CharProperty.MidLetter, 188 CharProperty.MidNumLet, 189 CharProperty.SingleQuote, 190 ) && 191 _isAHLetter(immediateRight)) { 192 return false; 193 } 194 195 // WB7a: Hebrew_Letter × Single_Quote 196 if (immediateLeft == CharProperty.HebrewLetter && 197 immediateRight == CharProperty.SingleQuote) { 198 return false; 199 } 200 201 // WB7b: Hebrew_Letter × Double_Quote Hebrew_Letter 202 if (immediateLeft == CharProperty.HebrewLetter && 203 immediateRight == CharProperty.DoubleQuote && 204 nextRight == CharProperty.HebrewLetter) { 205 return false; 206 } 207 208 // WB7c: Hebrew_Letter Double_Quote × Hebrew_Letter 209 if (nextLeft == CharProperty.HebrewLetter && 210 immediateLeft == CharProperty.DoubleQuote && 211 immediateRight == CharProperty.HebrewLetter) { 212 return false; 213 } 214 215 // Do not break within sequences of digits, or digits adjacent to letters 216 // (“3a”, or “A3”). 217 // WB8: Numeric × Numeric 218 if (immediateLeft == CharProperty.Numeric && 219 immediateRight == CharProperty.Numeric) { 220 return false; 221 } 222 223 // WB9: AHLetter × Numeric 224 if (_isAHLetter(immediateLeft) && immediateRight == CharProperty.Numeric) 225 return false; 226 227 // WB10: Numeric × AHLetter 228 if (immediateLeft == CharProperty.Numeric && _isAHLetter(immediateRight)) 229 return false; 230 231 // Do not break within sequences, such as “3.2” or “3,456.789”. 232 // WB11: Numeric (MidNum | MidNumLet | Single_Quote) × Numeric 233 if (nextLeft == CharProperty.Numeric && 234 _oneOf( 235 immediateLeft, 236 CharProperty.MidNum, 237 CharProperty.MidNumLet, 238 CharProperty.SingleQuote, 239 ) && 240 immediateRight == CharProperty.Numeric) { 241 return false; 242 } 243 244 // WB12: Numeric × (MidNum | MidNumLet | Single_Quote) Numeric 245 if (immediateLeft == CharProperty.Numeric && 246 _oneOf( 247 immediateRight, 248 CharProperty.MidNum, 249 CharProperty.MidNumLet, 250 CharProperty.SingleQuote, 251 ) && 252 nextRight == CharProperty.Numeric) { 253 return false; 254 } 255 256 // Do not break between Katakana. 257 // WB13: Katakana × Katakana 258 if (immediateLeft == CharProperty.Katakana && 259 immediateRight == CharProperty.Katakana) { 260 return false; 261 } 262 263 // Do not break from extenders. 264 // WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet 265 if (_oneOf( 266 immediateLeft, 267 CharProperty.ALetter, 268 CharProperty.HebrewLetter, 269 CharProperty.Numeric, 270 CharProperty.Katakana, 271 CharProperty.ExtendNumLet, 272 ) && 273 immediateRight == CharProperty.ExtendNumLet) { 274 return false; 275 } 276 277 // WB13b: ExtendNumLet × (AHLetter | Numeric | Katakana) 278 if (immediateLeft == CharProperty.ExtendNumLet && 279 _oneOf( 280 immediateRight, 281 CharProperty.ALetter, 282 CharProperty.HebrewLetter, 283 CharProperty.Numeric, 284 CharProperty.Katakana, 285 )) { 286 return false; 287 } 288 289 // Do not break within emoji flag sequences. That is, do not break between 290 // regional indicator (RI) symbols if there is an odd number of RI 291 // characters before the break point. 292 // WB15: sot (RI RI)* RI × RI 293 // TODO(mdebbar): implement this. 294 295 // WB16: [^RI] (RI RI)* RI × RI 296 // TODO(mdebbar): implement this. 297 298 // Otherwise, break everywhere (including around ideographs). 299 // WB999: Any ÷ Any 300 return true; 301 } 302 303 static bool _isUtf16Surrogate(int value) { 304 return value & 0xF800 == 0xD800; 305 } 306 307 static bool _oneOf( 308 CharProperty value, 309 CharProperty choice1, 310 CharProperty choice2, [ 311 CharProperty choice3, 312 CharProperty choice4, 313 CharProperty choice5, 314 ]) { 315 if (value == choice1) { 316 return true; 317 } 318 if (value == choice2) { 319 return true; 320 } 321 if (choice3 != null && value == choice3) { 322 return true; 323 } 324 if (choice4 != null && value == choice4) { 325 return true; 326 } 327 if (choice5 != null && value == choice5) { 328 return true; 329 } 330 return false; 331 } 332 333 static bool _isAHLetter(CharProperty property) { 334 return _oneOf(property, CharProperty.ALetter, CharProperty.HebrewLetter); 335 } 336} 337