• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2013 The Flutter Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5part of engine;
6
7enum _FindBreakDirection {
8  /// Indicates to find the word break by looking forward.
9  forward,
10
11  /// Indicates to find the word break by looking backward.
12  backward,
13}
14
15/// [WordBreaker] exposes static methods to identify word boundaries.
16abstract class WordBreaker {
17  /// It starts from [index] and tries to find the next word boundary in [text].
18  static int nextBreakIndex(String text, int index) =>
19      _findBreakIndex(_FindBreakDirection.forward, text, index);
20
21  /// It starts from [index] and tries to find the previous word boundary in
22  /// [text].
23  static int prevBreakIndex(String text, int index) =>
24      _findBreakIndex(_FindBreakDirection.backward, text, index);
25
26  static int _findBreakIndex(
27    _FindBreakDirection direction,
28    String text,
29    int index,
30  ) {
31    int step, min, max;
32    if (direction == _FindBreakDirection.forward) {
33      step = 1;
34      min = 0;
35      max = text.length - 1;
36    } else {
37      step = -1;
38      min = 1;
39      max = text.length;
40    }
41
42    int i = index;
43    while (i >= min && i <= max) {
44      i += step;
45      if (_isBreak(text, i)) {
46        break;
47      }
48    }
49    return i;
50  }
51
52  /// Find out if there's a word break between [index - 1] and [index].
53  /// http://unicode.org/reports/tr29/#Word_Boundary_Rules
54  static bool _isBreak(String text, int index) {
55    // Break at the start and end of text.
56    // WB1: sot ÷ Any
57    // WB2: Any ÷ eot
58    if (index <= 0 || index >= text.length) {
59      return true;
60    }
61
62    // Do not break inside surrogate pair
63    if (_isUtf16Surrogate(text.codeUnitAt(index - 1))) {
64      return false;
65    }
66
67    final CharProperty immediateRight = getCharProperty(text, index);
68    CharProperty immediateLeft = getCharProperty(text, index - 1);
69
70    // Do not break within CRLF.
71    // WB3: CR × LF
72    if (immediateLeft == CharProperty.CR && immediateRight == CharProperty.LF)
73      return false;
74
75    // Otherwise break before and after Newlines (including CR and LF)
76    // WB3a: (Newline | CR | LF) ÷
77    if (_oneOf(
78      immediateLeft,
79      CharProperty.Newline,
80      CharProperty.CR,
81      CharProperty.LF,
82    )) {
83      return true;
84    }
85
86    // WB3b: ÷ (Newline | CR | LF)
87    if (_oneOf(
88      immediateRight,
89      CharProperty.Newline,
90      CharProperty.CR,
91      CharProperty.LF,
92    )) {
93      return true;
94    }
95
96    // WB3c: ZWJ	×	\p{Extended_Pictographic}
97    // TODO(flutter_web): What's the right way to implement this?
98
99    // Keep horizontal whitespace together.
100    // WB3d: WSegSpace × WSegSpace
101    if (immediateLeft == CharProperty.WSegSpace &&
102        immediateRight == CharProperty.WSegSpace) {
103      return false;
104    }
105
106    // Ignore Format and Extend characters, except after sot, CR, LF, and
107    // Newline.
108    // WB4: X (Extend | Format | ZWJ)* → X
109    if (_oneOf(
110      immediateRight,
111      CharProperty.Extend,
112      CharProperty.Format,
113      CharProperty.ZWJ,
114    )) {
115      // The Extend|Format|ZWJ character is to the right, so it is attached
116      // to a character to the left, don't split here
117      return false;
118    }
119
120    // We've reached the end of an Extend|Format|ZWJ sequence, collapse it.
121    int l = 0;
122    while (_oneOf(
123      immediateLeft,
124      CharProperty.Extend,
125      CharProperty.Format,
126      CharProperty.ZWJ,
127    )) {
128      l++;
129      if (index - l - 1 < 0) {
130        // Reached the beginning of text.
131        return true;
132      }
133      immediateLeft = getCharProperty(text, index - l - 1);
134    }
135
136    // Do not break between most letters.
137    // WB5: (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
138    if (_isAHLetter(immediateLeft) && _isAHLetter(immediateRight)) {
139      return false;
140    }
141
142    // Some tests beyond this point require more context. We need to get that
143    // context while also respecting rule WB4. So ignore Format, Extend and ZWJ.
144
145    // Skip all Format, Extend and ZWJ to the right.
146    int r = 0;
147    CharProperty nextRight;
148    do {
149      r++;
150      nextRight = getCharProperty(text, index + r);
151    } while (_oneOf(
152      nextRight,
153      CharProperty.Extend,
154      CharProperty.Format,
155      CharProperty.ZWJ,
156    ));
157
158    // Skip all Format, Extend and ZWJ to the left.
159    CharProperty nextLeft;
160    do {
161      l++;
162      nextLeft = getCharProperty(text, index - l - 1);
163    } while (_oneOf(
164      nextLeft,
165      CharProperty.Extend,
166      CharProperty.Format,
167      CharProperty.ZWJ,
168    ));
169
170    // Do not break letters across certain punctuation.
171    // WB6: (AHLetter) × (MidLetter | MidNumLet | Single_Quote) (AHLetter)
172    if (_isAHLetter(immediateLeft) &&
173        _oneOf(
174          immediateRight,
175          CharProperty.MidLetter,
176          CharProperty.MidNumLet,
177          CharProperty.SingleQuote,
178        ) &&
179        _isAHLetter(nextRight)) {
180      return false;
181    }
182
183    // WB7: (AHLetter) (MidLetter | MidNumLet | Single_Quote) × (AHLetter)
184    if (_isAHLetter(nextLeft) &&
185        _oneOf(
186          immediateLeft,
187          CharProperty.MidLetter,
188          CharProperty.MidNumLet,
189          CharProperty.SingleQuote,
190        ) &&
191        _isAHLetter(immediateRight)) {
192      return false;
193    }
194
195    // WB7a: Hebrew_Letter × Single_Quote
196    if (immediateLeft == CharProperty.HebrewLetter &&
197        immediateRight == CharProperty.SingleQuote) {
198      return false;
199    }
200
201    // WB7b: Hebrew_Letter × Double_Quote Hebrew_Letter
202    if (immediateLeft == CharProperty.HebrewLetter &&
203        immediateRight == CharProperty.DoubleQuote &&
204        nextRight == CharProperty.HebrewLetter) {
205      return false;
206    }
207
208    // WB7c: Hebrew_Letter Double_Quote × Hebrew_Letter
209    if (nextLeft == CharProperty.HebrewLetter &&
210        immediateLeft == CharProperty.DoubleQuote &&
211        immediateRight == CharProperty.HebrewLetter) {
212      return false;
213    }
214
215    // Do not break within sequences of digits, or digits adjacent to letters
216    // (“3a”, or “A3”).
217    // WB8: Numeric × Numeric
218    if (immediateLeft == CharProperty.Numeric &&
219        immediateRight == CharProperty.Numeric) {
220      return false;
221    }
222
223    // WB9: AHLetter × Numeric
224    if (_isAHLetter(immediateLeft) && immediateRight == CharProperty.Numeric)
225      return false;
226
227    // WB10: Numeric × AHLetter
228    if (immediateLeft == CharProperty.Numeric && _isAHLetter(immediateRight))
229      return false;
230
231    // Do not break within sequences, such as “3.2” or “3,456.789”.
232    // WB11: Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
233    if (nextLeft == CharProperty.Numeric &&
234        _oneOf(
235          immediateLeft,
236          CharProperty.MidNum,
237          CharProperty.MidNumLet,
238          CharProperty.SingleQuote,
239        ) &&
240        immediateRight == CharProperty.Numeric) {
241      return false;
242    }
243
244    // WB12: Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
245    if (immediateLeft == CharProperty.Numeric &&
246        _oneOf(
247          immediateRight,
248          CharProperty.MidNum,
249          CharProperty.MidNumLet,
250          CharProperty.SingleQuote,
251        ) &&
252        nextRight == CharProperty.Numeric) {
253      return false;
254    }
255
256    // Do not break between Katakana.
257    // WB13: Katakana × Katakana
258    if (immediateLeft == CharProperty.Katakana &&
259        immediateRight == CharProperty.Katakana) {
260      return false;
261    }
262
263    // Do not break from extenders.
264    // WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
265    if (_oneOf(
266          immediateLeft,
267          CharProperty.ALetter,
268          CharProperty.HebrewLetter,
269          CharProperty.Numeric,
270          CharProperty.Katakana,
271          CharProperty.ExtendNumLet,
272        ) &&
273        immediateRight == CharProperty.ExtendNumLet) {
274      return false;
275    }
276
277    // WB13b: ExtendNumLet × (AHLetter | Numeric | Katakana)
278    if (immediateLeft == CharProperty.ExtendNumLet &&
279        _oneOf(
280          immediateRight,
281          CharProperty.ALetter,
282          CharProperty.HebrewLetter,
283          CharProperty.Numeric,
284          CharProperty.Katakana,
285        )) {
286      return false;
287    }
288
289    // Do not break within emoji flag sequences. That is, do not break between
290    // regional indicator (RI) symbols if there is an odd number of RI
291    // characters before the break point.
292    // WB15: sot (RI RI)* RI × RI
293    // TODO(mdebbar): implement this.
294
295    // WB16: [^RI] (RI RI)* RI × RI
296    // TODO(mdebbar): implement this.
297
298    // Otherwise, break everywhere (including around ideographs).
299    // WB999: Any ÷ Any
300    return true;
301  }
302
303  static bool _isUtf16Surrogate(int value) {
304    return value & 0xF800 == 0xD800;
305  }
306
307  static bool _oneOf(
308    CharProperty value,
309    CharProperty choice1,
310    CharProperty choice2, [
311    CharProperty choice3,
312    CharProperty choice4,
313    CharProperty choice5,
314  ]) {
315    if (value == choice1) {
316      return true;
317    }
318    if (value == choice2) {
319      return true;
320    }
321    if (choice3 != null && value == choice3) {
322      return true;
323    }
324    if (choice4 != null && value == choice4) {
325      return true;
326    }
327    if (choice5 != null && value == choice5) {
328      return true;
329    }
330    return false;
331  }
332
333  static bool _isAHLetter(CharProperty property) {
334    return _oneOf(property, CharProperty.ALetter, CharProperty.HebrewLetter);
335  }
336}
337