• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2#   Copyright (C) 2016 and later: Unicode, Inc. and others.
3#   License & terms of use: http://www.unicode.org/copyright.html
4#   Copyright (C) 2002-2016, International Business Machines Corporation and others.
5#       All Rights Reserved.
6#
7#   file:  char.txt
8#
9#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
10#      See Unicode Standard Annex #29.
11#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
12#      Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
13#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
14
15!!quoted_literals_only;
16
17#
18#  Character Class Definitions.
19#
20$CR          = [\p{Grapheme_Cluster_Break = CR}];
21$LF          = [\p{Grapheme_Cluster_Break = LF}];
22$Control     = [[\p{Grapheme_Cluster_Break = Control}]];
23# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
24#$Virama      = [[\p{Grapheme_Cluster_Break = Virama}]];
25#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
26$Extend      = [[\p{Grapheme_Cluster_Break = Extend}]];
27$ZWJ         = [\p{Grapheme_Cluster_Break = ZWJ}];
28$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
29$Prepend     = [\p{Grapheme_Cluster_Break = Prepend}];
30$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
31
32#
33# Korean Syllable Definitions
34#
35$L           = [\p{Grapheme_Cluster_Break = L}];
36$V           = [\p{Grapheme_Cluster_Break = V}];
37$T           = [\p{Grapheme_Cluster_Break = T}];
38
39$LV          = [\p{Grapheme_Cluster_Break = LV}];
40$LVT         = [\p{Grapheme_Cluster_Break = LVT}];
41
42# Emoji defintions
43
44$Extended_Pict = [:ExtPict:];
45
46## -------------------------------------------------
47!!chain;
48!!lookAheadHardBreak;
49
50$CR $LF;
51
52$L ($L | $V | $LV | $LVT);
53($LV | $V) ($V | $T);
54($LVT | $T) $T;
55
56# GB 9
57[^$Control $CR $LF] ($Extend | $ZWJ);
58
59# GB 9a (only for extended grapheme clusters)
60[^$Control $CR $LF] $SpacingMark;
61
62# GB 9b
63$Prepend [^$Control $CR $LF];
64
65# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
66$Extended_Pict $Extend* $ZWJ $Extended_Pict;
67
68# GB 12-13. Keep pairs of regional indicators together
69#           Note that hard break '/' rule triggers only if there are three or more initial RIs,
70
71^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator;
72^$Prepend* $Regional_Indicator $Regional_Indicator;
73
74# GB 999 Match a single code point if no other rule applies.
75.;
76
77