1# 2# Copyright (C) 2016 and later: Unicode, Inc. and others. 3# License & terms of use: http://www.unicode.org/copyright.html 4# Copyright (C) 2002-2016, International Business Machines Corporation and others. 5# All Rights Reserved. 6# 7# file: char.txt 8# 9# ICU Character Break Rules, also known as Grapheme Cluster Boundaries 10# See Unicode Standard Annex #29. 11# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 12# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088 13# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html 14 15!!quoted_literals_only; 16 17# 18# Character Class Definitions. 19# 20$CR = [\p{Grapheme_Cluster_Break = CR}]; 21$LF = [\p{Grapheme_Cluster_Break = LF}]; 22$Control = [[\p{Grapheme_Cluster_Break = Control}]]; 23# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets. 24#$Virama = [[\p{Grapheme_Cluster_Break = Virama}]]; 25#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]]; 26$Extend = [[\p{Grapheme_Cluster_Break = Extend}]]; 27$ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}]; 28$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; 29$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; 30$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; 31 32# 33# Korean Syllable Definitions 34# 35$L = [\p{Grapheme_Cluster_Break = L}]; 36$V = [\p{Grapheme_Cluster_Break = V}]; 37$T = [\p{Grapheme_Cluster_Break = T}]; 38 39$LV = [\p{Grapheme_Cluster_Break = LV}]; 40$LVT = [\p{Grapheme_Cluster_Break = LVT}]; 41 42# Emoji defintions 43 44$Extended_Pict = [:ExtPict:]; 45 46## ------------------------------------------------- 47!!chain; 48!!lookAheadHardBreak; 49 50$CR $LF; 51 52$L ($L | $V | $LV | $LVT); 53($LV | $V) ($V | $T); 54($LVT | $T) $T; 55 56# GB 9 57[^$Control $CR $LF] ($Extend | $ZWJ); 58 59# GB 9a (only for extended grapheme clusters) 60[^$Control $CR $LF] $SpacingMark; 61 62# GB 9b 63$Prepend [^$Control $CR $LF]; 64 65# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences. 66$Extended_Pict $Extend* $ZWJ $Extended_Pict; 67 68# GB 12-13. Keep pairs of regional indicators together 69# Note that hard break '/' rule triggers only if there are three or more initial RIs, 70 71^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator; 72^$Prepend* $Regional_Indicator $Regional_Indicator; 73 74# GB 999 Match a single code point if no other rule applies. 75.; 76 77