1# 2# Copyright (C) 2016 and later: Unicode, Inc. and others. 3# License & terms of use: http://www.unicode.org/copyright.html 4# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. 5 6# file: word.txt 7# 8# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest 9# 10# Note: Rule syntax and the monkey test itself are still a work in progress. 11# They are expected to change with review and the addition of support for rule tailoring. 12 13 14type = word; # one of grapheme | word | line | sentence 15locale = en; 16 17Han = [:Han:]; 18 19CR = [\p{Word_Break = CR}]; 20LF = [\p{Word_Break = LF}]; 21Newline = [\p{Word_Break = Newline}]; 22Extend = [\p{Word_Break = Extend}-Han]; 23ZWJ = [\p{Word_Break = ZWJ}]; 24Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 25Format = [\p{Word_Break = Format}]; 26Katakana = [\p{Word_Break = Katakana}]; 27Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 28ALetter = [\p{Word_Break = ALetter}]; 29Single_Quote = [\p{Word_Break = Single_Quote}]; 30Double_Quote = [\p{Word_Break = Double_Quote}]; 31MidNumLet = [\p{Word_Break = MidNumLet}]; 32MidLetter = [\p{Word_Break = MidLetter}]; 33MidNum = [\p{Word_Break = MidNum}]; 34Numeric = [\p{Word_Break = Numeric}]; 35ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 36WSegSpace = [\p{Word_Break = WSegSpace}]; 37Extended_Pict = [:ExtPict:]; 38 39#define dictionary, with the effect being that those characters don't appear in test data. 40 41Hiragana = [:Hiragana:]; 42 43Control = [\p{Grapheme_Cluster_Break = Control}]; 44HangulSyllable = [\uac00-\ud7a3]; 45ComplexContext = [:LineBreak = Complex_Context:]; 46KanaKanji = [Han Hiragana Katakana]; 47dictionaryCJK = [KanaKanji HangulSyllable]; 48dictionary = [ComplexContext dictionaryCJK]; 49 50# leave dictionary scripts out of ALetter 51 52ALetter = [ALetter - dictionary]; 53 54AHLetter = [ALetter Hebrew_Letter]; 55MidNumLetQ = [MidNumLet Single_Quote]; 56ExtFmt = [Extend Format ZWJ]; 57 58WB3: CR LF; 59WB3a: (Newline | CR | LF) ÷; 60WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. 61 # (but needed with UAX treat-as scheme.) 62WB3c: ZWJ Extended_Pict; 63WB3d: WSegSpace WSegSpace; 64 65WB5: AHLetter ExtFmt* AHLetter; 66 67# includes both WB6 and WB7 68WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; 69 70WB7a: Hebrew_Letter ExtFmt* Single_Quote; 71WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c 72 73WB8: Numeric ExtFmt* Numeric; 74WB9: AHLetter ExtFmt* Numeric; 75WB10: Numeric ExtFmt* AHLetter; 76 77WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 78WB13: Katakana ExtFmt* Katakana; 79 80WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; 81WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); 82 83# WB rule 15 - 17, pairs of Regional Indicators stay unbroken. 84# Interacts with WB3c. 85WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict; 86WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; 87 88# Rule WB 999 Any ÷ Any 89# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG). 90WB999.1: . ExtFmt* ZWJ Extended_Pict; 91WB999.2: . ExtFmt* ÷; 92 93