1# 2# Copyright (C) 2016 and later: Unicode, Inc. and others. 3# License & terms of use: http://www.unicode.org/copyright.html 4# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. 5 6# file: word_POSIX.txt 7# 8# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest 9# 10# Note: Rule syntax and the monkey test itself are still a work in progress. 11# They are expected to change with review and the addition of support for rule tailoring. 12 13type = word; # one of grapheme | word | line | sentence 14locale = en_US_POSIX; 15 16Han = [:Han:]; 17 18CR = [\p{Word_Break = CR}]; 19LF = [\p{Word_Break = LF}]; 20Newline = [\p{Word_Break = Newline}]; 21Extend = [\p{Word_Break = Extend}-Han]; 22ZWJ = [\p{Word_Break = ZWJ}]; 23Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 24Format = [\p{Word_Break = Format}]; 25Katakana = [\p{Word_Break = Katakana}]; 26Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 27ALetter = [\p{Word_Break = ALetter}]; 28Single_Quote = [\p{Word_Break = Single_Quote}]; 29Double_Quote = [\p{Word_Break = Double_Quote}]; 30MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; 31MidLetter = [\p{Word_Break = MidLetter} - [\:]]; 32MidNum = [\p{Word_Break = MidNum} [.]]; 33Numeric = [\p{Word_Break = Numeric}]; 34ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 35WSegSpace = [\p{Word_Break = WSegSpace}]; 36Extended_Pict = [:ExtPict:]; 37 38#define dictionary, with the effect being that those characters don't appear in test data. 39 40Hiragana = [:Hiragana:]; 41 42Control = [\p{Grapheme_Cluster_Break = Control}]; 43HangulSyllable = [\uac00-\ud7a3]; 44ComplexContext = [:LineBreak = Complex_Context:]; 45KanaKanji = [Han Hiragana Katakana]; 46dictionaryCJK = [KanaKanji HangulSyllable]; 47dictionary = [ComplexContext dictionaryCJK]; 48 49# leave dictionary scripts out of ALetter 50 51ALetter = [ALetter - dictionary]; 52 53AHLetter = [ALetter Hebrew_Letter]; 54MidNumLetQ = [MidNumLet Single_Quote]; 55ExtFmt = [Extend Format ZWJ]; 56 57WB3: CR LF; 58WB3a: (Newline | CR | LF) ÷; 59WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. 60 # (but needed with UAX treat-as scheme.) 61WB3c: ZWJ Extended_Pict; 62WB3d: WSegSpace WSegSpace; 63 64WB5: AHLetter ExtFmt* AHLetter; 65 66# includes both WB6 and WB7 67WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; 68 69WB7a: Hebrew_Letter ExtFmt* Single_Quote; 70WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c 71 72WB8: Numeric ExtFmt* Numeric; 73WB9: AHLetter ExtFmt* Numeric; 74WB10: Numeric ExtFmt* AHLetter; 75 76WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 77WB13: Katakana ExtFmt* Katakana; 78 79WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; 80WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); 81 82# WB rule 15 - 17, pairs of Regional Indicators stay unbroken. 83# Interacts with WB3c. 84WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict; 85WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; 86 87# Rule WB 999 Any ÷ Any 88# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG). 89WB999.1: . ExtFmt* ZWJ Extended_Pict; 90WB999.2: . ExtFmt* ÷; 91 92