• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2016 and later: Unicode, Inc. and others.
3# License & terms of use: http://www.unicode.org/copyright.html
4# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
5
6# file: word_POSIX.txt
7#
8# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
9#
10# Note: Rule syntax and the monkey test itself are still a work in progress.
11#       They are expected to change with review and the addition of support for rule tailoring.
12
13type = word;      # one of grapheme | word | line | sentence
14locale = en_US_POSIX;
15
16Han            = [:Han:];
17
18CR                 = [\p{Word_Break = CR}];
19LF                 = [\p{Word_Break = LF}];
20Newline            = [\p{Word_Break = Newline}];
21Extend             = [\p{Word_Break = Extend}-Han];
22ZWJ                = [\p{Word_Break = ZWJ}];
23Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
24Format             = [\p{Word_Break = Format}];
25Katakana           = [\p{Word_Break = Katakana}];
26Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
27ALetter            = [\p{Word_Break = ALetter}];
28Single_Quote       = [\p{Word_Break = Single_Quote}];
29Double_Quote       = [\p{Word_Break = Double_Quote}];
30MidNumLet          = [\p{Word_Break = MidNumLet} - [.]];
31MidLetter          = [\p{Word_Break = MidLetter} - [\:]];
32MidNum             = [\p{Word_Break = MidNum} [.]];
33Numeric            = [\p{Word_Break = Numeric}];
34ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
35WSegSpace          = [\p{Word_Break = WSegSpace}];
36Extended_Pict      = [:ExtPict:];
37
38#define dictionary, with the effect being that those characters don't appear in test data.
39
40Hiragana       = [:Hiragana:];
41
42Control        = [\p{Grapheme_Cluster_Break = Control}];
43HangulSyllable = [\uac00-\ud7a3];
44ComplexContext = [:LineBreak = Complex_Context:];
45KanaKanji      = [Han Hiragana Katakana];
46dictionaryCJK  = [KanaKanji HangulSyllable];
47dictionary     = [ComplexContext dictionaryCJK];
48
49# leave dictionary scripts out of ALetter
50
51ALetter        = [ALetter - dictionary];
52
53AHLetter       = [ALetter  Hebrew_Letter];
54MidNumLetQ     = [MidNumLet  Single_Quote];
55ExtFmt         = [Extend Format ZWJ];
56
57WB3:   CR LF;
58WB3a:  (Newline | CR | LF) ÷;
59WB3b:  . ÷ (Newline | CR | LF);   # actually redundant? No other rule combines.
60                                  # (but needed with UAX treat-as scheme.)
61WB3c:   ZWJ Extended_Pict;
62WB3d:   WSegSpace WSegSpace;
63
64WB5:    AHLetter ExtFmt* AHLetter;
65
66# includes both WB6 and WB7
67WB6:    AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt*  AHLetter;
68
69WB7a:   Hebrew_Letter ExtFmt* Single_Quote;
70WB7b:   Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter;   # Include WB7c
71
72WB8:    Numeric ExtFmt* Numeric;
73WB9:    AHLetter ExtFmt* Numeric;
74WB10:   Numeric ExtFmt* AHLetter;
75
76WB11:   Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric;    # includes WB12
77WB13:   Katakana ExtFmt* Katakana;
78
79WB13a:  (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
80WB13b:  ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
81
82# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
83#              Interacts with WB3c.
84WB15:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict;
85WB17:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
86
87# Rule WB 999   Any ÷ Any
88#    Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
89WB999.1: . ExtFmt* ZWJ Extended_Pict;
90WB999.2: . ExtFmt* ÷;
91
92