• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2016 and later: Unicode, Inc. and others.
3# License & terms of use: http://www.unicode.org/copyright.html#License
4
5# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
6# file: sentence.txt
7
8type = sentence;      # one of grapheme | word | line | sentence
9locale = en;
10
11CR        = [\p{Sentence_Break = CR}];
12LF        = [\p{Sentence_Break = LF}];
13Extend    = [\p{Sentence_Break = Extend}];
14Sep       = [\p{Sentence_Break = Sep}];
15Format    = [\p{Sentence_Break = Format}];
16Sp        = [\p{Sentence_Break = Sp}];
17Lower     = [\p{Sentence_Break = Lower}];
18Upper     = [\p{Sentence_Break = Upper}];
19OLetter   = [\p{Sentence_Break = OLetter}];
20Numeric   = [\p{Sentence_Break = Numeric}];
21ATerm     = [\p{Sentence_Break = ATerm}];
22SContinue = [\p{Sentence_Break = SContinue}];
23STerm     = [\p{Sentence_Break = STerm}];
24Close     = [\p{Sentence_Break = Close}];
25
26ParaSep   = [Sep CR LF];
27SATerm    = [STerm ATerm];
28ExtFmt    = [Extend Format];
29
30# SB2:  ÷  eot
31#       Conventional regular expression matching for '$' as end-of-text also matches
32#       at a line separator just preceding the physical end of text.
33#       Instead, use a look-ahead assertion that there is no following character.
34SB2:    . ÷ (?!.);
35
36SB3:    CR LF;
37SB4:    ParaSep ÷;
38
39# SB5: ignore Format and Extend characters.
40
41SB6:    ATerm ExtFmt* Numeric;
42SB7:    (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
43SB8:    ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
44SB8a:   SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
45
46SB9:    SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
47        # Also covers SB10, SB11.
48
49SB12:   . ExtFmt* [^ExtFmt]?;
50
51