1# 2# Copyright (C) 2016 and later: Unicode, Inc. and others. 3# License & terms of use: http://www.unicode.org/copyright.html 4 5# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. 6# file: sentence.txt 7 8type = sentence; # one of grapheme | word | line | sentence 9locale = en; 10 11CR = [\p{Sentence_Break = CR}]; 12LF = [\p{Sentence_Break = LF}]; 13Extend = [\p{Sentence_Break = Extend}]; 14Sep = [\p{Sentence_Break = Sep}]; 15Format = [\p{Sentence_Break = Format}]; 16Sp = [\p{Sentence_Break = Sp}]; 17Lower = [\p{Sentence_Break = Lower}]; 18Upper = [\p{Sentence_Break = Upper}]; 19OLetter = [\p{Sentence_Break = OLetter}]; 20Numeric = [\p{Sentence_Break = Numeric}]; 21ATerm = [\p{Sentence_Break = ATerm}]; 22SContinue = [\p{Sentence_Break = SContinue}]; 23STerm = [\p{Sentence_Break = STerm}]; 24Close = [\p{Sentence_Break = Close}]; 25 26ParaSep = [Sep CR LF]; 27SATerm = [STerm ATerm]; 28ExtFmt = [Extend Format]; 29 30# SB2: ÷ eot 31# Conventional regular expression matching for '$' as end-of-text also matches 32# at a line separator just preceding the physical end of text. 33# Instead, use a look-ahead assertion that there is no following character. 34SB2: . ÷ (?!.); 35 36SB3: CR LF; 37SB4: ParaSep ÷; 38 39# SB5: ignore Format and Extend characters. 40 41SB6: ATerm ExtFmt* Numeric; 42SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper; 43SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower; 44SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm); 45 46SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷; 47 # Also covers SB10, SB11. 48 49SB12: . ExtFmt* [^ExtFmt]?; 50 51