1# Copyright (C) 2016 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3# 4# Copyright (C) 2002-2015, International Business Machines Corporation and others. 5# All Rights Reserved. 6# 7# file: sent.txt 8# 9# ICU Sentence Break Rules 10# See Unicode Standard Annex #29. 11# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0 12# 13 14!!quoted_literals_only; 15 16# 17# Character categories as defined in TR 29 18# 19$CR = [\p{Sentence_Break = CR}]; 20$LF = [\p{Sentence_Break = LF}]; 21$Extend = [\p{Sentence_Break = Extend}]; 22$Sep = [\p{Sentence_Break = Sep}]; 23$Format = [\p{Sentence_Break = Format}]; 24$Sp = [\p{Sentence_Break = Sp}]; 25$Lower = [\p{Sentence_Break = Lower}]; 26$Upper = [\p{Sentence_Break = Upper}]; 27$OLetter = [\p{Sentence_Break = OLetter}]; 28$Numeric = [\p{Sentence_Break = Numeric}]; 29$ATerm = [\p{Sentence_Break = ATerm}]; 30$SContinue = [\p{Sentence_Break = SContinue}]; 31$STerm = [\p{Sentence_Break = STerm}]; 32$Close = [\p{Sentence_Break = Close}]; 33 34# 35# Define extended forms of the character classes, 36# incorporate trailing Extend or Format chars. 37# Rules 4 and 5. 38 39$SpEx = $Sp ($Extend | $Format)*; 40$LowerEx = $Lower ($Extend | $Format)*; 41$UpperEx = $Upper ($Extend | $Format)*; 42$OLetterEx = $OLetter ($Extend | $Format)*; 43$NumericEx = $Numeric ($Extend | $Format)*; 44$ATermEx = $ATerm ($Extend | $Format)*; 45$SContinueEx= $SContinue ($Extend | $Format)*; 46$STermEx = $STerm ($Extend | $Format)*; 47$CloseEx = $Close ($Extend | $Format)*; 48 49 50## ------------------------------------------------- 51 52!!chain; 53 54# Rule 3 - break after separators. Keep CR/LF together. 55# 56$CR $LF; 57 58 59# Rule 4 - Break after $Sep. 60# Rule 5 - Ignore $Format and $Extend 61# 62[^$Sep $CR $LF]? ($Extend | $Format)*; 63 64 65# Rule 6 66$ATermEx $NumericEx; 67 68# Rule 7 69($UpperEx | $LowerEx) $ATermEx $UpperEx; 70 71#Rule 8 72$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*; 73$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; 74 75# Rule 8a 76($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); 77 78#Rule 9, 10, 11 79($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; 80 81#Rule 12 82[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; 83[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; 84